From de1b75d62667927842a7acd463bd5b2549f37c69 Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Wed, 3 Nov 2021 01:06:57 +0100 Subject: [PATCH 01/18] Update Simd lib to 4.9.107 version. --- 3rdparty/simdlib/CMakeLists.txt | 16 +- .../Simd/{SimdSse1.h => SimdAlignment.h} | 113 +++-- 3rdparty/simdlib/Simd/SimdAllocator.hpp | 6 +- 3rdparty/simdlib/Simd/SimdArray.h | 30 +- 3rdparty/simdlib/Simd/SimdAvx1.h | 9 +- ...SimdBaseRgbaToGray.cpp => SimdAvx1Cpu.cpp} | 45 +- 3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp | 14 +- 3rdparty/simdlib/Simd/SimdAvx2.h | 22 +- 3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp | 43 +- 3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp | 61 ++- 3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp | 10 +- 3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp | 74 --- 3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp | 149 ++++++ 3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp | 56 ++- 3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp | 72 --- 3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp | 68 +++ .../simdlib/Simd/SimdAvx2Deinterleave.cpp | 59 ++- .../simdlib/Simd/SimdAvx2GaussianBlur.cpp | 3 +- 3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp | 4 +- .../simdlib/Simd/SimdAvx2ReduceGray2x2.cpp | 6 +- .../simdlib/Simd/SimdAvx2ReduceGray3x3.cpp | 4 +- .../simdlib/Simd/SimdAvx2ReduceGray4x4.cpp | 4 +- .../simdlib/Simd/SimdAvx2ReduceGray5x5.cpp | 6 +- .../simdlib/Simd/SimdAvx2ResizeBilinear.cpp | 4 +- 3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp | 23 +- 3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp | 92 ---- 3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp | 97 ---- 3rdparty/simdlib/Simd/SimdBase.h | 18 +- 3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp | 20 +- 3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp | 15 +- 3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp | 4 +- 3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp | 80 --- 3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp | 37 +- 3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp | 15 +- 3rdparty/simdlib/Simd/SimdBaseCpu.cpp | 234 +++++++++ .../simdlib/Simd/SimdBaseDeinterleave.cpp | 43 +- .../simdlib/Simd/SimdBaseGaussianBlur.cpp | 2 +- 3rdparty/simdlib/Simd/SimdBaseResizer.cpp | 243 ++++++++- 3rdparty/simdlib/Simd/SimdConfig.h | 10 +- 3rdparty/simdlib/Simd/SimdConst.h | 70 +-- 3rdparty/simdlib/Simd/SimdConversion.h | 55 +-- 3rdparty/simdlib/Simd/SimdCopyPixel.h | 17 + 3rdparty/simdlib/Simd/SimdCpu.h | 101 +++- 3rdparty/simdlib/Simd/SimdDefs.h | 80 +-- 3rdparty/simdlib/Simd/SimdEnable.h | 415 +--------------- 3rdparty/simdlib/Simd/SimdExp.h | 176 ++++++- 3rdparty/simdlib/Simd/SimdExtract.h | 22 +- 3rdparty/simdlib/Simd/SimdFrame.hpp | 98 +++- 3rdparty/simdlib/Simd/SimdInit.h | 35 +- 3rdparty/simdlib/Simd/SimdLib.cpp | 279 ++++++----- 3rdparty/simdlib/Simd/SimdLib.h | 239 +++++---- 3rdparty/simdlib/Simd/SimdLib.hpp | 465 +++++++++++++++++- 3rdparty/simdlib/Simd/SimdLoad.h | 277 +---------- 3rdparty/simdlib/Simd/SimdLoadBlock.h | 251 ++++++++++ 3rdparty/simdlib/Simd/SimdLog.h | 28 +- 3rdparty/simdlib/Simd/SimdMath.h | 47 +- 3rdparty/simdlib/Simd/SimdMemory.h | 104 ++-- 3rdparty/simdlib/Simd/SimdNeon.h | 20 +- 3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp | 45 +- 3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp | 63 ++- 3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp | 10 +- 3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp | 81 --- 3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp | 83 +++- 3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp | 41 +- 3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp | 78 --- .../simdlib/Simd/SimdNeonDeinterleave.cpp | 79 ++- .../simdlib/Simd/SimdNeonGaussianBlur.cpp | 1 + 3rdparty/simdlib/Simd/SimdNeonResizer.cpp | 8 +- 3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp | 71 --- 3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp | 71 --- 3rdparty/simdlib/Simd/SimdPixel.hpp | 200 +++++++- 3rdparty/simdlib/Simd/SimdPow.h | 2 +- 3rdparty/simdlib/Simd/SimdResizer.h | 148 ++++-- 3rdparty/simdlib/Simd/SimdResizerCommon.h | 97 ++++ 3rdparty/simdlib/Simd/SimdRuntime.h | 34 +- 3rdparty/simdlib/Simd/SimdSet.h | 8 +- 3rdparty/simdlib/Simd/SimdSse1Resizer.cpp | 129 ----- 3rdparty/simdlib/Simd/SimdSse2.h | 8 +- 3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp | 54 +- ...SimdBaseBgraToRgba.cpp => SimdSse2Cpu.cpp} | 44 +- .../simdlib/Simd/SimdSse2GaussianBlur3x3.cpp | 3 +- 3rdparty/simdlib/Simd/SimdSse2Resizer.cpp | 8 +- 3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp | 75 --- 3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp | 96 ---- 3rdparty/simdlib/Simd/SimdSse41.h | 76 +++ ...e3BgrToBgra.cpp => SimdSse41BgrToBgra.cpp} | 185 ++++--- ...e3BgrToGray.cpp => SimdSse41BgrToGray.cpp} | 241 +++++---- ...sse3BgrToRgb.cpp => SimdSse41BgrToRgb.cpp} | 163 +++--- ...e3BgraToBgr.cpp => SimdSse41BgraToBgr.cpp} | 257 ++++++---- ...SimdBaseRgbToGray.cpp => SimdSse41Cpu.cpp} | 46 +- ...terleave.cpp => SimdSse41Deinterleave.cpp} | 60 ++- .../simdlib/Simd/SimdSse41GaussianBlur.cpp | 3 +- ...ur3x3.cpp => SimdSse41GaussianBlur3x3.cpp} | 12 +- ...e3GrayToBgr.cpp => SimdSse41GrayToBgr.cpp} | 147 +++--- ...Interleave.cpp => SimdSse41Interleave.cpp} | 11 +- ...imdSsse3Reduce.cpp => SimdSse41Reduce.cpp} | 401 ++++++++------- ...Gray2x2.cpp => SimdSse41ReduceGray2x2.cpp} | 189 ++++--- ...Gray4x4.cpp => SimdSse41ReduceGray4x4.cpp} | 11 +- ...linear.cpp => SimdSse41ResizeBilinear.cpp} | 9 +- 3rdparty/simdlib/Simd/SimdSse41Resizer.cpp | 311 +++++++++++- 3rdparty/simdlib/Simd/SimdSsse3.h | 77 --- 3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp | 74 --- 3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp | 73 --- .../simdlib/Simd/SimdSsse3CustomFunctions.cpp | 69 --- 3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp | 350 ------------- 3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp | 93 ---- 3rdparty/simdlib/Simd/SimdStore.h | 45 +- 3rdparty/simdlib/Simd/SimdStream.h | 21 +- 3rdparty/simdlib/Simd/SimdUpdate.h | 17 +- 3rdparty/simdlib/Simd/SimdVersion.h | 2 +- 3rdparty/simdlib/Simd/SimdView.hpp | 6 +- modules/core/src/image/vpImageConvert.cpp | 4 +- 112 files changed, 5013 insertions(+), 4067 deletions(-) rename 3rdparty/simdlib/Simd/{SimdSse1.h => SimdAlignment.h} (53%) mode change 100644 => 100755 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAllocator.hpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdArray.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx1.h rename 3rdparty/simdlib/Simd/{SimdBaseRgbaToGray.cpp => SimdAvx1Cpu.cpp} (57%) mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp create mode 100755 3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp create mode 100644 3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBase.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseCpu.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseResizer.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdConfig.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdConst.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdConversion.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdCopyPixel.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdCpu.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdDefs.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdEnable.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdExp.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdExtract.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdFrame.hpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdInit.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLib.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLib.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLib.hpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLoad.h create mode 100755 3rdparty/simdlib/Simd/SimdLoadBlock.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLog.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdMath.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdMemory.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeon.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonResizer.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdPixel.hpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdPow.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdResizer.h create mode 100755 3rdparty/simdlib/Simd/SimdResizerCommon.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdRuntime.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSet.h delete mode 100644 3rdparty/simdlib/Simd/SimdSse1Resizer.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp rename 3rdparty/simdlib/Simd/{SimdBaseBgraToRgba.cpp => SimdSse2Cpu.cpp} (62%) mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2Resizer.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp create mode 100755 3rdparty/simdlib/Simd/SimdSse41.h rename 3rdparty/simdlib/Simd/{SimdSsse3BgrToBgra.cpp => SimdSse41BgrToBgra.cpp} (57%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdSsse3BgrToGray.cpp => SimdSse41BgrToGray.cpp} (56%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdSsse3BgrToRgb.cpp => SimdSse41BgrToRgb.cpp} (84%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdSsse3BgraToBgr.cpp => SimdSse41BgraToBgr.cpp} (53%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdBaseRgbToGray.cpp => SimdSse41Cpu.cpp} (54%) rename 3rdparty/simdlib/Simd/{SimdSsse3Deinterleave.cpp => SimdSse41Deinterleave.cpp} (74%) mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp rename 3rdparty/simdlib/Simd/{SimdSsse3GaussianBlur3x3.cpp => SimdSse41GaussianBlur3x3.cpp} (95%) rename 3rdparty/simdlib/Simd/{SimdSsse3GrayToBgr.cpp => SimdSse41GrayToBgr.cpp} (92%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdSsse3Interleave.cpp => SimdSse41Interleave.cpp} (96%) rename 3rdparty/simdlib/Simd/{SimdSsse3Reduce.cpp => SimdSse41Reduce.cpp} (96%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdSsse3ReduceGray2x2.cpp => SimdSse41ReduceGray2x2.cpp} (94%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdSsse3ReduceGray4x4.cpp => SimdSse41ReduceGray4x4.cpp} (96%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdSsse3ResizeBilinear.cpp => SimdSse41ResizeBilinear.cpp} (98%) mode change 100644 => 100755 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse41Resizer.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3.h delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdStore.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdStream.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdUpdate.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdView.hpp diff --git a/3rdparty/simdlib/CMakeLists.txt b/3rdparty/simdlib/CMakeLists.txt index e6880b3800..dc6d111aae 100644 --- a/3rdparty/simdlib/CMakeLists.txt +++ b/3rdparty/simdlib/CMakeLists.txt @@ -109,23 +109,11 @@ if(X86 OR X86_64) file(GLOB_RECURSE SIMD_BASE_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdBase*.cpp) set_source_files_properties(${SIMD_BASE_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS}") - file(GLOB_RECURSE SIMD_SSE1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse1*.cpp) - set_source_files_properties(${SIMD_SSE1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG}") - file(GLOB_RECURSE SIMD_SSE2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse2*.cpp) - set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE2_FLAG}") - - file(GLOB_RECURSE SIMD_SSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse3*.cpp) - set_source_files_properties(${SIMD_SSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG}") - - file(GLOB_RECURSE SIMD_SSSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSsse3*.cpp) - set_source_files_properties(${SIMD_SSSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSSE3_FLAG}") + set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG} ${SSE2_FLAG}") file(GLOB_RECURSE SIMD_SSE41_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse41*.cpp) - set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_1_FLAG}") - - file(GLOB_RECURSE SIMD_SSE42_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse42*.cpp) - set_source_files_properties(${SIMD_SSE42_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}") + set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG} ${SSSE3_FLAG} ${SSE4_1_FLAG} ${SSE4_2_FLAG}") file(GLOB_RECURSE SIMD_AVX1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx1*.cpp) set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}") diff --git a/3rdparty/simdlib/Simd/SimdSse1.h b/3rdparty/simdlib/Simd/SimdAlignment.h old mode 100644 new mode 100755 similarity index 53% rename from 3rdparty/simdlib/Simd/SimdSse1.h rename to 3rdparty/simdlib/Simd/SimdAlignment.h index e258d50ab3..9789cbb9e7 --- a/3rdparty/simdlib/Simd/SimdSse1.h +++ b/3rdparty/simdlib/Simd/SimdAlignment.h @@ -1,40 +1,73 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdSse_h__ -#define __SimdSse_h__ - -#include "Simd/SimdDefs.h" - -namespace Simd -{ -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - void SquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum); - - void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum); - } -#endif// SIMD_SSE_ENABLE -} -#endif//__SimdSse_h__ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdAlignment_h__ +#define __SimdAlignment_h__ + +#include "Simd/SimdEnable.h" + +namespace Simd +{ + SIMD_INLINE size_t GetAlignment() + { +#ifdef SIMD_AVX2_ENABLE + if (Avx2::Enable) + return sizeof(__m256i); + else +#endif +#ifdef SIMD_AVX_ENABLE + if (Avx::Enable) + return sizeof(__m256); + else +#endif +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable) + return sizeof(__m128i); + else +#endif +#ifdef SIMD_SSE2_ENABLE + if (Sse2::Enable) + return sizeof(__m128i); + else +#endif +#ifdef SIMD_NEON_ENABLE + if (Neon::Enable) + return sizeof(uint8x16_t); + else +#endif + return sizeof(void *); + } + + extern const size_t ALIGNMENT; + + SIMD_INLINE size_t Alignment() + { +#if defined(WIN32) + return GetAlignment(); +#else + return ALIGNMENT; +#endif + } +} + +#endif//__SimdAlignment_h__ diff --git a/3rdparty/simdlib/Simd/SimdAllocator.hpp b/3rdparty/simdlib/Simd/SimdAllocator.hpp old mode 100644 new mode 100755 index cd65f196f4..8ee548e5ae --- a/3rdparty/simdlib/Simd/SimdAllocator.hpp +++ b/3rdparty/simdlib/Simd/SimdAllocator.hpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -125,8 +125,8 @@ namespace Simd */ static SIMD_INLINE size_t Alignment() { -#if defined(__SimdEnable_h__) && defined(WIN32) - return Simd::ALIGNMENT; +#if defined(__SimdAlignment_h__) && defined(WIN32) + return Simd::Alignment(); #else return SimdAlignment(); #endif diff --git a/3rdparty/simdlib/Simd/SimdArray.h b/3rdparty/simdlib/Simd/SimdArray.h old mode 100644 new mode 100755 index 30e793080f..2f7f1bbbe0 --- a/3rdparty/simdlib/Simd/SimdArray.h +++ b/3rdparty/simdlib/Simd/SimdArray.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -57,15 +57,28 @@ namespace Simd } *(size_t*)&size = size_; if (size_) - *(T**)&data = (T*)Simd::Allocate(size * sizeof(T), align); + *(T**)&data = (T*)Simd::Allocate(RawSize(), align); } if (clear) Clear(); } + SIMD_INLINE void Assign(const T * src, size_t size_) + { + Resize(size_, src == NULL); + if(src) + memcpy(data, src, RawSize()); + } + SIMD_INLINE void Clear() { - ::memset(data, 0, size * sizeof(T)); + memset(data, 0, RawSize()); + } + + SIMD_INLINE void Swap(const Array & array) + { + Simd::Swap((T*&)data, (T*&)(array.data)); + Simd::Swap((size_t&)size, (size_t&)(array.size)); } SIMD_INLINE T & operator[] (size_t i) @@ -77,12 +90,19 @@ namespace Simd { return data[i]; } + + SIMD_INLINE size_t RawSize() const + { + return size * sizeof(T); + } }; + typedef Array Array8i; typedef Array Array8u; typedef Array Array16i; typedef Array Array16u; typedef Array Array32i; + typedef Array Array32u; typedef Array Array32f; #if defined(__GNUC__) && __GNUC__ >= 6 @@ -90,8 +110,8 @@ namespace Simd #pragma GCC diagnostic ignored "-Wignored-attributes" #endif -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { typedef Array<__m128> Array128f; } diff --git a/3rdparty/simdlib/Simd/SimdAvx1.h b/3rdparty/simdlib/Simd/SimdAvx1.h old mode 100644 new mode 100755 index 25c070c459..48df913c02 --- a/3rdparty/simdlib/Simd/SimdAvx1.h +++ b/3rdparty/simdlib/Simd/SimdAvx1.h @@ -1,8 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, -* 2019-2019 Facundo Galan. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -22,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef __SimdAvx1_h__ -#define __SimdAvx1_h__ +#ifndef __SimdAvx_h__ +#define __SimdAvx_h__ #include "Simd/SimdDefs.h" @@ -36,4 +35,4 @@ namespace Simd } #endif// SIMD_AVX_ENABLE } -#endif//__SimdAvx1_h__ +#endif//__SimdAvx_h__ diff --git a/3rdparty/simdlib/Simd/SimdBaseRgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx1Cpu.cpp similarity index 57% rename from 3rdparty/simdlib/Simd/SimdBaseRgbaToGray.cpp rename to 3rdparty/simdlib/Simd/SimdAvx1Cpu.cpp index 22d37b17ee..9d9cbb29d3 100644 --- a/3rdparty/simdlib/Simd/SimdBaseRgbaToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx1Cpu.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -21,23 +21,46 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "Simd/SimdConversion.h" +#include "Simd/SimdEnable.h" +#include "Simd/SimdCpu.h" + +#if defined(_MSC_VER) +#include +#endif namespace Simd { - namespace Base +#ifdef SIMD_AVX_ENABLE + namespace Avx { - void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride) + SIMD_INLINE bool SupportedByCPU() { - for (size_t row = 0; row < height; ++row) + return + Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) && + Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::AVX); + } + + SIMD_INLINE bool SupportedByOS() + { +#if defined(_MSC_VER) + __try { - const uint8_t * pRgba = rgba + row*rgbaStride; - uint8_t * pGray = gray + row*grayStride; - for (const uint8_t *pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgba += 4) - { - *pGray = RgbToGray(pRgba[0], pRgba[1], pRgba[2]); - } + __m256d value = _mm256_set1_pd(1.0);// try to execute of AVX instructions; + return true; } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } +#else + return true; +#endif + } + + bool GetEnable() + { + return SupportedByCPU() && SupportedByOS(); } } +#endif } diff --git a/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp b/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp old mode 100644 new mode 100755 index e409c17ff1..319c609408 --- a/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -42,7 +42,7 @@ namespace Simd float * pbx[2] = { _bx[0].data, _bx[1].data }; int32_t prev = -2; size_t rsa = AlignLo(rs, Avx::F); - size_t rsh = AlignLo(rs, Sse::F); + size_t rsh = AlignLo(rs, Sse2::F); for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) { float fy1 = _ay[dy]; @@ -78,10 +78,10 @@ namespace Simd __m256 m1 = _mm256_mul_ps(fx1, _mm256_shuffle_ps(s0145, s2367, 0xDD)); _mm256_store_ps(pb + dx, _mm256_add_ps(m0, m1)); } - for (; dx < rsh; dx += Sse::F) + for (; dx < rsh; dx += Sse2::F) { - __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); - __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); + __m128 s01 = Sse2::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); + __m128 s23 = Sse2::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); __m128 fx1 = _mm_load_ps(_ax.data + dx); __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1); __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88)); @@ -128,7 +128,7 @@ namespace Simd __m256 m1 = _mm256_mul_ps(_mm256_load_ps(pbx[1] + dx), _fy1); _mm256_storeu_ps(dst + dx, _mm256_add_ps(m0, m1)); } - for (; dx < rsh; dx += Sse::F) + for (; dx < rsh; dx += Sse2::F) { __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0)); __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1)); @@ -144,7 +144,7 @@ namespace Simd void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) { ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m256)); - if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp)) + if (param.IsFloatBilinear()) return new ResizerFloatBilinear(param); else return Sse41::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); diff --git a/3rdparty/simdlib/Simd/SimdAvx2.h b/3rdparty/simdlib/Simd/SimdAvx2.h old mode 100644 new mode 100755 index 46d3b2d547..f5957b26c1 --- a/3rdparty/simdlib/Simd/SimdAvx2.h +++ b/3rdparty/simdlib/Simd/SimdAvx2.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, +* Copyright (c) 2011-2021 Yermalayeu Ihar, * 2019-2019 Facundo Galan. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -32,24 +32,22 @@ namespace Simd #ifdef SIMD_AVX2_ENABLE namespace Avx2 { + void BgraToBgr(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* bgr, size_t bgrStride); + void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride); - void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride); + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride); + + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride); void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha); - - void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride); - void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride); - void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride); - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); + void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride); void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); @@ -87,6 +85,12 @@ namespace Simd void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); + void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); + + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride); + void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); } diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp old mode 100644 new mode 100755 index b1f9ef8417..ffb4828e98 --- a/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -67,6 +67,8 @@ namespace Simd BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); } + //--------------------------------------------------------------------- + template SIMD_INLINE void Bgr48pToBgra32(uint8_t * bgra, const uint8_t * blue, const uint8_t * green, const uint8_t * red, size_t offset, __m256i alpha) { @@ -117,6 +119,45 @@ namespace Simd else Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); } + + //--------------------------------------------------------------------- + + template SIMD_INLINE void RgbToBgra(const uint8_t* rgb, uint8_t* bgra, __m256i alpha) + { + Store((__m256i*)bgra + 0, RgbToBgra(Load((__m256i*)(rgb + 0)), alpha)); + Store((__m256i*)bgra + 1, RgbToBgra(Load((__m256i*)(rgb + 24)), alpha)); + Store((__m256i*)bgra + 2, RgbToBgra(Load((__m256i*)(rgb + 48)), alpha)); + Store((__m256i*)bgra + 3, RgbToBgra(Load((__m256i*)(rgb + 64)), alpha)); + } + + template void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t alignedWidth = AlignLo(width, A); + + __m256i _alpha = _mm256_slli_si256(_mm256_set1_epi32(alpha), 3); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + RgbToBgra(rgb + 3 * col, bgra + 4 * col, _alpha); + if (width != alignedWidth) + RgbToBgra(rgb + 3 * (width - A), bgra + 4 * (width - A), _alpha); + rgb += rgbStride; + bgra += bgraStride; + } + } + + void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) + RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + else + RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + } } #else // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToBgra.cpp.o) has no symbols diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp old mode 100644 new mode 100755 index d40b0f0cc6..7b922e7025 --- a/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -45,7 +45,7 @@ namespace Simd { const __m256i lo = PackI32ToI16(BgraToGray32(bgra[0]), BgraToGray32(bgra[1])); const __m256i hi = PackI32ToI16(BgraToGray32(bgra[2]), BgraToGray32(bgra[3])); - return PackU16ToU8(lo, hi); + return PackI16ToU8(lo, hi); } template SIMD_INLINE __m256i BgrToGray(const uint8_t * bgr) @@ -84,6 +84,63 @@ namespace Simd else BgrToGray(bgr, width, height, bgrStride, gray, grayStride); } + + + //--------------------------------------------------------------------- + + const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); + + SIMD_INLINE __m256i RgbaToGray32(__m256i rgba) + { + const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF); + const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF); + const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_ROUND), _mm256_madd_epi16(r0b0, K16_RED_BLUE)); + return _mm256_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT); + } + + SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4]) + { + const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); + const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); + return PackI16ToU8(lo, hi); + } + + template SIMD_INLINE __m256i RgbToGray(const uint8_t* rgb) + { + __m256i rgba[4]; + rgba[0] = BgrToBgra(Load((__m256i*)(rgb + 0)), K32_01000000); + rgba[1] = BgrToBgra(Load((__m256i*)(rgb + 24)), K32_01000000); + rgba[2] = BgrToBgra(Load((__m256i*)(rgb + 48)), K32_01000000); + rgba[3] = BgrToBgra(Load((__m256i*)(rgb + 64)), K32_01000000); + return RgbaToGray(rgba); + } + + template void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) + { + assert(width >= A); + if (align) + assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t alignedWidth = AlignLo(width, A); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + Store((__m256i*)(gray + col), RgbToGray(rgb + 3 * col)); + if (width != alignedWidth) + Store((__m256i*)(gray + width - A), RgbToGray(rgb + 3 * (width - A))); + rgb += rgbStride; + gray += grayStride; + } + } + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) + { + if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)) + RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + else + RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + } } #else // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToGray.cpp.o) has no symbols diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp old mode 100644 new mode 100755 index 2daae1e7df..a64ed8035e --- a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -64,7 +64,7 @@ namespace Simd _mm256_shuffle_epi8(p1, K8_SHFL_2P1)), _mm256_shuffle_epi8(p2, K8_SHFL_2P2))); } - template void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) + template void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride) { assert(width >= A); if (align) @@ -85,12 +85,12 @@ namespace Simd } } - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) + void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride) { if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)) - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); + BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); else - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); + BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); } } #else diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp deleted file mode 100644 index a4f9efdb2f..0000000000 --- a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void BgrToRgba(const uint8_t * bgr, uint8_t * rgba, __m256i alpha) - { - Store((__m256i*)rgba + 0, BgrToRgba(Load((__m256i*)(bgr + 0)), alpha)); - Store((__m256i*)rgba + 1, BgrToRgba(Load((__m256i*)(bgr + 24)), alpha)); - Store((__m256i*)rgba + 2, BgrToRgba(Load((__m256i*)(bgr + 48)), alpha)); - Store((__m256i*)rgba + 3, BgrToRgba(Load((__m256i*)(bgr + 64)), alpha)); - } - - template void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha) - { - assert(width >= A); - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgba) && Aligned(rgbaStride)); - - size_t alignedWidth = AlignLo(width, A); - - __m256i _alpha = _mm256_slli_si256(_mm256_set1_epi32(alpha), 3); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BgrToRgba(bgr + 3 * col, rgba + 4 * col, _alpha); - if (width != alignedWidth) - BgrToRgba(bgr + 3 * (width - A), rgba + 4 * (width - A), _alpha); - bgr += bgrStride; - rgba += rgbaStride; - } - } - - void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha) - { - if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - else - BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToRgba.cpp.o) has no symbols - void dummy_SimdAvx2BgrToRgba(){}; -#endif//SIMD_AVX2_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp new file mode 100755 index 0000000000..aac574d71c --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp @@ -0,0 +1,149 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdStore.h" +#include "Simd/SimdMemory.h" +#include "Simd/SimdConst.h" + +namespace Simd +{ +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + template SIMD_INLINE __m256i BgraToBgr(const uint8_t* bgra) + { + __m256i _bgra = Load((__m256i*)bgra); + return _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(_bgra, K8_SHUFFLE_BGRA_TO_BGR), K32_PERMUTE_BGRA_TO_BGR); + } + + template void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) + { + assert(width >= F); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); + + size_t widthF = AlignLo(width, F); + if (width == widthF) + widthF -= F; + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < widthF; col += F) + Store((__m256i*)(bgr + 3 * col), BgraToBgr(bgra + 4 * col)); + if (width != widthF) + Store24(bgr + 3 * (width - F), BgraToBgr(bgra + 4 * (width - F))); + bgra += bgraStride; + bgr += bgrStride; + } + } + + void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) + BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); + else + BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); + } + + //--------------------------------------------------------------------- + + const __m256i K8_SHUFFLE_BGRA_TO_RGB = SIMD_MM256_SETR_EPI8( + 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1, + 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1); + + template SIMD_INLINE __m256i BgraToRgb(const uint8_t* bgra) + { + __m256i _bgra = Load((__m256i*)bgra); + return _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(_bgra, K8_SHUFFLE_BGRA_TO_RGB), K32_PERMUTE_BGRA_TO_BGR); + } + + template void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) + { + assert(width >= F); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t widthF = AlignLo(width, F); + if (width == widthF) + widthF -= F; + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < widthF; col += F) + Store((__m256i*)(rgb + 3 * col), BgraToRgb(bgra + 4 * col)); + if (width != widthF) + Store24(rgb + 3 * (width - F), BgraToRgb(bgra + 4 * (width - F))); + bgra += bgraStride; + rgb += rgbStride; + } + } + + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) + BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); + else + BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); + } + + //--------------------------------------------------------------------- + + const __m256i K8_BGRA_TO_RGBA = SIMD_MM256_SETR_EPI8( + 0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF, + 0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF); + + template SIMD_INLINE void BgraToRgba(const uint8_t* bgra, uint8_t* rgba) + { + Store((__m256i*)rgba, _mm256_shuffle_epi8(Load((__m256i*)bgra), K8_BGRA_TO_RGBA)); + } + + template void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride)); + + size_t size = width * 4; + size_t sizeA = AlignLo(size, A); + + for (size_t row = 0; row < height; ++row) + { + for (size_t i = 0; i < size; i += A) + BgraToRgba(bgra + i, rgba + i); + if (size != sizeA) + BgraToRgba(bgra + size - sizeA, rgba + size - sizeA); + bgra += bgraStride; + rgba += rgbaStride; + } + } + + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride)) + BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + else + BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + } + } +#endif// SIMD_AVX2_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp old mode 100644 new mode 100755 index f203fcae78..7082801956 --- a/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -46,7 +46,7 @@ namespace Simd { const __m256i lo = PackI32ToI16(BgraToGray32(bgra[0]), BgraToGray32(bgra[1])); const __m256i hi = PackI32ToI16(BgraToGray32(bgra[2]), BgraToGray32(bgra[3])); - return PackU16ToU8(lo, hi); + return PackI16ToU8(lo, hi); } template SIMD_INLINE void Load(const uint8_t* p, __m256i a[4]) @@ -89,6 +89,58 @@ namespace Simd else BgraToGray(bgra, width, height, bgraStride, gray, grayStride); } + + //--------------------------------------------------------------------- + + const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); + + SIMD_INLINE __m256i RgbaToGray32(__m256i rgba) + { + const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF); + const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF); + const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_0000), _mm256_madd_epi16(r0b0, K16_RED_BLUE)); + return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); + } + + SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4]) + { + const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); + const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); + return PackI16ToU8(lo, hi); + } + + template void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) + { + assert(width >= A); + if (align) + assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride)); + + size_t alignedWidth = AlignLo(width, A); + __m256i a[4]; + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + { + Load(rgba + 4 * col, a); + Store((__m256i*)(gray + col), RgbaToGray(a)); + } + if (alignedWidth != width) + { + Load(rgba + 4 * (width - A), a); + Store((__m256i*)(gray + width - A), RgbaToGray(a)); + } + rgba += rgbaStride; + gray += grayStride; + } + } + + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) + { + if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride)) + RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + else + RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + } } #else // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgraToGray.cpp.o) has no symbols diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp deleted file mode 100644 index d64f184cbf..0000000000 --- a/3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void BgraToRgba(const uint8_t * bgra, uint8_t * rgba) - { - Store((__m256i*)rgba + 0, BgraToRgba(Load((__m256i*)(bgra + 0)))); - Store((__m256i*)rgba + 1, BgraToRgba(Load((__m256i*)(bgra + 32)))); - Store((__m256i*)rgba + 2, BgraToRgba(Load((__m256i*)(bgra + 64)))); - Store((__m256i*)rgba + 3, BgraToRgba(Load((__m256i*)(bgra + 96)))); - } - - template void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride)); - - size_t alignedWidth = AlignLo(width, A); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BgraToRgba(bgra + 4 * col, rgba + 4 * col); - if (width != alignedWidth) - BgraToRgba(bgra + 4 * (width - A), rgba + 4 * (width - A)); - bgra += bgraStride; - rgba += rgbaStride; - } - } - - void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride) - { - if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); - else - BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToRgba.cpp.o) has no symbols - void dummy_SimdAvx2BgraToRgba(){}; -#endif//SIMD_AVX2_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp b/3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp new file mode 100644 index 0000000000..778b11803a --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp @@ -0,0 +1,68 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2020 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdEnable.h" +#include "Simd/SimdCpu.h" + +#if defined(_MSC_VER) +#include +#endif + +namespace Simd +{ +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + SIMD_INLINE bool SupportedByCPU() + { + return + Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) && + Base::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX2) && + Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::FMA) && + Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::F16C); + } + + SIMD_INLINE bool SupportedByOS() + { +#if defined(_MSC_VER) + __try + { + __m256i value = _mm256_abs_epi8(_mm256_set1_epi8(1));// try to execute of AVX2 instructions; + return true; + } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } +#else + return true; +#endif + } + + bool GetEnable() + { + return SupportedByCPU() && SupportedByOS(); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp b/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp old mode 100644 new mode 100755 index 762d0f37ba..2bf5741a35 --- a/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -69,13 +69,15 @@ namespace Simd DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); } + //--------------------------------------------------------------------- + const __m256i K8_SHUFFLE_BGRA = SIMD_MM256_SETR_EPI8( 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF, 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF); const __m256i K32_PERMUTE_BGRA = SIMD_MM256_SETR_EPI32(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7); - template SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset) + template SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset) { __m256i _bgra[4]; _bgra[0] = _mm256_shuffle_epi8(Load((__m256i*)bgra + 0), K8_SHUFFLE_BGRA); @@ -93,39 +95,58 @@ namespace Simd __m256i rraa1 = _mm256_unpackhi_epi32(_bgra[2], _bgra[3]); Store((__m256i*)(r + offset), _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(rraa0, rraa1), K32_PERMUTE_BGRA)); - Store((__m256i*)(a + offset), _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(rraa0, rraa1), K32_PERMUTE_BGRA)); + if(alpha) + Store((__m256i*)(a + offset), _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(rraa0, rraa1), K32_PERMUTE_BGRA)); } - template void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) + template void DeinterleaveBgra(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height, + uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride) { assert(width >= A); if (align) { assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)); + assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL)); } size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) + if (a) { - for (size_t col = 0; col < alignedWidth; col += A) - DeinterleaveBgra(bgra + col * 4, b, g, r, a, col); - if (width != alignedWidth) - DeinterleaveBgra(bgra + 4 * (width - A), b, g, r, a, width - A); - bgra += bgraStride; - b += bStride; - g += gStride; - r += rStride; - a += aStride; + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + DeinterleaveBgra(bgra + col * 4, b, g, r, a, col); + if (width != alignedWidth) + DeinterleaveBgra(bgra + 4 * (width - A), b, g, r, a, width - A); + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; + a += aStride; + } + } + else + { + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + DeinterleaveBgra(bgra + col * 4, b, g, r, NULL, col); + if (width != alignedWidth) + DeinterleaveBgra(bgra + 4 * (width - A), b, g, r, NULL, width - A); + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; + } } } - void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) + void DeinterleaveBgra(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height, + uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride) { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)) + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && + Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL)) DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); else DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); diff --git a/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp old mode 100644 new mode 100755 index 243663a169..beefb55410 --- a/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2020 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "Simd/SimdMemory.h" +#include "Simd/SimdLoadBlock.h" #include "Simd/SimdStore.h" #include "Simd/SimdGaussianBlur.h" #include "Simd/SimdExtract.h" diff --git a/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp b/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp old mode 100644 new mode 100755 index ca40f5a347..5a85a27334 --- a/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp @@ -42,7 +42,7 @@ namespace Simd _mm256_and_si256(_mm256_srli_si256(s01, 1), K16_00FF), _mm256_and_si256(s11, K16_00FF), _mm256_and_si256(_mm256_srli_si256(s11, 1), K16_00FF)); - return PackU16ToU8(lo, hi); + return PackI16ToU8(lo, hi); } #else SIMD_INLINE __m256i Average16(const __m256i & s0, const __m256i & s1) @@ -52,7 +52,7 @@ namespace Simd SIMD_INLINE __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11) { - return PackU16ToU8(Average16(s00, s10), Average16(s01, s11)); + return PackI16ToU8(Average16(s00, s10), Average16(s01, s11)); } #endif diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp old mode 100644 new mode 100755 index c4ee30e989..d7caad1571 --- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2018 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -42,7 +42,7 @@ namespace Simd _mm256_and_si256(_mm256_srli_si256(s01, 1), K16_00FF), _mm256_and_si256(s11, K16_00FF), _mm256_and_si256(_mm256_srli_si256(s11, 1), K16_00FF)); - return PackU16ToU8(lo, hi); + return PackI16ToU8(lo, hi); } #else SIMD_INLINE __m256i Average16(const __m256i & s0, const __m256i & s1) @@ -52,7 +52,7 @@ namespace Simd SIMD_INLINE __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11) { - return PackU16ToU8(Average16(s00, s10), Average16(s01, s11)); + return PackI16ToU8(Average16(s00, s10), Average16(s01, s11)); } #endif diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp old mode 100644 new mode 100755 index 34b4a91ecb..71f36b978f --- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -78,7 +78,7 @@ namespace Simd template SIMD_INLINE __m256i ReduceRow(const __m256i lo[3], const __m256i hi[3]) { - return PackU16ToU8( + return PackI16ToU8( DivideBy16(BinomialSum16(lo[0], lo[1], lo[2])), DivideBy16(BinomialSum16(hi[0], hi[1], hi[2]))); } diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp old mode 100644 new mode 100755 index bf732178ed..cea41815d3 --- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -119,7 +119,7 @@ namespace Simd { __m256i lo = ReduceRow16(buffer, offset); __m256i hi = ReduceRow16(buffer, offset + HA); - return PackU16ToU8(lo, hi); + return PackI16ToU8(lo, hi); } template void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp old mode 100644 new mode 100755 index 96771d8aee..fe2ebbd3cf --- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -120,14 +120,14 @@ namespace Simd { const __m256i lo = MainRowX5x5(buffer.dst + offset); const __m256i hi = MainRowX5x5(buffer.dst + offset + HA); - return _mm256_and_si256(PackU16ToU8(lo, hi), K16_00FF); + return _mm256_and_si256(PackI16ToU8(lo, hi), K16_00FF); } template SIMD_INLINE void MainRowX5x5(Buffer & buffer, size_t offset, uint8_t * dst) { __m256i lo = MainRowX5x5(buffer, offset); __m256i hi = MainRowX5x5(buffer, offset + A); - Store((__m256i*)dst, PackU16ToU8(lo, hi)); + Store((__m256i*)dst, PackI16ToU8(lo, hi)); } template void ReduceGray5x5( diff --git a/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp b/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp old mode 100644 new mode 100755 index f00b174cb2..53c9cdc9f8 --- a/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -273,7 +273,7 @@ namespace Simd { __m256i lo = InterpolateY((__m256i*)bx0 + 0, (__m256i*)bx1 + 0, alpha); __m256i hi = InterpolateY((__m256i*)bx0 + 1, (__m256i*)bx1 + 1, alpha); - Store((__m256i*)dst, PackU16ToU8(lo, hi)); + Store((__m256i*)dst, PackI16ToU8(lo, hi)); } template void ResizeBilinear( diff --git a/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp b/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp old mode 100644 new mode 100755 index ab739b7aa9..d75c24989d --- a/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -23,6 +23,7 @@ */ #include "Simd/SimdMemory.h" #include "Simd/SimdResizer.h" +#include "Simd/SimdResizerCommon.h" #include "Simd/SimdStore.h" #include "Simd/SimdSet.h" #include "Simd/SimdUpdate.h" @@ -33,7 +34,7 @@ namespace Simd namespace Avx2 { ResizerByteBilinear::ResizerByteBilinear(const ResParam & param) - : Ssse3::ResizerByteBilinear(param) + : Sse41::ResizerByteBilinear(param) { } @@ -223,7 +224,7 @@ namespace Simd { __m256i lo = ResizerByteBilinearInterpolateY((__m256i*)bx0 + 0, (__m256i*)bx1 + 0, alpha); __m256i hi = ResizerByteBilinearInterpolateY((__m256i*)bx0 + 1, (__m256i*)bx1 + 1, alpha); - Store((__m256i*)dst, PackU16ToU8(lo, hi)); + Store((__m256i*)dst, PackI16ToU8(lo, hi)); } template void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) @@ -523,7 +524,7 @@ namespace Simd float * pbx[2] = { _bx[0].data, _bx[1].data }; int32_t prev = -2; size_t rsa = AlignLo(rs, Avx::F); - size_t rsh = AlignLo(rs, Sse::F); + size_t rsh = AlignLo(rs, Sse2::F); for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) { float fy1 = _ay[dy]; @@ -560,10 +561,10 @@ namespace Simd __m256 s1 = _mm256_shuffle_ps(s0145, s2367, 0xDD); _mm256_store_ps(pb + dx, _mm256_fmadd_ps(s0, fx0, _mm256_mul_ps(s1, fx1))); } - for (; dx < rsh; dx += Sse::F) + for (; dx < rsh; dx += Sse2::F) { - __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); - __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); + __m128 s01 = Sse2::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); + __m128 s23 = Sse2::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); __m128 fx1 = _mm_load_ps(_ax.data + dx); __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1); __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88)); @@ -625,7 +626,7 @@ namespace Simd __m256 b1 = _mm256_load_ps(pbx[1] + dx); _mm256_storeu_ps(dst + dx, _mm256_fmadd_ps(b0, _fy0, _mm256_mul_ps(b1, _fy1))); } - for (; dx < rsh; dx += Sse::F) + for (; dx < rsh; dx += Sse2::F) { __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0)); __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1)); @@ -641,11 +642,11 @@ namespace Simd void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) { ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m256i)); - if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && dstX >= A) + if (param.IsByteBilinear() && dstX >= A) return new ResizerByteBilinear(param); - else if (type == SimdResizeChannelByte && method == SimdResizeMethodArea) + else if (param.IsByteArea()) return new ResizerByteArea(param); - else if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp)) + else if (param.IsFloatBilinear()) return new ResizerFloatBilinear(param); else return Avx::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); diff --git a/3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp deleted file mode 100644 index 1533d99dfb..0000000000 --- a/3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); - const __m256i K16_GREEN_ROUND = SIMD_MM256_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m256i RgbaToGray32(__m256i rgba) - { - const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF); - const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF); - const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_ROUND), _mm256_madd_epi16(r0b0, K16_RED_BLUE)); - return _mm256_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4]) - { - const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); - const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); - return PackU16ToU8(lo, hi); - } - - template SIMD_INLINE __m256i RgbToGray(const uint8_t * rgb) - { - __m256i rgba[4]; - rgba[0] = BgrToBgra(Load((__m256i*)(rgb + 0)), K32_01000000); - rgba[1] = BgrToBgra(Load((__m256i*)(rgb + 24)), K32_01000000); - rgba[2] = BgrToBgra(Load((__m256i*)(rgb + 48)), K32_01000000); - rgba[3] = BgrToBgra(Load((__m256i*)(rgb + 64)), K32_01000000); - return RgbaToGray(rgba); - } - - template void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride) - { - assert(width >= A); - if (align) - assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)); - - size_t alignedWidth = AlignLo(width, A); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - Store((__m256i*)(gray + col), RgbToGray(rgb + 3 * col)); - if (width != alignedWidth) - Store((__m256i*)(gray + width - A), RgbToGray(rgb + 3 * (width - A))); - rgb += rgbStride; - gray += grayStride; - } - } - - void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride) - { - if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)) - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2RgbToGray.cpp.o) has no symbols - void dummy_SimdAvx2RgbToGray(){}; -#endif//SIMD_AVX2_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp deleted file mode 100644 index d28cb39832..0000000000 --- a/3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); - const __m256i K16_GREEN_0000 = SIMD_MM256_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, 0x0000); - const __m256i K32_ROUND_TERM = SIMD_MM256_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m256i RgbaToGray32(__m256i rgba) - { - const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF); - const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF); - const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_0000), _mm256_madd_epi16(r0b0, K16_RED_BLUE)); - return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4]) - { - const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); - const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); - return PackU16ToU8(lo, hi); - } - - template SIMD_INLINE void Load(const uint8_t* p, __m256i a[4]) - { - a[0] = Load((__m256i*)p + 0); - a[1] = Load((__m256i*)p + 1); - a[2] = Load((__m256i*)p + 2); - a[3] = Load((__m256i*)p + 3); - } - - template void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride) - { - assert(width >= A); - if (align) - assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, A); - __m256i a[4]; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - Load(rgba + 4 * col, a); - Store((__m256i*)(gray + col), RgbaToGray(a)); - } - if (alignedWidth != width) - { - Load(rgba + 4 * (width - A), a); - Store((__m256i*)(gray + width - A), RgbaToGray(a)); - } - rgba += rgbaStride; - gray += grayStride; - } - } - - void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride) - { - if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride)) - RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); - else - RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2RgbaToGray.cpp.o) has no symbols - void dummy_SimdAvx2RgbaToGray(){}; -#endif// SIMD_AVX2_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdBase.h b/3rdparty/simdlib/Simd/SimdBase.h old mode 100644 new mode 100755 index 57d654751e..998a7b7cbe --- a/3rdparty/simdlib/Simd/SimdBase.h +++ b/3rdparty/simdlib/Simd/SimdBase.h @@ -38,7 +38,9 @@ namespace Simd void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride); - void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride); + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride); + + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride); void BgrToBgra(const uint8_t * bgr, size_t size, uint8_t * bgra, bool fillAlpha, bool lastRow, uint8_t alpha); @@ -47,15 +49,9 @@ namespace Simd void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha); - - void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride); - void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride); - void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride); - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); + void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride); void Copy(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, uint8_t * dst, size_t dstStride); @@ -104,6 +100,12 @@ namespace Simd void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); + void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); + + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride); + void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp old mode 100644 new mode 100755 index b909ee9d20..b5b8140dbe --- a/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -106,5 +106,23 @@ namespace Simd bgra += bgraStride; } } + + void RgbToBgra(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) + { + size_t rgbGap = rgbStride - width * 3; + size_t bgraGap = bgraStride - width * 4; + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < width; ++col, rgb += 3, bgra += 4) + { + bgra[0] = rgb[2]; + bgra[1] = rgb[1]; + bgra[2] = rgb[0]; + bgra[3] = alpha; + } + rgb += rgbGap; + bgra += bgraGap; + } + } } } diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp old mode 100644 new mode 100755 index e6fa81ddb1..26f7bf171b --- a/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -39,5 +39,18 @@ namespace Simd } } } + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) + { + for (size_t row = 0; row < height; ++row) + { + const uint8_t* pRgb = rgb + row * rgbStride; + uint8_t* pGray = gray + row * grayStride; + for (const uint8_t* pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgb += 3) + { + *pGray = BgrToGray(pRgb[2], pRgb[1], pRgb[0]); + } + } + } } } diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp old mode 100644 new mode 100755 index d508115a64..ece4ffc97f --- a/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ namespace Simd { namespace Base { - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) + void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride) { size_t size = width * 3; for (size_t row = 0; row < height; ++row) diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp deleted file mode 100644 index b7003c067b..0000000000 --- a/3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" -#include - -namespace Simd -{ - namespace Base - { - void BgrToRgba(const uint8_t *bgr, size_t size, uint8_t *rgba, bool fillAlpha, bool lastRow, uint8_t alpha) - { - if (fillAlpha) - { -#ifdef SIMD_BIG_ENDIAN - const int32_t alphaMask = alpha; -#else - const int32_t alphaMask = alpha << 24; -#endif - for (size_t i = (lastRow ? 1 : 0); i < size; ++i, bgr += 3, rgba += 4) - { - *(int32_t*)rgba = (*(int32_t*)bgr) | alphaMask; - std::swap(rgba[0], rgba[2]); - } - if (lastRow) - { - rgba[0] = bgr[2]; - rgba[1] = bgr[1]; - rgba[2] = bgr[0]; - rgba[3] = alpha; - } - } - else - { - for (size_t i = (lastRow ? 1 : 0); i < size; ++i, bgr += 3, rgba += 4) - { - *(int32_t*)rgba = (*(int32_t*)bgr); - std::swap(rgba[0], rgba[2]); - } - if (lastRow) - { - rgba[0] = bgr[2]; - rgba[1] = bgr[1]; - rgba[2] = bgr[0]; - } - } - } - - void BgrToRgba(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *rgba, size_t bgraStride, uint8_t alpha) - { - for (size_t row = 1; row < height; ++row) - { - BgrToRgba(bgr, width, rgba, true, false, alpha); - bgr += bgrStride; - rgba += bgraStride; - } - BgrToRgba(bgr, width, rgba, true, true, alpha); - } - } -} diff --git a/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp old mode 100644 new mode 100755 index 8d3b1bbc6c..6ee5d55355 --- a/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -51,5 +51,40 @@ namespace Simd } BgraToBgr(bgra, width, bgr, true); } + + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) + { + size_t bgraGap = bgraStride - width * 4; + size_t rgbGap = rgbStride - width * 3; + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < width; ++col, bgra += 4, rgb += 3) + { + rgb[2] = bgra[0]; + rgb[1] = bgra[1]; + rgb[0] = bgra[2]; + } + bgra += bgraGap; + rgb += rgbGap; + } + } + + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) + { + size_t bgraGap = bgraStride - width * 4; + size_t rgbaGap = rgbaStride - width * 4; + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < width; ++col, bgra += 4, rgba += 4) + { + rgba[2] = bgra[0]; + rgba[1] = bgra[1]; + rgba[0] = bgra[2]; + rgba[3] = bgra[3]; + } + bgra += bgraGap; + rgba += rgbaGap; + } + } } } diff --git a/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp b/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp old mode 100644 new mode 100755 index 3d855e749e..16fba3e7ce --- a/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -39,5 +39,18 @@ namespace Simd } } } + + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) + { + for (size_t row = 0; row < height; ++row) + { + const uint8_t* pRgba = rgba + row * rgbaStride; + uint8_t* pGray = gray + row * grayStride; + for (const uint8_t* pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgba += 4) + { + *pGray = BgrToGray(pRgba[2], pRgba[1], pRgba[0]); + } + } + } } } diff --git a/3rdparty/simdlib/Simd/SimdBaseCpu.cpp b/3rdparty/simdlib/Simd/SimdBaseCpu.cpp new file mode 100644 index 0000000000..77fc5718df --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseCpu.cpp @@ -0,0 +1,234 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2020 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdCpu.h" + +#include +#include +#include +#include + +#if defined(_MSC_VER) + +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include + +#elif defined(__GNUC__) +#include +#include +#include + +#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) +#include +#endif + +#if defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE) +#include +#include +#if defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE) +#include +#endif +#endif + +#else +# error Do not know how to detect CPU info +#endif + +namespace Simd +{ + namespace Base + { +#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) + bool CheckBit(Cpuid::Level level, Cpuid::Register index, Cpuid::Bit bit) + { + unsigned int registers[4] = { 0, 0, 0, 0 }; +#if defined(_MSC_VER) + __cpuid((int*)registers, level); +#elif (defined __GNUC__) + if (__get_cpuid_max(0, NULL) < level) + return false; + __cpuid_count(level, 0, + registers[Cpuid::Eax], + registers[Cpuid::Ebx], + registers[Cpuid::Ecx], + registers[Cpuid::Edx]); +#else +#error Do not know how to detect CPU info! +#endif + return (registers[index] & bit) == bit; + } +#endif//defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) + +#if defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)) + bool CheckBit(int at, int bit) + { + bool result = false; + int file = ::open("/proc/self/auxv", O_RDONLY); + if (file < 0) + return false; + const ssize_t size = 64; + unsigned long buffer[size]; + for (ssize_t count = size; count == size;) + { + count = ::read(file, buffer, sizeof(buffer)) / sizeof(unsigned long); + for (int i = 0; i < count; i += 2) + { + if (buffer[i] == (unsigned)at) + { + result = !!(buffer[i + 1] & bit); + count = 0; + } + if (buffer[i] == AT_NULL) + count = 0; + } + } + ::close(file); + return result; + } +#endif//defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)) + + size_t CpuThreadNumber() + { + return std::thread::hardware_concurrency(); + } + +#if defined(_MSC_VER) + typedef SYSTEM_LOGICAL_PROCESSOR_INFORMATION Info; + + void GetLogicalProcessorInformation(std::vector & info) + { + DWORD size = 0; + ::GetLogicalProcessorInformation(0, &size); + info.resize(size / sizeof(Info)); + ::GetLogicalProcessorInformation(info.data(), &size); + } + + size_t CpuSocketNumber() + { + std::vector info; + GetLogicalProcessorInformation(info); + size_t number = 0; + for (size_t i = 0; i < info.size(); ++i) + if (info[i].Relationship == ::RelationNumaNode) + number++; + return number; + } + + size_t CpuCoreNumber() + { + std::vector info; + GetLogicalProcessorInformation(info); + size_t number = 0; + for (size_t i = 0; i < info.size(); ++i) + if (info[i].Relationship == ::RelationProcessorCore) + number++; + return number; + } + + size_t CpuCacheSize(size_t level) + { + std::vector info; + GetLogicalProcessorInformation(info); + for (size_t i = 0; i < info.size(); ++i) + if (info[i].Relationship == ::RelationCache && info[i].Cache.Level == level && (info[i].Cache.Type == ::CacheData || info[i].Cache.Type == CacheUnified)) + return info[i].Cache.Size; + return 0; + } +#elif defined(__GNUC__) + size_t CpuSocketNumber() + { + uint32_t number = 0; + ::FILE * p = ::popen("lscpu -b -p=Socket | grep -v '^#' | sort -u | wc -l", "r"); + if (p) + { + char buffer[PATH_MAX]; + while (::fgets(buffer, PATH_MAX, p)); + number = ::atoi(buffer); + ::pclose(p); + } + return number; + } + + size_t CpuCoreNumber() + { + uint32_t number = 0; + ::FILE * p = ::popen("lscpu -b -p=Core | grep -v '^#' | sort -u | wc -l", "r"); + if (p) + { + char buffer[PATH_MAX]; + while (::fgets(buffer, PATH_MAX, p)); + number = ::atoi(buffer); + ::pclose(p); + } + return number; + } + + SIMD_INLINE size_t CorrectIfZero(size_t value, size_t otherwise) + { + return value ? value : otherwise; + } + +#if defined(_SC_LEVEL1_DCACHE_SIZE) && defined(_SC_LEVEL2_CACHE_SIZE) && defined(_SC_LEVEL3_CACHE_SIZE) + size_t CpuCacheSize(size_t level) + { + switch (level) + { + case 1: return CorrectIfZero(::sysconf(_SC_LEVEL1_DCACHE_SIZE), 32 * 1024); + case 2: return CorrectIfZero(::sysconf(_SC_LEVEL2_CACHE_SIZE), 256 * 1024); + case 3: return CorrectIfZero(::sysconf(_SC_LEVEL3_CACHE_SIZE), 2048 * 1024); + default: + return 0; + } + } +#else + size_t CpuCacheSize(size_t level) + { + switch (level) + { + case 1: return 32 * 1024; + case 2: return 256 * 1024; + case 3: return 2048 * 1024; + default: + return 0; + } + } +#endif + +#else +#error This platform is unsupported! +#endif + } + + namespace Cpu + { + const size_t SOCKET_NUMBER = Base::CpuSocketNumber(); + const size_t CORE_NUMBER = Base::CpuCoreNumber(); + const size_t THREAD_NUMBER = Base::CpuThreadNumber(); + const size_t L1_CACHE_SIZE = Base::CpuCacheSize(1); + const size_t L2_CACHE_SIZE = Base::CpuCacheSize(2); + const size_t L3_CACHE_SIZE = Base::CpuCacheSize(3); + } +} diff --git a/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp b/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp old mode 100644 new mode 100755 index ecb22ed4b0..366ce1bc0e --- a/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -48,20 +48,39 @@ namespace Simd void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) { - for (size_t row = 0; row < height; ++row) + if (a) { - for (size_t col = 0, offset = 0; col < width; ++col, offset += 4) + for (size_t row = 0; row < height; ++row) { - b[col] = bgra[offset + 0]; - g[col] = bgra[offset + 1]; - r[col] = bgra[offset + 2]; - a[col] = bgra[offset + 3]; + for (size_t col = 0, offset = 0; col < width; ++col, offset += 4) + { + b[col] = bgra[offset + 0]; + g[col] = bgra[offset + 1]; + r[col] = bgra[offset + 2]; + a[col] = bgra[offset + 3]; + } + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; + a += aStride; + } + } + else + { + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0, offset = 0; col < width; ++col, offset += 4) + { + b[col] = bgra[offset + 0]; + g[col] = bgra[offset + 1]; + r[col] = bgra[offset + 2]; + } + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; } - bgra += bgraStride; - b += bStride; - g += gStride; - r += rStride; - a += aStride; } } } diff --git a/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp old mode 100644 new mode 100755 index 560b9d3cb9..1394d919e1 --- a/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2020 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/3rdparty/simdlib/Simd/SimdBaseResizer.cpp b/3rdparty/simdlib/Simd/SimdBaseResizer.cpp old mode 100644 new mode 100755 index 9585a4f1ac..b8c08d2b92 --- a/3rdparty/simdlib/Simd/SimdBaseResizer.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseResizer.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -23,6 +23,7 @@ */ #include "Simd/SimdMemory.h" #include "Simd/SimdResizer.h" +#include "Simd/SimdCopyPixel.h" namespace Simd { @@ -132,8 +133,6 @@ namespace Simd ResizerByteArea::ResizerByteArea(const ResParam & param) : Resizer(param) { - double scale = Simd::Max(float(_param.srcW) / _param.dstW, float(_param.srcH) / _param.dstH); - _ay.Resize(_param.dstH + 1); _iy.Resize(_param.dstH + 1); EstimateParams(_param.srcH, _param.dstH, Base::AREA_RANGE, _ay.data, _iy.data); @@ -234,28 +233,173 @@ namespace Simd //--------------------------------------------------------------------- + ResizerShortBilinear::ResizerShortBilinear(const ResParam& param) + : Resizer(param) + { + _ay.Resize(_param.dstH, false, _param.align); + _iy.Resize(_param.dstH, false, _param.align); + EstimateIndexAlpha(_param.srcH, _param.dstH, 1, _iy.data, _ay.data); + size_t rs = _param.dstW * _param.channels; + _ax.Resize(rs, false, _param.align); + _ix.Resize(rs, false, _param.align); + EstimateIndexAlpha(_param.srcW, _param.dstW, _param.channels, _ix.data, _ax.data); + _bx[0].Resize(rs, false, _param.align); + _bx[1].Resize(rs, false, _param.align); + } + + void ResizerShortBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t* indices, float* alphas) + { + float scale = (float)srcSize / dstSize; + for (size_t i = 0; i < dstSize; ++i) + { + float alpha = (float)((i + 0.5f) * scale - 0.5f); + ptrdiff_t index = (ptrdiff_t)::floor(alpha); + alpha -= index; + if (index < 0) + { + index = 0; + alpha = 0; + } + if (index > (ptrdiff_t)srcSize - 2) + { + index = srcSize - 2; + alpha = 1; + } + for (size_t c = 0; c < channels; c++) + { + size_t offset = i * channels + c; + indices[offset] = (int32_t)(channels * index + c); + alphas[offset] = alpha; + } + } + } + + void ResizerShortBilinear::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) + { + Run((const uint16_t*)src, srcStride / sizeof(uint16_t), (uint16_t*)dst, dstStride / sizeof(uint16_t)); + } + + template void ResizerShortBilinear::RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride) + { + size_t rs = _param.dstW * N; + float* pbx[2] = { _bx[0].data, _bx[1].data }; + int32_t prev = -2; + for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) + { + float fy1 = _ay[dy]; + float fy0 = 1.0f - fy1; + int32_t sy = _iy[dy]; + int32_t k = 0; + if (sy == prev) + k = 2; + else if (sy == prev + 1) + { + Swap(pbx[0], pbx[1]); + k = 1; + } + prev = sy; + for (; k < 2; k++) + { + float* pb = pbx[k]; + const uint16_t* ps = src + (sy + k) * srcStride; + for (size_t dx = 0; dx < rs; dx++) + { + int32_t sx = _ix[dx]; + float fx = _ax[dx]; + pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + N] * fx; + } + } + for (size_t dx = 0; dx < rs; dx++) + dst[dx] = Round(pbx[0][dx] * fy0 + pbx[1][dx] * fy1); + } + } + + template void ResizerShortBilinear::RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride) + { + size_t rs = _param.dstW * N; + for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) + { + float fy1 = _ay[dy]; + float fy0 = 1.0f - fy1; + int32_t sy = _iy[dy]; + const uint16_t* ps0 = src + (sy + 0) * srcStride; + const uint16_t* ps1 = src + (sy + 1) * srcStride; + for (size_t dx = 0; dx < rs; dx++) + { + int32_t sx = _ix[dx]; + float fx1 = _ax[dx]; + float fx0 = 1.0f - fx1; + float r0 = ps0[sx] * fx0 + ps0[sx + N] * fx1; + float r1 = ps1[sx] * fx0 + ps1[sx + N] * fx1; + dst[dx] = Round(r0 * fy0 + r1 * fy1); + } + } + } + + void ResizerShortBilinear::Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride) + { + bool sparse = _param.dstH * 2.0 <= _param.srcH; + switch (_param.channels) + { + case 1: sparse ? RunS<1>(src, srcStride, dst, dstStride) : RunB<1>(src, srcStride, dst, dstStride); return; + case 2: sparse ? RunS<2>(src, srcStride, dst, dstStride) : RunB<2>(src, srcStride, dst, dstStride); return; + case 3: sparse ? RunS<3>(src, srcStride, dst, dstStride) : RunB<3>(src, srcStride, dst, dstStride); return; + case 4: sparse ? RunS<4>(src, srcStride, dst, dstStride) : RunB<4>(src, srcStride, dst, dstStride); return; + default: + assert(0); + } + } + + //--------------------------------------------------------------------- + ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param) : Resizer(param) { _ay.Resize(_param.dstH, false, _param.align); _iy.Resize(_param.dstH, false, _param.align); - EstimateIndexAlpha(_param.srcH, _param.dstH, 1, _param.method == SimdResizeMethodCaffeInterp, _iy.data, _ay.data); + EstimateIndexAlpha(_param.srcH, _param.dstH, 1, _iy.data, _ay.data); size_t rs = _param.dstW * _param.channels; _ax.Resize(rs, false, _param.align); _ix.Resize(rs, false, _param.align); - EstimateIndexAlpha(_param.srcW, _param.dstW, _param.channels, _param.method == SimdResizeMethodCaffeInterp, _ix.data, _ax.data); + EstimateIndexAlpha(_param.srcW, _param.dstW, _param.channels, _ix.data, _ax.data); _bx[0].Resize(rs, false, _param.align); _bx[1].Resize(rs, false, _param.align); } - void ResizerFloatBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, bool caffeInterp, int32_t * indices, float * alphas) + void ResizerFloatBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t * indices, float * alphas) { - if (caffeInterp) + if (_param.method == SimdResizeMethodBilinear) + { + float scale = (float)srcSize / dstSize; + for (size_t i = 0; i < dstSize; ++i) + { + float alpha = (float)((i + 0.5f) * scale - 0.5f); + ptrdiff_t index = (ptrdiff_t)::floor(alpha); + alpha -= index; + if (index < 0) + { + index = 0; + alpha = 0; + } + if (index > (ptrdiff_t)srcSize - 2) + { + index = srcSize - 2; + alpha = 1; + } + for (size_t c = 0; c < channels; c++) + { + size_t offset = i * channels + c; + indices[offset] = (int32_t)(channels * index + c); + alphas[offset] = alpha; + } + } + } + else if (_param.method == SimdResizeMethodCaffeInterp) { float scale = dstSize > 1 ? float(srcSize - 1) / float(dstSize - 1) : 0.0f; for (size_t i = 0; i < dstSize; ++i) { - float alpha = float(i)*scale; + float alpha = float(i) * scale; ptrdiff_t index = (ptrdiff_t)::floor(alpha); alpha -= index; if (index > (ptrdiff_t)srcSize - 2) @@ -266,17 +410,17 @@ namespace Simd for (size_t c = 0; c < channels; c++) { size_t offset = i * channels + c; - indices[offset] = (int32_t)(channels*index + c); + indices[offset] = (int32_t)(channels * index + c); alphas[offset] = alpha; } } } - else + else if (_param.method == SimdResizeMethodInferenceEngineInterp) { float scale = (float)srcSize / dstSize; for (size_t i = 0; i < dstSize; ++i) { - float alpha = (float)((i + 0.5f)*scale - 0.5f); + float alpha = float(i) * scale; ptrdiff_t index = (ptrdiff_t)::floor(alpha); alpha -= index; if (index < 0) @@ -284,7 +428,7 @@ namespace Simd index = 0; alpha = 0; } - if (index >(ptrdiff_t)srcSize - 2) + if (index > (ptrdiff_t)srcSize - 2) { index = srcSize - 2; alpha = 1; @@ -292,11 +436,13 @@ namespace Simd for (size_t c = 0; c < channels; c++) { size_t offset = i * channels + c; - indices[offset] = (int32_t)(channels*index + c); + indices[offset] = (int32_t)(channels * index + c); alphas[offset] = alpha; } } } + else + assert(0); } void ResizerFloatBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) @@ -346,15 +492,80 @@ namespace Simd //--------------------------------------------------------------------- + ResizerNearest::ResizerNearest(const ResParam& param) + : Resizer(param) + { + _pixelSize = _param.PixelSize(); + _iy.Resize(_param.dstH, false, _param.align); + EstimateIndex(_param.srcH, _param.dstH, 1, _iy.data); + _ix.Resize(_param.dstW, false, _param.align); + EstimateIndex(_param.srcW, _param.dstW, _pixelSize, _ix.data); + } + + void ResizerNearest::EstimateIndex(size_t srcSize, size_t dstSize, size_t pixelSize, int32_t* indices) + { + float scale = (float)srcSize / dstSize; + for (size_t i = 0; i < dstSize; ++i) + { + float alpha = (i + 0.5f) * scale; + int index = RestrictRange((int)::floor(alpha), 0, (int)srcSize - 1); + indices[i] = (int)(index * pixelSize); + } + } + + void ResizerNearest::Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) + { + for (size_t dy = 0; dy < _param.dstH; dy++) + { + const uint8_t* srcRow = src + _iy[dy] * srcStride; + for (size_t dx = 0, offset = 0; dx < _param.dstW; dx++, offset += _pixelSize) + memcpy(dst + offset, srcRow + _ix[dx], _pixelSize); + dst += dstStride; + } + } + + template void ResizerNearest::Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) + { + for (size_t dy = 0; dy < _param.dstH; dy++) + { + const uint8_t * srcRow = src + _iy[dy] * srcStride; + for (size_t dx = 0, offset = 0; dx < _param.dstW; dx++, offset += N) + CopyPixel(srcRow + _ix[dx], dst + offset); + dst += dstStride; + } + } + + void ResizerNearest::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) + { + switch (_pixelSize) + { + case 1: Resize<1>(src, srcStride, dst, dstStride); break; + case 2: Resize<2>(src, srcStride, dst, dstStride); break; + case 3: Resize<3>(src, srcStride, dst, dstStride); break; + case 4: Resize<4>(src, srcStride, dst, dstStride); break; + case 6: Resize<6>(src, srcStride, dst, dstStride); break; + case 8: Resize<8>(src, srcStride, dst, dstStride); break; + case 12: Resize<12>(src, srcStride, dst, dstStride); break; + default: + Resize(src, srcStride, dst, dstStride); + } + } + + //--------------------------------------------------------------------- + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) { ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(void*)); - if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear) + if (param.IsByteBilinear()) return new ResizerByteBilinear(param); - else if (type == SimdResizeChannelByte && method == SimdResizeMethodArea) + else if (param.IsByteArea()) return new ResizerByteArea(param); - else if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp)) + else if (param.IsShortBilinear()) + return new ResizerShortBilinear(param); + else if (param.IsFloatBilinear()) return new ResizerFloatBilinear(param); + else if (param.IsNearest()) + return new ResizerNearest(param); else return NULL; } diff --git a/3rdparty/simdlib/Simd/SimdConfig.h b/3rdparty/simdlib/Simd/SimdConfig.h old mode 100644 new mode 100755 index 8e328e2495..22c7fdd8e6 --- a/3rdparty/simdlib/Simd/SimdConfig.h +++ b/3rdparty/simdlib/Simd/SimdConfig.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -24,18 +24,10 @@ #ifndef __SimdConfig_h__ #define __SimdConfig_h__ -//#define SIMD_SSE_DISABLE - //#define SIMD_SSE2_DISABLE -//#define SIMD_SSE3_DISABLE - -//#define SIMD_SSSE3_DISABLE - //#define SIMD_SSE41_DISABLE -//#define SIMD_SSE42_DISABLE - //#define SIMD_AVX_DISABLE //#define SIMD_AVX2_DISABLE diff --git a/3rdparty/simdlib/Simd/SimdConst.h b/3rdparty/simdlib/Simd/SimdConst.h old mode 100644 new mode 100755 index 38e217d6ca..e18c1b90d0 --- a/3rdparty/simdlib/Simd/SimdConst.h +++ b/3rdparty/simdlib/Simd/SimdConst.h @@ -76,25 +76,13 @@ namespace Simd const int DIVISION_BY_9_FACTOR = (1 << DIVISION_BY_9_SHIFT) / 9; } -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { const size_t F = sizeof(__m128) / sizeof(float); const size_t DF = 2 * F; const size_t QF = 4 * F; const size_t HF = F / 2; - } -#endif// SIMD_SSE_ENABLE - -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - using namespace Sse; -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::F; - using Sse::DF; - using Sse::QF; -#endif const size_t A = sizeof(__m128i); const size_t DA = 2 * A; @@ -128,6 +116,7 @@ namespace Simd const __m128i K16_0020 = SIMD_MM_SET1_EPI16(0x0020); const __m128i K16_0080 = SIMD_MM_SET1_EPI16(0x0080); const __m128i K16_00FF = SIMD_MM_SET1_EPI16(0x00FF); + const __m128i K16_0101 = SIMD_MM_SET1_EPI16(0x0101); const __m128i K16_FF00 = SIMD_MM_SET1_EPI16(0xFF00); const __m128i K32_00000001 = SIMD_MM_SET1_EPI32(0x00000001); @@ -138,6 +127,7 @@ namespace Simd const __m128i K32_0000FFFF = SIMD_MM_SET1_EPI32(0x0000FFFF); const __m128i K32_00010000 = SIMD_MM_SET1_EPI32(0x00010000); const __m128i K32_01000000 = SIMD_MM_SET1_EPI32(0x01000000); + const __m128i K32_00FF0000 = SIMD_MM_SET1_EPI32(0x00FF0000); const __m128i K32_00FFFFFF = SIMD_MM_SET1_EPI32(0x00FFFFFF); const __m128i K32_FFFFFF00 = SIMD_MM_SET1_EPI32(0xFFFFFF00); @@ -162,22 +152,15 @@ namespace Simd } #endif// SIMD_SSE2_ENABLE -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { using namespace Sse2; #if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::F; - using Sse::DF; - using Sse::QF; + using Sse2::F; + using Sse2::DF; + using Sse2::QF; #endif - } -#endif// SIMD_SSE3_ENABLE - -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - using namespace Sse3; const __m128i K8_SHUFFLE_GRAY_TO_BGR0 = SIMD_MM_SETR_EPI8(0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5); const __m128i K8_SHUFFLE_GRAY_TO_BGR1 = SIMD_MM_SETR_EPI8(0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xA, 0xA); @@ -207,27 +190,8 @@ namespace Simd const __m128i K8_SHUFFLE_BGR1_TO_RED = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1); const __m128i K8_SHUFFLE_BGR2_TO_RED = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF); } -#endif// SIMD_SSSE3_ENABLE - -#ifdef SIMD_SSE41_ENABLE - namespace Sse41 - { - using namespace Ssse3; -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::F; - using Sse::DF; - using Sse::QF; -#endif - } #endif// SIMD_SSE41_ENABLE -#ifdef SIMD_SSE42_ENABLE - namespace Sse42 - { - using namespace Sse41; - } -#endif// SIMD_SSE42_ENABLE - #ifdef SIMD_AVX_ENABLE namespace Avx { @@ -282,6 +246,7 @@ namespace Simd const __m256i K16_0020 = SIMD_MM256_SET1_EPI16(0x0020); const __m256i K16_0080 = SIMD_MM256_SET1_EPI16(0x0080); const __m256i K16_00FF = SIMD_MM256_SET1_EPI16(0x00FF); + const __m256i K16_0101 = SIMD_MM256_SET1_EPI16(0x0101); const __m256i K16_FF00 = SIMD_MM256_SET1_EPI16(0xFF00); const __m256i K32_00000001 = SIMD_MM256_SET1_EPI32(0x00000001); @@ -292,6 +257,7 @@ namespace Simd const __m256i K32_0000FFFF = SIMD_MM256_SET1_EPI32(0x0000FFFF); const __m256i K32_00010000 = SIMD_MM256_SET1_EPI32(0x00010000); const __m256i K32_01000000 = SIMD_MM256_SET1_EPI32(0x01000000); + const __m256i K32_00FF0000 = SIMD_MM256_SET1_EPI32(0x00FF0000); const __m256i K32_FFFFFF00 = SIMD_MM256_SET1_EPI32(0xFFFFFF00); const __m256i K16_Y_ADJUST = SIMD_MM256_SET1_EPI16(Base::Y_ADJUST); @@ -311,6 +277,8 @@ namespace Simd const __m256i K16_DIVISION_BY_9_FACTOR = SIMD_MM256_SET1_EPI16(Base::DIVISION_BY_9_FACTOR); + const __m256i K64_00000000FFFFFFFF = SIMD_MM256_SET2_EPI32(0xFFFFFFFF, 0); + const __m256i K8_SHUFFLE_0 = SIMD_MM256_SETR_EPI8( 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0); @@ -389,11 +357,11 @@ namespace Simd -1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF); - const __m256i K8_BGRA_TO_BGR_SHUFFLE = SIMD_MM256_SETR_EPI8( + const __m256i K8_BGR_TO_BGRA_SHUFFLE = SIMD_MM256_SETR_EPI8( 0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1, 0x4, 0x5, 0x6, -1, 0x7, 0x8, 0x9, -1, 0xA, 0xB, 0xC, -1, 0xD, 0xE, 0xF, -1); - const __m256i K8_BGRA_TO_RGB_SHUFFLE = SIMD_MM256_SETR_EPI8( + const __m256i K8_RGB_TO_BGRA_SHUFFLE = SIMD_MM256_SETR_EPI8( 0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1, 0x6, 0x5, 0x4, -1, 0x9, 0x8, 0x7, -1, 0xC, 0xB, 0xA, -1, 0xF, 0xE, 0xD, -1); @@ -402,6 +370,12 @@ namespace Simd 0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF); const __m256i K32_TWO_UNPACK_PERMUTE = SIMD_MM256_SETR_EPI32(0, 2, 4, 6, 1, 3, 5, 7); + + const __m256i K8_SHUFFLE_BGRA_TO_BGR = SIMD_MM256_SETR_EPI8( + 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, + 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1); + + const __m256i K32_PERMUTE_BGRA_TO_BGR = SIMD_MM256_SETR_EPI32(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, -1, -1); } #endif// SIMD_AVX2_ENABLE @@ -459,8 +433,10 @@ namespace Simd const uint32x4_t K32_000000FF = SIMD_VEC_SET1_EPI32(0x000000FF); const uint32x4_t K32_0000FFFF = SIMD_VEC_SET1_EPI32(0x0000FFFF); const uint32x4_t K32_00010000 = SIMD_VEC_SET1_EPI32(0x00010000); + const uint32x4_t K32_00FF0000 = SIMD_VEC_SET1_EPI32(0x00FF0000); const uint32x4_t K32_01000000 = SIMD_VEC_SET1_EPI32(0x01000000); const uint32x4_t K32_08080800 = SIMD_VEC_SET1_EPI32(0x08080800); + const uint32x4_t K32_FF000000 = SIMD_VEC_SET1_EPI32(0xFF000000); const uint32x4_t K32_FFFFFF00 = SIMD_VEC_SET1_EPI32(0xFFFFFF00); const uint32x4_t K32_FFFFFFFF = SIMD_VEC_SET1_EPI32(0xFFFFFFFF); const uint32x4_t K32_0123 = SIMD_VEC_SETR_EPI32(0, 1, 2, 3); diff --git a/3rdparty/simdlib/Simd/SimdConversion.h b/3rdparty/simdlib/Simd/SimdConversion.h old mode 100644 new mode 100755 index e0601a9f61..5f8f0a0b9b --- a/3rdparty/simdlib/Simd/SimdConversion.h +++ b/3rdparty/simdlib/Simd/SimdConversion.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, +* Copyright (c) 2011-2021 Yermalayeu Ihar, * 2014-2015 Antonenka Mikhail. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -38,16 +38,10 @@ namespace Simd return (BLUE_TO_GRAY_WEIGHT*blue + GREEN_TO_GRAY_WEIGHT * green + RED_TO_GRAY_WEIGHT * red + BGR_TO_GRAY_ROUND_TERM) >> BGR_TO_GRAY_AVERAGING_SHIFT; } - - SIMD_INLINE int RgbToGray(int red, int green, int blue) - { - return (BLUE_TO_GRAY_WEIGHT*blue + GREEN_TO_GRAY_WEIGHT * green + - RED_TO_GRAY_WEIGHT * red + BGR_TO_GRAY_ROUND_TERM) >> BGR_TO_GRAY_AVERAGING_SHIFT; - } } -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { template __m128i InterleaveBgr(__m128i blue, __m128i green, __m128i red); @@ -99,7 +93,7 @@ namespace Simd _mm_shuffle_epi8(bgr[2], K8_SHUFFLE_BGR2_TO_RED))); } } -#endif//SIMD_SSSE3_ENABLE +#endif #ifdef SIMD_AVX2_ENABLE namespace Avx2 @@ -181,41 +175,24 @@ namespace Simd template<> SIMD_INLINE __m256i BgrToBgra(const __m256i & bgr, const __m256i & alpha) { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0x94), K8_BGRA_TO_BGR_SHUFFLE), alpha); + return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0x94), K8_BGR_TO_BGRA_SHUFFLE), alpha); } template<> SIMD_INLINE __m256i BgrToBgra(const __m256i & bgr, const __m256i & alpha) { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0xE9), K8_BGRA_TO_BGR_SHUFFLE), alpha); - } - - template __m256i BgrToRgba(const __m256i & bgr, const __m256i & alpha); - - template<> SIMD_INLINE __m256i BgrToRgba(const __m256i & bgr, const __m256i & alpha) - { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0x94), K8_BGRA_TO_RGB_SHUFFLE), alpha); - } - - template<> SIMD_INLINE __m256i BgrToRgba(const __m256i & bgr, const __m256i & alpha) - { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0xE9), K8_BGRA_TO_RGB_SHUFFLE), alpha); - } - - SIMD_INLINE __m256i BgraToRgba(const __m256i & bgra) - { - return _mm256_shuffle_epi8(bgra, K8_BGRA_TO_RGBA_SHUFFLE); + return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0xE9), K8_BGR_TO_BGRA_SHUFFLE), alpha); } template __m256i RgbToBgra(const __m256i & rgb, const __m256i & alpha); template<> SIMD_INLINE __m256i RgbToBgra(const __m256i & rgb, const __m256i & alpha) { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0x94), K8_BGRA_TO_RGB_SHUFFLE), alpha); + return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0x94), K8_RGB_TO_BGRA_SHUFFLE), alpha); } template<> SIMD_INLINE __m256i RgbToBgra(const __m256i & rgb, const __m256i & alpha) { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0xE9), K8_BGRA_TO_RGB_SHUFFLE), alpha); + return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0xE9), K8_RGB_TO_BGRA_SHUFFLE), alpha); } } #endif// SIMD_AVX2_ENABLE @@ -236,8 +213,20 @@ namespace Simd template SIMD_INLINE int32x4_t BgrToU(uint16x8_t blue, uint16x8_t green, uint16x8_t red) { - return vshrq_n_s32(vmlal_s16(vmlal_s16(vmlal_s16(K32_BGR_TO_YUV_ROUND_TERM, (int16x4_t)Half(blue), K16_BLUE_TO_U_WEIGHT), - (int16x4_t)Half(green), K16_GREEN_TO_U_WEIGHT), (int16x4_t)Half(red), K16_RED_TO_U_WEIGHT), Base::BGR_TO_YUV_AVERAGING_SHIFT); + return vshrq_n_s32(vmlal_s16(vmlal_s16(vmlal_s16(K32_BGR_TO_YUV_ROUND_TERM, vreinterpret_s16_u16(Half(blue)), K16_BLUE_TO_U_WEIGHT), + vreinterpret_s16_u16(Half(green)), K16_GREEN_TO_U_WEIGHT), vreinterpret_s16_u16(Half(red)), K16_RED_TO_U_WEIGHT), Base::BGR_TO_YUV_AVERAGING_SHIFT); + } + + SIMD_INLINE int16x8_t BgrToU(uint16x8_t blue, uint16x8_t green, uint16x8_t red) + { + return vaddq_s16(K16_UV_ADJUST, PackI32(BgrToU<0>(blue, green, red), BgrToU<1>(blue, green, red))); + } + + SIMD_INLINE uint8x16_t BgrToU(uint8x16_t blue, uint8x16_t green, uint8x16_t red) + { + return PackSaturatedI16( + BgrToU(UnpackU8<0>(blue), UnpackU8<0>(green), UnpackU8<0>(red)), + BgrToU(UnpackU8<1>(blue), UnpackU8<1>(green), UnpackU8<1>(red))); } } #endif// SIMD_NEON_ENABLE diff --git a/3rdparty/simdlib/Simd/SimdCopyPixel.h b/3rdparty/simdlib/Simd/SimdCopyPixel.h old mode 100644 new mode 100755 index 6f113e4c39..a5539eba35 --- a/3rdparty/simdlib/Simd/SimdCopyPixel.h +++ b/3rdparty/simdlib/Simd/SimdCopyPixel.h @@ -56,6 +56,23 @@ namespace Simd { ((uint32_t*)dst)[0] = ((uint32_t*)src)[0]; } + + template<> SIMD_INLINE void CopyPixel<6>(const uint8_t* src, uint8_t* dst) + { + ((uint32_t*)dst)[0] = ((uint32_t*)src)[0]; + ((uint16_t*)dst)[2] = ((uint16_t*)src)[2]; + } + + template<> SIMD_INLINE void CopyPixel<8>(const uint8_t* src, uint8_t* dst) + { + ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; + } + + template<> SIMD_INLINE void CopyPixel<12>(const uint8_t* src, uint8_t* dst) + { + ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; + ((uint32_t*)dst)[2] = ((uint32_t*)src)[2]; + } } } diff --git a/3rdparty/simdlib/Simd/SimdCpu.h b/3rdparty/simdlib/Simd/SimdCpu.h old mode 100644 new mode 100755 index adaf916462..b10d9fa98f --- a/3rdparty/simdlib/Simd/SimdCpu.h +++ b/3rdparty/simdlib/Simd/SimdCpu.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,8 +28,103 @@ namespace Simd { -#ifdef SIMD_SSE_ENABLE - namespace Sse +#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) + namespace Cpuid + { + // See http://www.sandpile.org/x86/cpuid.htm for additional information. + enum Level + { + Ordinary = 1, + Extended = 7, + }; + + enum Register + { + Eax = 0, + Ebx = 1, + Ecx = 2, + Edx = 3, + }; + + enum Bit + { + // Ordinary: + // Edx: + SSE = 1 << 25, + SSE2 = 1 << 26, + + // Ecx: + SSE3 = 1 << 0, + SSSE3 = 1 << 9, + FMA = 1 << 12, + SSE41 = 1 << 19, + SSE42 = 1 << 20, + OSXSAVE = 1 << 27, + AVX = 1 << 28, + F16C = 1 << 29, + + // Extended: + // Ebx: + AVX2 = 1 << 5, + AVX512F = 1 << 16, + AVX512DQ = 1 << 17, + AVX512CD = 1 << 28, + AVX512BW = 1 << 30, + AVX512VL = 1 << 31, + + // Ecx: + AVX512VBMI = 1 << 1, + AVX512VNNI = 1 << 11, + }; + } +#endif//defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) + + namespace Cpu + { + extern const size_t SOCKET_NUMBER; + extern const size_t CORE_NUMBER; + extern const size_t THREAD_NUMBER; + extern const size_t L1_CACHE_SIZE; + extern const size_t L2_CACHE_SIZE; + extern const size_t L3_CACHE_SIZE; + } + + namespace Base + { +#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) + bool CheckBit(Cpuid::Level level, Cpuid::Register index, Cpuid::Bit bit); +#endif + +#if defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)) + bool CheckBit(int at, int bit); +#endif + + size_t CpuSocketNumber(); + + size_t CpuCoreNumber(); + + size_t CpuThreadNumber(); + + size_t CpuCacheSize(size_t level); + + SIMD_INLINE size_t AlgCacheL1() + { + return Cpu::L1_CACHE_SIZE; + } + + SIMD_INLINE size_t AlgCacheL2() + { + return Cpu::L3_CACHE_SIZE ? Cpu::L2_CACHE_SIZE : Cpu::L2_CACHE_SIZE * Cpu::SOCKET_NUMBER / Cpu::CORE_NUMBER; + } + + SIMD_INLINE size_t AlgCacheL3() + { + return Cpu::L3_CACHE_SIZE ? Cpu::L3_CACHE_SIZE * Cpu::SOCKET_NUMBER / Cpu::CORE_NUMBER : Cpu::L2_CACHE_SIZE; + } + } + +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { const unsigned int SCR_FTZ = 1 << 15; const unsigned int SCR_DAZ = 1 << 6; diff --git a/3rdparty/simdlib/Simd/SimdDefs.h b/3rdparty/simdlib/Simd/SimdDefs.h old mode 100644 new mode 100755 index c2b9274ed4..97d8f06ad6 --- a/3rdparty/simdlib/Simd/SimdDefs.h +++ b/3rdparty/simdlib/Simd/SimdDefs.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -37,10 +37,24 @@ #include #include +#if defined(SIMD_SSE2_DISABLE) && !defined(SIMD_SSE41_DISABLE) +#define SIMD_SSE41_DISABLE +#endif + +#if defined(SIMD_SSE41_DISABLE) && !defined(SIMD_AVX_DISABLE) +#define SIMD_AVX_DISABLE +#endif + +#if defined(SIMD_AVX_DISABLE) && !defined(SIMD_AVX2_DISABLE) +#define SIMD_AVX2_DISABLE +#endif + #if defined(_MSC_VER) && defined(_MSC_FULL_VER) #define SIMD_ALIGNED(x) __declspec(align(x)) +#define SIMD_NOINLINE __declspec(noinline) + #ifdef _M_IX86 #define SIMD_X86_ENABLE #endif @@ -55,30 +69,14 @@ #if defined(SIMD_X64_ENABLE) || defined(SIMD_X86_ENABLE) -#if !defined(SIMD_SSE_DISABLE) && _MSC_VER >= 1200 -#define SIMD_SSE_ENABLE -#endif - #if !defined(SIMD_SSE2_DISABLE) && _MSC_VER >= 1300 #define SIMD_SSE2_ENABLE #endif -#if !defined(SIMD_SSE3_DISABLE) && _MSC_VER >= 1500 -#define SIMD_SSE3_ENABLE -#endif - -#if !defined(SIMD_SSSE3_DISABLE) && _MSC_VER >= 1500 -#define SIMD_SSSE3_ENABLE -#endif - #if !defined(SIMD_SSE41_DISABLE) && _MSC_VER >= 1500 #define SIMD_SSE41_ENABLE #endif -#if !defined(SIMD_SSE42_DISABLE) && _MSC_VER >= 1500 -#define SIMD_SSE42_ENABLE -#endif - #if !defined(SIMD_AVX_DISABLE) && _MSC_FULL_VER >= 160040219 #define SIMD_AVX_ENABLE #endif @@ -88,7 +86,7 @@ #endif #if defined(NDEBUG) && _MSC_VER >= 1700 && _MSC_VER < 1900 -#define SIMD_MADDUBS_ERROR // Visual Studio 2012/2013 release mode compiler bug in function _mm256_maddubs_epi16: +#define SIMD_MADDUBS_ERROR // Visual Studio 2012/2013 release mode compiler bug in function _mm256_maddubs_epi16. #endif #if defined(NDEBUG) && _MSC_VER == 1914 @@ -123,6 +121,8 @@ #define SIMD_ALIGNED(x) __attribute__ ((aligned(x))) +#define SIMD_NOINLINE __attribute__ ((noinline)) + #ifdef __i386__ #define SIMD_X86_ENABLE #endif @@ -159,36 +159,16 @@ #define SIMD_ARM64_ENABLE #endif -#if defined __mips__ -#define SIMD_MIPS_ENABLE -#endif - #if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) -#if !defined(SIMD_SSE_DISABLE) && defined(__SSE__) -#define SIMD_SSE_ENABLE -#endif - -#if !defined(SIMD_SSE2_DISABLE) && defined(__SSE2__) +#if !defined(SIMD_SSE2_DISABLE) && defined(__SSE__) && defined(__SSE2__) #define SIMD_SSE2_ENABLE #endif -#if !defined(SIMD_SSE3_DISABLE) && defined(__SSE3__) -#define SIMD_SSE3_ENABLE -#endif - -#if !defined(SIMD_SSSE3_DISABLE) && defined(__SSSE3__) -#define SIMD_SSSE3_ENABLE -#endif - -#if !defined(SIMD_SSE41_DISABLE) && defined(__SSE4_1__) +#if !defined(SIMD_SSE41_DISABLE) && defined(__SSE3__) && defined(__SSSE3__) && defined(__SSE4_1__) && defined(__SSE4_2__) #define SIMD_SSE41_ENABLE #endif -#if !defined(SIMD_SSE42_DISABLE) && defined(__SSE4_2__) -#define SIMD_SSE42_ENABLE -#endif - #if !defined(SIMD_AVX_DISABLE) && defined(__AVX__) #define SIMD_AVX_ENABLE #endif @@ -239,27 +219,11 @@ #endif -#ifdef SIMD_SSE_ENABLE -#include -#endif - #ifdef SIMD_SSE2_ENABLE #include #endif -#ifdef SIMD_SSE3_ENABLE -# include -#endif - -#ifdef SIMD_SSSE3_ENABLE -#include -#endif - #ifdef SIMD_SSE41_ENABLE -#include -#endif - -#ifdef SIMD_SSE42_ENABLE #include #endif @@ -273,10 +237,10 @@ #if defined(SIMD_AVX_ENABLE) || defined(SIMD_AVX2_ENABLE) #define SIMD_ALIGN 32 -#elif defined(SIMD_SSE_ENABLE) || defined(SIMD_SSE2_ENABLE) || defined(SIMD_SSE3_ENABLE) || defined(SIMD_SSSE3_ENABLE) || defined(SIMD_SSE41_ENABLE) || defined(SIMD_SSE42_ENABLE) \ +#elif defined(SIMD_SSE2_ENABLE) || defined(SIMD_SSE41_ENABLE) \ || defined(SIMD_NEON_ENABLE) #define SIMD_ALIGN 16 -#elif defined (SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) +#elif defined (SIMD_X64_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM64_ENABLE) #define SIMD_ALIGN 8 #else #define SIMD_ALIGN 4 diff --git a/3rdparty/simdlib/Simd/SimdEnable.h b/3rdparty/simdlib/Simd/SimdEnable.h old mode 100644 new mode 100755 index 6c79eb0d94..a501daf8ad --- a/3rdparty/simdlib/Simd/SimdEnable.h +++ b/3rdparty/simdlib/Simd/SimdEnable.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -62,455 +62,74 @@ namespace Simd { -#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) - namespace Cpuid - { - // See http://www.sandpile.org/x86/cpuid.htm for additional information. - enum Level - { - Ordinary = 1, - Extended = 7, - }; - - enum Register - { - Eax = 0, - Ebx = 1, - Ecx = 2, - Edx = 3, - }; - - enum Bit - { - // Ordinary: - // Edx: - SSE = 1 << 25, - SSE2 = 1 << 26, - - // Ecx: - SSE3 = 1 << 0, - SSSE3 = 1 << 9, - FMA = 1 << 12, - SSE41 = 1 << 19, - SSE42 = 1 << 20, - OSXSAVE = 1 << 27, - AVX = 1 << 28, - F16C = 1 << 29, - - // Extended: - // Ebx: - AVX2 = 1 << 5, - AVX512F = 1 << 16, - AVX512BW = 1 << 30, - - // Ecx: - AVX512VBMI = 1 << 1, - }; - - SIMD_INLINE bool CheckBit(Level level, Register index, Bit bit) - { - unsigned int registers[4] = { 0, 0, 0, 0 }; -#if defined(_MSC_VER) - __cpuid((int*)registers, level); -#elif (defined __GNUC__) - if (__get_cpuid_max(0, NULL) < level) - return false; - __cpuid_count(level, 0, registers[Eax], registers[Ebx], registers[Ecx], registers[Edx]); -#else -#error Do not know how to detect CPU info! -#endif - return (registers[index] & bit) == bit; - } - } -#endif//defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) - -#if !defined(__APPLE__) // not macOS, iOS -#if defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)) - namespace CpuInfo - { - SIMD_INLINE bool CheckBit(int at, int bit) - { - bool result = false; - int file = ::open("/proc/self/auxv", O_RDONLY); - if (file < 0) - return false; - const ssize_t size = 64; - unsigned long buffer[size]; - for (ssize_t count = size; count == size;) - { - count = ::read(file, buffer, sizeof(buffer)) / sizeof(unsigned long); - for (int i = 0; i < count; i += 2) - { - if (buffer[i] == (unsigned)at) - { - result = !!(buffer[i + 1] & bit); - count = 0; - } - if (buffer[i] == AT_NULL) - count = 0; - } - } - ::close(file); - return result; - } - } -#endif//defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)) -#endif//(TARGET_OS_IOS == 0) not iOS - -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Edx, Cpuid::SSE); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m128 value = _mm_set1_ps(1.0f);// try to execute of SSE instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - #ifdef SIMD_SSE2_ENABLE namespace Sse2 { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Edx, Cpuid::SSE2); - } + bool GetEnable(); - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m128d value = _mm_set1_pd(1.0);// try to execute of SSE2 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 - { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE3); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m128 value = _mm_hadd_ps(_mm_set1_ps(1.0f), _mm_set1_ps(2.0f)); //try to execute of SSE3 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSSE3); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m128i value = _mm_abs_epi8(_mm_set1_epi8(-1)); //try to execute of SSSE3 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); + const bool Enable = GetEnable(); } #endif #ifdef SIMD_SSE41_ENABLE namespace Sse41 { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE41); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - int value = _mm_testz_si128(_mm_set1_epi8(0), _mm_set1_epi8(-1)); // try to execute of SSE41 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_SSE42_ENABLE - namespace Sse42 - { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE42); - } + bool GetEnable(); - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - uint32_t value = _mm_crc32_u8(0, 1); // try to execute of SSE42 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); + const bool Enable = GetEnable(); } #endif #ifdef SIMD_AVX_ENABLE namespace Avx { - SIMD_INLINE bool SupportedByCPU() - { - return - Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) && - Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::AVX); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m256d value = _mm256_set1_pd(1.0);// try to execute of AVX instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } + bool GetEnable(); - const bool Enable = SupportedByCPU() && SupportedByOS(); + const bool Enable = GetEnable(); } #endif #ifdef SIMD_AVX2_ENABLE namespace Avx2 { - SIMD_INLINE bool SupportedByCPU() - { - return - Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) && - Cpuid::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX2) && - Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::FMA) && - Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::F16C); - } + bool GetEnable(); - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m256i value = _mm256_abs_epi8(_mm256_set1_epi8(1));// try to execute of AVX2 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); + const bool Enable = GetEnable(); } #endif #ifdef SIMD_NEON_ENABLE namespace Neon { - SIMD_INLINE bool SupportedByCPU() - { -#if defined(_MSC_VER) - return true; -#elif defined(__GNUC__) -#if defined(SIMD_ARM64_ENABLE) || (TARGET_OS_IOS != 0) // iOS - return true; -#else - return CpuInfo::CheckBit(AT_HWCAP, HWCAP_NEON); -#endif -#else -#error Do not know how to detect NEON support! -#endif - } + bool GetEnable(); - SIMD_INLINE bool SupportedByOS() - { - return true; - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); + const bool Enable = GetEnable(); } #endif - - SIMD_INLINE size_t Alignment() - { -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable) - return sizeof(__m256i); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable) - return sizeof(__m256); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable) - return sizeof(__m128i); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable) - return sizeof(__m128i); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable) - return sizeof(__m128i); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable) - return sizeof(__m128); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable) - return sizeof(uint8x16_t); - else -#endif - return sizeof(void *); - } - - const size_t ALIGNMENT = Alignment(); } #define SIMD_BASE_FUNC(func) Simd::Base::func -#ifdef SIMD_SSE_ENABLE -#define SIMD_SSE_FUNC(func) Simd::Sse::Enable ? Simd::Sse::func : -#else -#define SIMD_SSE_FUNC(func) -#endif - #ifdef SIMD_SSE2_ENABLE -#define SIMD_SSE2_FUNC(func) Simd::Sse2::Enable ? Simd::Sse2::func : -#else -#define SIMD_SSE2_FUNC(func) -#endif - -#ifdef SIMD_SSE3_ENABLE -#define SIMD_SSE3_FUNC(func) Simd::Sse3::Enable ? Simd::Sse3::func : +#define SIMD_SSE2_FUNC(func) Simd::Sse2::Enable ? Simd::Sse2::func : #else -#define SIMD_SSE3_FUNC(func) -#endif - -#ifdef SIMD_SSSE3_ENABLE -#define SIMD_SSSE3_FUNC(func) Simd::Ssse3::Enable ? Simd::Ssse3::func : -#else -#define SIMD_SSSE3_FUNC(func) +#define SIMD_SSE2_FUNC(func) #endif #ifdef SIMD_SSE41_ENABLE -#define SIMD_SSE41_FUNC(func) Simd::Sse41::Enable ? Simd::Sse41::func : -#else -#define SIMD_SSE41_FUNC(func) -#endif - -#ifdef SIMD_SSE42_ENABLE -#define SIMD_SSE42_FUNC(func) Simd::Sse42::Enable ? Simd::Sse42::func : +#define SIMD_SSE41_FUNC(func) Simd::Sse41::Enable ? Simd::Sse41::func : #else -#define SIMD_SSE42_FUNC(func) +#define SIMD_SSE41_FUNC(func) #endif #ifdef SIMD_AVX_ENABLE -#define SIMD_AVX_FUNC(func) Simd::Avx::Enable ? Simd::Avx::func : +#define SIMD_AVX_FUNC(func) Simd::Avx::Enable ? Simd::Avx::func : #else #define SIMD_AVX_FUNC(func) #endif #ifdef SIMD_AVX2_ENABLE -#define SIMD_AVX2_FUNC(func) Simd::Avx2::Enable ? Simd::Avx2::func : +#define SIMD_AVX2_FUNC(func) Simd::Avx2::Enable ? Simd::Avx2::func : #else #define SIMD_AVX2_FUNC(func) #endif diff --git a/3rdparty/simdlib/Simd/SimdExp.h b/3rdparty/simdlib/Simd/SimdExp.h old mode 100644 new mode 100755 index 3bfbc3f8f5..1600275b23 --- a/3rdparty/simdlib/Simd/SimdExp.h +++ b/3rdparty/simdlib/Simd/SimdExp.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -34,6 +34,11 @@ namespace Simd { return ::expf(value); } + + SIMD_INLINE float Log(float value) + { + return ::logf(value); + } } #ifdef SIMD_SSE2_ENABLE @@ -107,20 +112,20 @@ namespace Simd __m128 exp = Exp2(_mm_mul_ps(_k, value)); __m128 neg = _mm_mul_ps(alpha, _mm_sub_ps(exp, _1_0)); __m128 mask = _mm_cmpgt_ps(_mm_setzero_ps(), value); - return Sse::Combine(mask, neg, value); + return Combine(mask, neg, value); } }; namespace Detail { - SIMD_INLINE __m128 Poly5(__m128 x) + SIMD_INLINE __m128 Poly5(__m128 x, float a, float b, float c, float d, float e, float f) { - __m128 p = _mm_set1_ps(1.8775767e-3f); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(8.9893397e-3f)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(5.5826318e-2f)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(2.4015361e-1f)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(6.9315308e-1f)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(9.9999994e-1f)); + __m128 p = _mm_set1_ps(f); + p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(e)); + p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(d)); + p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(c)); + p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(b)); + p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(a)); return p; } @@ -130,9 +135,19 @@ namespace Simd __m128i ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f))); __m128 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart)); __m128 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23)); - __m128 expfpart = Poly5(fpart); + __m128 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); return _mm_mul_ps(expipart, expfpart); } + + SIMD_INLINE __m128 Log2(__m128 x) + { + __m128 _1 = _mm_set1_ps(1.0f); + __m128i i = _mm_castps_si128(x); + __m128 e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, _mm_set1_epi32(0x7F800000)), 23), _mm_set1_epi32(127))); + __m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, _mm_set1_epi32(0x007FFFFF))), _1); + __m128 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + return _mm_add_ps(_mm_mul_ps(p, _mm_sub_ps(m, _1)), e); + } } SIMD_INLINE __m128 Exponent(__m128 value) @@ -145,7 +160,36 @@ namespace Simd __m128 exp = Exponent(value); __m128 neg = _mm_mul_ps(alpha, _mm_sub_ps(exp, _mm_set1_ps(1.0f))); __m128 mask = _mm_cmpgt_ps(_mm_setzero_ps(), value); - return Sse::Combine(mask, neg, value); + return Combine(mask, neg, value); + } + + SIMD_INLINE __m128 Logarithm(__m128 value) + { + return _mm_mul_ps(_mm_set1_ps(0.693147181f), Detail::Log2(value)); + } + + SIMD_INLINE __m128 Mish(__m128 value, __m128 threshold) + { + __m128 _1 = _mm_set1_ps(1.0f); + __m128 mish = _mm_add_ps(Exponent(value), _1); + mish = _mm_add_ps(_mm_mul_ps(mish, mish), _1); + mish = _mm_mul_ps(value, _mm_sub_ps(_1, _mm_div_ps(_mm_set1_ps(2.0f), mish))); + return Combine(_mm_cmpgt_ps(threshold, value), mish, value); + } + + SIMD_INLINE __m128 Softplus(__m128 value, __m128 beta, __m128 threshold) + { + __m128 exp = Exponent(_mm_mul_ps(value, beta)); + __m128 log = Logarithm(_mm_add_ps(_mm_set1_ps(1.0f), exp)); + __m128 mask = _mm_cmpgt_ps(threshold, value); + return Combine(mask, _mm_div_ps(log, beta), value); + } + + SIMD_INLINE __m128 Tanh(__m128 value) + { + __m128 _1 = _mm_set1_ps(1.0f); + __m128 exp = Detail::Exp2(_mm_mul_ps(_mm_set1_ps(2.88539008f), value)); + return _mm_div_ps(_mm_sub_ps(exp, _1), _mm_add_ps(_1, exp)); } } #endif //SIMD_SSE2_ENABLE @@ -227,14 +271,14 @@ namespace Simd namespace Detail { - SIMD_INLINE __m256 Poly5(__m256 x) + SIMD_INLINE __m256 Poly5(__m256 x, float a, float b, float c, float d, float e, float f) { - __m256 p = _mm256_set1_ps(1.8775767e-3f); - p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(8.9893397e-3f)); - p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(5.5826318e-2f)); - p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(2.4015361e-1f)); - p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(6.9315308e-1f)); - p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(9.9999994e-1f)); + __m256 p = _mm256_set1_ps(f); + p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(e)); + p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(d)); + p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(c)); + p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(b)); + p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(a)); return p; } @@ -244,9 +288,19 @@ namespace Simd __m256i ipart = _mm256_cvtps_epi32(_mm256_sub_ps(x, _mm256_set1_ps(0.5f))); __m256 fpart = _mm256_sub_ps(x, _mm256_cvtepi32_ps(ipart)); __m256 expipart = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(ipart, _mm256_set1_epi32(127)), 23)); - __m256 expfpart = Poly5(fpart); + __m256 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); return _mm256_mul_ps(expipart, expfpart); } + + SIMD_INLINE __m256 Log2(__m256 x) + { + __m256 _1 = _mm256_set1_ps(1.0f); + __m256i i = _mm256_castps_si256(x); + __m256 e = _mm256_cvtepi32_ps(_mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(i, _mm256_set1_epi32(0x7F800000)), 23), _mm256_set1_epi32(127))); + __m256 m = _mm256_or_ps(_mm256_castsi256_ps(_mm256_and_si256(i, _mm256_set1_epi32(0x007FFFFF))), _1); + __m256 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + return _mm256_add_ps(_mm256_mul_ps(p, _mm256_sub_ps(m, _1)), e); + } } SIMD_INLINE __m256 Exponent(__m256 value) @@ -261,6 +315,35 @@ namespace Simd __m256 mask = _mm256_cmp_ps(_mm256_setzero_ps(), value, _CMP_GT_OS); return _mm256_blendv_ps(value, neg, mask); } + + SIMD_INLINE __m256 Logarithm(__m256 value) + { + return _mm256_mul_ps(_mm256_set1_ps(0.693147181f), Detail::Log2(value)); + } + + SIMD_INLINE __m256 Mish(__m256 value, __m256 threshold) + { + __m256 _1 = _mm256_set1_ps(1.0f); + __m256 mish = _mm256_add_ps(Exponent(value), _1); + mish = Fmadd(mish, mish, _1); + mish = _mm256_mul_ps(value, _mm256_sub_ps(_1, _mm256_div_ps(_mm256_set1_ps(2.0f), mish))); + return _mm256_blendv_ps(value, mish, _mm256_cmp_ps(threshold, value, _CMP_GT_OS)); + } + + SIMD_INLINE __m256 Softplus(__m256 value, __m256 beta, __m256 threshold) + { + __m256 exp = Exponent(_mm256_mul_ps(value, beta)); + __m256 log = Logarithm(_mm256_add_ps(_mm256_set1_ps(1.0f), exp)); + __m256 mask = _mm256_cmp_ps(threshold, value, _CMP_GT_OS); + return _mm256_blendv_ps(value, _mm256_div_ps(log, beta), mask); + } + + SIMD_INLINE __m256 Tanh(__m256 value) + { + __m256 _1 = _mm256_set1_ps(1.0f); + __m256 exp = Detail::Exp2(_mm256_mul_ps(_mm256_set1_ps(2.88539008f), value)); + return _mm256_div_ps(_mm256_sub_ps(exp, _1), _mm256_add_ps(_1, exp)); + } } #endif //SIMD_AVX2_ENABLE @@ -341,14 +424,14 @@ namespace Simd namespace Detail { - SIMD_INLINE float32x4_t Poly5(float32x4_t x) + SIMD_INLINE float32x4_t Poly5(float32x4_t x, float a, float b, float c, float d, float e, float f) { - float32x4_t p = vdupq_n_f32(1.8775767e-3f); - p = vmlaq_f32(vdupq_n_f32(8.9893397e-3f), x, p); - p = vmlaq_f32(vdupq_n_f32(5.5826318e-2f), x, p); - p = vmlaq_f32(vdupq_n_f32(2.4015361e-1f), x, p); - p = vmlaq_f32(vdupq_n_f32(6.9315308e-1f), x, p); - p = vmlaq_f32(vdupq_n_f32(9.9999994e-1f), x, p); + float32x4_t p = vdupq_n_f32(f); + p = vmlaq_f32(vdupq_n_f32(e), x, p); + p = vmlaq_f32(vdupq_n_f32(d), x, p); + p = vmlaq_f32(vdupq_n_f32(c), x, p); + p = vmlaq_f32(vdupq_n_f32(b), x, p); + p = vmlaq_f32(vdupq_n_f32(a), x, p); return p; } @@ -358,9 +441,19 @@ namespace Simd int32x4_t ipart = vcvtq_s32_f32(vsubq_f32(x, vdupq_n_f32(0.5f))); float32x4_t fpart = vsubq_f32(x, vcvtq_f32_s32(ipart)); float32x4_t expipart = vreinterpretq_f32_s32(vshlq_n_s32(vaddq_s32(ipart, vdupq_n_s32(127)), 23)); - float32x4_t expfpart = Poly5(fpart); + float32x4_t expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); return vmulq_f32(expipart, expfpart); } + + SIMD_INLINE float32x4_t Log2(float32x4_t x) + { + float32x4_t _1 = vdupq_n_f32(1.0f); + int32x4_t i = vreinterpretq_s32_f32(x); + float32x4_t e = vcvtq_f32_s32(vsubq_s32(vshrq_n_s32(vandq_s32(i, vdupq_n_s32(0x7F800000)), 23), vdupq_n_s32(127))); + float32x4_t m = Or(vreinterpretq_f32_s32(vandq_s32(i, vdupq_n_s32(0x007FFFFF))), _1); + float32x4_t p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + return vaddq_f32(vmulq_f32(p, vsubq_f32(m, _1)), e); + } } SIMD_INLINE float32x4_t Exponent(float32x4_t value) @@ -375,6 +468,35 @@ namespace Simd uint32x4_t mask = vcgtq_f32(vdupq_n_f32(0.0f), value); return vbslq_f32(mask, neg, value); } + + SIMD_INLINE float32x4_t Logarithm(float32x4_t value) + { + return vmulq_f32(vdupq_n_f32(0.693147181f), Detail::Log2(value)); + } + + template SIMD_INLINE float32x4_t Mish(float32x4_t value, float32x4_t threshold) + { + float32x4_t _1 = vdupq_n_f32(1.0f); + float32x4_t mish = vaddq_f32(Exponent(value), _1); + mish = Fmadd(mish, mish, _1); + mish = vmulq_f32(value, vsubq_f32(_1, Div(vdupq_n_f32(2.0f), mish))); + return vbslq_f32(vcgtq_f32(threshold, value), mish, value); + } + + template SIMD_INLINE float32x4_t Softplus(float32x4_t value, float32x4_t beta, float32x4_t threshold) + { + float32x4_t exp = Exponent(vmulq_f32(value, beta)); + float32x4_t log = Logarithm(vaddq_f32(vdupq_n_f32(1.0f), exp)); + uint32x4_t mask = vcgtq_f32(threshold, value); + return vbslq_f32(mask, Div(log, beta), value); + } + + template SIMD_INLINE float32x4_t Tanh(float32x4_t value) + { + float32x4_t _1 = vdupq_n_f32(1.0f); + float32x4_t exp = Detail::Exp2(vmulq_f32(vdupq_n_f32(2.88539008f), value)); + return Div(vsubq_f32(exp, _1), vaddq_f32(_1, exp)); + } } #endif //SIMD_NEON_ENABLE } diff --git a/3rdparty/simdlib/Simd/SimdExtract.h b/3rdparty/simdlib/Simd/SimdExtract.h old mode 100644 new mode 100755 index d0d8184d7c..e30a0c85e5 --- a/3rdparty/simdlib/Simd/SimdExtract.h +++ b/3rdparty/simdlib/Simd/SimdExtract.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,8 +28,8 @@ namespace Simd { -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { SIMD_INLINE float ExtractValue(__m128 a, int i) { @@ -44,12 +44,7 @@ namespace Simd _mm_store_ps(_a, a); return _a[0] + _a[1] + _a[2] + _a[3]; } - } -#endif//SIMD_SSE_ENABLE -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { template SIMD_INLINE int ExtractInt8(__m128i a) { return _mm_extract_epi16(_mm_srli_si128(a, index & 0x1), index >> 1) & 0xFF; @@ -90,8 +85,8 @@ namespace Simd } #endif// SIMD_SSE2_ENABLE -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { SIMD_INLINE float ExtractSum(__m128 a) { @@ -103,7 +98,7 @@ namespace Simd return _mm_hadd_ps(_mm_hadd_ps(a[0], a[1]), _mm_hadd_ps(a[2], a[3])); } } -#endif//SIMD_SSE3_ENABLE +#endif//SIMD_SSE41_ENABLE #ifdef SIMD_AVX_ENABLE namespace Avx @@ -199,6 +194,11 @@ namespace Simd return vgetq_lane_u32(a, 0) + vgetq_lane_u32(a, 1) + vgetq_lane_u32(a, 2) + vgetq_lane_u32(a, 3); } + SIMD_INLINE int32_t ExtractSum32s(const int32x4_t& a) + { + return vgetq_lane_s32(a, 0) + vgetq_lane_s32(a, 1) + vgetq_lane_s32(a, 2) + vgetq_lane_s32(a, 3); + } + SIMD_INLINE uint64_t ExtractSum64u(const uint64x2_t & a) { return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); diff --git a/3rdparty/simdlib/Simd/SimdFrame.hpp b/3rdparty/simdlib/Simd/SimdFrame.hpp old mode 100644 new mode 100755 index 53cc33879d..45b0b6022a --- a/3rdparty/simdlib/Simd/SimdFrame.hpp +++ b/3rdparty/simdlib/Simd/SimdFrame.hpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, +* Copyright (c) 2011-2021 Yermalayeu Ihar, * 2014-2019 Antonenka Mikhail, * 2019-2019 Artur Voronkov. * @@ -58,6 +58,10 @@ namespace Simd Bgr24, /*! One plane 8-bit gray pixel format. */ Gray8, + /*! One plane 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */ + Rgb24, + /*! One plane 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */ + Rgba32, }; const size_t width; /*!< \brief A width of the frame. */ @@ -373,6 +377,8 @@ namespace Simd case View::Gray8: (Format&)format = Gray8; break; case View::Bgr24: (Format&)format = Bgr24; break; case View::Bgra32: (Format&)format = Bgra32; break; + case View::Rgb24: (Format&)format = Rgb24; break; + case View::Rgba32: (Format&)format = Rgba32; break; default: assert(0); } @@ -420,6 +426,14 @@ namespace Simd case Gray8: planes[0] = View(width, height, stride0, View::Gray8, data0); break; + case Rgb24: + planes[0] = View(width, height, stride0, View::Rgb24, data0); + break; + case Rgba32: + planes[0] = View(width, height, stride0, View::Rgba32, data0); + break; + default: + assert(0); } } @@ -494,6 +508,14 @@ namespace Simd case Gray8: planes[0].Recreate(width, height, View::Gray8); break; + case Rgb24: + planes[0].Recreate(width, height, View::Rgb24); + break; + case Rgba32: + planes[0].Recreate(width, height, View::Rgba32); + break; + default: + assert(0); } } @@ -591,6 +613,8 @@ namespace Simd case Bgra32: return 1; case Bgr24: return 1; case Gray8: return 1; + case Rgb24: return 1; + case Rgba32: return 1; default: assert(0); return 0; } } @@ -648,6 +672,12 @@ namespace Simd case Frame::Gray8: BgraToGray(src.planes[0], dst.planes[0]); break; + case Frame::Rgb24: + BgraToRgb(src.planes[0], dst.planes[0]); + break; + case Frame::Rgba32: + BgraToRgba(src.planes[0], dst.planes[0]); + break; default: assert(0); } @@ -662,6 +692,12 @@ namespace Simd case Frame::Gray8: BgrToGray(src.planes[0], dst.planes[0]); break; + case Frame::Rgb24: + BgrToRgb(src.planes[0], dst.planes[0]); + break; + case Frame::Rgba32: + BgrToRgba(src.planes[0], dst.planes[0]); + break; default: assert(0); } @@ -676,11 +712,71 @@ namespace Simd case Frame::Bgr24: GrayToBgr(src.planes[0], dst.planes[0]); break; + case Frame::Rgb24: + GrayToRgb(src.planes[0], dst.planes[0]); + break; + case Frame::Rgba32: + GrayToRgba(src.planes[0], dst.planes[0]); + break; default: assert(0); } break; + case Frame::Rgb24: + switch (dst.format) + { + case Frame::Bgra32: + RgbToBgra(src.planes[0], dst.planes[0]); + break; + case Frame::Gray8: + RgbToGray(src.planes[0], dst.planes[0]); + break; + case Frame::Bgr24: + RgbToBgr(src.planes[0], dst.planes[0]); + break; + case Frame::Rgba32: + RgbToRgba(src.planes[0], dst.planes[0]); + break; + default: + assert(0); + } + + case Frame::Rgba32: + switch (dst.format) + { + case Frame::Nv12: + { + View bgr(src.Size(), View::Bgr24); + RgbaToBgr(src.planes[0], bgr); + View u(src.Size(), View::Gray8), v(src.Size(), View::Gray8); + BgrToYuv420p(bgr, dst.planes[0], u, v); + InterleaveUv(u, v, dst.planes[1]); + break; + } + case Frame::Yuv420p: + { + View bgr(src.Size(), View::Bgr24); + RgbaToBgr(src.planes[0], bgr); + BgrToYuv420p(bgr, dst.planes[0], dst.planes[1], dst.planes[2]); + break; + } + case Frame::Bgra32: + RgbaToBgra(src.planes[0], dst.planes[0]); + break; + case Frame::Gray8: + RgbaToGray(src.planes[0], dst.planes[0]); + break; + case Frame::Bgr24: + RgbaToBgr(src.planes[0], dst.planes[0]); + break; + case Frame::Rgb24: + RgbaToRgb(src.planes[0], dst.planes[0]); + break; + default: + assert(0); + } + default: assert(0); } diff --git a/3rdparty/simdlib/Simd/SimdInit.h b/3rdparty/simdlib/Simd/SimdInit.h old mode 100644 new mode 100755 index 179e61bdb4..707ea4c8bc --- a/3rdparty/simdlib/Simd/SimdInit.h +++ b/3rdparty/simdlib/Simd/SimdInit.h @@ -28,7 +28,22 @@ namespace Simd { -#if defined(_MSC_VER) && (defined(SIMD_X64_ENABLE) || defined(SIMD_X86_ENABLE)) + +#if defined(_MSC_VER) && !defined(__clang__) && (defined(SIMD_X64_ENABLE) || defined(SIMD_X86_ENABLE)) + +#define SIMD_INIT_AS_CHAR + +#elif defined(__GNUC__) || defined(__clang__) || (defined(_MSC_VER) && defined(SIMD_NEON_ENABLE)) + +#define SIMD_INIT_AS_LONGLONG + +#else + +#error This platform is unsupported! + +#endif + +#if defined(SIMD_INIT_AS_CHAR) template SIMD_INLINE char GetChar(T value, size_t index) { @@ -50,7 +65,7 @@ namespace Simd Simd::GetChar(int64_t(a), 4), Simd::GetChar(int64_t(a), 5), \ Simd::GetChar(int64_t(a), 6), Simd::GetChar(int64_t(a), 7) -#elif defined(__GNUC__) || (defined(_MSC_VER) && defined(SIMD_NEON_ENABLE)) +#elif defined(SIMD_INIT_AS_LONGLONG) #define SIMD_CHAR_AS_LONGLONG(a) (((long long)a) & 0xFF) @@ -94,11 +109,15 @@ namespace Simd #define SIMD_LL_SET2_EPI32(a, b) \ SIMD_INT_AS_LONGLONG(a) | (SIMD_INT_AS_LONGLONG(b) << 32) -#endif//defined(__GNUC__) || (defined(_MSC_VER) && defined(SIMD_NEON_ENABLE)) +#else + +#error This platform is unsupported! + +#endif #if defined(SIMD_SSE2_ENABLE) -#if defined(_MSC_VER) +#if defined(SIMD_INIT_AS_CHAR) #define SIMD_MM_SET1_EPI8(a) \ {SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ @@ -148,7 +167,7 @@ namespace Simd #define SIMD_MM_SETR_EPI64(a0, a1) \ {SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1)} -#elif defined(__GNUC__) +#elif defined(SIMD_INIT_AS_LONGLONG) #define SIMD_MM_SET1_EPI8(a) \ {SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a)} @@ -192,7 +211,7 @@ namespace Simd #if defined(SIMD_AVX2_ENABLE) -#if defined(_MSC_VER) +#if defined(SIMD_INIT_AS_CHAR) #define SIMD_MM256_SET1_EPI8(a) \ {SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ @@ -263,7 +282,7 @@ namespace Simd #define SIMD_MM256_SETR_EPI64(a0, a1, a2, a3) \ {SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1), SIMD_AS_8CHARS(a2), SIMD_AS_8CHARS(a3)} -#elif defined(__GNUC__) +#elif defined(SIMD_INIT_AS_LONGLONG) #define SIMD_MM256_SET1_EPI8(a) \ {SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a), \ @@ -310,7 +329,7 @@ namespace Simd #define SIMD_MM256_SETR_EPI64(a0, a1, a2, a3) \ {a0, a1, a2, a3} -#endif// defined(_MSC_VER) || defined(__GNUC__) +#endif #endif// SIMD_AVX2_ENABLE diff --git a/3rdparty/simdlib/Simd/SimdLib.cpp b/3rdparty/simdlib/Simd/SimdLib.cpp old mode 100644 new mode 100755 index eb181ec376..b1cac8b1ba --- a/3rdparty/simdlib/Simd/SimdLib.cpp +++ b/3rdparty/simdlib/Simd/SimdLib.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, +* Copyright (c) 2011-2021 Yermalayeu Ihar, * 2014-2018 Antonenka Mikhail, * 2018-2018 Radchenko Andrey, * 2019-2019 Facundo Galan. @@ -55,18 +55,18 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved) #include "Simd/SimdLib.h" #include "Simd/SimdMemory.h" +#include "Simd/SimdCpu.h" #include "Simd/SimdEnable.h" +#include "Simd/SimdAlignment.h" #include "Simd/SimdConst.h" -#include "Simd/SimdCpu.h" #include "Simd/SimdLog.h" #include "Simd/SimdResizer.h" #include "Simd/SimdGaussianBlur.h" #include "Simd/SimdBase.h" -#include "Simd/SimdSse1.h" #include "Simd/SimdSse2.h" -#include "Simd/SimdSsse3.h" +#include "Simd/SimdSse41.h" #include "Simd/SimdAvx1.h" #include "Simd/SimdAvx2.h" #include "Simd/SimdNeon.h" @@ -75,6 +75,11 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved) #include "Simd/SimdVersion.h" #endif +namespace Simd +{ + const size_t ALIGNMENT = GetAlignment(); +} + SIMD_API const char * SimdVersion() { return SIMD_VERSION; @@ -118,9 +123,9 @@ SIMD_API void SimdRelease(void * context) SIMD_API SimdBool SimdGetFastMode() { -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable) - return Sse::GetFastMode(); +#ifdef SIMD_SSE2_ENABLE + if (Sse2::Enable) + return Sse2::GetFastMode(); else #endif #ifdef SIMD_NEON_ENABLE @@ -133,9 +138,9 @@ SIMD_API SimdBool SimdGetFastMode() SIMD_API void SimdSetFastMode(SimdBool value) { -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable) - Sse::SetFastMode(value); +#ifdef SIMD_SSE2_ENABLE + if (Sse2::Enable) + Sse2::SetFastMode(value); #endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable) @@ -145,9 +150,9 @@ SIMD_API void SimdSetFastMode(SimdBool value) SIMD_API void SimdBgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) { -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && width >= Sse41::A) + Sse41::BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); else #endif #ifdef SIMD_NEON_ENABLE @@ -178,84 +183,69 @@ SIMD_API void SimdBgraToGray(const uint8_t *bgra, size_t width, size_t height, s Base::BgraToGray(bgra, width, height, bgraStride, gray, grayStride); } -SIMD_API void SimdRgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride) +SIMD_API void SimdBgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) { #ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + if (Avx2::Enable && width >= Avx2::A) + Avx2::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); else #endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); else #endif #ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::HA) - Neon::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + if (Neon::Enable && width >= Neon::A) + Neon::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); else #endif - Base::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + Base::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); } -SIMD_API void SimdBgrToBgra(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha) +SIMD_API void SimdBgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) { -#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR) - if(Avx2::Enable && width >= Avx2::A) - Avx2::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); +#ifdef SIMD_AVX2_ENABLE + if (Avx2::Enable && width >= Avx2::A) + Avx2::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); else #endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable && width >= Neon::A) - Neon::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); + Neon::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); else #endif - Base::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); + Base::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); } -SIMD_API void SimdBgrToRgba(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *rgba, size_t rgbaStride, uint8_t alpha) +SIMD_API void SimdBgrToBgra(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha) { #if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR) if(Avx2::Enable && width >= Avx2::A) - Avx2::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); + Avx2::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); else #endif - Base::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); -} - -SIMD_API void SimdBgraToRgba(const uint8_t *bgra, size_t width, size_t height, size_t bgraStride, uint8_t *rgba, size_t rgbaStride) -{ -#if defined(SIMD_AVX2_ENABLE) - if(Avx2::Enable && width >= Avx2::A) - Avx2::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && width >= Sse41::A) + Sse41::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); +#ifdef SIMD_VMX_ENABLE + if(Vmx::Enable && width >= Vmx::A) + Vmx::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); else #endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable && width >= Neon::A) - Neon::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + Neon::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); else #endif - Base::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + Base::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); } SIMD_API void SimdBgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, @@ -286,9 +276,9 @@ SIMD_API void SimdBgrToGray(const uint8_t *bgr, size_t width, size_t height, siz Avx2::BgrToGray(bgr, width, height, bgrStride, gray, grayStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgrToGray(bgr, width, height, bgrStride, gray, grayStride); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && width >= Sse41::A) + Sse41::BgrToGray(bgr, width, height, bgrStride, gray, grayStride); else #endif #ifdef SIMD_SSE2_ENABLE @@ -304,49 +294,29 @@ SIMD_API void SimdBgrToGray(const uint8_t *bgr, size_t width, size_t height, siz Base::BgrToGray(bgr, width, height, bgrStride, gray, grayStride); } -SIMD_API void SimdRgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride) +SIMD_API void SimdBgrToRgb(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride) { -#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR) - if (Avx2::Enable && width >= Avx2::A) - Avx2::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && width >= Sse2::A) - Sse2::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); +#ifdef SIMD_AVX512BW_ENABLE + if (Avx512bw::Enable) + Avx512bw::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); else #endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else -#endif - Base::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); -} - -SIMD_API void SimdBgrToRgb(const uint8_t *bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) -{ #ifdef SIMD_AVX2_ENABLE if (Avx2::Enable && width >= Avx2::A) - Avx2::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); + Avx2::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); else #endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable && width >= Neon::A) - Neon::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); + Neon::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); else #endif - Base::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); + Base::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); } SIMD_API void SimdCopy(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, uint8_t * dst, size_t dstStride) @@ -368,9 +338,9 @@ SIMD_API void SimdDeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t Avx2::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); else #endif #ifdef SIMD_NEON_ENABLE @@ -389,9 +359,9 @@ SIMD_API void SimdDeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size Avx2::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); else #endif #ifdef SIMD_NEON_ENABLE @@ -410,9 +380,9 @@ SIMD_API void SimdGaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t Avx2::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && (width - 1)*channelCount >= Ssse3::A) - Ssse3::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && (width - 1)*channelCount >= Sse41::A) + Sse41::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); else #endif #ifdef SIMD_SSE2_ENABLE @@ -448,9 +418,9 @@ SIMD_API void SimdGrayToBgr(const uint8_t * gray, size_t width, size_t height, s Avx2::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && width >= Sse41::A) + Sse41::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); else #endif #ifdef SIMD_NEON_ENABLE @@ -489,9 +459,9 @@ SIMD_API void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t Avx2::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); else #endif #ifdef SIMD_NEON_ENABLE @@ -510,9 +480,9 @@ SIMD_API void SimdInterleaveBgra(const uint8_t * b, size_t bStride, const uint8_ Avx2::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); else #endif #ifdef SIMD_NEON_ENABLE @@ -552,9 +522,9 @@ SIMD_API void SimdReduceColor2x2(const uint8_t *src, size_t srcWidth, size_t src Avx2::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); else #endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && srcWidth >= Ssse3::DA) - Ssse3::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && srcWidth >= Sse41::DA) + Sse41::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); else #endif #ifdef SIMD_SSE2_ENABLE @@ -578,9 +548,9 @@ SIMD_API void SimdReduceGray2x2(const uint8_t *src, size_t srcWidth, size_t srcH Avx2::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && srcWidth >= Ssse3::DA) - Ssse3::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && srcWidth >= Sse41::DA) + Sse41::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); else #endif #ifdef SIMD_SSE2_ENABLE @@ -625,9 +595,9 @@ SIMD_API void SimdReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcH Avx2::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && srcWidth > Ssse3::A) - Ssse3::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && srcWidth > Sse41::A) + Sse41::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); else #endif #ifdef SIMD_SSE2_ENABLE @@ -672,9 +642,9 @@ SIMD_API void SimdResizeBilinear(const uint8_t *src, size_t srcWidth, size_t src Avx2::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && dstWidth >= Ssse3::A) - Ssse3::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && dstWidth >= Sse41::A) + Sse41::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); else #endif #ifdef SIMD_SSE2_ENABLE @@ -707,21 +677,11 @@ SIMD_API void * SimdResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t ds return Sse41::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); else #endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable) - return Ssse3::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - else -#endif #ifdef SIMD_SSE2_ENABLE if (Sse2::Enable) return Sse2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); else #endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable) - return Sse::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - else -#endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable) return Neon::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); @@ -735,6 +695,66 @@ SIMD_API void SimdResizerRun(const void * resizer, const uint8_t * src, size_t s ((Resizer*)resizer)->Run(src, srcStride, dst, dstStride); } +SIMD_API void SimdRgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) +{ +#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR) + if (Avx2::Enable && width >= Avx2::A) + Avx2::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + else +#endif +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + else +#endif +#ifdef SIMD_NEON_ENABLE + if (Neon::Enable && width >= Neon::A) + Neon::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + else +#endif + Base::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); +} + +SIMD_API void SimdRgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) +{ +#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR) + if (Avx2::Enable && width >= Avx2::A) + Avx2::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + else +#endif +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + else +#endif +#ifdef SIMD_NEON_ENABLE + if (Neon::Enable && width >= Neon::A) + Neon::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + else +#endif + Base::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); +} + +SIMD_API void SimdRgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) +{ +#if defined(SIMD_AVX2_ENABLE) + if (Avx2::Enable && width >= Avx2::A) + Avx2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + else +#endif +#ifdef SIMD_SSE2_ENABLE + if (Sse2::Enable && width >= Sse2::A) + Sse2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + else +#endif +#ifdef SIMD_NEON_ENABLE + if (Neon::Enable && width >= Neon::A) + Neon::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + else +#endif + Base::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); +} + SIMD_API void SimdStretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) { @@ -842,6 +862,7 @@ SIMD_API void SimdMatTranspose(const double * mat, size_t rows, size_t cols, dou SIMD_API void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff) { + //TODO: #ifdef SIMD_SSSE3_ENABLE if (Ssse3::Enable && size >= Ssse3::A) Ssse3::SimdImageDifference(img1,img2, size, imgDiff); diff --git a/3rdparty/simdlib/Simd/SimdLib.h b/3rdparty/simdlib/Simd/SimdLib.h old mode 100644 new mode 100755 index c3862f19f1..4838b82261 --- a/3rdparty/simdlib/Simd/SimdLib.h +++ b/3rdparty/simdlib/Simd/SimdLib.h @@ -1,8 +1,8 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, -* 2014-2016 Antonenka Mikhail, +* Copyright (c) 2011-2021 Yermalayeu Ihar, +* 2014-2019 Antonenka Mikhail, * 2019-2019 Facundo Galan. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -27,8 +27,6 @@ #ifndef __SimdLib_h__ #define __SimdLib_h__ -#include "Simd/SimdConfig.h" - #include #if defined(_MSC_VER) || defined(__CODEGEARC__) @@ -107,12 +105,8 @@ typedef enum SimdCpuInfoCacheL1, /*!< A size of level 1 data cache. */ SimdCpuInfoCacheL2, /*!< A size of level 2 cache. */ SimdCpuInfoCacheL3, /*!< A size of level 3 cache. */ - SimdCpuInfoSse, /*!< Availability of SSE (x86). */ SimdCpuInfoSse2, /*!< Availability of SSE2 (x86). */ - SimdCpuInfoSse3, /*!< Availability of SSE3 (x86). */ - SimdCpuInfoSsse3, /*!< Availability of SSSE3 (x86). */ SimdCpuInfoSse41, /*!< Availability of SSE4.1 (x86). */ - SimdCpuInfoSse42, /*!< Availability of SSE4.2 (x86). */ SimdCpuInfoAvx, /*!< Availability of AVX (x86). */ SimdCpuInfoAvx2, /*!< Availability of AVX2 (x86). */ SimdCpuInfoAvx512f, /*!< Availability of AVX-512F (x86). */ @@ -120,7 +114,6 @@ typedef enum SimdCpuInfoVmx, /*!< Availability of VMX or Altivec (PowerPC). */ SimdCpuInfoVsx, /*!< Availability of VSX (PowerPC). */ SimdCpuInfoNeon, /*!< Availability of NEON (ARM). */ - SimdCpuInfoMsa, /*!< Availability of MSA (MIPS). */ } SimdCpuInfoType; /*! @ingroup c_types @@ -188,6 +181,8 @@ typedef enum SimdPixelFormatHsl24, /*! A 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */ SimdPixelFormatRgb24, + /*! A 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */ + SimdPixelFormatRgba32, } SimdPixelFormatType; /*! @ingroup c_types @@ -208,12 +203,14 @@ typedef enum { /*! 8-bit integer channel type. */ SimdResizeChannelByte, + /*! 16-bit integer channel type. */ + SimdResizeChannelShort, /*! 32-bit float channel type. */ SimdResizeChannelFloat, } SimdResizeChannelType; /*! @ingroup resizing - Describes methods used in oreder to resize image. + Describes methods used in order to resize image. */ typedef enum { @@ -223,6 +220,10 @@ typedef enum SimdResizeMethodCaffeInterp, /*! Area method. */ SimdResizeMethodArea, + /*! InferenceEngine::Extension::Cpu::Interp compatible method. */ + SimdResizeMethodInferenceEngineInterp, + /*! Nearest pixel method. */ + SimdResizeMethodNearest, } SimdResizeMethodType; // ViSP custom SIMD code @@ -317,7 +318,7 @@ extern "C" \fn size_t SimdAlignment(); - \short Gets alignment required for the most productive work of the Simd Library. + \short Gets alignment required for the most productive work of Simd Library. \return a required alignment. */ @@ -359,17 +360,18 @@ extern "C" \fn void SimdBgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride); - \short Converts 32-bit BGRA image to 24-bit BGR image. + \short Converts 32-bit BGRA image to 24-bit BGR image. Also it can be used for 32-bit RGBA to 24-bit RGB conversion. All images must have the same width and height. - \note This function has a C++ wrapper Simd::BgraToBgr(const View& bgra, View& bgr). + \note This function has C++ wrappers: Simd::BgraToBgr(const View& bgra, View& bgr) + and Simd::RgbaToRgb(const View& rgba, View& rgb). - \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image. + \param [in] bgra - a pointer to pixels data of input 32-bit BGRA (or 32-bit RGBA) image. \param [in] width - an image width. \param [in] height - an image height. \param [in] bgraStride - a row size of the bgra image. - \param [out] bgr - a pointer to pixels data of output 24-bit BGR image. + \param [out] bgr - a pointer to pixels data of output 24-bit BGR (or 24-bit RGB) image. \param [in] bgrStride - a row size of the bgr image. */ SIMD_API void SimdBgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride); @@ -395,76 +397,63 @@ extern "C" /*! @ingroup bgra_conversion - \fn void SimdRgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride); + \fn void SimdBgraToRgb(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgb, size_t rgbStride); - \short Converts 32-bit RGBA image to 8-bit gray image. + \short Converts 32-bit BGRA image to 24-bit RGB image. Also it can be used for 32-bit RGBA to 24-bit BGR conversion. All images must have the same width and height. - \param [in] rgba - a pointer to pixels data of input 32-bit RGBA image. + \note This function has C++ wrappers: Simd::BgraToRgb(const View& bgra, View& rgb) + and Simd::RgbaToBgr(const View& rgba, View& bgr). + + \param [in] bgra - a pointer to pixels data of input 32-bit BGRA (or 32-bit RGBA) image. \param [in] width - an image width. \param [in] height - an image height. - \param [in] rgbaStride - a row size of the rgba image. - \param [out] gray - a pointer to pixels data of output 8-bit gray image. - \param [in] grayStride - a row size of the gray image. + \param [in] bgraStride - a row size of the bgra image. + \param [out] rgb - a pointer to pixels data of output 24-bit RGB (or 24-bit BGR) image. + \param [in] rgbStride - a row size of the rgb image. */ - SIMD_API void SimdRgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride); + SIMD_API void SimdBgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride); - /*! @ingroup bgr_conversion + /*! @ingroup bgra_conversion - \fn void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); + \fn void SimdBgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride); - \short Converts 24-bit BGR image to 32-bit BGRA image. + \short Converts 32-bit BGRA image to 32-bit RGBA image. Also it can be used for 32-bit RGBA to 32-bit BGRA conversion. All images must have the same width and height. - \note This function has a C++ wrapper Simd::BgrToBgra(const View& bgr, View& bgra, uint8_t alpha). + \note This function has C++ wrappers: Simd::BgraToRgba(const View& bgra, View& rgba) + and Simd::RgbaToBgra(const View& rgba, View& bgra). - \param [in] bgr - a pointer to pixels data of input 24-bit BGR image. + \param [in] bgra - a pointer to pixels data of input 32-bit BGRA (or 32-bit RGBA) image. \param [in] width - an image width. \param [in] height - an image height. - \param [in] bgrStride - a row size of the bgr image. - \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image. \param [in] bgraStride - a row size of the bgra image. - \param [in] alpha - a value of alpha channel. + \param [out] rgba - a pointer to pixels data of output 32-bit RGBA (or 32-bit BGRA) image. + \param [in] rgbaStride - a row size of the rgb image. */ - SIMD_API void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); + SIMD_API void SimdBgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride); /*! @ingroup bgr_conversion - \fn void SimdBgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha); + \fn void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - \short Converts 24-bit BGR image to 32-bit RGBA image. + \short Converts 24-bit BGR image to 32-bit BGRA image. All images must have the same width and height. + \note This function has a C++ wrapper Simd::BgrToBgra(const View& bgr, View& bgra, uint8_t alpha). + \param [in] bgr - a pointer to pixels data of input 24-bit BGR image. \param [in] width - an image width. \param [in] height - an image height. \param [in] bgrStride - a row size of the bgr image. - \param [out] rgba - a pointer to pixels data of output 32-bit BGRA image. - \param [in] rgbaStride - a row size of the bgra image. - \param [in] alpha - a value of alpha channel. - */ - SIMD_API void SimdBgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha); - - /*! @ingroup bgr_conversion - - \fn void SimdBgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride); - - \short Converts 32-bit BGRA image to 32-bit RGBA image. - - All images must have the same width and height. - - \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image. - \param [in] width - an image width. - \param [in] height - an image height. + \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image. \param [in] bgraStride - a row size of the bgra image. - \param [out] rgba - a pointer to pixels data of output 32-bit RGBA image. - \param [in] rgbaStride - a row size of the rgba image. \param [in] alpha - a value of alpha channel. */ - SIMD_API void SimdBgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride); + SIMD_API void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); /*! @ingroup other_conversion @@ -512,39 +501,23 @@ extern "C" /*! @ingroup bgr_conversion - \fn void SimdRgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride); + \fn void SimdBgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride); - \short Converts 24-bit RGB image to 8-bit gray image. + \short Converts 24-bit BGR image to 24-bit RGB image. Also it can be used for 24-bit RGB to 24-bit BGR conversion. All images must have the same width and height. - \param [in] rgb - a pointer to pixels data of input 24-bit BGR image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] rgbStride - a row size of the bgr image. - \param [out] gray - a pointer to pixels data of output 8-bit gray image. - \param [in] grayStride - a row size of the gray image. - */ - SIMD_API void SimdRgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride); - - /*! @ingroup bgr_conversion - - \fn void SimdBgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); - - \short Converts 24-bit BGR image to 24-bit RGB image (also it performs backward conversion). - - All images must have the same width and height. + \note This function has C++ wrappers: Simd::BgrToRgb(const View & bgr, View & rgb) + and Simd::RgbToBgr(const View& rgb, View& bgr). - \note This function has a C++ wrapper Simd::BgrToRgb(const View & bgr, View & rgb). - - \param [in] bgr - a pointer to pixels data of input 24-bit BGR image. - \param [in] bgrStride - a row size of the bgr image. + \param [in] bgr - a pointer to pixels data of input 24-bit BGR image (or 24-bit RGB image). \param [in] width - an image width. \param [in] height - an image height. - \param [out] rgb - a pointer to pixels data of output 24-bit RGB image. + \param [in] bgrStride - a row size of the bgr image. + \param [out] rgb - a pointer to pixels data of output 24-bit RGB image (or 24-bit BGR image). \param [in] rgbStride - a row size of the rgb image. */ - SIMD_API void SimdBgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); + SIMD_API void SimdBgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride); /*! @ingroup copying @@ -591,7 +564,7 @@ extern "C" SIMD_API void SimdCopyFrame(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, size_t frameLeft, size_t frameTop, size_t frameRight, size_t frameBottom, uint8_t * dst, size_t dstStride); - /*! @ingroup other_conversion + /*! @ingroup deinterleave_conversion \fn void SimdDeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); @@ -599,7 +572,9 @@ extern "C" All images must have the same width and height. - \note This function has a C++ wrapper Simd::DeinterleaveBgr(const View& bgr, View& b, View& g, View& r). + \note This function has C++ wrappers: + Simd::DeinterleaveBgr(const View& bgr, View& b, View& g, View& r), + Simd::DeinterleaveRgb(const View& rgb, View& r, View& g, View& b). \param [in] bgr - a pointer to pixels data of input 24-bit BGR interleaved image. \param [in] bgrStride - a row size of the bgr image. @@ -615,7 +590,7 @@ extern "C" SIMD_API void SimdDeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); - /*! @ingroup other_conversion + /*! @ingroup deinterleave_conversion \fn void SimdDeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride); @@ -623,7 +598,11 @@ extern "C" All images must have the same width and height. - \note This function has a C++ wrapper Simd::DeinterleaveBgra(const View& bgra, View& b, View& g, View& r, View& a). + \note This function has C++ wrappers: + Simd::DeinterleaveBgra(const View& bgra, View& b, View& g, View& r, View& a), + Simd::DeinterleaveBgra(const View& bgra, View& b, View& g, View& r), + Simd::DeinterleaveRgba(const View& rgba, View& r, View& g, View& b, View& a), + Simd::DeinterleaveRgba(const View& rgba, View& r, View& g, View& b). \param [in] bgra - a pointer to pixels data of input 32-bit BGRA interleaved image. \param [in] bgraStride - a row size of the bgra image. @@ -635,7 +614,7 @@ extern "C" \param [in] gStride - a row size of the g image. \param [out] r - a pointer to pixels data of 8-bit Red planar image. \param [in] rStride - a row size of the r image. - \param [out] a - a pointer to pixels data of 8-bit Alpha planar image. + \param [out] a - a pointer to pixels data of 8-bit Alpha planar image. It can be NULL. \param [in] aStride - a row size of the a image. */ SIMD_API void SimdDeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, @@ -670,20 +649,27 @@ extern "C" size_t channelCount, uint8_t * dst, size_t dstStride); /*! @ingroup gaussian_filter + \fn void * SimdGaussianBlurInit(size_t width, size_t height, size_t channels, const float * sigma, const float* epsilon); + \short Creates Gaussian blur filter context. + In particular calculates Gaussian blur coefficients: \verbatim half = floor(sqrt(log(1/epsilon)) * sigma); weight[2*half + 1]; + for(x = -half; x <= half; ++x) weight[x + half] = exp(-sqr(x / sigma) / 2); + sum = 0; for (x = -half; x <= half; ++x) sum += weight[x + half]; + for (x = -half; x <= half; ++x) weight[x + half] /= sum; \endverbatim + \param [in] width - a width of input and output image. \param [in] height - a height of input and output image. \param [in] channels - a channel number of input and output image. Its value must be in range [1..4]. @@ -697,8 +683,11 @@ extern "C" SIMD_API void* SimdGaussianBlurInit(size_t width, size_t height, size_t channels, const float * sigma, const float* epsilon); /*! @ingroup gaussian_filter + \fn void SimdGaussianBlurRun(const void* filter, const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride); + \short Performs image Gaussian bluring. + Bluring algorithm for every point: \verbatim sum = 0; @@ -713,6 +702,7 @@ extern "C" } dst[dx, dy] = sum; \endverbatim + \param [in] filter - a filter context. It must be created by function ::SimdGaussianBlurInit and released by function ::SimdRelease. \param [in] src - a pointer to pixels data of the original input image. \param [in] srcStride - a row size (in bytes) of the input image. @@ -725,17 +715,18 @@ extern "C" \fn void SimdGrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride); - \short Converts 8-bit gray image to 24-bit BGR image. + \short Converts 8-bit gray image to 24-bit BGR image. Also it can be used for 8-bit gray to 24-bit RGB conversion. All images must have the same width and height. - \note This function has a C++ wrapper Simd::GrayToBgr(const View& gray, View& bgr). + \note This function has C++ wrappers: Simd::GrayToBgr(const View& gray, View& bgr) + and Simd::GrayToRgb(const View& gray, View& rgb). \param [in] gray - a pointer to pixels data of input 8-bit gray image. \param [in] width - an image width. \param [in] height - an image height. \param [in] grayStride - a row size of the gray image. - \param [out] bgr - a pointer to pixels data of output 24-bit BGR image. + \param [out] bgr - a pointer to pixels data of output 24-bit BGR (or 24-bit RGB) image. \param [in] bgrStride - a row size of the bgr image. */ SIMD_API void SimdGrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride); @@ -744,17 +735,18 @@ extern "C" \fn void SimdGrayToBgra(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - \short Converts 8-bit gray image to 32-bit BGRA image. + \short Converts 8-bit gray image to 32-bit BGRA image. Also it can be used for 8-bit gray to 32-bit RGBA conversion. All images must have the same width and height. - \note This function has a C++ wrapper Simd::GrayToBgra(const View& gray, View& bgra, uint8_t alpha). + \note This function has C++ wrappers: Simd::GrayToBgra(const View& gray, View& bgra, uint8_t alpha) + and Simd::GrayToRgba(const View& gray, View& rgba, uint8_t alpha). \param [in] gray - a pointer to pixels data of input 8-bit gray image. \param [in] width - an image width. \param [in] height - an image height. \param [in] grayStride - a row size of the gray image. - \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image. + \param [out] bgra - a pointer to pixels data of output 32-bit BGRA (or 32-bit RGBA) image. \param [in] bgraStride - a row size of the bgra image. \param [in] alpha - a value of alpha channel. */ @@ -785,7 +777,7 @@ extern "C" SIMD_API void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - /*! @ingroup other_conversion + /*! @ingroup interleave_conversion \fn void SimdInterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride); @@ -1125,6 +1117,16 @@ extern "C" \short Creates resize context. + An using example (resize of RGBA64 image): + \verbatim + void * resizer = SimdResizerInit(srcX, srcY, dstX, dstY, 4, SimdResizeChannelShort, SimdResizeMethodBilinear); + if (resizer) + { + SimdResizerRun(resizer, (uint8_t*)src, srcStride, (uint8_t*)dst, dstStride); + SimdRelease(resizer); + } + \endverbatim + \param [in] srcX - a width of the input image. \param [in] srcY - a height of the input image. \param [in] dstX - a width of the output image. @@ -1152,6 +1154,65 @@ extern "C" */ SIMD_API void SimdResizerRun(const void * resizer, const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); + /*! @ingroup rgb_conversion + + \fn void SimdRgbToBgra(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); + + \short Converts 24-bit RGB image to 32-bit BGRA image. Also it can be used for 24-bit BGR to 32-bit RGBA conversion. + + All images must have the same width and height. + + \note This function has C++ wrappers: Simd::RgbToBgra(const View& rgb, View& bgra, uint8_t alpha) + and Simd::BgrToRgba(const View& bgr, View& rgba, uint8_t alpha). + + \param [in] rgb - a pointer to pixels data of input 24-bit RGB (or 24-bit BGR) image. + \param [in] width - an image width. + \param [in] height - an image height. + \param [in] rgbStride - a row size of the rgb image. + \param [out] bgra - a pointer to pixels data of output 32-bit BGRA (or 32-bit RGBA) image. + \param [in] bgraStride - a row size of the bgra image. + \param [in] alpha - a value of alpha channel. + */ + SIMD_API void SimdRgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + + /*! @ingroup rgb_conversion + + \fn void SimdRgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride); + + \short Converts 24-bit RGB image to 8-bit gray image. + + All images must have the same width and height. + + \note This function has a C++ wrapper Simd::RgbToGray(const View& rgb, View& gray). + + \param [in] rgb - a pointer to pixels data of input 24-bit RGB image. + \param [in] width - an image width. + \param [in] height - an image height. + \param [in] rgbStride - a row size of the rgb image. + \param [out] gray - a pointer to pixels data of output 8-bit gray image. + \param [in] grayStride - a row size of the gray image. + */ + SIMD_API void SimdRgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); + + /*! @ingroup rgba_conversion + + \fn void SimdRgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride); + + \short Converts 32-bit RGBA image to 8-bit gray image. + + All images must have the same width and height. + + \note This function has a C++ wrapper Simd::RgbaToGray(const View& rgba, View& gray). + + \param [in] rgba - a pointer to pixels data of input 32-bit RGBA image. + \param [in] width - an image width. + \param [in] height - an image height. + \param [in] rgbaStride - a row size of the rgba image. + \param [out] gray - a pointer to pixels data of output 8-bit gray image. + \param [in] grayStride - a row size of the gray image. + */ + SIMD_API void SimdRgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride); + /*! @ingroup resizing \fn void SimdStretchGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride); diff --git a/3rdparty/simdlib/Simd/SimdLib.hpp b/3rdparty/simdlib/Simd/SimdLib.hpp old mode 100644 new mode 100755 index 7f7e6745d5..aaedc571e2 --- a/3rdparty/simdlib/Simd/SimdLib.hpp +++ b/3rdparty/simdlib/Simd/SimdLib.hpp @@ -1,8 +1,8 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, -* 2014-2016 Antonenka Mikhail, +* Copyright (c) 2011-2021 Yermalayeu Ihar, +* 2014-2019 Antonenka Mikhail, * 2019-2019 Facundo Galan. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -31,7 +31,9 @@ #ifndef __SimdLib_hpp__ #define __SimdLib_hpp__ -/*! \namespace Simd */ +/*! @ingroup functions + Simd API C++ wrappers. +*/ namespace Simd { /*! @ingroup bgra_conversion @@ -74,6 +76,46 @@ namespace Simd SimdBgraToGray(bgra.data, bgra.width, bgra.height, bgra.stride, gray.data, gray.stride); } + /*! @ingroup bgra_conversion + + \fn void BgraToRgb(const View& bgra, View& rgb) + + \short Converts 32-bit BGRA image to 24-bit RGB image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdBgraToRgb. + + \param [in] bgra - an input 32-bit BGRA image. + \param [out] rgb - an output 24-bit RGB image. + */ + template class A> SIMD_INLINE void BgraToRgb(const View& bgra, View& rgb) + { + assert(EqualSize(bgra, rgb) && bgra.format == View::Bgra32 && rgb.format == View::Rgb24); + + SimdBgraToRgb(bgra.data, bgra.width, bgra.height, bgra.stride, rgb.data, rgb.stride); + } + + /*! @ingroup bgra_conversion + + \fn void BgraToRgba(const View& bgra, View& rgba) + + \short Converts 32-bit BGRA image to 32-bit RGBA image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdBgraToRgba. + + \param [in] bgra - an input 32-bit BGRA image. + \param [out] rgba - an output 32-bit RGBA image. + */ + template class A> SIMD_INLINE void BgraToRgba(const View& bgra, View& rgba) + { + assert(EqualSize(bgra, rgba) && bgra.format == View::Bgra32 && rgba.format == View::Rgba32); + + SimdBgraToRgba(bgra.data, bgra.width, bgra.height, bgra.stride, rgba.data, rgba.stride); + } + /*! @ingroup bgr_conversion \fn void BgrToBgra(const View& bgr, View& bgra, uint8_t alpha = 0xFF) @@ -142,7 +184,7 @@ namespace Simd \fn void BgrToRgb(const View & bgr, View & rgb) - \short Converts 24-bit BGR image to 24-bit RGB image (also it performs backward conversion). + \short Converts 24-bit BGR image to 24-bit RGB image. All images must have the same width and height. @@ -153,9 +195,30 @@ namespace Simd */ template class A> SIMD_INLINE void BgrToRgb(const View & bgr, View & rgb) { - assert(EqualSize(bgr, rgb) && bgr.PixelSize() == 3 && rgb.PixelSize() == 3); + assert(EqualSize(bgr, rgb) && bgr.format == View::Bgr24 && rgb.format == View::Rgb24); - SimdBgrToRgb(bgr.data, bgr.stride, bgr.width, bgr.height, rgb.data, rgb.stride); + SimdBgrToRgb(bgr.data, bgr.width, bgr.height, bgr.stride, rgb.data, rgb.stride); + } + + /*! @ingroup bgr_conversion + + \fn void BgrToRgba(const View& bgr, View& rgba, uint8_t alpha = 0xFF) + + \short Converts 24-bit BGR image to 32-bit RGBA image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdRgbToBgra. + + \param [in] bgr - an input 24-bit BGR image. + \param [out] rgba - an output 32-bit RGBA image. + \param [in] alpha - a value of alpha channel. It is equal to 256 by default. + */ + template class A> SIMD_INLINE void BgrToRgba(const View& bgr, View& rgba, uint8_t alpha = 0xFF) + { + assert(EqualSize(bgr, rgba) && rgba.format == View::Rgba32 && bgr.format == View::Bgr24); + + SimdRgbToBgra(bgr.data, bgr.width, bgr.height, bgr.stride, rgba.data, rgba.stride, alpha); } /*! @ingroup copying @@ -204,7 +267,7 @@ namespace Simd frame.left, frame.top, frame.right, frame.bottom, dst.data, dst.stride); } - /*! @ingroup other_conversion + /*! @ingroup deinterleave_conversion \fn void DeinterleaveBgr(const View& bgr, View& b, View& g, View& r) @@ -226,7 +289,7 @@ namespace Simd SimdDeinterleaveBgr(bgr.data, bgr.stride, bgr.width, bgr.height, b.data, b.stride, g.data, g.stride, r.data, r.stride); } - /*! @ingroup other_conversion + /*! @ingroup deinterleave_conversion \fn void DeinterleaveBgra(const View& bgra, View& b, View& g, View& r, View& a) @@ -249,6 +312,95 @@ namespace Simd SimdDeinterleaveBgra(bgra.data, bgra.stride, bgra.width, bgra.height, b.data, b.stride, g.data, g.stride, r.data, r.stride, a.data, a.stride); } + /*! @ingroup deinterleave_conversion + + \fn void DeinterleaveBgra(const View& bgra, View& b, View& g, View& r) + + \short Deinterleaves 32-bit BGRA interleaved image into separated 8-bit Blue, Green and Red planar images (Alpha channel is ignored). + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdDeinterleaveBgra. + + \param [in] bgra - an input 32-bit BGRA interleaved image. + \param [out] b - an output 8-bit Blue planar image. + \param [out] g - an output 8-bit Green planar image. + \param [out] r - an output 8-bit Red planar image. + */ + template class A> SIMD_INLINE void DeinterleaveBgra(const View& bgra, View& b, View& g, View& r) + { + assert(EqualSize(bgra, b) && Compatible(b, g, r) && bgra.format == View::Bgra32 && b.format == View::Gray8); + + SimdDeinterleaveBgra(bgra.data, bgra.stride, bgra.width, bgra.height, b.data, b.stride, g.data, g.stride, r.data, r.stride, NULL, 0); + } + + /*! @ingroup deinterleave_conversion + + \fn void DeinterleaveRgb(const View& rgb, View& r, View& g, View& b) + + \short Deinterleaves 24-bit RGB interleaved image into separated 8-bit Red, Green and Blue planar images. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdDeinterleaveBgr. + + \param [in] rgb - an input 24-bit RGB interleaved image. + \param [out] r - an output 8-bit Red planar image. + \param [out] g - an output 8-bit Green planar image. + \param [out] b - an output 8-bit Blue planar image. + */ + template class A> SIMD_INLINE void DeinterleaveRgb(const View& rgb, View& r, View& g, View& b) + { + assert(EqualSize(rgb, b) && Compatible(b, g, r) && rgb.format == View::Rgb24 && b.format == View::Gray8); + + SimdDeinterleaveBgr(rgb.data, rgb.stride, rgb.width, rgb.height, r.data, r.stride, g.data, g.stride, b.data, b.stride); + } + + /*! @ingroup deinterleave_conversion + + \fn void DeinterleaveRgba(const View& rgba, View& r, View& g, View& b, View& a) + + \short Deinterleaves 32-bit RGBA interleaved image into separated 8-bit Red, Green, Blue and Alpha planar images. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdDeinterleaveBgra. + + \param [in] rgba - an input 32-bit RGBA interleaved image. + \param [out] r - an output 8-bit Red planar image. + \param [out] g - an output 8-bit Green planar image. + \param [out] b - an output 8-bit Blue planar image. + \param [out] a - an output 8-bit Alpha planar image. + */ + template class A> SIMD_INLINE void DeinterleaveRgba(const View& rgba, View& r, View& g, View& b, View& a) + { + assert(EqualSize(rgba, b) && Compatible(b, g, r, a) && rgba.format == View::Rgba32 && b.format == View::Gray8); + + SimdDeinterleaveBgra(rgba.data, rgba.stride, rgba.width, rgba.height, r.data, r.stride, g.data, g.stride, b.data, b.stride, a.data, a.stride); + } + + /*! @ingroup deinterleave_conversion + + \fn void DeinterleaveRgba(const View& rgba, View& r, View& g, View& b) + + \short Deinterleaves 32-bit RGBA interleaved image into separated 8-bit Red, Green and Blue planar images (Alpha channel is ignored). + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdDeinterleaveBgra. + + \param [in] rgba - an input 32-bit RGBA interleaved image. + \param [out] r - an output 8-bit Red planar image. + \param [out] g - an output 8-bit Green planar image. + \param [out] b - an output 8-bit Blue planar image. + */ + template class A> SIMD_INLINE void DeinterleaveRgba(const View& rgba, View& r, View& g, View& b) + { + assert(EqualSize(rgba, b) && Compatible(b, g, r) && rgba.format == View::Rgba32 && b.format == View::Gray8); + + SimdDeinterleaveBgra(rgba.data, rgba.stride, rgba.width, rgba.height, r.data, r.stride, g.data, g.stride, b.data, b.stride, NULL, 0); + } + /*! @ingroup other_filter \fn void GaussianBlur3x3(const View& src, View& dst) @@ -295,6 +447,26 @@ namespace Simd SimdGrayToBgr(gray.data, gray.width, gray.height, gray.stride, bgr.data, bgr.stride); } + /*! @ingroup gray_conversion + + \fn void GrayToRgb(const View& gray, View& rgb) + + \short Converts 8-bit gray image to 24-bit RGB image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdGrayToBgr. + + \param [in] gray - an input 8-bit gray image. + \param [out] rgb - an output 24-bit RGB image. + */ + template class A> SIMD_INLINE void GrayToRgb(const View& gray, View& rgb) + { + assert(EqualSize(gray, rgb) && rgb.format == View::Rgb24 && gray.format == View::Gray8); + + SimdGrayToBgr(gray.data, gray.width, gray.height, gray.stride, rgb.data, rgb.stride); + } + /*! @ingroup gray_conversion \fn void GrayToBgra(const View& gray, View& bgra, uint8_t alpha = 0xFF) @@ -316,6 +488,27 @@ namespace Simd SimdGrayToBgra(gray.data, gray.width, gray.height, gray.stride, bgra.data, bgra.stride, alpha); } + /*! @ingroup gray_conversion + + \fn void GrayToRgba(const View& gray, View& rgba, uint8_t alpha = 0xFF) + + \short Converts 8-bit gray image to 32-bit RGBA image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdGrayToBgra. + + \param [in] gray - an input 8-bit gray image. + \param [out] rgba - an output 32-bit RGBA image. + \param [in] alpha - a value of alpha channel. It is equal to 255 by default. + */ + template class A> SIMD_INLINE void GrayToRgba(const View& gray, View& rgba, uint8_t alpha = 0xFF) + { + assert(EqualSize(gray, rgba) && rgba.format == View::Rgba32 && gray.format == View::Gray8); + + SimdGrayToBgra(gray.data, gray.width, gray.height, gray.stride, rgba.data, rgba.stride, alpha); + } + /*! @ingroup other_conversion \fn void InterleaveBgr(const View & b, const View & g, const View & r, View & bgr) @@ -338,7 +531,7 @@ namespace Simd SimdInterleaveBgr(b.data, b.stride, g.data, g.stride, r.data, r.stride, bgr.width, bgr.height, bgr.data, bgr.stride); } - /*! @ingroup other_conversion + /*! @ingroup interleave_conversion \fn void InterleaveBgra(const View& b, const View& g, const View& r, const View& a, View& bgra) @@ -798,6 +991,200 @@ namespace Simd } } + /*! @ingroup resizing + + \fn void Resize(const View & src, View & dst, const Point & size, ::SimdResizeMethodType method = ::SimdResizeMethodBilinear) + + \short Performs resizing of image. + + \param [in] src - an original input image. + \param [out] dst - a resized output image. The input image can be the output. + \param [in] size - a size of output image. + \param [in] method - a resizing method. By default it is equal to ::SimdResizeMethodBilinear. + */ + template class A> SIMD_INLINE void Resize(const View& src, View& dst, const Point & size, ::SimdResizeMethodType method = ::SimdResizeMethodBilinear) + { + assert(src.format == View::Float || src.ChannelSize() == 1); + + if (&src == &dst) + { + if (src.Size() != size) + { + View tmp(size, src.format); + Resize(src, tmp, method); + dst.Swap(tmp); + } + } + else + { + if (dst.Size() != size) + dst.Recreate(size, src.format); + Resize(src, dst, method); + } + } + + /*! @ingroup rgb_conversion + + \fn void RgbToBgr(const View & rgb, View & bgr) + + \short Converts 24-bit RGB image to 24-bit BGR image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdBgrToRgb. + + \param [in] rgb - an input 24-bit RGB image. + \param [out] bgr - an output 24-bit BGR image. + */ + template class A> SIMD_INLINE void RgbToBgr(const View& rgb, View& bgr) + { + assert(EqualSize(bgr, rgb) && rgb.format == View::Rgb24 || bgr.format == View::Bgr24); + + SimdBgrToRgb(rgb.data, rgb.width, rgb.height, rgb.stride, bgr.data, bgr.stride); + } + + /*! @ingroup rgb_conversion + + \fn void RgbToBgra(const View& rgb, View& bgra, uint8_t alpha = 0xFF) + + \short Converts 24-bit RGB image to 32-bit BGRA image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdRgbToBgra. + + \param [in] rgb - an input 24-bit RGB image. + \param [out] bgra - an output 32-bit BGRA image. + \param [in] alpha - a value of alpha channel. It is equal to 256 by default. + */ + template class A> SIMD_INLINE void RgbToBgra(const View& rgb, View& bgra, uint8_t alpha = 0xFF) + { + assert(EqualSize(rgb, bgra) && bgra.format == View::Bgra32 && rgb.format == View::Rgb24); + + SimdRgbToBgra(rgb.data, rgb.width, rgb.height, rgb.stride, bgra.data, bgra.stride, alpha); + } + + /*! @ingroup rgb_conversion + + \fn void RgbToGray(const View& rgb, View& gray) + + \short Converts 24-bit RGB image to 8-bit gray image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdRgbToGray. + + \param [in] rgb - an input 24-bit RGB image. + \param [out] gray - an output 8-bit gray image. + */ + template class A> SIMD_INLINE void RgbToGray(const View& rgb, View& gray) + { + assert(EqualSize(rgb, gray) && rgb.format == View::Rgb24 && gray.format == View::Gray8); + + SimdRgbToGray(rgb.data, rgb.width, rgb.height, rgb.stride, gray.data, gray.stride); + } + + /*! @ingroup rgb_conversion + + \fn void RgbToRgba(const View& rgb, View& rgba, uint8_t alpha = 0xFF) + + \short Converts 24-bit RGB image to 32-bit RGBA image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdBgrToBgra. + + \param [in] rgb - an input 24-bit RGB image. + \param [out] rgba - an output 32-bit RGBA image. + \param [in] alpha - a value of alpha channel. It is equal to 256 by default. + */ + template class A> SIMD_INLINE void RgbToRgba(const View& rgb, View& rgba, uint8_t alpha = 0xFF) + { + assert(EqualSize(rgb, rgba) && rgba.format == View::Rgba32 && rgb.format == View::Rgb24); + + SimdBgrToBgra(rgb.data, rgb.width, rgb.height, rgb.stride, rgba.data, rgba.stride, alpha); + } + + /*! @ingroup rgba_conversion + + \fn void RgbaToBgr(const View& rgba, View& bgr) + + \short Converts 32-bit RGBA image to 24-bit BGR image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdBgraToRgb. + + \param [in] rgba - an input 32-bit RGBA image. + \param [out] bgr - an output 24-bit RGB image. + */ + template class A> SIMD_INLINE void RgbaToBgr(const View& rgba, View& bgr) + { + assert(EqualSize(rgba, bgr) && rgba.format == View::Rgba32 && bgr.format == View::Bgr24); + + SimdBgraToRgb(rgba.data, rgba.width, rgba.height, rgba.stride, bgr.data, bgr.stride); + } + + /*! @ingroup rgba_conversion + + \fn void RgbaToBgra(const View& rgba, View& bgra) + + \short Converts 32-bit RGBA image to 32-bit BGRA image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdBgraToRgba. + + \param [in] rgba - an input 32-bit RGBA image. + \param [out] bgra - an output 32-bit BGRA image. + */ + template class A> SIMD_INLINE void RgbaToBgra(const View& rgba, View& bgra) + { + assert(EqualSize(bgra, rgba) && bgra.format == View::Bgra32 && rgba.format == View::Rgba32); + + SimdBgraToRgba(rgba.data, rgba.width, rgba.height, rgba.stride, bgra.data, bgra.stride); + } + + /*! @ingroup rgba_conversion + + \fn void RgbaToGray(const View& rgba, View& gray) + + \short Converts 32-bit RGBA image to 8-bit gray image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdRgbaToGray. + + \param [in] rgba - an input 32-bit RGBA image. + \param [out] gray - an output 8-bit gray image. + */ + template class A> SIMD_INLINE void RgbaToGray(const View& rgba, View& gray) + { + assert(EqualSize(rgba, gray) && rgba.format == View::Rgba32 && gray.format == View::Gray8); + + SimdRgbaToGray(rgba.data, rgba.width, rgba.height, rgba.stride, gray.data, gray.stride); + } + + /*! @ingroup rgba_conversion + + \fn void RgbaToRgb(const View& rgba, View& rgb) + + \short Converts 32-bit RGBA image to 24-bit RGB image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdBgraToBgr. + + \param [in] rgba - an input 32-bit RGBA image. + \param [out] rgb - an output 24-bit RGB image. + */ + template class A> SIMD_INLINE void RgbaToRgb(const View& rgba, View& rgb) + { + assert(EqualSize(rgba, rgb) && rgba.format == View::Rgba32 && rgb.format == View::Rgb24); + + SimdBgraToBgr(rgba.data, rgba.width, rgba.height, rgba.stride, rgb.data, rgb.stride); + } + /*! @ingroup resizing \fn void StretchGray2x2(const View& src, View& dst) @@ -825,7 +1212,7 @@ namespace Simd The input and output images must have the same width and height. - \note This function supports conversion between Gray8, Bgr24 and Bgra32 image formats. + \note This function supports conversion between View::Gray8, View::Bgr24, View::Bgra32, View::Rgb24 and View::Rgba32 image formats. \param [in] src - an input image. \param [out] dst - an output image. @@ -848,9 +1235,15 @@ namespace Simd case View::Bgra32: GrayToBgra(src, dst); break; + case View::Rgba32: + GrayToRgba(src, dst); + break; case View::Bgr24: GrayToBgr(src, dst); break; + case View::Rgb24: + GrayToRgb(src, dst); + break; default: assert(0); } @@ -865,6 +1258,32 @@ namespace Simd case View::Gray8: BgrToGray(src, dst); break; + case View::Rgb24: + BgrToRgb(src, dst); + break; + case View::Rgba32: + BgrToRgba(src, dst); + break; + default: + assert(0); + } + break; + + case View::Rgb24: + switch (dst.format) + { + case View::Bgra32: + RgbToBgra(src, dst); + break; + case View::Bgr24: + RgbToBgr(src, dst); + break; + case View::Gray8: + RgbToGray(src, dst); + break; + case View::Rgba32: + RgbToRgba(src, dst); + break; default: assert(0); } @@ -879,6 +1298,32 @@ namespace Simd case View::Gray8: BgraToGray(src, dst); break; + case View::Rgb24: + BgraToRgb(src, dst); + break; + case View::Rgba32: + BgraToRgba(src, dst); + break; + default: + assert(0); + } + break; + + case View::Rgba32: + switch (dst.format) + { + case View::Bgra32: + RgbaToBgra(src, dst); + break; + case View::Bgr24: + RgbaToBgr(src, dst); + break; + case View::Gray8: + RgbaToGray(src, dst); + break; + case View::Rgb24: + RgbaToRgb(src, dst); + break; default: assert(0); } diff --git a/3rdparty/simdlib/Simd/SimdLoad.h b/3rdparty/simdlib/Simd/SimdLoad.h old mode 100644 new mode 100755 index 97d7af7098..243858ca1b --- a/3rdparty/simdlib/Simd/SimdLoad.h +++ b/3rdparty/simdlib/Simd/SimdLoad.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,16 +28,8 @@ namespace Simd { - enum PadType - { - PadNose1, - PadNone, - PadTail1, - PadTail2, - }; - -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { template SIMD_INLINE __m128 Load(const float * p); @@ -56,7 +48,7 @@ namespace Simd return _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)p0), (__m64*)p1); } - SIMD_INLINE __m128 LoadPadZeroNose1(const float * p) + SIMD_INLINE __m128 LoadPadZeroNose1(const float* p) { SIMD_ALIGNED(16) const int32_t m[F] = { 0, -1, -1, -1 }; __m128 a = _mm_loadu_ps(p + 1); @@ -64,7 +56,7 @@ namespace Simd return _mm_and_ps(b, _mm_load_ps((float*)m)); } - SIMD_INLINE __m128 LoadPadZeroTail1(const float * p) + SIMD_INLINE __m128 LoadPadZeroTail1(const float* p) { SIMD_ALIGNED(16) const int32_t m[F] = { -1, -1, -1, 0 }; __m128 a = _mm_loadu_ps(p - 1); @@ -72,20 +64,15 @@ namespace Simd return _mm_and_ps(b, _mm_load_ps((float*)m)); } - SIMD_INLINE __m128 LoadPadZeroTail2(const float * p) + SIMD_INLINE __m128 LoadPadZeroTail2(const float* p) { SIMD_ALIGNED(16) const int32_t m[F] = { -1, -1, 0, 0 }; __m128 a = _mm_loadu_ps(p - 2); __m128 b = _mm_shuffle_ps(a, a, 0xFE); return _mm_and_ps(b, _mm_load_ps((float*)m)); } - } -#endif//SIMD_SSE_ENABLE -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - using namespace Sse; + //--------------------------------------------------------------------- template SIMD_INLINE __m128i Load(const __m128i * p); @@ -99,6 +86,11 @@ namespace Simd return _mm_load_si128(p); } + SIMD_INLINE __m128i Load(const __m128i* p0, const __m128i* p1) + { + return _mm_castps_si128(_mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)p0), (__m64*)p1)); + } + template SIMD_INLINE __m128i LoadMaskI8(const __m128i * p, __m128i index) { return _mm_cmpeq_epi8(Load(p), index); @@ -113,90 +105,13 @@ namespace Simd { return _mm_or_si128(_mm_srli_si128(last, count), _mm_and_si128(last, _mm_slli_si128(K_INV_ZERO, A - count))); } - - template SIMD_INLINE void LoadNose3(const uint8_t * p, __m128i a[3]) - { - a[1] = Load((__m128i*)p); - a[0] = LoadBeforeFirst(a[1]); - a[2] = _mm_loadu_si128((__m128i*)(p + step)); - } - - template SIMD_INLINE void LoadBody3(const uint8_t * p, __m128i a[3]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - step)); - a[1] = Load((__m128i*)p); - a[2] = _mm_loadu_si128((__m128i*)(p + step)); - } - - template SIMD_INLINE void LoadTail3(const uint8_t * p, __m128i a[3]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - step)); - a[1] = Load((__m128i*)p); - a[2] = LoadAfterLast(a[1]); - } - - template SIMD_INLINE void LoadNose5(const uint8_t * p, __m128i a[5]) - { - a[2] = Load((__m128i*)p); - a[1] = LoadBeforeFirst(a[2]); - a[0] = LoadBeforeFirst(a[1]); - a[3] = _mm_loadu_si128((__m128i*)(p + step)); - a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step)); - } - - template SIMD_INLINE void LoadBody5(const uint8_t * p, __m128i a[5]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step)); - a[1] = _mm_loadu_si128((__m128i*)(p - step)); - a[2] = Load((__m128i*)p); - a[3] = _mm_loadu_si128((__m128i*)(p + step)); - a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step)); - } - - template SIMD_INLINE void LoadTail5(const uint8_t * p, __m128i a[5]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step)); - a[1] = _mm_loadu_si128((__m128i*)(p - step)); - a[2] = Load((__m128i*)p); - a[3] = LoadAfterLast(a[2]); - a[4] = LoadAfterLast(a[3]); - } - - SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m128i a[3]) - { - a[0] = LoadBeforeFirst<1>(_mm_loadu_si128((__m128i*)p)); - a[2] = _mm_loadu_si128((__m128i*)(p + 1)); - } - - SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m128i a[3]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - 1)); - a[2] = _mm_loadu_si128((__m128i*)(p + 1)); - } - - SIMD_INLINE void LoadTailDx(const uint8_t * p, __m128i a[3]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - 1)); - a[2] = LoadAfterLast<1>(_mm_loadu_si128((__m128i*)p)); - } } #endif//SIMD_SSE2_ENABLE -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 - { -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::Load; - using Sse2::Load; -#endif - } -#endif - #ifdef SIMD_SSE41_ENABLE namespace Sse41 { #if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::Load; using Sse2::Load; #endif } @@ -219,12 +134,17 @@ namespace Simd template SIMD_INLINE __m256 Load(const float * p0, const float * p1) { - return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse::Load(p0)), Sse::Load(p1), 1); + return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse2::Load(p0)), Sse2::Load(p1), 1); } SIMD_INLINE __m256 Load(const float * p0, const float * p1, const float * p2, const float * p3) { - return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse::Load(p0, p1)), Sse::Load(p2, p3), 1); + return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse2::Load(p0, p1)), Sse2::Load(p2, p3), 1); + } + + SIMD_INLINE __m256 Load(const float * ptr, __m256i mask) + { + return _mm256_maskload_ps(ptr, mask); } } #endif//SIMD_AVX_ENABLE @@ -333,86 +253,6 @@ namespace Simd __m128i secondHi = LoadHalfAfterLast(firstHi); second = _mm256_inserti128_si256(_mm256_castsi128_si256(secondLo), secondHi, 0x1); } - - template SIMD_INLINE void LoadNose3(const uint8_t * p, __m256i a[3]) - { - a[0] = LoadBeforeFirst(p); - a[1] = Load((__m256i*)p); - a[2] = _mm256_loadu_si256((__m256i*)(p + step)); - } - - template SIMD_INLINE void LoadBody3(const uint8_t * p, __m256i a[3]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - step)); - a[1] = Load((__m256i*)p); - a[2] = _mm256_loadu_si256((__m256i*)(p + step)); - } - - template SIMD_INLINE void LoadTail3(const uint8_t * p, __m256i a[3]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - step)); - a[1] = Load((__m256i*)p); - a[2] = LoadAfterLast(p); - } - - template SIMD_INLINE void LoadNose5(const uint8_t * p, __m256i a[5]) - { - LoadBeforeFirst(p, a[1], a[0]); - a[2] = Load((__m256i*)p); - a[3] = _mm256_loadu_si256((__m256i*)(p + step)); - a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step)); - } - - template SIMD_INLINE void LoadBody5(const uint8_t * p, __m256i a[5]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step)); - a[1] = _mm256_loadu_si256((__m256i*)(p - step)); - a[2] = Load((__m256i*)p); - a[3] = _mm256_loadu_si256((__m256i*)(p + step)); - a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step)); - } - - template SIMD_INLINE void LoadTail5(const uint8_t * p, __m256i a[5]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step)); - a[1] = _mm256_loadu_si256((__m256i*)(p - step)); - a[2] = Load((__m256i*)p); - LoadAfterLast(p, a[3], a[4]); - } - - SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m256i a[3]) - { - a[0] = LoadBeforeFirst(p); - a[2] = _mm256_loadu_si256((__m256i*)(p + 1)); - } - - SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m256i a[3]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - 1)); - a[2] = _mm256_loadu_si256((__m256i*)(p + 1)); - } - - SIMD_INLINE void LoadTailDx(const uint8_t * p, __m256i a[3]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - 1)); - a[2] = LoadAfterLast(p); - } - - template SIMD_INLINE __m256 Load(const float * p); - - template <> SIMD_INLINE __m256 Load(const float * p) - { - return _mm256_loadu_ps(p); - } - - template <> SIMD_INLINE __m256 Load(const float * p) - { -#ifdef _MSC_VER - return _mm256_castsi256_ps(_mm256_load_si256((__m256i*)p)); -#else - return _mm256_load_ps(p); -#endif - } } #endif//SIMD_AVX2_ENABLE @@ -456,12 +296,12 @@ namespace Simd template SIMD_INLINE int32x4_t Load(const int32_t * p) { - return (int32x4_t)Load((const uint8_t*)p); + return vreinterpretq_s32_u8(Load((const uint8_t*)p)); } template SIMD_INLINE uint32x4_t Load(const uint32_t * p) { - return (uint32x4_t)Load((const uint8_t*)p); + return vreinterpretq_u32_u8(Load((const uint8_t*)p)); } template SIMD_INLINE float32x4_t Load(const float * p); @@ -829,81 +669,6 @@ namespace Simd return vextq_u8(last, vextq_u8(last, last, 16 - count), count); } - template SIMD_INLINE void LoadNose3(const uint8_t * p, uint8x16_t a[3]) - { - a[1] = Load(p); - a[0] = LoadBeforeFirst(a[1]); - a[2] = vld1q_u8(p + step); - } - - template SIMD_INLINE void LoadBody3(const uint8_t * p, uint8x16_t a[3]) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - a[0] = vld1q_u8(p - step); - a[1] = Load(p); - a[2] = vld1q_u8(p + step); - } - - template SIMD_INLINE void LoadTail3(const uint8_t * p, uint8x16_t a[3]) - { - a[0] = vld1q_u8(p - step); - a[1] = Load(p); - a[2] = LoadAfterLast(a[1]); - } - - template SIMD_INLINE void LoadNose5(const uint8_t * p, uint8x16_t a[5]) - { - a[2] = Load(p); - a[1] = LoadBeforeFirst(a[2]); - a[0] = LoadBeforeFirst(a[1]); - a[3] = vld1q_u8(p + step); - a[4] = vld1q_u8(p + 2 * step); - } - - template SIMD_INLINE void LoadBody5(const uint8_t * p, uint8x16_t a[5]) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - a[0] = vld1q_u8(p - 2 * step); - a[1] = vld1q_u8(p - step); - a[2] = Load(p); - a[3] = vld1q_u8(p + step); - a[4] = vld1q_u8(p + 2 * step); - } - - template SIMD_INLINE void LoadTail5(const uint8_t * p, uint8x16_t a[5]) - { - a[0] = vld1q_u8(p - 2 * step); - a[1] = vld1q_u8(p - step); - a[2] = Load(p); - a[3] = LoadAfterLast(a[2]); - a[4] = LoadAfterLast(a[3]); - } - - SIMD_INLINE void LoadNoseDx(const uint8_t * p, uint8x16_t a[3]) - { - a[0] = LoadBeforeFirst<1>(vld1q_u8(p)); - a[2] = vld1q_u8(p + 1); - } - - SIMD_INLINE void LoadBodyDx(const uint8_t * p, uint8x16_t a[3]) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - a[0] = vld1q_u8(p - 1); - a[2] = vld1q_u8(p + 1); - } - - SIMD_INLINE void LoadTailDx(const uint8_t * p, uint8x16_t a[3]) - { - a[0] = vld1q_u8(p - 1); - a[2] = LoadAfterLast<1>(vld1q_u8(p)); - } - template SIMD_INLINE uint8x8_t LoadBeforeFirst(uint8x8_t first) { return vext_u8(vext_u8(first, first, count), first, 8 - count); diff --git a/3rdparty/simdlib/Simd/SimdLoadBlock.h b/3rdparty/simdlib/Simd/SimdLoadBlock.h new file mode 100755 index 0000000000..8a46e07687 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdLoadBlock.h @@ -0,0 +1,251 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdLoadBlock_h__ +#define __SimdLoadBlock_h__ + +#include "Simd/SimdLoad.h" + +namespace Simd +{ +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 + { + template SIMD_INLINE void LoadNose3(const uint8_t * p, __m128i a[3]) + { + a[1] = Load((__m128i*)p); + a[0] = LoadBeforeFirst(a[1]); + a[2] = _mm_loadu_si128((__m128i*)(p + step)); + } + + template SIMD_INLINE void LoadBody3(const uint8_t * p, __m128i a[3]) + { + a[0] = _mm_loadu_si128((__m128i*)(p - step)); + a[1] = Load((__m128i*)p); + a[2] = _mm_loadu_si128((__m128i*)(p + step)); + } + + template SIMD_INLINE void LoadTail3(const uint8_t * p, __m128i a[3]) + { + a[0] = _mm_loadu_si128((__m128i*)(p - step)); + a[1] = Load((__m128i*)p); + a[2] = LoadAfterLast(a[1]); + } + + template SIMD_INLINE void LoadNose5(const uint8_t * p, __m128i a[5]) + { + a[2] = Load((__m128i*)p); + a[1] = LoadBeforeFirst(a[2]); + a[0] = LoadBeforeFirst(a[1]); + a[3] = _mm_loadu_si128((__m128i*)(p + step)); + a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step)); + } + + template SIMD_INLINE void LoadBody5(const uint8_t * p, __m128i a[5]) + { + a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step)); + a[1] = _mm_loadu_si128((__m128i*)(p - step)); + a[2] = Load((__m128i*)p); + a[3] = _mm_loadu_si128((__m128i*)(p + step)); + a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step)); + } + + template SIMD_INLINE void LoadTail5(const uint8_t * p, __m128i a[5]) + { + a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step)); + a[1] = _mm_loadu_si128((__m128i*)(p - step)); + a[2] = Load((__m128i*)p); + a[3] = LoadAfterLast(a[2]); + a[4] = LoadAfterLast(a[3]); + } + + SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m128i a[3]) + { + a[0] = LoadBeforeFirst<1>(_mm_loadu_si128((__m128i*)p)); + a[2] = _mm_loadu_si128((__m128i*)(p + 1)); + } + + SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m128i a[3]) + { + a[0] = _mm_loadu_si128((__m128i*)(p - 1)); + a[2] = _mm_loadu_si128((__m128i*)(p + 1)); + } + + SIMD_INLINE void LoadTailDx(const uint8_t * p, __m128i a[3]) + { + a[0] = _mm_loadu_si128((__m128i*)(p - 1)); + a[2] = LoadAfterLast<1>(_mm_loadu_si128((__m128i*)p)); + } + } +#endif//SIMD_SSE2_ENABLE + +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + template SIMD_INLINE void LoadNose3(const uint8_t * p, __m256i a[3]) + { + a[0] = LoadBeforeFirst(p); + a[1] = Load((__m256i*)p); + a[2] = _mm256_loadu_si256((__m256i*)(p + step)); + } + + template SIMD_INLINE void LoadBody3(const uint8_t * p, __m256i a[3]) + { + a[0] = _mm256_loadu_si256((__m256i*)(p - step)); + a[1] = Load((__m256i*)p); + a[2] = _mm256_loadu_si256((__m256i*)(p + step)); + } + + template SIMD_INLINE void LoadTail3(const uint8_t * p, __m256i a[3]) + { + a[0] = _mm256_loadu_si256((__m256i*)(p - step)); + a[1] = Load((__m256i*)p); + a[2] = LoadAfterLast(p); + } + + template SIMD_INLINE void LoadNose5(const uint8_t * p, __m256i a[5]) + { + LoadBeforeFirst(p, a[1], a[0]); + a[2] = Load((__m256i*)p); + a[3] = _mm256_loadu_si256((__m256i*)(p + step)); + a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step)); + } + + template SIMD_INLINE void LoadBody5(const uint8_t * p, __m256i a[5]) + { + a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step)); + a[1] = _mm256_loadu_si256((__m256i*)(p - step)); + a[2] = Load((__m256i*)p); + a[3] = _mm256_loadu_si256((__m256i*)(p + step)); + a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step)); + } + + template SIMD_INLINE void LoadTail5(const uint8_t * p, __m256i a[5]) + { + a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step)); + a[1] = _mm256_loadu_si256((__m256i*)(p - step)); + a[2] = Load((__m256i*)p); + LoadAfterLast(p, a[3], a[4]); + } + + SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m256i a[3]) + { + a[0] = LoadBeforeFirst(p); + a[2] = _mm256_loadu_si256((__m256i*)(p + 1)); + } + + SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m256i a[3]) + { + a[0] = _mm256_loadu_si256((__m256i*)(p - 1)); + a[2] = _mm256_loadu_si256((__m256i*)(p + 1)); + } + + SIMD_INLINE void LoadTailDx(const uint8_t * p, __m256i a[3]) + { + a[0] = _mm256_loadu_si256((__m256i*)(p - 1)); + a[2] = LoadAfterLast(p); + } + } +#endif//SIMD_AVX2_ENABLE + +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + template SIMD_INLINE void LoadNose3(const uint8_t * p, uint8x16_t a[3]) + { + a[1] = Load(p); + a[0] = LoadBeforeFirst(a[1]); + a[2] = vld1q_u8(p + step); + } + + template SIMD_INLINE void LoadBody3(const uint8_t * p, uint8x16_t a[3]) + { +#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE + __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); +#endif + a[0] = vld1q_u8(p - step); + a[1] = Load(p); + a[2] = vld1q_u8(p + step); + } + + template SIMD_INLINE void LoadTail3(const uint8_t * p, uint8x16_t a[3]) + { + a[0] = vld1q_u8(p - step); + a[1] = Load(p); + a[2] = LoadAfterLast(a[1]); + } + + template SIMD_INLINE void LoadNose5(const uint8_t * p, uint8x16_t a[5]) + { + a[2] = Load(p); + a[1] = LoadBeforeFirst(a[2]); + a[0] = LoadBeforeFirst(a[1]); + a[3] = vld1q_u8(p + step); + a[4] = vld1q_u8(p + 2 * step); + } + + template SIMD_INLINE void LoadBody5(const uint8_t * p, uint8x16_t a[5]) + { +#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE + __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); +#endif + a[0] = vld1q_u8(p - 2 * step); + a[1] = vld1q_u8(p - step); + a[2] = Load(p); + a[3] = vld1q_u8(p + step); + a[4] = vld1q_u8(p + 2 * step); + } + + template SIMD_INLINE void LoadTail5(const uint8_t * p, uint8x16_t a[5]) + { + a[0] = vld1q_u8(p - 2 * step); + a[1] = vld1q_u8(p - step); + a[2] = Load(p); + a[3] = LoadAfterLast(a[2]); + a[4] = LoadAfterLast(a[3]); + } + + SIMD_INLINE void LoadNoseDx(const uint8_t * p, uint8x16_t a[3]) + { + a[0] = LoadBeforeFirst<1>(vld1q_u8(p)); + a[2] = vld1q_u8(p + 1); + } + + SIMD_INLINE void LoadBodyDx(const uint8_t * p, uint8x16_t a[3]) + { +#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE + __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); +#endif + a[0] = vld1q_u8(p - 1); + a[2] = vld1q_u8(p + 1); + } + + SIMD_INLINE void LoadTailDx(const uint8_t * p, uint8x16_t a[3]) + { + a[0] = vld1q_u8(p - 1); + a[2] = LoadAfterLast<1>(vld1q_u8(p)); + } + } +#endif//SIMD_NEON_ENABLE +} +#endif//__SimdLoadBlock_h__ diff --git a/3rdparty/simdlib/Simd/SimdLog.h b/3rdparty/simdlib/Simd/SimdLog.h old mode 100644 new mode 100755 index 45ba3f3be5..923a16dc70 --- a/3rdparty/simdlib/Simd/SimdLog.h +++ b/3rdparty/simdlib/Simd/SimdLog.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -58,8 +58,8 @@ namespace Simd Log(array.data, array.size, name); } -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { SIMD_INLINE void Log(const __m128 & value, const std::string & name) { @@ -67,12 +67,7 @@ namespace Simd _mm_storeu_ps(buffer, value); Simd::Log(buffer, F, name); } - } -#endif //SIMD_SSE_ENABLE -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { template SIMD_INLINE void Log(const __m128i & value, const std::string & name) { const size_t n = sizeof(__m128i) / sizeof(T); @@ -86,7 +81,7 @@ namespace Simd #ifdef SIMD_SSE41_ENABLE namespace Sse41 { - using namespace Sse; + using namespace Sse2; } #endif //SIMD_SSE41_ENABLE @@ -173,14 +168,15 @@ namespace Simd #define SIMD_LOG2(value) Log(value, #value) #define SIMD_LOG4(value) Log(value, #value) -#define SIMD_LOG_SS(message) \ +#define SIMD_LOG_ERROR(message) \ { \ - std::cout << __FUNCTION__ << " : " << message << std::endl; \ - std::cout.flush(); \ + std::stringstream ss; \ + ss << std::endl << " In function " << SIMD_FUNCTION << ":" << std::endl; \ + ss << " In file " << __FILE__ << ":" << __LINE__ << ":" << std::endl; \ + ss << " Error: " << message << std::endl << std::endl; \ + std::cerr << ss.str() << std::flush; \ } -#define SIMD_LOG_LINE() std::cout << __FUNCTION__ << " : " << __LINE__ << std::endl << std::flush; - #else//SIMD_LOG_ENABLE #define SIMD_LOG(value) @@ -188,9 +184,7 @@ namespace Simd #define SIMD_LOG2(value) #define SIMD_LOG4(value) -#define SIMD_LOG_SS(message) - -#define SIMD_LOG_LINE() +#define SIMD_LOG_ERROR(message) #endif//SIMD_LOG_ENABLE diff --git a/3rdparty/simdlib/Simd/SimdMath.h b/3rdparty/simdlib/Simd/SimdMath.h old mode 100644 new mode 100755 index 4b674ea512..0f7425f76e --- a/3rdparty/simdlib/Simd/SimdMath.h +++ b/3rdparty/simdlib/Simd/SimdMath.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, +* Copyright (c) 2011-2021 Yermalayeu Ihar, * 2018-2019 Radchenko Andrey. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -66,11 +66,21 @@ namespace Simd #define SIMD_ROUND SIMD_INLINE int Round(double value) { -#if defined(SIMD_SSE2_ENABLE) && ((defined(_MSC_VER) && defined(_M_X64)) || (defined(__GNUC__) && defined(__x86_64__))) - __m128d t = _mm_set_sd(value); - return _mm_cvtsd_si32(t); +#if defined(SIMD_X64_ENABLE) && !defined(SIMD_SSE2_DISABLE) + __m128d _value = _mm_set_sd(value); + return _mm_cvtsd_si32(_value); #else - return (int)(value + (value >= 0 ? 0.5 : -0.5)); + return (int)(value + (value >= 0.0 ? 0.5 : -0.5)); +#endif + } + + SIMD_INLINE int Round(float value) + { +#if defined(SIMD_X64_ENABLE) && !defined(SIMD_SSE2_DISABLE) + __m128 _value = _mm_set_ss(value); + return _mm_cvtss_si32(_value); +#else + return (int)(value + (value >= 0.0f ? 0.5f : -0.5f)); #endif } #endif @@ -263,8 +273,8 @@ namespace Simd } } -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { SIMD_INLINE __m128 Square(__m128 value) { @@ -330,12 +340,7 @@ namespace Simd __m128 m = _mm_max_ps(s0, s1); return _mm_store_ss(dst, _mm_max_ss(m, _mm_shuffle_ps(m, m, 1))); } - } -#endif//SIMD_SSE_ENABLE -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { SIMD_INLINE __m128i SaturateI16ToU8(__m128i value) { return _mm_min_epi16(K16_00FF, _mm_max_epi16(value, K_ZERO)); @@ -508,17 +513,8 @@ namespace Simd } #endif// SIMD_SSE2_ENABLE -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 - { -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::RightNotZero; -#endif - } -#endif//SIMD_SSE3_ENABLE - -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { using namespace Sse2; @@ -538,12 +534,7 @@ namespace Simd { return _mm_maddubs_epi16(UnpackU8(a, b), K8_01_FF); } - } -#endif// SIMD_SSSE3_ENABLE -#ifdef SIMD_SSE41_ENABLE - namespace Sse41 - { #if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug using Sse::RightNotZero; #endif diff --git a/3rdparty/simdlib/Simd/SimdMemory.h b/3rdparty/simdlib/Simd/SimdMemory.h old mode 100644 new mode 100755 index de45abb291..d7772ffa3c --- a/3rdparty/simdlib/Simd/SimdMemory.h +++ b/3rdparty/simdlib/Simd/SimdMemory.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2018 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * 2016-2016 Sintegrial Technologies. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -28,9 +28,10 @@ #include "Simd/SimdDefs.h" #include "Simd/SimdMath.h" -#if defined(__GNUC__) && defined(SIMD_ALLOCATE_ERROR_MESSAGE) +#if defined(SIMD_ALLOCATE_ERROR_MESSAGE) #include #endif +#include namespace Simd { @@ -88,17 +89,18 @@ namespace Simd align = AlignHi(align, sizeof(void *)); size = AlignHi(size, align); int result = ::posix_memalign(&ptr, align, size); -#ifdef SIMD_ALLOCATE_ERROR_MESSAGE if (result != 0) + ptr = NULL; +#else + ptr = malloc(size); +#endif +#ifdef SIMD_ALLOCATE_ERROR_MESSAGE + if (ptr == NULL) std::cout << "The function posix_memalign can't allocate " << size << " bytes with align " << align << " !" << std::endl << std::flush; #endif #ifdef SIMD_ALLOCATE_ASSERT - assert(result == 0); -#endif -#else - ptr = malloc(size); + assert(ptr); #endif - #ifdef SIMD_NO_MANS_LAND if (ptr) ptr = (char*)ptr + SIMD_NO_MANS_LAND; @@ -121,60 +123,86 @@ namespace Simd #endif } + //--------------------------------------------------------------------------------------------- + struct Deletable { virtual ~Deletable() {} }; -#ifdef SIMD_SSE_ENABLE - namespace Sse + //--------------------------------------------------------------------------------------------- + +#if defined(SIMD_CPP_2011_ENABLE) + template using Holder = std::unique_ptr; +#else + template class Holder { - SIMD_INLINE bool Aligned(size_t size, size_t align = sizeof(__m128)) + T* _ptr; + + public: + Holder(T* ptr) + : _ptr(ptr) { - return Simd::Aligned(size, align); } - SIMD_INLINE bool Aligned(const void * ptr, size_t align = sizeof(__m128)) + ~Holder() { - return Simd::Aligned(ptr, align); + if (_ptr) + delete _ptr; + } + + T& operator * () + { + return *_ptr; + } + + const T& operator * () const + { + return *_ptr; + } + + T* operator -> () + { + return _ptr; } - } -#endif// SIMD_SSE_ENABLE + + const T* operator -> () const + { + return _ptr; + } + + operator bool() const + { + return _ptr != NULL; + } + }; +#endif + + //--------------------------------------------------------------------------------------------- + #ifdef SIMD_SSE2_ENABLE namespace Sse2 { - using Sse::Aligned; - } -#endif// SIMD_SSE2_ENABLE - -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 - { - using Sse::Aligned; - } -#endif// SIMD_SSE3_ENABLE + SIMD_INLINE bool Aligned(size_t size, size_t align = sizeof(__m128)) + { + return Simd::Aligned(size, align); + } -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - using Sse::Aligned; + SIMD_INLINE bool Aligned(const void * ptr, size_t align = sizeof(__m128)) + { + return Simd::Aligned(ptr, align); + } } -#endif// SIMD_SSSE3_ENABLE +#endif// SIMD_SSE2_ENABLE #ifdef SIMD_SSE41_ENABLE namespace Sse41 { - using Sse::Aligned; + using Sse2::Aligned; } #endif// SIMD_SSE41_ENABLE -#ifdef SIMD_SSE42_ENABLE - namespace Sse42 - { - } -#endif// SIMD_SSE42_ENABLE - #ifdef SIMD_AVX_ENABLE namespace Avx { diff --git a/3rdparty/simdlib/Simd/SimdNeon.h b/3rdparty/simdlib/Simd/SimdNeon.h old mode 100644 new mode 100755 index 54373b506e..bf2b98be69 --- a/3rdparty/simdlib/Simd/SimdNeon.h +++ b/3rdparty/simdlib/Simd/SimdNeon.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, +* Copyright (c) 2011-2021 Yermalayeu Ihar, * 2018-2018 Radchenko Andrey. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -36,22 +36,18 @@ namespace Simd void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride); - void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride); + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride); - void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha); + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride); - void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride); + void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride); - void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride); - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); + void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride); void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); @@ -93,6 +89,12 @@ namespace Simd void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); + void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); + + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride); + void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); } diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp old mode 100644 new mode 100755 index bb25c0c6e8..98a360b0e6 --- a/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -74,6 +74,8 @@ namespace Simd BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); } + //--------------------------------------------------------------------- + template SIMD_INLINE void Bgr48pToBgra32(uint8_t * bgra, const uint8_t * blue, const uint8_t * green, const uint8_t * red, size_t offset, const uint8x16_t & alpha) { @@ -128,6 +130,47 @@ namespace Simd else Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); } + + //--------------------------------------------------------------------- + + template SIMD_INLINE void RgbToBgra(const uint8_t* rgb, uint8_t* bgra, uint8x16_t alpha) + { + uint8x16x3_t _rgb = Load3(rgb); + uint8x16x4_t _bgra; + _bgra.val[0] = _rgb.val[2]; + _bgra.val[1] = _rgb.val[1]; + _bgra.val[2] = _rgb.val[0]; + _bgra.val[3] = alpha; + Store4(bgra, _bgra); + } + + template void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t alignedWidth = AlignLo(width, A); + uint8x16_t _alpha = vdupq_n_u8(alpha); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0, colBgra = 0, colRgb = 0; col < alignedWidth; col += A, colBgra += A4, colRgb += A3) + RgbToBgra(rgb + colRgb, bgra + colBgra, _alpha); + if (width != alignedWidth) + RgbToBgra(rgb + 3 * (width - A), bgra + 4 * (width - A), _alpha); + rgb += rgbStride; + bgra += bgraStride; + } + } + + void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) + RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + else + RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + } } #endif// SIMD_NEON_ENABLE } diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp old mode 100644 new mode 100755 index 0b9fdeaedf..57cf19f18d --- a/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -30,29 +30,31 @@ namespace Simd #ifdef SIMD_NEON_ENABLE namespace Neon { - SIMD_INLINE uint8x8_t BgrToGray(uint8x8x3_t bgr) + SIMD_INLINE uint8x16_t BgrToGray(uint8x16x3_t bgr) { - return vmovn_u16(BgrToGray(vmovl_u8(bgr.val[0]), vmovl_u8(bgr.val[1]), vmovl_u8(bgr.val[2]))); + uint8x8_t lo = vmovn_u16(BgrToGray(UnpackU8<0>(bgr.val[0]), UnpackU8<0>(bgr.val[1]), UnpackU8<0>(bgr.val[2]))); + uint8x8_t hi = vmovn_u16(BgrToGray(UnpackU8<1>(bgr.val[0]), UnpackU8<1>(bgr.val[1]), UnpackU8<1>(bgr.val[2]))); + return vcombine_u8(lo, hi); } - template void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) + template void BgrToGray(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* gray, size_t grayStride) { - assert(width >= HA); + assert(width >= A); if (align) assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride)); - size_t alignedWidth = AlignLo(width, HA); + size_t alignedWidth = AlignLo(width, A); for (size_t row = 0; row < height; ++row) { - for (size_t col = 0; col < alignedWidth; col += HA) + for (size_t col = 0; col < alignedWidth; col += A) { - uint8x8x3_t _bgr = LoadHalf3(bgr + 3 * col); + uint8x16x3_t _bgr = Load3(bgr + 3 * col); Store(gray + col, BgrToGray(_bgr)); } if (alignedWidth != width) { - uint8x8x3_t _bgr = LoadHalf3(bgr + 3 * (width - HA)); - Store(gray + width - HA, BgrToGray(_bgr)); + uint8x16x3_t _bgr = Load3(bgr + 3 * (width - A)); + Store(gray + width - A, BgrToGray(_bgr)); } bgr += bgrStride; gray += grayStride; @@ -66,6 +68,47 @@ namespace Simd else BgrToGray(bgr, width, height, bgrStride, gray, grayStride); } + + //--------------------------------------------------------------------- + + SIMD_INLINE uint8x16_t RgbToGray(uint8x16x3_t rgb) + { + uint8x8_t lo = vmovn_u16(BgrToGray(UnpackU8<0>(rgb.val[2]), UnpackU8<0>(rgb.val[1]), UnpackU8<0>(rgb.val[0]))); + uint8x8_t hi = vmovn_u16(BgrToGray(UnpackU8<1>(rgb.val[2]), UnpackU8<1>(rgb.val[1]), UnpackU8<1>(rgb.val[0]))); + return vcombine_u8(lo, hi); + } + + template void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) + { + assert(width >= A); + if (align) + assert(Aligned(rgb) && Aligned(rgbStride) && Aligned(gray) && Aligned(grayStride)); + + size_t alignedWidth = AlignLo(width, A); + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + { + uint8x16x3_t _rgb = Load3(rgb + 3 * col); + Store(gray + col, RgbToGray(_rgb)); + } + if (alignedWidth != width) + { + uint8x16x3_t _rgb = Load3(rgb + 3 * (width - A)); + Store(gray + width - A, RgbToGray(_rgb)); + } + rgb += rgbStride; + gray += grayStride; + } + } + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) + { + if (Aligned(rgb) && Aligned(gray) && Aligned(rgbStride) && Aligned(grayStride)) + RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + else + RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + } } #endif// SIMD_NEON_ENABLE } diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp old mode 100644 new mode 100755 index fb69a04b5f..b1e69cc3aa --- a/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -38,7 +38,7 @@ namespace Simd Store3(rgb, _bgr); } - template void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) + template void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride) { assert(width >= A); if (align) @@ -59,12 +59,12 @@ namespace Simd } } - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) + void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride) { if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)) - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); + BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); else - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); + BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); } } #endif//SIMD_NEON_ENABLE diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp deleted file mode 100644 index b2950c7da1..0000000000 --- a/3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - const size_t A3 = A * 3; - const size_t A4 = A * 4; - - union Bgra - { - uint8x16x4_t bgra; - uint8x16x3_t bgr; - }; - - template SIMD_INLINE void BgrToRgba(const uint8_t * bgr, uint8_t * rgba, Bgra & _bgra) - { - _bgra.bgr = Load3(bgr); - uint8x16_t tmp = _bgra.bgr.val[0]; - _bgra.bgr.val[0] = _bgra.bgr.val[2]; - _bgra.bgr.val[2] = tmp; - Store4(rgba, _bgra.bgra); - } - - template void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha) - { - assert(width >= A); - if (align) - assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - - Bgra _bgra; - _bgra.bgra.val[3] = vdupq_n_u8(alpha); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colRgba = 0, colBgr = 0; col < alignedWidth; col += A, colRgba += A4, colBgr += A3) - BgrToRgba(bgr + colBgr, rgba + colRgba, _bgra); - if (width != alignedWidth) - BgrToRgba(bgr + 3 * (width - A), rgba + 4 * (width - A), _bgra); - bgr += bgrStride; - rgba += rgbaStride; - } - } - - void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha) - { - if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - else - BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp old mode 100644 new mode 100755 index f95e1a9118..944fe5b45e --- a/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -66,6 +66,87 @@ namespace Simd else BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); } + + //--------------------------------------------------------------------- + + template SIMD_INLINE void BgraToRgb(const uint8_t* bgra, uint8_t* rgb) + { + uint8x16x4_t _bgra = Load4(bgra); + uint8x16x3_t _rgb; + _rgb.val[0] = _bgra.val[2]; + _rgb.val[1] = _bgra.val[1]; + _rgb.val[2] = _bgra.val[0]; + Store3(rgb, _rgb); + } + + template void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t alignedWidth = AlignLo(width, A); + if (width == alignedWidth) + alignedWidth -= A; + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0, colBgra = 0, colRgb = 0; col < alignedWidth; col += A, colBgra += A4, colRgb += A3) + BgraToRgb(bgra + colBgra, rgb + colRgb); + if (width != alignedWidth) + BgraToRgb(bgra + 4 * (width - A), rgb + 3 * (width - A)); + bgra += bgraStride; + rgb += rgbStride; + } + } + + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) + BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); + else + BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); + } + + //--------------------------------------------------------------------- + + template SIMD_INLINE void BgraToRgba(const uint8_t* bgra, uint8_t* rgba) + { + uint8x16x4_t _bgra = Load4(bgra); + uint8x16_t tmp = _bgra.val[0]; + _bgra.val[0] = _bgra.val[2]; + _bgra.val[2] = tmp; + Store4(rgba, _bgra); + } + + template void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride)); + + size_t alignedWidth = AlignLo(width, A); + if (width == alignedWidth) + alignedWidth -= A; + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0, colBgra = 0, colRgba = 0; col < alignedWidth; col += A, colBgra += A4, colRgba += A4) + BgraToRgba(bgra + colBgra, rgba + colRgba); + if (width != alignedWidth) + BgraToRgba(bgra + 4 * (width - A), rgba + 4 * (width - A)); + bgra += bgraStride; + rgba += rgbaStride; + } + } + + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride)) + BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + else + BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + } } #endif// SIMD_NEON_ENABLE } diff --git a/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp old mode 100644 new mode 100755 index 24fc228560..6b2eb4de48 --- a/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -66,6 +66,45 @@ namespace Simd else BgraToGray(bgra, width, height, bgraStride, gray, grayStride); } + + //--------------------------------------------------------------------- + + SIMD_INLINE uint8x8_t RgbaToGray(uint8x8x4_t rgba) + { + return vmovn_u16(BgrToGray(vmovl_u8(rgba.val[2]), vmovl_u8(rgba.val[1]), vmovl_u8(rgba.val[0]))); + } + + template void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) + { + assert(width >= HA); + if (align) + assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride)); + + size_t alignedWidth = AlignLo(width, HA); + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += HA) + { + uint8x8x4_t _rgba = LoadHalf4(rgba + 4 * col); + Store(gray + col, RgbaToGray(_rgba)); + } + if (alignedWidth != width) + { + uint8x8x4_t _rgba = LoadHalf4(rgba + 4 * (width - HA)); + Store(gray + width - HA, RgbaToGray(_rgba)); + } + rgba += rgbaStride; + gray += grayStride; + } + } + + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) + { + if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride)) + RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + else + RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + } } #endif// SIMD_NEON_ENABLE } diff --git a/3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp b/3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp deleted file mode 100644 index d1873eddcb..0000000000 --- a/3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - const size_t A4 = A * 4; - - union Bgra - { - uint8x16x4_t bgra; - }; - - template SIMD_INLINE void BgraToRgba(const uint8_t * bgra, uint8_t * rgba, Bgra & _bgra) - { - _bgra.bgra = Load4(bgra); - uint8x16_t tmp = _bgra.bgra.val[0]; - _bgra.bgra.val[0] = _bgra.bgra.val[2]; - _bgra.bgra.val[2] = tmp; - Store4(rgba, _bgra.bgra); - } - - template void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride) - { - assert(width >= A); - if (align) - assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride)); - - size_t alignedWidth = AlignLo(width, A); - - Bgra _bgra; - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colRgba = 0; col < alignedWidth; col += A, colRgba += A4) - BgraToRgba(bgra + colRgba, rgba + colRgba, _bgra); - if (width != alignedWidth) - BgraToRgba(bgra + 4 * (width - A), rgba + 4 * (width - A), _bgra); - bgra += bgraStride; - rgba += rgbaStride; - } - } - - void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride) - { - if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); - else - BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp b/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp old mode 100644 new mode 100755 index 53530a788d..36a623efb5 --- a/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -71,6 +71,8 @@ namespace Simd DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride); } + //--------------------------------------------------------------------- + template void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride) { @@ -118,6 +120,8 @@ namespace Simd DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); } + //--------------------------------------------------------------------- + template void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) { @@ -125,36 +129,65 @@ namespace Simd if (align) { assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)); + assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL)); } size_t bodyWidth = AlignLo(width, A); size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; ++row) + if (a) { - for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += QA) + for (size_t row = 0; row < height; ++row) { - uint8x16x4_t _bgra = Load4(bgra + offset); - Store(b + col, _bgra.val[0]); - Store(g + col, _bgra.val[1]); - Store(r + col, _bgra.val[2]); - Store(a + col, _bgra.val[3]); + for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += QA) + { + uint8x16x4_t _bgra = Load4(bgra + offset); + Store(b + col, _bgra.val[0]); + Store(g + col, _bgra.val[1]); + Store(r + col, _bgra.val[2]); + Store(a + col, _bgra.val[3]); + } + if (tail) + { + size_t col = width - A; + size_t offset = 4 * col; + uint8x16x4_t _bgra = Load4(bgra + offset); + Store(b + col, _bgra.val[0]); + Store(g + col, _bgra.val[1]); + Store(r + col, _bgra.val[2]); + Store(a + col, _bgra.val[3]); + } + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; + a += aStride; } - if (tail) + } + else + { + for (size_t row = 0; row < height; ++row) { - size_t col = width - A; - size_t offset = 4 * col; - uint8x16x4_t _bgra = Load4(bgra + offset); - Store(b + col, _bgra.val[0]); - Store(g + col, _bgra.val[1]); - Store(r + col, _bgra.val[2]); - Store(a + col, _bgra.val[3]); + for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += QA) + { + uint8x16x4_t _bgra = Load4(bgra + offset); + Store(b + col, _bgra.val[0]); + Store(g + col, _bgra.val[1]); + Store(r + col, _bgra.val[2]); + } + if (tail) + { + size_t col = width - A; + size_t offset = 4 * col; + uint8x16x4_t _bgra = Load4(bgra + offset); + Store(b + col, _bgra.val[0]); + Store(g + col, _bgra.val[1]); + Store(r + col, _bgra.val[2]); + } + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; } - bgra += bgraStride; - b += bStride; - g += gStride; - r += rStride; - a += aStride; } } @@ -162,7 +195,7 @@ namespace Simd uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) { if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && - Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)) + Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL)) DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); else DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); diff --git a/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp old mode 100644 new mode 100755 index 752778be2a..1d63a6510b --- a/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "Simd/SimdMemory.h" +#include "Simd/SimdLoadBlock.h" #include "Simd/SimdStore.h" #include "Simd/SimdGaussianBlur.h" #include "Simd/SimdLog.h" diff --git a/3rdparty/simdlib/Simd/SimdNeonResizer.cpp b/3rdparty/simdlib/Simd/SimdNeonResizer.cpp old mode 100644 new mode 100755 index b2e965200e..d11a0e29a8 --- a/3rdparty/simdlib/Simd/SimdNeonResizer.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonResizer.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -578,11 +578,11 @@ namespace Simd void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) { ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(float32x4_t)); - if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && dstX >= A) + if (param.IsByteBilinear() && dstX >= A) return new ResizerByteBilinear(param); - else if (type == SimdResizeChannelByte && method == SimdResizeMethodArea) + else if (param.IsByteArea()) return new ResizerByteArea(param); - else if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp)) + else if (param.IsFloatBilinear()) return new ResizerFloatBilinear(param); else return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); diff --git a/3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp deleted file mode 100644 index 37b288b277..0000000000 --- a/3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE uint8x8_t RgbToGray(uint8x8x3_t rgb) - { - return vmovn_u16(BgrToGray(vmovl_u8(rgb.val[2]), vmovl_u8(rgb.val[1]), vmovl_u8(rgb.val[0]))); - } - - template void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride) - { - assert(width >= HA); - if (align) - assert(Aligned(rgb) && Aligned(rgbStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, HA); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += HA) - { - uint8x8x3_t _rgb = LoadHalf3(rgb + 3 * col); - Store(gray + col, RgbToGray(_rgb)); - } - if (alignedWidth != width) - { - uint8x8x3_t _rgb = LoadHalf3(rgb + 3 * (width - HA)); - Store(gray + width - HA, RgbToGray(_rgb)); - } - rgb += rgbStride; - gray += grayStride; - } - } - - void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride) - { - if (Aligned(rgb) && Aligned(gray) && Aligned(rgbStride) && Aligned(grayStride)) - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp deleted file mode 100644 index 377d6fcb42..0000000000 --- a/3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE uint8x8_t RgbaToGray(uint8x8x4_t rgba) - { - return vmovn_u16(BgrToGray(vmovl_u8(rgba.val[2]), vmovl_u8(rgba.val[1]), vmovl_u8(rgba.val[0]))); - } - - template void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride) - { - assert(width >= HA); - if (align) - assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, HA); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += HA) - { - uint8x8x4_t _rgba = LoadHalf4(rgba + 4 * col); - Store(gray + col, RgbaToGray(_rgba)); - } - if (alignedWidth != width) - { - uint8x8x4_t _rgba = LoadHalf4(rgba + 4 * (width - HA)); - Store(gray + width - HA, RgbaToGray(_rgba)); - } - rgba += rgbaStride; - gray += grayStride; - } - } - - void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride) - { - if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride)) - RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); - else - RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdPixel.hpp b/3rdparty/simdlib/Simd/SimdPixel.hpp old mode 100644 new mode 100755 index 109c18ec1d..f95ce46ee6 --- a/3rdparty/simdlib/Simd/SimdPixel.hpp +++ b/3rdparty/simdlib/Simd/SimdPixel.hpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -41,6 +41,7 @@ namespace Simd struct Hsv24; struct Hsl24; struct Rgb24; + struct Rgba32; //------------------------------------------------------------------------- @@ -86,6 +87,13 @@ namespace Simd */ Bgr24(const Rgb24 & p); + /*! + Creates a new 24-bit BGR pixel structure on the base of 32-bit RGBA pixel. + + \param [in] p - 32-bit RGBA pixel. + */ + Bgr24(const Rgba32& p); + /*! Creates a copy of 24-bit BGR pixel structure. @@ -165,6 +173,13 @@ namespace Simd */ Bgra32(const Rgb24 & p, const uint8_t & a = uint8_t(255)); + /*! + Creates a new 32-bit BGRA pixel structure on the base of 32-bit RGBA pixel. + + \param [in] p - 32-bit RGBA pixel. + */ + Bgra32(const Rgba32& p); + /*! Creates a copy of 32-bit BGRA pixel structure. @@ -360,6 +375,13 @@ namespace Simd */ Rgb24(const Bgr24 & p); + /*! + Creates a new 24-bit RGB pixel structure on the base of 32-bit RGBA pixel. + + \param [in] p - 32-bit RGBA pixel. + */ + Rgb24(const Rgba32& p); + /*! Creates a copy of 24-bit RGB pixel structure. @@ -392,6 +414,92 @@ namespace Simd template class A> static Rgb24 & At(View & view, ptrdiff_t col, ptrdiff_t row); }; + /*! @ingroup cpp_pixels + + \short 32-bit RGBA pixel. + + Provides manipulation of 32-bit RGBA (Red, Blue, Green, Alpha) pixels of the View struct. + */ + struct Rgba32 + { + uint8_t red; /*!< \brief 8-bit red channel 32-bit BGRA pixel. */ + uint8_t green; /*!< \brief 8-bit green channel 32-bit BGRA pixel. */ + uint8_t blue; /*!< \brief 8-bit blue channel 32-bit BGRA pixel. */ + uint8_t alpha; /*!< \brief 8-bit alpha channel 32-bit RGBA pixel. */ + + /*! + Creates a new 32-bit RGBA pixel structure with specified channel values. + + \param [in] gray - initial value for all channels. It is equal to 0 by default. + \param [in] a - initial value for alpha channel. It is equal to 255 by default. + */ + Rgba32(const uint8_t& gray = uint8_t(0), const uint8_t& a = uint8_t(255)); + + /*! + Creates a new 32-bit RGBA pixel structure with specified channel values. + + \param [in] r - initial value for red channel. + \param [in] g - initial value for green channel. + \param [in] b - initial value for blue channel. + \param [in] a - initial value for alpha channel. It is equal to 255 by default. + */ + Rgba32(const uint8_t& r, const uint8_t& g, const uint8_t& b, const uint8_t& a = uint8_t(255)); + + /*! + Creates a new 32-bit RGBA pixel structure on the base of 32-bit BGRA pixel. + + \param [in] p - 32-bit BGRA pixel. + */ + Rgba32(const Bgra32& p); + + /*! + Creates a new 32-bit RGBA pixel structure on the base of 24-bit BGR pixel. + + \param [in] p - 24-bit BGR pixel. + \param [in] a - initial value for alpha channel. It is equal to 255 by default. + */ + Rgba32(const Bgr24& p, const uint8_t& a = uint8_t(255)); + + /*! + Creates a new of 32-bit RGBA pixel structure on the base of 24-bit RGB pixel. + + \param [in] p - 24-bit RGB pixel. + \param [in] a - initial value for alpha channel. It is equal to 255 by default. + */ + Rgba32(const Rgb24& p, const uint8_t& a = uint8_t(255)); + + /*! + Creates a copy of 32-bit RGBA pixel structure. + + \param [in] p - 32-bit RGBA pixel. + */ + Rgba32(const Rgba32& p); + + /*! + \fn template class A> static const Rgba32 & At(const View & view, ptrdiff_t col, ptrdiff_t row); + + Gets constant reference to the pixel with specific coordinates at the image view. + + \param [in] view - an image view of 32-bit RGBA pixel format. + \param [in] col - x-coordinate of the pixel. + \param [in] row - y-coordinate of the pixel. + \return a constant reference to the pixel. + */ + template class A> static const Rgba32& At(const View& view, ptrdiff_t col, ptrdiff_t row); + + /*! + \fn template class A> static Rgba32 & At(View & view, ptrdiff_t col, ptrdiff_t row); + + Gets reference to the pixel with specific coordinates at the image view. + + \param [in] view - an image view of 32-bit RGBA pixel format. + \param [in] col - x-coordinate of the pixel. + \param [in] row - y-coordinate of the pixel. + \return a reference to the pixel. + */ + template class A> static Rgba32& At(View& view, ptrdiff_t col, ptrdiff_t row); + }; + //------------------------------------------------------------------------- // struct Bgr24 implementation: @@ -417,14 +525,21 @@ namespace Simd { } - SIMD_INLINE Bgr24::Bgr24(const Bgr24 & p) + SIMD_INLINE Bgr24::Bgr24(const Rgb24 & p) : blue(p.blue) , green(p.green) , red(p.red) { } - SIMD_INLINE Bgr24::Bgr24(const Rgb24 & p) + SIMD_INLINE Bgr24::Bgr24(const Rgba32& p) + : blue(p.blue) + , green(p.green) + , red(p.red) + { + } + + SIMD_INLINE Bgr24::Bgr24(const Bgr24 & p) : blue(p.blue) , green(p.green) , red(p.red) @@ -479,6 +594,14 @@ namespace Simd { } + SIMD_INLINE Bgra32::Bgra32(const Rgba32& p) + : blue(p.blue) + , green(p.green) + , red(p.red) + , alpha(p.alpha) + { + } + SIMD_INLINE Bgra32::Bgra32(const Bgra32 & p) : blue(p.blue) , green(p.green) @@ -605,6 +728,13 @@ namespace Simd { } + SIMD_INLINE Rgb24::Rgb24(const Rgba32& p) + : red(p.red) + , green(p.green) + , blue(p.blue) + { + } + SIMD_INLINE Rgb24::Rgb24(const Rgb24 & p) : red(p.red) , green(p.green) @@ -625,6 +755,70 @@ namespace Simd return Simd::At(view, col, row); } + + // struct Rgba32 implementation: + + SIMD_INLINE Rgba32::Rgba32(const uint8_t& gray, const uint8_t& a) + : red(gray) + , green(gray) + , blue(gray) + , alpha(a) + { + } + + SIMD_INLINE Rgba32::Rgba32(const uint8_t& r, const uint8_t& g, const uint8_t& b, const uint8_t& a) + : red(r) + , green(g) + , blue(b) + , alpha(a) + { + } + + SIMD_INLINE Rgba32::Rgba32(const Bgra32& p) + : red(p.red) + , green(p.green) + , blue(p.blue) + , alpha(p.alpha) + { + } + + SIMD_INLINE Rgba32::Rgba32(const Bgr24& p, const uint8_t& a) + : red(p.red) + , green(p.green) + , blue(p.blue) + , alpha(a) + { + } + + SIMD_INLINE Rgba32::Rgba32(const Rgb24& p, const uint8_t& a) + : red(p.red) + , green(p.green) + , blue(p.blue) + , alpha(a) + { + } + + SIMD_INLINE Rgba32::Rgba32(const Rgba32& p) + : red(p.red) + , green(p.green) + , blue(p.blue) + , alpha(p.alpha) + { + } + + template class A> SIMD_INLINE const Rgba32& Rgba32::At(const View& view, ptrdiff_t col, ptrdiff_t row) + { + assert(view.format == View::Rgba32); + + return Simd::At(view, col, row); + } + + template class A> SIMD_INLINE Rgba32& Rgba32::At(View& view, ptrdiff_t col, ptrdiff_t row) + { + assert(view.format == View::Rgba32); + + return Simd::At(view, col, row); + } } } diff --git a/3rdparty/simdlib/Simd/SimdPow.h b/3rdparty/simdlib/Simd/SimdPow.h old mode 100644 new mode 100755 index 309e3104f0..ca0db18eb5 --- a/3rdparty/simdlib/Simd/SimdPow.h +++ b/3rdparty/simdlib/Simd/SimdPow.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/3rdparty/simdlib/Simd/SimdResizer.h b/3rdparty/simdlib/Simd/SimdResizer.h old mode 100644 new mode 100755 index 0a70ee0ad6..15dacfcd0c --- a/3rdparty/simdlib/Simd/SimdResizer.h +++ b/3rdparty/simdlib/Simd/SimdResizer.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -46,6 +46,43 @@ namespace Simd this->channels = channels; this->align = align; } + + bool IsByteBilinear() const + { + return type == SimdResizeChannelByte && method == SimdResizeMethodBilinear; + } + + bool IsByteArea() const + { + return type == SimdResizeChannelByte && method == SimdResizeMethodArea; + } + + bool IsShortBilinear() const + { + return type == SimdResizeChannelShort && method == SimdResizeMethodBilinear; + } + + bool IsFloatBilinear() const + { + return type == SimdResizeChannelFloat && + (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp || method == SimdResizeMethodInferenceEngineInterp); + } + + bool IsNearest() const + { + return method == SimdResizeMethodNearest; + } + + size_t ChannelSize() const + { + static const size_t sizes[3] = { 1, 2, 4 }; + return sizes[(int)type]; + } + + size_t PixelSize() const + { + return ChannelSize() * channels; + } }; class Resizer : Deletable @@ -94,13 +131,32 @@ namespace Simd virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); }; + class ResizerShortBilinear : public Resizer + { + protected: + Array32i _ix, _iy; + Array32f _ax, _ay, _bx[2]; + + void EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t* indices, float* alphas); + + template void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + template void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + + virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + + public: + ResizerShortBilinear(const ResParam& param); + + virtual void Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride); + }; + class ResizerFloatBilinear : public Resizer { protected: Array32i _ix, _iy; Array32f _ax, _ay, _bx[2]; - void EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, bool caffeInterp, int32_t * indices, float * alphas); + void EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t * indices, float * alphas); virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride); @@ -110,22 +166,23 @@ namespace Simd virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); }; - void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); - } - -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - class ResizerFloatBilinear : public Base::ResizerFloatBilinear + class ResizerNearest : public Resizer { - virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride); + void Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride); + template void Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride); + protected: + size_t _pixelSize; + Array32i _ix, _iy; + + void EstimateIndex(size_t srcSize, size_t dstSize, size_t pixelSize, int32_t* indices); public: - ResizerFloatBilinear(const ResParam & param); - }; + ResizerNearest(const ResParam& param); + virtual void Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride); + }; + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); } -#endif //SIMD_SSE_ENABLE #ifdef SIMD_SSE2_ENABLE namespace Sse2 @@ -156,12 +213,19 @@ namespace Simd virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); }; + class ResizerFloatBilinear : public Base::ResizerFloatBilinear + { + virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride); + public: + ResizerFloatBilinear(const ResParam & param); + }; + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); } #endif //SIMD_SSE2_ENABLE -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { class ResizerByteBilinear : public Sse2::ResizerByteBilinear { @@ -183,15 +247,8 @@ namespace Simd ResizerByteBilinear(const ResParam & param); virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); - }; - - void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); - } -#endif //SIMD_SSSE3_ENABLE - -#ifdef SIMD_SSE41_ENABLE - namespace Sse41 - { + }; + class ResizerByteArea : public Sse2::ResizerByteArea { protected: @@ -202,6 +259,17 @@ namespace Simd virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); }; + class ResizerShortBilinear : public Base::ResizerShortBilinear + { + protected: + template void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + template void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + + virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + public: + ResizerShortBilinear(const ResParam& param); + }; + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); } #endif //SIMD_SSE41_ENABLE @@ -223,15 +291,7 @@ namespace Simd #ifdef SIMD_AVX2_ENABLE namespace Avx2 { - template SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t * src, const Idx & index, const uint8_t * alpha, uint8_t * dst) - { - __m256i _src = _mm256_loadu_si256((__m256i*)(src + index.src)); - __m256i _shuffle = _mm256_loadu_si256((__m256i*)&index.shuffle); - __m256i _alpha = _mm256_loadu_si256((__m256i*)(alpha + index.dst)); - _mm256_storeu_si256((__m256i*)(dst + index.dst), _mm256_maddubs_epi16(Avx2::Shuffle(_src, _shuffle), _alpha)); - } - - class ResizerByteBilinear : public Ssse3::ResizerByteBilinear + class ResizerByteBilinear : public Sse41::ResizerByteBilinear { protected: struct Idx @@ -260,6 +320,17 @@ namespace Simd virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); }; + class ResizerShortBilinear : public Sse41::ResizerShortBilinear + { + protected: + template void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + template void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + + virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + public: + ResizerShortBilinear(const ResParam& param); + }; + class ResizerFloatBilinear : public Base::ResizerFloatBilinear { virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride); @@ -308,6 +379,17 @@ namespace Simd virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); }; + class ResizerShortBilinear : public Base::ResizerShortBilinear + { + protected: + template void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + template void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + + virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + public: + ResizerShortBilinear(const ResParam& param); + }; + class ResizerFloatBilinear : public Base::ResizerFloatBilinear { virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride); diff --git a/3rdparty/simdlib/Simd/SimdResizerCommon.h b/3rdparty/simdlib/Simd/SimdResizerCommon.h new file mode 100755 index 0000000000..3e6ab00ffa --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdResizerCommon.h @@ -0,0 +1,97 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdResizerCommon_h__ +#define __SimdResizerCommon_h__ + +#include "Simd/SimdLoad.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + const __m128i RSB_1_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x4, 0x5, -1, -1, 0x8, 0x9, -1, -1, 0xC, 0xD, -1, -1); + const __m128i RSB_1_1 = SIMD_MM_SETR_EPI8(0x2, 0x3, -1, -1, 0x6, 0x7, -1, -1, 0xA, 0xB, -1, -1, 0xE, 0xF, -1, -1); + + SIMD_INLINE __m128 BilColS1(const uint16_t* src, const int32_t* idx, __m128 fx0, __m128 fx1) + { + __m128i s = _mm_setr_epi32( + *(uint32_t*)(src + idx[0]), *(uint32_t*)(src + idx[1]), + *(uint32_t*)(src + idx[2]), *(uint32_t*)(src + idx[3])); + __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_1_0))); + __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_1_1))); + return _mm_add_ps(m0, m1); + } + + const __m128i RSB_2_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x2, 0x3, -1, -1, 0x8, 0x9, -1, -1, 0xA, 0xB, -1, -1); + const __m128i RSB_2_1 = SIMD_MM_SETR_EPI8(0x4, 0x5, -1, -1, 0x6, 0x7, -1, -1, 0xC, 0xD, -1, -1, 0xE, 0xF, -1, -1); + + SIMD_INLINE __m128 BilColS2(const uint16_t* src, const int32_t* idx, __m128 fx0, __m128 fx1) + { + __m128i s = Sse2::Load((__m128i*)(src + idx[0]), (__m128i*)(src + idx[2])); + __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_2_0))); + __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_2_1))); + return _mm_add_ps(m0, m1); + } + + const __m128i RSB_3_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x2, 0x3, -1, -1, 0x4, 0x5, -1, -1, -1, -1, -1, -1); + const __m128i RSB_3_1 = SIMD_MM_SETR_EPI8(0x6, 0x7, -1, -1, 0x8, 0x9, -1, -1, 0xA, 0xB, -1, -1, -1, -1, -1, -1); + + SIMD_INLINE __m128 BilColS3(const uint16_t* src, __m128 fx0, __m128 fx1) + { + __m128i s = _mm_loadu_si128((__m128i*)src); + __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_3_0))); + __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_3_1))); + return _mm_add_ps(m0, m1); + } + + const __m128i RSB_4_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x2, 0x3, -1, -1, 0x4, 0x5, -1, -1, 0x6, 0x7, -1, -1); + const __m128i RSB_4_1 = SIMD_MM_SETR_EPI8(0x8, 0x9, -1, -1, 0xA, 0xB, -1, -1, 0xC, 0xD, -1, -1, 0xE, 0xF, -1, -1); + + SIMD_INLINE __m128 BilColS4(const uint16_t* src, __m128 fx0, __m128 fx1) + { + __m128i s = _mm_loadu_si128((__m128i*)src); + __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_4_0))); + __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_4_1))); + return _mm_add_ps(m0, m1); + } + + const __m128i RSB_3_P = SIMD_MM_SETR_EPI8(0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, -1, -1, -1, -1); + } +#endif //SIMD_SSE41_ENABLE + +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + template SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t * src, const Idx & index, const uint8_t * alpha, uint8_t * dst) + { + __m256i _src = _mm256_loadu_si256((__m256i*)(src + index.src)); + __m256i _shuffle = _mm256_loadu_si256((__m256i*)&index.shuffle); + __m256i _alpha = _mm256_loadu_si256((__m256i*)(alpha + index.dst)); + _mm256_storeu_si256((__m256i*)(dst + index.dst), _mm256_maddubs_epi16(Avx2::Shuffle(_src, _shuffle), _alpha)); + } + } +#endif //SIMD_AVX2_ENABLE +} +#endif//__SimdResizerCommon_h__ diff --git a/3rdparty/simdlib/Simd/SimdRuntime.h b/3rdparty/simdlib/Simd/SimdRuntime.h old mode 100644 new mode 100755 index 5fb82ebd00..de098cdb94 --- a/3rdparty/simdlib/Simd/SimdRuntime.h +++ b/3rdparty/simdlib/Simd/SimdRuntime.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -54,10 +54,13 @@ namespace Simd if (!_info.empty()) { std::sort(_candidates.begin(), _candidates.end(), [](const Candidate & a, const Candidate & b) { return a.Mean() < b.Mean(); }); - std::cout << std::setprecision(3) << std::fixed; std::cout << "Simd::Runtime " << _info << " : "; + int64_t f = TimeFrequency(); for (size_t i = 0; i < _candidates.size(); ++i) - std::cout << _candidates[i].func.Name() << ": " << _candidates[i].Mean()*1000.0 << " "; + { + int64_t t = _candidates[i].Mean(); + std::cout << _candidates[i].func.Name() << ": " << t * 1000 / f << "." << (t * 1000000 / f) % 1000 << " "; + } std::cout << std::endl; } #endif @@ -104,18 +107,18 @@ namespace Simd { Func func; size_t count; - double sum, min, max; + int64_t sum, min, max; SIMD_INLINE Candidate(const Func & f) : func(f) , count(0) , sum(0) - , min(std::numeric_limits::max()) - , max(std::numeric_limits::min()) + , min(std::numeric_limits::max()) + , max(0) { } - SIMD_INLINE void Update(const double & value) + SIMD_INLINE void Update(int64_t value) { count += 1; sum += value; @@ -123,9 +126,14 @@ namespace Simd max = std::max(max, value); } - SIMD_INLINE double Mean() const + SIMD_INLINE int64_t Mean() const { - return (sum - min - max) / (count - 2); + if( count > 2) + return (sum - min - max) / (count - 2); + else if (count > 0) + return sum / count; + else + return sum; } }; typedef std::vector Candidates; @@ -144,9 +152,9 @@ namespace Simd if (_info.empty()) _info = current->func.Info(args); #endif - double start = Simd::Time(); + int64_t start = Simd::TimeCounter(); current->func.Run(args); - current->Update(Simd::Time() - start); + current->Update(Simd::TimeCounter() - start); } else { @@ -173,10 +181,10 @@ namespace Simd SIMD_INLINE Candidate * Best() { Candidate * best = &_candidates[0]; - double min = best->Mean(); + int64_t min = best->Mean(); for (size_t i = 1; i < _candidates.size(); ++i) { - double mean = _candidates[i].Mean(); + int64_t mean = _candidates[i].Mean(); if (mean < min) { min = mean; diff --git a/3rdparty/simdlib/Simd/SimdSet.h b/3rdparty/simdlib/Simd/SimdSet.h old mode 100644 new mode 100755 index ae1bb6066a..22b5622e73 --- a/3rdparty/simdlib/Simd/SimdSet.h +++ b/3rdparty/simdlib/Simd/SimdSet.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2018 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -109,6 +109,12 @@ namespace Simd const float a[4] = { a0, a1, a2, a3 }; return vld1q_f32(a); } + + SIMD_INLINE int32x4_t SetI32(int32_t a0, int32_t a1, int32_t a2, int32_t a3) + { + const int32_t a[4] = { a0, a1, a2, a3 }; + return vld1q_s32(a); + } } #endif// SIMD_NEON_ENABLE } diff --git a/3rdparty/simdlib/Simd/SimdSse1Resizer.cpp b/3rdparty/simdlib/Simd/SimdSse1Resizer.cpp deleted file mode 100644 index 405ee03f4f..0000000000 --- a/3rdparty/simdlib/Simd/SimdSse1Resizer.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdResizer.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param) - : Base::ResizerFloatBilinear(param) - { - } - - void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - size_t cn = _param.channels; - size_t rs = _param.dstW * cn; - float * pbx[2] = { _bx[0].data, _bx[1].data }; - int32_t prev = -2; - size_t rsa = AlignLo(rs, Sse::F); - for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) - { - float fy1 = _ay[dy]; - float fy0 = 1.0f - fy1; - int32_t sy = _iy[dy]; - int32_t k = 0; - - if (sy == prev) - k = 2; - else if (sy == prev + 1) - { - Swap(pbx[0], pbx[1]); - k = 1; - } - - prev = sy; - - for (; k < 2; k++) - { - float * pb = pbx[k]; - const float * ps = src + (sy + k)*srcStride; - size_t dx = 0; - if (cn == 1) - { - __m128 _1 = _mm_set1_ps(1.0f); - for (; dx < rsa; dx += Sse::F) - { - __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); - __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); - __m128 fx1 = _mm_load_ps(_ax.data + dx); - __m128 fx0 = _mm_sub_ps(_1, fx1); - __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88)); - __m128 m1 = _mm_mul_ps(fx1, _mm_shuffle_ps(s01, s23, 0xDD)); - _mm_store_ps(pb + dx, _mm_add_ps(m0, m1)); - } - } - if (cn == 3 && rs > 3) - { - __m128 _1 = _mm_set1_ps(1.0f); - size_t rs3 = rs - 3; - for (; dx < rs3; dx += 3) - { - __m128 s0 = _mm_loadu_ps(ps + _ix[dx] + 0); - __m128 s1 = _mm_loadu_ps(ps + _ix[dx] + 3); - __m128 fx1 = _mm_set1_ps(_ax.data[dx]); - __m128 fx0 = _mm_sub_ps(_1, fx1); - _mm_storeu_ps(pb + dx, _mm_add_ps(_mm_mul_ps(fx0, s0), _mm_mul_ps(fx1, s1))); - } - } - for (; dx < rs; dx++) - { - int32_t sx = _ix[dx]; - float fx = _ax[dx]; - pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + cn] * fx; - } - } - - size_t dx = 0; - __m128 _fy0 = _mm_set1_ps(fy0); - __m128 _fy1 = _mm_set1_ps(fy1); - for (; dx < rsa; dx += Sse::F) - { - __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _fy0); - __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _fy1); - _mm_storeu_ps(dst + dx, _mm_add_ps(m0, m1)); - } - for (; dx < rs; dx++) - dst[dx] = pbx[0][dx] * fy0 + pbx[1][dx] * fy1; - } - } - - //--------------------------------------------------------------------- - - void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) - { - ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128)); - if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp)) - return new ResizerFloatBilinear(param); - else - return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - } - } -#endif //SIMD_SSE_ENABLE -} - diff --git a/3rdparty/simdlib/Simd/SimdSse2.h b/3rdparty/simdlib/Simd/SimdSse2.h old mode 100644 new mode 100755 index ce304774f5..66a0d22700 --- a/3rdparty/simdlib/Simd/SimdSse2.h +++ b/3rdparty/simdlib/Simd/SimdSse2.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,15 +33,11 @@ namespace Simd { void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride); - void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride); - void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); void BgrToGray(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *gray, size_t grayStride); - void RgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride); - void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); @@ -68,6 +64,8 @@ namespace Simd void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride); + void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); diff --git a/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp b/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp old mode 100644 new mode 100755 index c150220b82..b818225858 --- a/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -88,6 +88,58 @@ namespace Simd else BgraToGray(bgra, width, height, bgraStride, gray, grayStride); } + + //--------------------------------------------------------------------- + + const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); + + SIMD_INLINE __m128i RgbaToGray32(__m128i rgba) + { + const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF); + const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF); + const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_0000), _mm_madd_epi16(r0b0, K16_RED_BLUE)); + return _mm_srli_epi32(_mm_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); + } + + SIMD_INLINE __m128i RgbaToGray(__m128i rgba[4]) + { + const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); + const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); + return _mm_packus_epi16(lo, hi); + } + + template void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) + { + assert(width >= A); + if (align) + assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride)); + + size_t alignedWidth = AlignLo(width, A); + __m128i a[4]; + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + { + Load(rgba + 4 * col, a); + Store((__m128i*)(gray + col), RgbaToGray(a)); + } + if (alignedWidth != width) + { + Load(rgba + 4 * (width - A), a); + Store((__m128i*)(gray + width - A), RgbaToGray(a)); + } + rgba += rgbaStride; + gray += grayStride; + } + } + + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) + { + if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride)) + RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + else + RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + } } #else // Work arround to avoid warning: libvisp_simdlib.a(SimdSse2BgraToGray.cpp.o) has no symbols diff --git a/3rdparty/simdlib/Simd/SimdBaseBgraToRgba.cpp b/3rdparty/simdlib/Simd/SimdSse2Cpu.cpp similarity index 62% rename from 3rdparty/simdlib/Simd/SimdBaseBgraToRgba.cpp rename to 3rdparty/simdlib/Simd/SimdSse2Cpu.cpp index 8ada2f6a2c..3d1dfe00fb 100644 --- a/3rdparty/simdlib/Simd/SimdBaseBgraToRgba.cpp +++ b/3rdparty/simdlib/Simd/SimdSse2Cpu.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -21,30 +21,44 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "Simd/SimdDefs.h" -#include +#include "Simd/SimdEnable.h" +#include "Simd/SimdCpu.h" + +#if defined(_MSC_VER) +#include +#endif namespace Simd { - namespace Base +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { - void BgraToRgba(const uint8_t *bgra, size_t size, uint8_t *rgba) + SIMD_INLINE bool SupportedByCPU() { - for (size_t i = 0; i < size; ++i, bgra += 4, rgba += 4) - { - *(int32_t*)rgba = (*(int32_t*)bgra); - std::swap(rgba[0], rgba[2]); - } + return Base::CheckBit(Cpuid::Ordinary, Cpuid::Edx, Cpuid::SSE2); } - void BgraToRgba(const uint8_t *bgra, size_t width, size_t height, size_t bgraStride, uint8_t *rgba, size_t rgbaStride) + SIMD_INLINE bool SupportedByOS() { - for (size_t row = 0; row < height; ++row) +#if defined(_MSC_VER) + __try + { + __m128d value = _mm_set1_pd(1.0);// try to execute of SSE2 instructions; + return true; + } + __except (EXCEPTION_EXECUTE_HANDLER) { - BgraToRgba(bgra, width, rgba); - bgra += bgraStride; - rgba += rgbaStride; + return false; } +#else + return true; +#endif + } + + bool GetEnable() + { + return SupportedByCPU() && SupportedByOS(); } } +#endif } diff --git a/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp b/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp old mode 100644 new mode 100755 index 394488a804..70e4f139ea --- a/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp +++ b/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "Simd/SimdMemory.h" +#include "Simd/SimdLoadBlock.h" #include "Simd/SimdStore.h" namespace Simd diff --git a/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp b/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp old mode 100644 new mode 100755 index f29d96eeb1..c289ab7f75 --- a/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp +++ b/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -297,12 +297,12 @@ namespace Simd void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) { ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128i)); - if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && (channels == 1 || channels == 2) && dstX >= A) + if (param.IsByteBilinear() && (channels == 1 || channels == 2) && dstX >= A) return new ResizerByteBilinear(param); - else if (type == SimdResizeChannelByte && method == SimdResizeMethodArea) + else if (param.IsByteArea()) return new ResizerByteArea(param); else - return Sse::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); + return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); } } #else diff --git a/3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp b/3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp deleted file mode 100644 index 927dde0dae..0000000000 --- a/3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSse2.h" - -namespace Simd -{ -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint8_t) * 4 * width); - rgba = (uint8_t*)_p; - } - - ~Buffer() - { - Free(_p); - } - - uint8_t * rgba; - private: - void *_p; - }; - } - - void RgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride) - { - assert(width >= A); - - Buffer buffer(width); - - for (size_t row = 1; row < height; ++row) - { - Base::BgrToBgra(rgb, width, buffer.rgba, false, false, 0xFF); - Sse2::RgbaToGray(buffer.rgba, width, 1, 4 * width, gray, width); - rgb += rgbStride; - gray += grayStride; - } - Base::BgrToBgra(rgb, width, buffer.rgba, false, true, 0xFF); - Sse2::RgbaToGray(buffer.rgba, width, 1, 4 * width, gray, width); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSse2RgbToGray.cpp.o) has no symbols - void dummy_SimdSse2RgbToGray(){}; -#endif//SIMD_SSE2_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp deleted file mode 100644 index 884f09924b..0000000000 --- a/3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp +++ /dev/null @@ -1,96 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); - const __m128i K16_GREEN_0000 = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, 0x0000); - const __m128i K32_ROUND_TERM = SIMD_MM_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m128i RgbaToGray32(__m128i rgba) - { - const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF); - const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF); - const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_0000), _mm_madd_epi16(r0b0, K16_RED_BLUE)); - return _mm_srli_epi32(_mm_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - SIMD_INLINE __m128i RgbaToGray(__m128i rgba[4]) - { - const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); - const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); - return _mm_packus_epi16(lo, hi); - } - - template SIMD_INLINE void Load(const uint8_t* p, __m128i a[4]) - { - a[0] = Load((__m128i*)p + 0); - a[1] = Load((__m128i*)p + 1); - a[2] = Load((__m128i*)p + 2); - a[3] = Load((__m128i*)p + 3); - } - - template void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride) - { - assert(width >= A); - if (align) - assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, A); - __m128i a[4]; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - Load(rgba + 4 * col, a); - Store((__m128i*)(gray + col), RgbaToGray(a)); - } - if (alignedWidth != width) - { - Load(rgba + 4 * (width - A), a); - Store((__m128i*)(gray + width - A), RgbaToGray(a)); - } - rgba += rgbaStride; - gray += grayStride; - } - } - - void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride) - { - if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride)) - RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); - else - RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSse2RgbaToGray.cpp.o) has no symbols - void dummy_SimdSse2RgbaToGray(){}; -#endif// SIMD_SSE2_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdSse41.h b/3rdparty/simdlib/Simd/SimdSse41.h new file mode 100755 index 0000000000..958fc11bc5 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdSse41.h @@ -0,0 +1,76 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdSse41_h__ +#define __SimdSse41_h__ + +#include "Simd/SimdDefs.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + void BgraToBgr(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* bgr, size_t bgrStride); + + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride); + + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride); + + void BgrToBgra(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + + void BgrToGray(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* gray, size_t grayStride); + + void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride); + + void DeinterleaveBgr(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride); + + void DeinterleaveBgra(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height, uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride); + + void GaussianBlur3x3(const uint8_t* src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t* dst, size_t dstStride); + + void GrayToBgr(const uint8_t* gray, size_t width, size_t height, size_t grayStride, uint8_t* bgr, size_t bgrStride); + + void InterleaveBgr(const uint8_t* b, size_t bStride, const uint8_t* g, size_t gStride, const uint8_t* r, size_t rStride, size_t width, size_t height, uint8_t* bgr, size_t bgrStride); + + void InterleaveBgra(const uint8_t* b, size_t bStride, const uint8_t* g, size_t gStride, const uint8_t* r, size_t rStride, const uint8_t* a, size_t aStride, size_t width, size_t height, uint8_t* bgra, size_t bgraStride); + + void ReduceColor2x2(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); + + void ReduceGray2x2(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride); + + void ReduceGray4x4(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride); + + void ResizeBilinear(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); + + void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); + } +#endif// SIMD_SSE41_ENABLE +} +#endif//__SimdSse41_h__ diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdSse41BgrToBgra.cpp old mode 100644 new mode 100755 similarity index 57% rename from 3rdparty/simdlib/Simd/SimdSsse3BgrToBgra.cpp rename to 3rdparty/simdlib/Simd/SimdSse41BgrToBgra.cpp index 2c7f277758..65787e1a45 --- a/3rdparty/simdlib/Simd/SimdSsse3BgrToBgra.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41BgrToBgra.cpp @@ -1,74 +1,111 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - template SIMD_INLINE void BgrToBgra(const uint8_t * bgr, uint8_t * bgra, __m128i alpha, __m128i shuffle) - { - Store((__m128i*)bgra + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 0)), shuffle))); - Store((__m128i*)bgra + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 12)), shuffle))); - Store((__m128i*)bgra + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 24)), shuffle))); - Store((__m128i*)bgra + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(bgr + 32)), 4), shuffle))); - } - - template void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - - __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3); - __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BgrToBgra(bgr + 3 * col, bgra + 4 * col, _alpha, _shuffle); - if (width != alignedWidth) - BgrToBgra(bgr + 3 * (width - A), bgra + 4 * (width - A), _alpha, _shuffle); - bgr += bgrStride; - bgra += bgraStride; - } - } - - void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); - else - BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToBgra.cpp.o) has no symbols - void dummy_SimdSsse3BgrToBgra(){}; -#endif// SIMD_SSSE3_ENABLE -} +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdStore.h" +#include "Simd/SimdMemory.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + template SIMD_INLINE void BgrToBgra(const uint8_t * bgr, uint8_t * bgra, __m128i alpha, __m128i shuffle) + { + Store((__m128i*)bgra + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 0)), shuffle))); + Store((__m128i*)bgra + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 12)), shuffle))); + Store((__m128i*)bgra + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 24)), shuffle))); + Store((__m128i*)bgra + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(bgr + 32)), 4), shuffle))); + } + + template void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); + + size_t alignedWidth = AlignLo(width, A); + + __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3); + __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + BgrToBgra(bgr + 3 * col, bgra + 4 * col, _alpha, _shuffle); + if (width != alignedWidth) + BgrToBgra(bgr + 3 * (width - A), bgra + 4 * (width - A), _alpha, _shuffle); + bgr += bgrStride; + bgra += bgraStride; + } + } + + void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) + BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); + else + BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); + } + + //--------------------------------------------------------------------- + + template SIMD_INLINE void RgbToBgra(const uint8_t* rgb, uint8_t* bgra, __m128i alpha, __m128i shuffle) + { + Store((__m128i*)bgra + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(rgb + 0)), shuffle))); + Store((__m128i*)bgra + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(rgb + 12)), shuffle))); + Store((__m128i*)bgra + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(rgb + 24)), shuffle))); + Store((__m128i*)bgra + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(rgb + 32)), 4), shuffle))); + } + + template void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t alignedWidth = AlignLo(width, A); + + __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3); + __m128i _shuffle = _mm_setr_epi8(0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + RgbToBgra(rgb + 3 * col, bgra + 4 * col, _alpha, _shuffle); + if (width != alignedWidth) + RgbToBgra(rgb + 3 * (width - A), bgra + 4 * (width - A), _alpha, _shuffle); + rgb += rgbStride; + bgra += bgraStride; + } + } + + void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) + RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + else + RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToGray.cpp b/3rdparty/simdlib/Simd/SimdSse41BgrToGray.cpp old mode 100644 new mode 100755 similarity index 56% rename from 3rdparty/simdlib/Simd/SimdSsse3BgrToGray.cpp rename to 3rdparty/simdlib/Simd/SimdSse41BgrToGray.cpp index 224a87bbce..b089e35631 --- a/3rdparty/simdlib/Simd/SimdSsse3BgrToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41BgrToGray.cpp @@ -1,93 +1,148 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - const __m128i K16_BLUE_RED = SIMD_MM_SET2_EPI16(Base::BLUE_TO_GRAY_WEIGHT, Base::RED_TO_GRAY_WEIGHT); - const __m128i K16_GREEN_ROUND = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m128i BgraToGray32(__m128i bgra) - { - const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(bgra, 1), K16_00FF); - const __m128i b0r0 = _mm_and_si128(bgra, K16_00FF); - const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(b0r0, K16_BLUE_RED)); - return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - SIMD_INLINE __m128i BgraToGray(__m128i bgra[4]) - { - const __m128i lo = _mm_packs_epi32(BgraToGray32(bgra[0]), BgraToGray32(bgra[1])); - const __m128i hi = _mm_packs_epi32(BgraToGray32(bgra[2]), BgraToGray32(bgra[3])); - return _mm_packus_epi16(lo, hi); - } - - template SIMD_INLINE __m128i BgrToGray(const uint8_t * bgr, __m128i shuffle) - { - __m128i bgra[4]; - bgra[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(bgr + 0)), shuffle)); - bgra[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(bgr + 12)), shuffle)); - bgra[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(bgr + 24)), shuffle)); - bgra[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(bgr + 32)), 4), shuffle)); - return BgraToGray(bgra); - } - - template void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) - { - assert(width >= A); - if (align) - assert(Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - - __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - Store((__m128i*)(gray + col), BgrToGray(bgr + 3 * col, _shuffle)); - if (width != alignedWidth) - Store((__m128i*)(gray + width - A), BgrToGray(bgr + 3 * (width - A), _shuffle)); - bgr += bgrStride; - gray += grayStride; - } - } - - void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) - { - if (Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - else - BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToGray.cpp.o) has no symbols - void dummy_SimdSsse3BgrToGray(){}; -#endif// SIMD_SSSE3_ENABLE -} +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdStore.h" +#include "Simd/SimdMemory.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + const __m128i K16_BLUE_RED = SIMD_MM_SET2_EPI16(Base::BLUE_TO_GRAY_WEIGHT, Base::RED_TO_GRAY_WEIGHT); + const __m128i K16_GREEN_ROUND = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM); + + SIMD_INLINE __m128i BgraToGray32(__m128i bgra) + { + const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(bgra, 1), K16_00FF); + const __m128i b0r0 = _mm_and_si128(bgra, K16_00FF); + const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(b0r0, K16_BLUE_RED)); + return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT); + } + + SIMD_INLINE __m128i BgraToGray(__m128i bgra[4]) + { + const __m128i lo = _mm_packs_epi32(BgraToGray32(bgra[0]), BgraToGray32(bgra[1])); + const __m128i hi = _mm_packs_epi32(BgraToGray32(bgra[2]), BgraToGray32(bgra[3])); + return _mm_packus_epi16(lo, hi); + } + + template SIMD_INLINE __m128i BgrToGray(const uint8_t * bgr, __m128i shuffle) + { + __m128i bgra[4]; + bgra[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(bgr + 0)), shuffle)); + bgra[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(bgr + 12)), shuffle)); + bgra[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(bgr + 24)), shuffle)); + bgra[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(bgr + 32)), 4), shuffle)); + return BgraToGray(bgra); + } + + template void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) + { + assert(width >= A); + if (align) + assert(Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride)); + + size_t alignedWidth = AlignLo(width, A); + + __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + Store((__m128i*)(gray + col), BgrToGray(bgr + 3 * col, _shuffle)); + if (width != alignedWidth) + Store((__m128i*)(gray + width - A), BgrToGray(bgr + 3 * (width - A), _shuffle)); + bgr += bgrStride; + gray += grayStride; + } + } + + void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) + { + if (Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride)) + BgrToGray(bgr, width, height, bgrStride, gray, grayStride); + else + BgrToGray(bgr, width, height, bgrStride, gray, grayStride); + } + + //--------------------------------------------------------------------- + + const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); + + SIMD_INLINE __m128i RgbaToGray32(__m128i rgba) + { + const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF); + const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF); + const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(r0b0, K16_RED_BLUE)); + return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT); + } + + SIMD_INLINE __m128i RgbaToGray(__m128i rgba[4]) + { + const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); + const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); + return _mm_packus_epi16(lo, hi); + } + + template SIMD_INLINE __m128i RgbToGray(const uint8_t* rgb, __m128i shuffle) + { + __m128i rgba[4]; + rgba[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(rgb + 0)), shuffle)); + rgba[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(rgb + 12)), shuffle)); + rgba[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(rgb + 24)), shuffle)); + rgba[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(rgb + 32)), 4), shuffle)); + return RgbaToGray(rgba); + } + + template void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) + { + assert(width >= A); + if (align) + assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t alignedWidth = AlignLo(width, A); + + __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + Store((__m128i*)(gray + col), RgbToGray(rgb + 3 * col, _shuffle)); + if (width != alignedWidth) + Store((__m128i*)(gray + width - A), RgbToGray(rgb + 3 * (width - A), _shuffle)); + rgb += rgbStride; + gray += grayStride; + } + } + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) + { + if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)) + RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + else + RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdSse41BgrToRgb.cpp old mode 100644 new mode 100755 similarity index 84% rename from 3rdparty/simdlib/Simd/SimdSsse3BgrToRgb.cpp rename to 3rdparty/simdlib/Simd/SimdSse41BgrToRgb.cpp index 0f74b41b91..14a351a5c9 --- a/3rdparty/simdlib/Simd/SimdSsse3BgrToRgb.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41BgrToRgb.cpp @@ -1,83 +1,80 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - const __m128i K8_CVT_00 = SIMD_MM_SETR_EPI8(0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1); - const __m128i K8_CVT_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); - const __m128i K8_CVT_10 = SIMD_MM_SETR_EPI8(-1, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_CVT_11 = SIMD_MM_SETR_EPI8(0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF); - const __m128i K8_CVT_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, -1); - const __m128i K8_CVT_21 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_CVT_22 = SIMD_MM_SETR_EPI8(-1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD); - - template SIMD_INLINE void BgrToRgb(const uint8_t * src, uint8_t * dst) - { - __m128i s0 = Load((__m128i*)src + 0); - __m128i s1 = Load((__m128i*)src + 1); - __m128i s2 = Load((__m128i*)src + 2); - Store((__m128i*)dst + 0, _mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_00), _mm_shuffle_epi8(s1, K8_CVT_01))); - Store((__m128i*)dst + 1, _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_10), _mm_shuffle_epi8(s1, K8_CVT_11)), _mm_shuffle_epi8(s2, K8_CVT_12))); - Store((__m128i*)dst + 2, _mm_or_si128(_mm_shuffle_epi8(s1, K8_CVT_21), _mm_shuffle_epi8(s2, K8_CVT_22))); - } - - template void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)); - - const size_t A3 = A * 3; - size_t size = width * 3; - size_t aligned = AlignLo(width, A) * 3; - - for (size_t row = 0; row < height; ++row) - { - for (size_t i = 0; i < aligned; i += A3) - BgrToRgb(bgr + i, rgb + i); - if (aligned < size) - BgrToRgb(bgr + size - A3, rgb + size - A3); - bgr += bgrStride; - rgb += rgbStride; - } - } - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) - { - if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)) - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); - else - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToRgb.cpp.o) has no symbols - void dummy_SimdSsse3BgrToRgb(){}; -#endif//SIMD_SSSE3_ENABLE -} +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdStore.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + const __m128i K8_CVT_00 = SIMD_MM_SETR_EPI8(0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1); + const __m128i K8_CVT_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); + const __m128i K8_CVT_10 = SIMD_MM_SETR_EPI8(-1, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + const __m128i K8_CVT_11 = SIMD_MM_SETR_EPI8(0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF); + const __m128i K8_CVT_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, -1); + const __m128i K8_CVT_21 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + const __m128i K8_CVT_22 = SIMD_MM_SETR_EPI8(-1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD); + + template SIMD_INLINE void BgrToRgb(const uint8_t * src, uint8_t * dst) + { + __m128i s0 = Load((__m128i*)src + 0); + __m128i s1 = Load((__m128i*)src + 1); + __m128i s2 = Load((__m128i*)src + 2); + Store((__m128i*)dst + 0, _mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_00), _mm_shuffle_epi8(s1, K8_CVT_01))); + Store((__m128i*)dst + 1, _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_10), _mm_shuffle_epi8(s1, K8_CVT_11)), _mm_shuffle_epi8(s2, K8_CVT_12))); + Store((__m128i*)dst + 2, _mm_or_si128(_mm_shuffle_epi8(s1, K8_CVT_21), _mm_shuffle_epi8(s2, K8_CVT_22))); + } + + template void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)); + + const size_t A3 = A * 3; + size_t size = width * 3; + size_t aligned = AlignLo(width, A) * 3; + + for (size_t row = 0; row < height; ++row) + { + for (size_t i = 0; i < aligned; i += A3) + BgrToRgb(bgr + i, rgb + i); + if (aligned < size) + BgrToRgb(bgr + size - A3, rgb + size - A3); + bgr += bgrStride; + rgb += rgbStride; + } + } + + void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride) + { + if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)) + BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); + else + BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdSse41BgraToBgr.cpp old mode 100644 new mode 100755 similarity index 53% rename from 3rdparty/simdlib/Simd/SimdSsse3BgraToBgr.cpp rename to 3rdparty/simdlib/Simd/SimdSse41BgraToBgr.cpp index ccf4c51c97..a3000972e6 --- a/3rdparty/simdlib/Simd/SimdSsse3BgraToBgr.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41BgraToBgr.cpp @@ -1,92 +1,165 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - template SIMD_INLINE void BgraToBgrBody(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2]) - { - Store((__m128i*)(bgr + 0), _mm_shuffle_epi8(Load((__m128i*)bgra + 0), k[0][0])); - Store((__m128i*)(bgr + 12), _mm_shuffle_epi8(Load((__m128i*)bgra + 1), k[0][0])); - Store((__m128i*)(bgr + 24), _mm_shuffle_epi8(Load((__m128i*)bgra + 2), k[0][0])); - Store((__m128i*)(bgr + 36), _mm_shuffle_epi8(Load((__m128i*)bgra + 3), k[0][0])); - } - - template SIMD_INLINE void BgraToBgr(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2]) - { - __m128i bgra0 = Load((__m128i*)bgra + 0); - __m128i bgra1 = Load((__m128i*)bgra + 1); - __m128i bgra2 = Load((__m128i*)bgra + 2); - __m128i bgra3 = Load((__m128i*)bgra + 3); - Store((__m128i*)bgr + 0, _mm_or_si128(_mm_shuffle_epi8(bgra0, k[0][0]), _mm_shuffle_epi8(bgra1, k[0][1]))); - Store((__m128i*)bgr + 1, _mm_or_si128(_mm_shuffle_epi8(bgra1, k[1][0]), _mm_shuffle_epi8(bgra2, k[1][1]))); - Store((__m128i*)bgr + 2, _mm_or_si128(_mm_shuffle_epi8(bgra2, k[2][0]), _mm_shuffle_epi8(bgra3, k[2][1]))); - } - - template void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - if (width == alignedWidth) - alignedWidth -= A; - - __m128i k[3][2]; - k[0][0] = _mm_setr_epi8(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1); - k[0][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4); - k[1][0] = _mm_setr_epi8(0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1); - k[1][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9); - k[2][0] = _mm_setr_epi8(0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - k[2][1] = _mm_setr_epi8(-1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BgraToBgrBody(bgra + 4 * col, bgr + 3 * col, k); - if (width != alignedWidth) - BgraToBgr(bgra + 4 * (width - A), bgr + 3 * (width - A), k); - bgra += bgraStride; - bgr += bgrStride; - } - } - - void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) - BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); - else - BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgraToBgr.cpp.o) has no symbols - void dummy_SimdSsse3BgraToBgr(){}; -#endif// SIMD_SSSE3_ENABLE -} +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdStore.h" +#include "Simd/SimdMemory.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + template SIMD_INLINE void BgraToBgrBody(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2]) + { + Store((__m128i*)(bgr + 0), _mm_shuffle_epi8(Load((__m128i*)bgra + 0), k[0][0])); + Store((__m128i*)(bgr + 12), _mm_shuffle_epi8(Load((__m128i*)bgra + 1), k[0][0])); + Store((__m128i*)(bgr + 24), _mm_shuffle_epi8(Load((__m128i*)bgra + 2), k[0][0])); + Store((__m128i*)(bgr + 36), _mm_shuffle_epi8(Load((__m128i*)bgra + 3), k[0][0])); + } + + template SIMD_INLINE void BgraToBgr(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2]) + { + __m128i bgra0 = Load((__m128i*)bgra + 0); + __m128i bgra1 = Load((__m128i*)bgra + 1); + __m128i bgra2 = Load((__m128i*)bgra + 2); + __m128i bgra3 = Load((__m128i*)bgra + 3); + Store((__m128i*)bgr + 0, _mm_or_si128(_mm_shuffle_epi8(bgra0, k[0][0]), _mm_shuffle_epi8(bgra1, k[0][1]))); + Store((__m128i*)bgr + 1, _mm_or_si128(_mm_shuffle_epi8(bgra1, k[1][0]), _mm_shuffle_epi8(bgra2, k[1][1]))); + Store((__m128i*)bgr + 2, _mm_or_si128(_mm_shuffle_epi8(bgra2, k[2][0]), _mm_shuffle_epi8(bgra3, k[2][1]))); + } + + template void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); + + size_t alignedWidth = AlignLo(width, A); + if (width == alignedWidth) + alignedWidth -= A; + + __m128i k[3][2]; + k[0][0] = _mm_setr_epi8(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1); + k[0][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4); + k[1][0] = _mm_setr_epi8(0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1); + k[1][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9); + k[2][0] = _mm_setr_epi8(0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + k[2][1] = _mm_setr_epi8(-1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + BgraToBgrBody(bgra + 4 * col, bgr + 3 * col, k); + if (width != alignedWidth) + BgraToBgr(bgra + 4 * (width - A), bgr + 3 * (width - A), k); + bgra += bgraStride; + bgr += bgrStride; + } + } + + void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) + BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); + else + BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); + } + + //--------------------------------------------------------------------- + + template void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t alignedWidth = AlignLo(width, A); + if (width == alignedWidth) + alignedWidth -= A; + + __m128i k[3][2]; + k[0][0] = _mm_setr_epi8(0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1); + k[0][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x1, 0x0, 0x6); + k[1][0] = _mm_setr_epi8(0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1, -1, -1, -1, -1); + k[1][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9); + k[2][0] = _mm_setr_epi8(0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + k[2][1] = _mm_setr_epi8(-1, -1, -1, -1, 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + BgraToBgrBody(bgra + 4 * col, rgb + 3 * col, k); + if (width != alignedWidth) + BgraToBgr(bgra + 4 * (width - A), rgb + 3 * (width - A), k); + bgra += bgraStride; + rgb += rgbStride; + } + } + + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) + BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); + else + BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); + } + + //--------------------------------------------------------------------- + + const __m128i K8_BGRA_TO_RGBA = SIMD_MM_SETR_EPI8(0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF); + + template SIMD_INLINE void BgraToRgba(const uint8_t* bgra, uint8_t* rgba) + { + Store((__m128i*)rgba, _mm_shuffle_epi8(Load((__m128i*)bgra), K8_BGRA_TO_RGBA)); + } + + template void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride)); + + size_t size = width * 4; + size_t sizeA = AlignLo(size, A); + + for (size_t row = 0; row < height; ++row) + { + for (size_t i = 0; i < size; i += A) + BgraToRgba(bgra + i, rgba + i); + if (size != sizeA) + BgraToRgba(bgra + size - sizeA, rgba + size - sizeA); + bgra += bgraStride; + rgba += rgbaStride; + } + } + + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride)) + BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + else + BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdBaseRgbToGray.cpp b/3rdparty/simdlib/Simd/SimdSse41Cpu.cpp similarity index 54% rename from 3rdparty/simdlib/Simd/SimdBaseRgbToGray.cpp rename to 3rdparty/simdlib/Simd/SimdSse41Cpu.cpp index 6ac7f88791..9b5719ce97 100644 --- a/3rdparty/simdlib/Simd/SimdBaseRgbToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41Cpu.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -21,23 +21,47 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "Simd/SimdConversion.h" +#include "Simd/SimdEnable.h" +#include "Simd/SimdCpu.h" + +#if defined(_MSC_VER) +#include +#endif namespace Simd { - namespace Base +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { - void RgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride) + SIMD_INLINE bool SupportedByCPU() { - for (size_t row = 0; row < height; ++row) + return + Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE41) && + Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE42); + } + + SIMD_INLINE bool SupportedByOS() + { +#if defined(_MSC_VER) + __try { - const uint8_t * pRgb = rgb + row*rgbStride; - uint8_t * pGray = gray + row*grayStride; - for (const uint8_t *pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgb += 3) - { - *pGray = RgbToGray(pRgb[0], pRgb[1], pRgb[2]); - } + int value = _mm_testz_si128(_mm_set1_epi8(0), _mm_set1_epi8(-1)); // try to execute of SSE41 instructions; + uint32_t crc = _mm_crc32_u8(0, 1); // try to execute of SSE42 instructions; + return true; } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } +#else + return true; +#endif + } + + bool GetEnable() + { + return SupportedByCPU() && SupportedByOS(); } } +#endif } diff --git a/3rdparty/simdlib/Simd/SimdSsse3Deinterleave.cpp b/3rdparty/simdlib/Simd/SimdSse41Deinterleave.cpp similarity index 74% rename from 3rdparty/simdlib/Simd/SimdSsse3Deinterleave.cpp rename to 3rdparty/simdlib/Simd/SimdSse41Deinterleave.cpp index 45ff364d03..68ae14efc5 100644 --- a/3rdparty/simdlib/Simd/SimdSsse3Deinterleave.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41Deinterleave.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,8 +27,8 @@ namespace Simd { -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { template SIMD_INLINE void DeinterleaveBgr(const uint8_t * bgr, uint8_t * b, uint8_t * g, uint8_t * r, size_t offset) { @@ -69,9 +69,11 @@ namespace Simd DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); } + //--------------------------------------------------------------------- + const __m128i K8_SHUFFLE_BGRA = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF); - template SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset) + template SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset) { __m128i _bgra[4]; _bgra[0] = _mm_shuffle_epi8(Load((__m128i*)bgra + 0), K8_SHUFFLE_BGRA); @@ -89,7 +91,8 @@ namespace Simd __m128i rraa1 = _mm_unpackhi_epi32(_bgra[2], _bgra[3]); Store((__m128i*)(r + offset), _mm_unpacklo_epi64(rraa0, rraa1)); - Store((__m128i*)(a + offset), _mm_unpackhi_epi64(rraa0, rraa1)); + if(alpha) + Store((__m128i*)(a + offset), _mm_unpackhi_epi64(rraa0, rraa1)); } template void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, @@ -99,36 +102,51 @@ namespace Simd if (align) { assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)); + assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL)); } size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) + if (a) { - for (size_t col = 0; col < alignedWidth; col += A) - DeinterleaveBgra(bgra + col * 4, b, g, r, a, col); - if (width != alignedWidth) - DeinterleaveBgra(bgra + 4 * (width - A), b, g, r, a, width - A); - bgra += bgraStride; - b += bStride; - g += gStride; - r += rStride; - a += aStride; + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + DeinterleaveBgra(bgra + col * 4, b, g, r, a, col); + if (width != alignedWidth) + DeinterleaveBgra(bgra + 4 * (width - A), b, g, r, a, width - A); + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; + a += aStride; + } + } + else + { + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + DeinterleaveBgra(bgra + col * 4, b, g, r, NULL, col); + if (width != alignedWidth) + DeinterleaveBgra(bgra + 4 * (width - A), b, g, r, NULL, width - A); + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; + } } } void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)) + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && + Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL)) DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); else DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); } } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Deinterleave.cpp.o) has no symbols - void dummy_SimdSsse3Deinterleave(){}; -#endif// SIMD_SSSE3_ENABLE +#endif } diff --git a/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp old mode 100644 new mode 100755 index bacd2f7d91..73334c635d --- a/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2020 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "Simd/SimdMemory.h" +#include "Simd/SimdLoadBlock.h" #include "Simd/SimdStore.h" #include "Simd/SimdGaussianBlur.h" diff --git a/3rdparty/simdlib/Simd/SimdSsse3GaussianBlur3x3.cpp b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur3x3.cpp similarity index 95% rename from 3rdparty/simdlib/Simd/SimdSsse3GaussianBlur3x3.cpp rename to 3rdparty/simdlib/Simd/SimdSse41GaussianBlur3x3.cpp index 74ff76aa8a..11573a696b 100644 --- a/3rdparty/simdlib/Simd/SimdSsse3GaussianBlur3x3.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur3x3.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -22,12 +22,13 @@ * SOFTWARE. */ #include "Simd/SimdMemory.h" +#include "Simd/SimdLoadBlock.h" #include "Simd/SimdStore.h" namespace Simd { -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { namespace { @@ -154,8 +155,5 @@ namespace Simd GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); } } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3GaussianBlur3x3.cpp.o) has no symbols - void dummy_SimdSsse3GaussianBlur3x3(){}; -#endif// SIMD_SSSE3_ENABLE +#endif } diff --git a/3rdparty/simdlib/Simd/SimdSsse3GrayToBgr.cpp b/3rdparty/simdlib/Simd/SimdSse41GrayToBgr.cpp old mode 100644 new mode 100755 similarity index 92% rename from 3rdparty/simdlib/Simd/SimdSsse3GrayToBgr.cpp rename to 3rdparty/simdlib/Simd/SimdSse41GrayToBgr.cpp index 8106f6451a..db79b3e4f0 --- a/3rdparty/simdlib/Simd/SimdSsse3GrayToBgr.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41GrayToBgr.cpp @@ -1,75 +1,72 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - template SIMD_INLINE void GrayToBgr(uint8_t * bgr, __m128i gray) - { - Store((__m128i*)bgr + 0, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR0)); - Store((__m128i*)bgr + 1, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR1)); - Store((__m128i*)bgr + 2, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR2)); - } - - template void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - __m128i _gray = Load((__m128i*)(gray + col)); - GrayToBgr(bgr + 3 * col, _gray); - } - if (alignedWidth != width) - { - __m128i _gray = Load((__m128i*)(gray + width - A)); - GrayToBgr(bgr + 3 * (width - A), _gray); - } - gray += grayStride; - bgr += bgrStride; - } - } - - void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride) - { - if (Aligned(bgr) && Aligned(gray) && Aligned(bgrStride) && Aligned(grayStride)) - GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); - else - GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3GrayToBgr.cpp.o) has no symbols - void dummy_SimdSsse3GrayToBgr(){}; -#endif// SIMD_SSSE3_ENABLE -} +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdStore.h" +#include "Simd/SimdMemory.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + template SIMD_INLINE void GrayToBgr(uint8_t * bgr, __m128i gray) + { + Store((__m128i*)bgr + 0, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR0)); + Store((__m128i*)bgr + 1, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR1)); + Store((__m128i*)bgr + 2, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR2)); + } + + template void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride)); + + size_t alignedWidth = AlignLo(width, A); + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + { + __m128i _gray = Load((__m128i*)(gray + col)); + GrayToBgr(bgr + 3 * col, _gray); + } + if (alignedWidth != width) + { + __m128i _gray = Load((__m128i*)(gray + width - A)); + GrayToBgr(bgr + 3 * (width - A), _gray); + } + gray += grayStride; + bgr += bgrStride; + } + } + + void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride) + { + if (Aligned(bgr) && Aligned(gray) && Aligned(bgrStride) && Aligned(grayStride)) + GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); + else + GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdSsse3Interleave.cpp b/3rdparty/simdlib/Simd/SimdSse41Interleave.cpp similarity index 96% rename from 3rdparty/simdlib/Simd/SimdSsse3Interleave.cpp rename to 3rdparty/simdlib/Simd/SimdSse41Interleave.cpp index c7213577fd..bb6354405e 100644 --- a/3rdparty/simdlib/Simd/SimdSsse3Interleave.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41Interleave.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,8 +27,8 @@ namespace Simd { -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { template SIMD_INLINE void InterleaveBgr(const uint8_t * b, const uint8_t * g, const uint8_t * r, size_t offset, uint8_t * bgr) { @@ -124,8 +124,5 @@ namespace Simd InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); } } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Interleave.cpp.o) has no symbols - void dummy_SimdSsse3Interleave(){}; -#endif// SIMD_SSSE3_ENABLE +#endif } diff --git a/3rdparty/simdlib/Simd/SimdSsse3Reduce.cpp b/3rdparty/simdlib/Simd/SimdSse41Reduce.cpp old mode 100644 new mode 100755 similarity index 96% rename from 3rdparty/simdlib/Simd/SimdSsse3Reduce.cpp rename to 3rdparty/simdlib/Simd/SimdSse41Reduce.cpp index faded50ec7..9905a6f171 --- a/3rdparty/simdlib/Simd/SimdSsse3Reduce.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41Reduce.cpp @@ -1,202 +1,199 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1) - { - return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2); - } - - SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) - { - return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11)); - } - - template __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11); - - template<> SIMD_INLINE __m128i Average8<1>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) - { - return Average8(s00, s01, s10, s11); - } - - const __m128i K8_RC2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF); - - template<> SIMD_INLINE __m128i Average8<2>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) - { - return Average8(_mm_shuffle_epi8(s00, K8_RC2), _mm_shuffle_epi8(s01, K8_RC2), _mm_shuffle_epi8(s10, K8_RC2), _mm_shuffle_epi8(s11, K8_RC2)); - } - - const __m128i K8_RC4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF); - - template<> SIMD_INLINE __m128i Average8<4>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) - { - return Average8(_mm_shuffle_epi8(s00, K8_RC4), _mm_shuffle_epi8(s01, K8_RC4), _mm_shuffle_epi8(s10, K8_RC4), _mm_shuffle_epi8(s11, K8_RC4)); - } - - template SIMD_INLINE void ReduceColor2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - __m128i s00 = Load((__m128i*)src0 + 0); - __m128i s01 = Load((__m128i*)src0 + 1); - __m128i s10 = Load((__m128i*)src1 + 0); - __m128i s11 = Load((__m128i*)src1 + 1); - Store((__m128i*)dst, Average8(s00, s01, s10, s11)); - } - - template void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t evenWidth = AlignLo(srcWidth, 2); - size_t evenSize = evenWidth * channelCount; - size_t alignedSize = AlignLo(evenSize, DA); - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t *src0 = src; - const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - size_t srcOffset = 0, dstOffset = 0; - for (; srcOffset < alignedSize; srcOffset += DA, dstOffset += A) - ReduceColor2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); - if (alignedSize != evenSize) - { - srcOffset = evenSize - DA; - dstOffset = srcOffset / 2; - ReduceColor2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); - } - if (evenWidth != srcWidth) - { - for (size_t c = 0; c < channelCount; ++c) - dst[evenSize/2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]); - } - src += 2 * srcStride; - dst += dstStride; - } - } - - const __m128i K8_BGR0 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1); - const __m128i K8_BGR1 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0); - const __m128i K8_BGR2 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_BGR3 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1); - const __m128i K8_BGR4 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); - const __m128i K8_BGR5 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_BGR6 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF); - - template SIMD_INLINE void ReduceBgr2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - __m128i s00 = Load((__m128i*)src0 + 0); - __m128i s01 = Load((__m128i*)src0 + 1); - __m128i s02 = Load((__m128i*)src0 + 2); - __m128i s10 = Load((__m128i*)src1 + 0); - __m128i s11 = Load((__m128i*)src1 + 1); - __m128i s12 = Load((__m128i*)src1 + 2); - __m128i m00 = _mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR0), _mm_shuffle_epi8(s01, K8_BGR1)); - __m128i m01 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR2), _mm_shuffle_epi8(s01, K8_BGR3)), _mm_shuffle_epi8(s02, K8_BGR4)); - __m128i m10 = _mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR0), _mm_shuffle_epi8(s11, K8_BGR1)); - __m128i m11 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR2), _mm_shuffle_epi8(s11, K8_BGR3)), _mm_shuffle_epi8(s12, K8_BGR4)); - Store((__m128i*)dst + 0, Average8(m00, m01, m10, m11)); - __m128i s03 = Load((__m128i*)src0 + 3); - __m128i s04 = Load((__m128i*)src0 + 4); - __m128i s13 = Load((__m128i*)src1 + 3); - __m128i s14 = Load((__m128i*)src1 + 4); - __m128i m02 = _mm_or_si128(_mm_shuffle_epi8(s01, K8_BGR5), _mm_shuffle_epi8(s02, K8_BGR6)); - __m128i m03 = _mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR0), _mm_shuffle_epi8(s04, K8_BGR1)); - __m128i m12 = _mm_or_si128(_mm_shuffle_epi8(s11, K8_BGR5), _mm_shuffle_epi8(s12, K8_BGR6)); - __m128i m13 = _mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR0), _mm_shuffle_epi8(s14, K8_BGR1)); - Store((__m128i*)dst + 1, Average8(m02, m03, m12, m13)); - __m128i s05 = Load((__m128i*)src0 + 5); - __m128i s15 = Load((__m128i*)src1 + 5); - __m128i m04 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR2), _mm_shuffle_epi8(s04, K8_BGR3)), _mm_shuffle_epi8(s05, K8_BGR4)); - __m128i m05 = _mm_or_si128(_mm_shuffle_epi8(s04, K8_BGR5), _mm_shuffle_epi8(s05, K8_BGR6)); - __m128i m14 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR2), _mm_shuffle_epi8(s14, K8_BGR3)), _mm_shuffle_epi8(s15, K8_BGR4)); - __m128i m15 = _mm_or_si128(_mm_shuffle_epi8(s14, K8_BGR5), _mm_shuffle_epi8(s15, K8_BGR6)); - Store((__m128i*)dst + 2, Average8(m04, m05, m14, m15)); - } - - template void ReduceBgr2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t evenWidth = AlignLo(srcWidth, 2); - size_t alignedWidth = AlignLo(srcWidth, DA); - size_t evenSize = evenWidth * 3; - size_t alignedSize = alignedWidth*3; - size_t srcStep = DA * 3, dstStep = A*3; - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t *src0 = src; - const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - size_t srcOffset = 0, dstOffset = 0; - for (; srcOffset < alignedSize; srcOffset += srcStep, dstOffset += dstStep) - ReduceBgr2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); - if (alignedSize != evenSize) - { - srcOffset = evenSize - srcStep; - dstOffset = srcOffset / 2; - ReduceBgr2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); - } - if (evenWidth != srcWidth) - { - for (size_t c = 0; c < 3; ++c) - dst[evenSize / 2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]); - } - src += 2 * srcStride; - dst += dstStride; - } - } - - template void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(dst) && Aligned(dstStride)); - } - - switch (channelCount) - { - case 1: ReduceColor2x2<1, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 2: ReduceColor2x2<2, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 3: ReduceBgr2x2(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 4: ReduceColor2x2<4, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - default: assert(0); - } - } - - void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else - ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Reduce.cpp.o) has no symbols - void dummy_SimdSsse3Reduce(){}; -#endif// SIMD_SSSE3_ENABLE -} +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdStore.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1) + { + return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2); + } + + SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) + { + return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11)); + } + + template __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11); + + template<> SIMD_INLINE __m128i Average8<1>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) + { + return Average8(s00, s01, s10, s11); + } + + const __m128i K8_RC2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF); + + template<> SIMD_INLINE __m128i Average8<2>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) + { + return Average8(_mm_shuffle_epi8(s00, K8_RC2), _mm_shuffle_epi8(s01, K8_RC2), _mm_shuffle_epi8(s10, K8_RC2), _mm_shuffle_epi8(s11, K8_RC2)); + } + + const __m128i K8_RC4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF); + + template<> SIMD_INLINE __m128i Average8<4>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) + { + return Average8(_mm_shuffle_epi8(s00, K8_RC4), _mm_shuffle_epi8(s01, K8_RC4), _mm_shuffle_epi8(s10, K8_RC4), _mm_shuffle_epi8(s11, K8_RC4)); + } + + template SIMD_INLINE void ReduceColor2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) + { + __m128i s00 = Load((__m128i*)src0 + 0); + __m128i s01 = Load((__m128i*)src0 + 1); + __m128i s10 = Load((__m128i*)src1 + 0); + __m128i s11 = Load((__m128i*)src1 + 1); + Store((__m128i*)dst, Average8(s00, s01, s10, s11)); + } + + template void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride) + { + size_t evenWidth = AlignLo(srcWidth, 2); + size_t evenSize = evenWidth * channelCount; + size_t alignedSize = AlignLo(evenSize, DA); + for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) + { + const uint8_t *src0 = src; + const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); + size_t srcOffset = 0, dstOffset = 0; + for (; srcOffset < alignedSize; srcOffset += DA, dstOffset += A) + ReduceColor2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); + if (alignedSize != evenSize) + { + srcOffset = evenSize - DA; + dstOffset = srcOffset / 2; + ReduceColor2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); + } + if (evenWidth != srcWidth) + { + for (size_t c = 0; c < channelCount; ++c) + dst[evenSize/2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]); + } + src += 2 * srcStride; + dst += dstStride; + } + } + + const __m128i K8_BGR0 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1); + const __m128i K8_BGR1 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0); + const __m128i K8_BGR2 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + const __m128i K8_BGR3 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1); + const __m128i K8_BGR4 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); + const __m128i K8_BGR5 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + const __m128i K8_BGR6 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF); + + template SIMD_INLINE void ReduceBgr2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) + { + __m128i s00 = Load((__m128i*)src0 + 0); + __m128i s01 = Load((__m128i*)src0 + 1); + __m128i s02 = Load((__m128i*)src0 + 2); + __m128i s10 = Load((__m128i*)src1 + 0); + __m128i s11 = Load((__m128i*)src1 + 1); + __m128i s12 = Load((__m128i*)src1 + 2); + __m128i m00 = _mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR0), _mm_shuffle_epi8(s01, K8_BGR1)); + __m128i m01 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR2), _mm_shuffle_epi8(s01, K8_BGR3)), _mm_shuffle_epi8(s02, K8_BGR4)); + __m128i m10 = _mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR0), _mm_shuffle_epi8(s11, K8_BGR1)); + __m128i m11 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR2), _mm_shuffle_epi8(s11, K8_BGR3)), _mm_shuffle_epi8(s12, K8_BGR4)); + Store((__m128i*)dst + 0, Average8(m00, m01, m10, m11)); + __m128i s03 = Load((__m128i*)src0 + 3); + __m128i s04 = Load((__m128i*)src0 + 4); + __m128i s13 = Load((__m128i*)src1 + 3); + __m128i s14 = Load((__m128i*)src1 + 4); + __m128i m02 = _mm_or_si128(_mm_shuffle_epi8(s01, K8_BGR5), _mm_shuffle_epi8(s02, K8_BGR6)); + __m128i m03 = _mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR0), _mm_shuffle_epi8(s04, K8_BGR1)); + __m128i m12 = _mm_or_si128(_mm_shuffle_epi8(s11, K8_BGR5), _mm_shuffle_epi8(s12, K8_BGR6)); + __m128i m13 = _mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR0), _mm_shuffle_epi8(s14, K8_BGR1)); + Store((__m128i*)dst + 1, Average8(m02, m03, m12, m13)); + __m128i s05 = Load((__m128i*)src0 + 5); + __m128i s15 = Load((__m128i*)src1 + 5); + __m128i m04 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR2), _mm_shuffle_epi8(s04, K8_BGR3)), _mm_shuffle_epi8(s05, K8_BGR4)); + __m128i m05 = _mm_or_si128(_mm_shuffle_epi8(s04, K8_BGR5), _mm_shuffle_epi8(s05, K8_BGR6)); + __m128i m14 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR2), _mm_shuffle_epi8(s14, K8_BGR3)), _mm_shuffle_epi8(s15, K8_BGR4)); + __m128i m15 = _mm_or_si128(_mm_shuffle_epi8(s14, K8_BGR5), _mm_shuffle_epi8(s15, K8_BGR6)); + Store((__m128i*)dst + 2, Average8(m04, m05, m14, m15)); + } + + template void ReduceBgr2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride) + { + size_t evenWidth = AlignLo(srcWidth, 2); + size_t alignedWidth = AlignLo(srcWidth, DA); + size_t evenSize = evenWidth * 3; + size_t alignedSize = alignedWidth*3; + size_t srcStep = DA * 3, dstStep = A*3; + for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) + { + const uint8_t *src0 = src; + const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); + size_t srcOffset = 0, dstOffset = 0; + for (; srcOffset < alignedSize; srcOffset += srcStep, dstOffset += dstStep) + ReduceBgr2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); + if (alignedSize != evenSize) + { + srcOffset = evenSize - srcStep; + dstOffset = srcOffset / 2; + ReduceBgr2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); + } + if (evenWidth != srcWidth) + { + for (size_t c = 0; c < 3; ++c) + dst[evenSize / 2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]); + } + src += 2 * srcStride; + dst += dstStride; + } + } + + template void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) + { + assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA); + if (align) + { + assert(Aligned(src) && Aligned(srcStride)); + assert(Aligned(dst) && Aligned(dstStride)); + } + + switch (channelCount) + { + case 1: ReduceColor2x2<1, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; + case 2: ReduceColor2x2<2, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; + case 3: ReduceBgr2x2(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; + case 4: ReduceColor2x2<4, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; + default: assert(0); + } + } + + void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) + { + if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) + ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); + else + ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray2x2.cpp b/3rdparty/simdlib/Simd/SimdSse41ReduceGray2x2.cpp old mode 100644 new mode 100755 similarity index 94% rename from 3rdparty/simdlib/Simd/SimdSsse3ReduceGray2x2.cpp rename to 3rdparty/simdlib/Simd/SimdSse41ReduceGray2x2.cpp index 24d071182d..dd8bd5b0e3 --- a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray2x2.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41ReduceGray2x2.cpp @@ -1,96 +1,93 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1) - { - return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2); - } - - SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) - { - return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11)); - } - - template void ReduceGray2x2( - const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(dst) && Aligned(dstStride) && Aligned(dstWidth)); - } - - size_t alignedWidth = AlignLo(srcWidth, DA); - size_t evenWidth = AlignLo(srcWidth, 2); - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t *src0 = src; - const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - size_t srcOffset = 0, dstOffset = 0; - for (; srcOffset < alignedWidth; srcOffset += DA, dstOffset += A) - { - Store((__m128i*)(dst + dstOffset), Average8( - Load((__m128i*)(src0 + srcOffset)), Load((__m128i*)(src0 + srcOffset + A)), - Load((__m128i*)(src1 + srcOffset)), Load((__m128i*)(src1 + srcOffset + A)))); - } - if (alignedWidth != srcWidth) - { - dstOffset = dstWidth - A - (evenWidth != srcWidth ? 1 : 0); - srcOffset = evenWidth - DA; - Store((__m128i*)(dst + dstOffset), Average8( - Load((__m128i*)(src0 + srcOffset)), Load((__m128i*)(src0 + srcOffset + A)), - Load((__m128i*)(src1 + srcOffset)), Load((__m128i*)(src1 + srcOffset + A)))); - if (evenWidth != srcWidth) - { - dst[dstWidth - 1] = Base::Average(src0[evenWidth], src1[evenWidth]); - } - } - src += 2 * srcStride; - dst += dstStride; - } - } - - void ReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - if (Aligned(src) && Aligned(srcWidth) && Aligned(srcStride) && Aligned(dst) && Aligned(dstWidth) && Aligned(dstStride)) - ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Reduce2x2.cpp.o) has no symbols - void dummy_SimdSsse3Reduce2x2(){}; -#endif// SIMD_SSSE3_ENABLE -} +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdStore.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1) + { + return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2); + } + + SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) + { + return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11)); + } + + template void ReduceGray2x2( + const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) + { + assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA); + if (align) + { + assert(Aligned(src) && Aligned(srcStride)); + assert(Aligned(dst) && Aligned(dstStride) && Aligned(dstWidth)); + } + + size_t alignedWidth = AlignLo(srcWidth, DA); + size_t evenWidth = AlignLo(srcWidth, 2); + for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) + { + const uint8_t *src0 = src; + const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); + size_t srcOffset = 0, dstOffset = 0; + for (; srcOffset < alignedWidth; srcOffset += DA, dstOffset += A) + { + Store((__m128i*)(dst + dstOffset), Average8( + Load((__m128i*)(src0 + srcOffset)), Load((__m128i*)(src0 + srcOffset + A)), + Load((__m128i*)(src1 + srcOffset)), Load((__m128i*)(src1 + srcOffset + A)))); + } + if (alignedWidth != srcWidth) + { + dstOffset = dstWidth - A - (evenWidth != srcWidth ? 1 : 0); + srcOffset = evenWidth - DA; + Store((__m128i*)(dst + dstOffset), Average8( + Load((__m128i*)(src0 + srcOffset)), Load((__m128i*)(src0 + srcOffset + A)), + Load((__m128i*)(src1 + srcOffset)), Load((__m128i*)(src1 + srcOffset + A)))); + if (evenWidth != srcWidth) + { + dst[dstWidth - 1] = Base::Average(src0[evenWidth], src1[evenWidth]); + } + } + src += 2 * srcStride; + dst += dstStride; + } + } + + void ReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) + { + if (Aligned(src) && Aligned(srcWidth) && Aligned(srcStride) && Aligned(dst) && Aligned(dstWidth) && Aligned(dstStride)) + ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); + else + ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray4x4.cpp b/3rdparty/simdlib/Simd/SimdSse41ReduceGray4x4.cpp old mode 100644 new mode 100755 similarity index 96% rename from 3rdparty/simdlib/Simd/SimdSsse3ReduceGray4x4.cpp rename to 3rdparty/simdlib/Simd/SimdSse41ReduceGray4x4.cpp index 261e84c918..7754b290ba --- a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray4x4.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41ReduceGray4x4.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -26,8 +26,8 @@ namespace Simd { -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { namespace { @@ -170,8 +170,5 @@ namespace Simd ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); } } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Reduce4x4.cpp.o) has no symbols - void dummy_SimdSsse3Reduce4x4(){}; -#endif// SIMD_SSSE3_ENABLE +#endif } diff --git a/3rdparty/simdlib/Simd/SimdSsse3ResizeBilinear.cpp b/3rdparty/simdlib/Simd/SimdSse41ResizeBilinear.cpp old mode 100644 new mode 100755 similarity index 98% rename from 3rdparty/simdlib/Simd/SimdSsse3ResizeBilinear.cpp rename to 3rdparty/simdlib/Simd/SimdSse41ResizeBilinear.cpp index b39f619005..50a708aa20 --- a/3rdparty/simdlib/Simd/SimdSsse3ResizeBilinear.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41ResizeBilinear.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,8 +27,8 @@ namespace Simd { -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { namespace { @@ -401,9 +401,6 @@ namespace Simd } } } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3ResizeBilinear.cpp.o) has no symbols - void dummy_SimdSsse3ResizeBilinear(){}; #endif } diff --git a/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp b/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp old mode 100644 new mode 100755 index b766a8a209..e3e8e7b360 --- a/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,6 +32,309 @@ namespace Simd #ifdef SIMD_SSE41_ENABLE namespace Sse41 { + ResizerByteBilinear::ResizerByteBilinear(const ResParam& param) + : Sse2::ResizerByteBilinear(param) + , _blocks(0) + { + } + + size_t ResizerByteBilinear::BlockCountMax(size_t align) + { + return (size_t)Simd::Max(::ceil(float(_param.srcW) / (align - 1)), ::ceil(float(_param.dstW) * 2.0f / align)); + } + + void ResizerByteBilinear::EstimateParams() + { + if (_ax.data) + return; + if (_param.channels == 1 && _param.srcW < 4 * _param.dstW) + _blocks = BlockCountMax(A); + float scale = (float)_param.srcW / _param.dstW; + _ax.Resize(AlignHi(_param.dstW, A) * _param.channels * 2, false, _param.align); + uint8_t* alphas = _ax.data; + if (_blocks) + { + _ixg.Resize(_blocks); + int block = 0; + _ixg[0].src = 0; + _ixg[0].dst = 0; + for (int dstIndex = 0; dstIndex < (int)_param.dstW; ++dstIndex) + { + float alpha = (float)((dstIndex + 0.5) * scale - 0.5); + int srcIndex = (int)::floor(alpha); + alpha -= srcIndex; + + if (srcIndex < 0) + { + srcIndex = 0; + alpha = 0; + } + + if (srcIndex > (int)_param.srcW - 2) + { + srcIndex = (int)_param.srcW - 2; + alpha = 1; + } + + int dst = 2 * dstIndex - _ixg[block].dst; + int src = srcIndex - _ixg[block].src; + if (src >= A - 1 || dst >= A) + { + block++; + _ixg[block].src = Simd::Min(srcIndex, int(_param.srcW - A)); + _ixg[block].dst = 2 * dstIndex; + dst = 0; + src = srcIndex - _ixg[block].src; + } + _ixg[block].shuffle[dst] = src; + _ixg[block].shuffle[dst + 1] = src + 1; + + alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); + alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); + alphas += 2; + } + _blocks = block + 1; + } + else + { + _ix.Resize(_param.dstW); + for (size_t i = 0; i < _param.dstW; ++i) + { + float alpha = (float)((i + 0.5) * scale - 0.5); + ptrdiff_t index = (ptrdiff_t)::floor(alpha); + alpha -= index; + + if (index < 0) + { + index = 0; + alpha = 0; + } + + if (index > (ptrdiff_t)_param.srcW - 2) + { + index = _param.srcW - 2; + alpha = 1; + } + + _ix[i] = (int)index; + alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); + alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); + for (size_t channel = 1; channel < _param.channels; channel++) + ((uint16_t*)alphas)[channel] = *(uint16_t*)alphas; + alphas += 2 * _param.channels; + } + } + size_t size = AlignHi(_param.dstW, _param.align) * _param.channels * 2; + _bx[0].Resize(size, false, _param.align); + _bx[1].Resize(size, false, _param.align); + } + + template void ResizerByteBilinearInterpolateX(const __m128i* alpha, __m128i* buffer); + + template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<1>(const __m128i* alpha, __m128i* buffer) + { + _mm_store_si128(buffer, _mm_maddubs_epi16(_mm_load_si128(buffer), _mm_load_si128(alpha))); + } + + const __m128i K8_SHUFFLE_X2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF); + + SIMD_INLINE void ResizerByteBilinearInterpolateX2(const __m128i* alpha, __m128i* buffer) + { + __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X2); + _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha))); + } + + template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<2>(const __m128i* alpha, __m128i* buffer) + { + ResizerByteBilinearInterpolateX2(alpha + 0, buffer + 0); + ResizerByteBilinearInterpolateX2(alpha + 1, buffer + 1); + } + + const __m128i K8_SHUFFLE_X3_00 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1); + const __m128i K8_SHUFFLE_X3_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0); + const __m128i K8_SHUFFLE_X3_10 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + const __m128i K8_SHUFFLE_X3_11 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1); + const __m128i K8_SHUFFLE_X3_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); + const __m128i K8_SHUFFLE_X3_21 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + const __m128i K8_SHUFFLE_X3_22 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF); + + template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<3>(const __m128i* alpha, __m128i* buffer) + { + __m128i src[3], shuffled[3]; + src[0] = _mm_load_si128(buffer + 0); + src[1] = _mm_load_si128(buffer + 1); + src[2] = _mm_load_si128(buffer + 2); + shuffled[0] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_00); + shuffled[0] = _mm_or_si128(shuffled[0], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_01)); + _mm_store_si128(buffer + 0, _mm_maddubs_epi16(shuffled[0], _mm_load_si128(alpha + 0))); + shuffled[1] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_10); + shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_11)); + shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_12)); + _mm_store_si128(buffer + 1, _mm_maddubs_epi16(shuffled[1], _mm_load_si128(alpha + 1))); + shuffled[2] = _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_21); + shuffled[2] = _mm_or_si128(shuffled[2], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_22)); + _mm_store_si128(buffer + 2, _mm_maddubs_epi16(shuffled[2], _mm_load_si128(alpha + 2))); + } + + const __m128i K8_SHUFFLE_X4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF); + + SIMD_INLINE void ResizerByteBilinearInterpolateX4(const __m128i* alpha, __m128i* buffer) + { + __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X4); + _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha))); + } + + template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<4>(const __m128i* alpha, __m128i* buffer) + { + ResizerByteBilinearInterpolateX4(alpha + 0, buffer + 0); + ResizerByteBilinearInterpolateX4(alpha + 1, buffer + 1); + ResizerByteBilinearInterpolateX4(alpha + 2, buffer + 2); + ResizerByteBilinearInterpolateX4(alpha + 3, buffer + 3); + } + + const __m128i K16_FRACTION_ROUND_TERM = SIMD_MM_SET1_EPI16(Base::BILINEAR_ROUND_TERM); + + template SIMD_INLINE __m128i ResizerByteBilinearInterpolateY(const __m128i* pbx0, const __m128i* pbx1, __m128i alpha[2]) + { + __m128i sum = _mm_add_epi16(_mm_mullo_epi16(Load(pbx0), alpha[0]), _mm_mullo_epi16(Load(pbx1), alpha[1])); + return _mm_srli_epi16(_mm_add_epi16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT); + } + + template SIMD_INLINE void ResizerByteBilinearInterpolateY(const uint8_t* bx0, const uint8_t* bx1, __m128i alpha[2], uint8_t* dst) + { + __m128i lo = ResizerByteBilinearInterpolateY((__m128i*)bx0 + 0, (__m128i*)bx1 + 0, alpha); + __m128i hi = ResizerByteBilinearInterpolateY((__m128i*)bx0 + 1, (__m128i*)bx1 + 1, alpha); + Store((__m128i*)dst, _mm_packus_epi16(lo, hi)); + } + + template void ResizerByteBilinear::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) + { + struct One { uint8_t val[N * 1]; }; + struct Two { uint8_t val[N * 2]; }; + + size_t size = 2 * _param.dstW * N; + size_t aligned = AlignHi(size, DA) - DA; + const size_t step = A * N; + ptrdiff_t previous = -2; + __m128i a[2]; + uint8_t* bx[2] = { _bx[0].data, _bx[1].data }; + const uint8_t* ax = _ax.data; + const int32_t* ix = _ix.data; + size_t dstW = _param.dstW; + + for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride) + { + a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst])); + a[1] = _mm_set1_epi16(int16_t(_ay[yDst])); + + ptrdiff_t sy = _iy[yDst]; + int k = 0; + + if (sy == previous) + k = 2; + else if (sy == previous + 1) + { + Swap(bx[0], bx[1]); + k = 1; + } + + previous = sy; + + for (; k < 2; k++) + { + Two* pb = (Two*)bx[k]; + const One* psrc = (const One*)(src + (sy + k) * srcStride); + for (size_t x = 0; x < dstW; x++) + pb[x] = *(Two*)(psrc + ix[x]); + + uint8_t* pbx = bx[k]; + for (size_t i = 0; i < size; i += step) + ResizerByteBilinearInterpolateX((__m128i*)(ax + i), (__m128i*)(pbx + i)); + } + + for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A) + ResizerByteBilinearInterpolateY(bx[0] + ib, bx[1] + ib, a, dst + id); + size_t i = size - DA; + ResizerByteBilinearInterpolateY(bx[0] + i, bx[1] + i, a, dst + i / 2); + } + } + + template SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t* src, const Idx& index, const uint8_t* alpha, uint8_t* dst) + { + __m128i _src = _mm_loadu_si128((__m128i*)(src + index.src)); + __m128i _shuffle = _mm_loadu_si128((__m128i*) & index.shuffle); + __m128i _alpha = _mm_loadu_si128((__m128i*)(alpha + index.dst)); + _mm_storeu_si128((__m128i*)(dst + index.dst), _mm_maddubs_epi16(_mm_shuffle_epi8(_src, _shuffle), _alpha)); + } + + void ResizerByteBilinear::RunG(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) + { + size_t bufW = AlignHi(_param.dstW, A) * 2; + size_t size = 2 * _param.dstW; + size_t aligned = AlignHi(size, DA) - DA; + size_t blocks = _blocks; + ptrdiff_t previous = -2; + __m128i a[2]; + uint8_t* bx[2] = { _bx[0].data, _bx[1].data }; + const uint8_t* ax = _ax.data; + const Idx* ixg = _ixg.data; + + for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride) + { + a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst])); + a[1] = _mm_set1_epi16(int16_t(_ay[yDst])); + + ptrdiff_t sy = _iy[yDst]; + int k = 0; + + if (sy == previous) + k = 2; + else if (sy == previous + 1) + { + Swap(bx[0], bx[1]); + k = 1; + } + + previous = sy; + + for (; k < 2; k++) + { + const uint8_t* psrc = src + (sy + k) * srcStride; + uint8_t* pdst = bx[k]; + for (size_t i = 0; i < blocks; ++i) + ResizerByteBilinearLoadGrayInterpolated(psrc, ixg[i], ax, pdst); + } + + for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A) + ResizerByteBilinearInterpolateY(bx[0] + ib, bx[1] + ib, a, dst + id); + size_t i = size - DA; + ResizerByteBilinearInterpolateY(bx[0] + i, bx[1] + i, a, dst + i / 2); + } + } + + void ResizerByteBilinear::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) + { + assert(_param.dstW >= A); + + EstimateParams(); + switch (_param.channels) + { + case 1: + if (_blocks) + RunG(src, srcStride, dst, dstStride); + else + Run<1>(src, srcStride, dst, dstStride); + break; + case 2: Run<2>(src, srcStride, dst, dstStride); break; + case 3: Run<3>(src, srcStride, dst, dstStride); break; + case 4: Run<4>(src, srcStride, dst, dstStride); break; + default: + assert(0); + } + } + + //--------------------------------------------------------------------- + ResizerByteArea::ResizerByteArea(const ResParam & param) : Sse2::ResizerByteArea(param) { @@ -200,10 +503,12 @@ namespace Simd void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) { ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128i)); - if (type == SimdResizeChannelByte && method == SimdResizeMethodArea) + if (param.IsByteBilinear() && dstX >= A) + return new ResizerByteBilinear(param); + else if (param.IsByteArea()) return new ResizerByteArea(param); else - return Ssse3::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); + return Sse2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); } } #else diff --git a/3rdparty/simdlib/Simd/SimdSsse3.h b/3rdparty/simdlib/Simd/SimdSsse3.h deleted file mode 100644 index ed7849f39d..0000000000 --- a/3rdparty/simdlib/Simd/SimdSsse3.h +++ /dev/null @@ -1,77 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdSsse3_h__ -#define __SimdSsse3_h__ - -#include "Simd/SimdDefs.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride); - - void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha); - - void BgraToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride); - - void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride); - - void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride); - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); - - void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); - - void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride); - - void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); - - void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride); - - void InterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void InterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride); - - void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - void ReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void ReduceGray4x4(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - // ViSP custom SIMD code - void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff); - } -#endif// SIMD_SSSE3_ENABLE -} -#endif//__SimdSsse3_h__ diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp b/3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp deleted file mode 100644 index bb01107812..0000000000 --- a/3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - template SIMD_INLINE void BgrToRgba(const uint8_t * bgr, uint8_t * rgba, __m128i alpha, __m128i shuffle) - { - Store((__m128i*)rgba + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 0)), shuffle))); - Store((__m128i*)rgba + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 12)), shuffle))); - Store((__m128i*)rgba + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 24)), shuffle))); - Store((__m128i*)rgba + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(bgr + 32)), 4), shuffle))); - } - - template void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha) - { - assert(width >= A); - if (align) - assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - - __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3); - __m128i _shuffle = _mm_setr_epi8(0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BgrToRgba(bgr + 3 * col, rgba + 4 * col, _alpha, _shuffle); - if (width != alignedWidth) - BgrToRgba(bgr + 3 * (width - A), rgba + 4 * (width - A), _alpha, _shuffle); - bgr += bgrStride; - rgba += rgbaStride; - } - } - - void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha) - { - if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - else - BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToRGBa.cpp.o) has no symbols - void dummy_SimdSsse3BgrToRGBa(){}; -#endif// SIMD_SSSE3_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp b/3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp deleted file mode 100644 index d455781ed3..0000000000 --- a/3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - template SIMD_INLINE void BgraToRgba(const uint8_t * bgra, uint8_t * rgba, __m128i shuffle) - { - Store((__m128i*)rgba + 0, _mm_shuffle_epi8(Load((__m128i*)(bgra + 0)), shuffle)); - Store((__m128i*)rgba + 1, _mm_shuffle_epi8(Load((__m128i*)(bgra + 16)), shuffle)); - Store((__m128i*)rgba + 2, _mm_shuffle_epi8(Load((__m128i*)(bgra + 32)), shuffle)); - Store((__m128i*)rgba + 3, _mm_shuffle_epi8(Load((__m128i*)(bgra + 48)), shuffle)); - } - - template void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride) - { - assert(width >= A); - if (align) - assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride)); - - size_t alignedWidth = AlignLo(width, A); - - __m128i _shuffle = _mm_setr_epi8(0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BgraToRgba(bgra + 4 * col, rgba + 4 * col, _shuffle); - if (width != alignedWidth) - BgraToRgba(bgra + 4 * (width - A), rgba + 4 * (width - A), _shuffle); - bgra += bgraStride; - rgba += rgbaStride; - } - } - - void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride) - { - if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); - else - BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToRGBa.cpp.o) has no symbols - void dummy_SimdSsse3BgraToRGBa(){}; -#endif// SIMD_SSSE3_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp b/3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp deleted file mode 100644 index 985a772d47..0000000000 --- a/3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp +++ /dev/null @@ -1,69 +0,0 @@ -/* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff) - { - const __m128i mask1 = _mm_set_epi8(-1, 14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0); - const __m128i mask2 = _mm_set_epi8(-1, 15, -1, 13, -1, 11, -1, 9, -1, 7, -1, 5, -1, 3, -1, 1); - const __m128i mask_out2 = _mm_set_epi8(14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0, -1); - - size_t i = 0; - for (; i <= size-16; i+= 16) { - const __m128i vdata1 = _mm_loadu_si128(reinterpret_cast(img1 + i)); - const __m128i vdata2 = _mm_loadu_si128(reinterpret_cast(img2 + i)); - - __m128i vdata1_reorg = _mm_shuffle_epi8(vdata1, mask1); - __m128i vdata2_reorg = _mm_shuffle_epi8(vdata2, mask1); - - const __m128i vshift = _mm_set1_epi16(128); - __m128i vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift); - - const __m128i v255 = _mm_set1_epi16(255); - const __m128i vzero = _mm_setzero_si128(); - const __m128i vdata_diff_min_max1 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero); - - vdata1_reorg = _mm_shuffle_epi8(vdata1, mask2); - vdata2_reorg = _mm_shuffle_epi8(vdata2, mask2); - - vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift); - const __m128i vdata_diff_min_max2 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero); - - _mm_storeu_si128(reinterpret_cast<__m128i *>(imgDiff + i), _mm_or_si128(_mm_shuffle_epi8(vdata_diff_min_max1, mask1), - _mm_shuffle_epi8(vdata_diff_min_max2, mask_out2))); - } - - if (i < size) { - Base::SimdImageDifference(img1 + i, img2 + i, size - i, imgDiff + i); - } - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3CustomFunctions.cpp.o) has no symbols - void dummy_SimdSsse3CustomFunctions(){}; -#endif// SIMD_SSSE3_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp b/3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp deleted file mode 100644 index 37f2eca6c1..0000000000 --- a/3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp +++ /dev/null @@ -1,350 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdResizer.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - ResizerByteBilinear::ResizerByteBilinear(const ResParam & param) - : Sse2::ResizerByteBilinear(param) - , _blocks(0) - { - } - - size_t ResizerByteBilinear::BlockCountMax(size_t align) - { - return (size_t)Simd::Max(::ceil(float(_param.srcW) / (align - 1)), ::ceil(float(_param.dstW) * 2.0f / align )); - } - - void ResizerByteBilinear::EstimateParams() - { - if (_ax.data) - return; - if (_param.channels == 1 && _param.srcW < 4 * _param.dstW) - _blocks = BlockCountMax(A); - float scale = (float)_param.srcW / _param.dstW; - _ax.Resize(AlignHi(_param.dstW, A) * _param.channels * 2, false, _param.align); - uint8_t * alphas = _ax.data; - if (_blocks) - { - _ixg.Resize(_blocks); - int block = 0; - _ixg[0].src = 0; - _ixg[0].dst = 0; - for (int dstIndex = 0; dstIndex < (int)_param.dstW; ++dstIndex) - { - float alpha = (float)((dstIndex + 0.5)*scale - 0.5); - int srcIndex = (int)::floor(alpha); - alpha -= srcIndex; - - if (srcIndex < 0) - { - srcIndex = 0; - alpha = 0; - } - - if (srcIndex > (int)_param.srcW - 2) - { - srcIndex = (int)_param.srcW - 2; - alpha = 1; - } - - int dst = 2 * dstIndex - _ixg[block].dst; - int src = srcIndex - _ixg[block].src; - if (src >= A - 1 || dst >= A) - { - block++; - _ixg[block].src = Simd::Min(srcIndex, int(_param.srcW - A)); - _ixg[block].dst = 2 * dstIndex; - dst = 0; - src = srcIndex - _ixg[block].src; - } - _ixg[block].shuffle[dst] = src; - _ixg[block].shuffle[dst + 1] = src + 1; - - alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); - alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); - alphas += 2; - } - _blocks = block + 1; - } - else - { - _ix.Resize(_param.dstW); - for (size_t i = 0; i < _param.dstW; ++i) - { - float alpha = (float)((i + 0.5)*scale - 0.5); - ptrdiff_t index = (ptrdiff_t)::floor(alpha); - alpha -= index; - - if (index < 0) - { - index = 0; - alpha = 0; - } - - if (index >(ptrdiff_t)_param.srcW - 2) - { - index = _param.srcW - 2; - alpha = 1; - } - - _ix[i] = (int)index; - alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); - alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); - for (size_t channel = 1; channel < _param.channels; channel++) - ((uint16_t*)alphas)[channel] = *(uint16_t*)alphas; - alphas += 2 * _param.channels; - } - } - size_t size = AlignHi(_param.dstW, _param.align)*_param.channels * 2; - _bx[0].Resize(size, false, _param.align); - _bx[1].Resize(size, false, _param.align); - } - - template void ResizerByteBilinearInterpolateX(const __m128i * alpha, __m128i * buffer); - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<1>(const __m128i * alpha, __m128i * buffer) - { - _mm_store_si128(buffer, _mm_maddubs_epi16(_mm_load_si128(buffer), _mm_load_si128(alpha))); - } - - const __m128i K8_SHUFFLE_X2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF); - - SIMD_INLINE void ResizerByteBilinearInterpolateX2(const __m128i * alpha, __m128i * buffer) - { - __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X2); - _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha))); - } - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<2>(const __m128i * alpha, __m128i * buffer) - { - ResizerByteBilinearInterpolateX2(alpha + 0, buffer + 0); - ResizerByteBilinearInterpolateX2(alpha + 1, buffer + 1); - } - - const __m128i K8_SHUFFLE_X3_00 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1); - const __m128i K8_SHUFFLE_X3_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0); - const __m128i K8_SHUFFLE_X3_10 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_X3_11 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1); - const __m128i K8_SHUFFLE_X3_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); - const __m128i K8_SHUFFLE_X3_21 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_X3_22 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF); - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<3>(const __m128i * alpha, __m128i * buffer) - { - __m128i src[3], shuffled[3]; - src[0] = _mm_load_si128(buffer + 0); - src[1] = _mm_load_si128(buffer + 1); - src[2] = _mm_load_si128(buffer + 2); - shuffled[0] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_00); - shuffled[0] = _mm_or_si128(shuffled[0], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_01)); - _mm_store_si128(buffer + 0, _mm_maddubs_epi16(shuffled[0], _mm_load_si128(alpha + 0))); - shuffled[1] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_10); - shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_11)); - shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_12)); - _mm_store_si128(buffer + 1, _mm_maddubs_epi16(shuffled[1], _mm_load_si128(alpha + 1))); - shuffled[2] = _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_21); - shuffled[2] = _mm_or_si128(shuffled[2], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_22)); - _mm_store_si128(buffer + 2, _mm_maddubs_epi16(shuffled[2], _mm_load_si128(alpha + 2))); - } - - const __m128i K8_SHUFFLE_X4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF); - - SIMD_INLINE void ResizerByteBilinearInterpolateX4(const __m128i * alpha, __m128i * buffer) - { - __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X4); - _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha))); - } - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<4>(const __m128i * alpha, __m128i * buffer) - { - ResizerByteBilinearInterpolateX4(alpha + 0, buffer + 0); - ResizerByteBilinearInterpolateX4(alpha + 1, buffer + 1); - ResizerByteBilinearInterpolateX4(alpha + 2, buffer + 2); - ResizerByteBilinearInterpolateX4(alpha + 3, buffer + 3); - } - - const __m128i K16_FRACTION_ROUND_TERM = SIMD_MM_SET1_EPI16(Base::BILINEAR_ROUND_TERM); - - template SIMD_INLINE __m128i ResizerByteBilinearInterpolateY(const __m128i * pbx0, const __m128i * pbx1, __m128i alpha[2]) - { - __m128i sum = _mm_add_epi16(_mm_mullo_epi16(Load(pbx0), alpha[0]), _mm_mullo_epi16(Load(pbx1), alpha[1])); - return _mm_srli_epi16(_mm_add_epi16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT); - } - - template SIMD_INLINE void ResizerByteBilinearInterpolateY(const uint8_t * bx0, const uint8_t * bx1, __m128i alpha[2], uint8_t * dst) - { - __m128i lo = ResizerByteBilinearInterpolateY((__m128i*)bx0 + 0, (__m128i*)bx1 + 0, alpha); - __m128i hi = ResizerByteBilinearInterpolateY((__m128i*)bx0 + 1, (__m128i*)bx1 + 1, alpha); - Store((__m128i*)dst, _mm_packus_epi16(lo, hi)); - } - - template void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - struct One { uint8_t val[N * 1]; }; - struct Two { uint8_t val[N * 2]; }; - - size_t size = 2 * _param.dstW*N; - size_t aligned = AlignHi(size, DA) - DA; - const size_t step = A * N; - ptrdiff_t previous = -2; - __m128i a[2]; - uint8_t * bx[2] = { _bx[0].data, _bx[1].data }; - const uint8_t * ax = _ax.data; - const int32_t * ix = _ix.data; - size_t dstW = _param.dstW; - - for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride) - { - a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst])); - a[1] = _mm_set1_epi16(int16_t(_ay[yDst])); - - ptrdiff_t sy = _iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(bx[0], bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - Two * pb = (Two *)bx[k]; - const One * psrc = (const One *)(src + (sy + k)*srcStride); - for (size_t x = 0; x < dstW; x++) - pb[x] = *(Two *)(psrc + ix[x]); - - uint8_t * pbx = bx[k]; - for (size_t i = 0; i < size; i += step) - ResizerByteBilinearInterpolateX((__m128i*)(ax + i), (__m128i*)(pbx + i)); - } - - for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A) - ResizerByteBilinearInterpolateY(bx[0] + ib, bx[1] + ib, a, dst + id); - size_t i = size - DA; - ResizerByteBilinearInterpolateY(bx[0] + i, bx[1] + i, a, dst + i / 2); - } - } - - template SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t * src, const Idx & index, const uint8_t * alpha, uint8_t * dst) - { - __m128i _src = _mm_loadu_si128((__m128i*)(src + index.src)); - __m128i _shuffle = _mm_loadu_si128((__m128i*)&index.shuffle); - __m128i _alpha = _mm_loadu_si128((__m128i*)(alpha + index.dst)); - _mm_storeu_si128((__m128i*)(dst + index.dst), _mm_maddubs_epi16(_mm_shuffle_epi8(_src, _shuffle), _alpha)); - } - - void ResizerByteBilinear::RunG(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t bufW = AlignHi(_param.dstW, A) * 2; - size_t size = 2 * _param.dstW; - size_t aligned = AlignHi(size, DA) - DA; - size_t blocks = _blocks; - ptrdiff_t previous = -2; - __m128i a[2]; - uint8_t * bx[2] = { _bx[0].data, _bx[1].data }; - const uint8_t * ax = _ax.data; - const Idx * ixg = _ixg.data; - - for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride) - { - a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst])); - a[1] = _mm_set1_epi16(int16_t(_ay[yDst])); - - ptrdiff_t sy = _iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(bx[0], bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - const uint8_t * psrc = src + (sy + k)*srcStride; - uint8_t * pdst = bx[k]; - for (size_t i = 0; i < blocks; ++i) - ResizerByteBilinearLoadGrayInterpolated(psrc, ixg[i], ax, pdst); - } - - for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A) - ResizerByteBilinearInterpolateY(bx[0] + ib, bx[1] + ib, a, dst + id); - size_t i = size - DA; - ResizerByteBilinearInterpolateY(bx[0] + i, bx[1] + i, a, dst + i / 2); - } - } - - void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - assert(_param.dstW >= A); - - EstimateParams(); - switch (_param.channels) - { - case 1: - if(_blocks) - RunG(src, srcStride, dst, dstStride); - else - Run<1>(src, srcStride, dst, dstStride); - break; - case 2: Run<2>(src, srcStride, dst, dstStride); break; - case 3: Run<3>(src, srcStride, dst, dstStride); break; - case 4: Run<4>(src, srcStride, dst, dstStride); break; - default: - assert(0); - } - } - - //--------------------------------------------------------------------- - - void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) - { - ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128i)); - if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && dstX >= A) - return new ResizerByteBilinear(param); - else - return Sse2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Resizer.cpp.o) has no symbols - void dummy_SimdSsse3Resizer(){}; -#endif//SIMD_SSSE3_ENABLE -} - diff --git a/3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp b/3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp deleted file mode 100644 index cf79dd55bd..0000000000 --- a/3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp +++ /dev/null @@ -1,93 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); - const __m128i K16_GREEN_ROUND = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m128i RgbaToGray32(__m128i rgba) - { - const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF); - const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF); - const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(r0b0, K16_RED_BLUE)); - return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - SIMD_INLINE __m128i RgbToGray(__m128i rgba[4]) - { - const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); - const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); - return _mm_packus_epi16(lo, hi); - } - - template SIMD_INLINE __m128i RgbToGray(const uint8_t * rgb, __m128i shuffle) - { - __m128i rgba[4]; - rgba[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(rgb + 0)), shuffle)); - rgba[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(rgb + 12)), shuffle)); - rgba[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(rgb + 24)), shuffle)); - rgba[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(rgb + 32)), 4), shuffle)); - return RgbToGray(rgba); - } - - template void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride) - { - assert(width >= A); - if (align) - assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)); - - size_t alignedWidth = AlignLo(width, A); - - __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - Store((__m128i*)(gray + col), RgbToGray(rgb + 3 * col, _shuffle)); - if (width != alignedWidth) - Store((__m128i*)(gray + width - A), RgbToGray(rgb + 3 * (width - A), _shuffle)); - rgb += rgbStride; - gray += grayStride; - } - } - - void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride) - { - if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)) - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3RgbToGray.cpp.o) has no symbols - void dummy_SimdSsse3RgbToGray(){}; -#endif// SIMD_SSSE3_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdStore.h b/3rdparty/simdlib/Simd/SimdStore.h old mode 100644 new mode 100755 index 11ae3f7815..2b22a9616d --- a/3rdparty/simdlib/Simd/SimdStore.h +++ b/3rdparty/simdlib/Simd/SimdStore.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -31,8 +31,8 @@ namespace Simd { -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { template SIMD_INLINE void Store(float * p, __m128 a); @@ -63,13 +63,6 @@ namespace Simd __m128 old = Load(p); Store(p, Combine(mask, value, old)); } - } -#endif//SIMD_SSE_ENABLE - -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - using namespace Sse; template SIMD_INLINE void Store(__m128i * p, __m128i a); @@ -83,6 +76,11 @@ namespace Simd _mm_store_si128(p, a); } + template SIMD_INLINE void StoreHalf(__m128i* p, __m128i a) + { + StoreHalf((float*)p, _mm_castsi128_ps(a)); + } + template SIMD_INLINE void StoreMasked(__m128i * p, __m128i value, __m128i mask) { __m128i old = Load(p); @@ -95,7 +93,6 @@ namespace Simd namespace Sse41 { #if defined(_MSC_VER) && _MSC_VER >= 1800 && _MSC_VER < 1900 // Visual Studio 2013 compiler bug - using Sse::Store; using Sse2::Store; #endif } @@ -118,8 +115,8 @@ namespace Simd template SIMD_INLINE void Store(float * p0, float * p1, __m256 a) { - Sse::Store(p0, _mm256_extractf128_ps(a, 0)); - Sse::Store(p1, _mm256_extractf128_ps(a, 1)); + Sse2::Store(p0, _mm256_extractf128_ps(a, 0)); + Sse2::Store(p1, _mm256_extractf128_ps(a, 1)); } template SIMD_INLINE void StoreMasked(float * p, __m256 value, __m256 mask) @@ -163,11 +160,6 @@ namespace Simd return _mm256_permute4x64_epi64(_mm256_packus_epi16(lo, hi), 0xD8); } - SIMD_INLINE __m256i PackU16ToU8(__m256i lo, __m256i hi) - { - return _mm256_permute4x64_epi64(_mm256_packus_epi16(lo, hi), 0xD8); - } - SIMD_INLINE __m256i PackI32ToI16(__m256i lo, __m256i hi) { return _mm256_permute4x64_epi64(_mm256_packs_epi32(lo, hi), 0xD8); @@ -184,6 +176,12 @@ namespace Simd lo = _mm256_permute2x128_si256(lo, hi, 0x20); hi = _mm256_permute2x128_si256(_lo, hi, 0x31); } + + template SIMD_INLINE void Store24(uint8_t * p, __m256i a) + { + Sse2::Store((__m128i*)p, _mm256_extractf128_si256(a, 0)); + Sse2::StoreHalf<0>((__m128i*)p + 1, _mm256_extractf128_si256(a, 1)); + } } #endif//SIMD_SAVX2_ENABLE @@ -230,27 +228,27 @@ namespace Simd template SIMD_INLINE void Store(uint16_t * p, uint16x8_t a) { - Store((uint8_t*)p, (uint8x16_t)a); + Store((uint8_t*)p, vreinterpretq_u8_u16(a)); } template SIMD_INLINE void Store(uint16_t * p, uint16x4_t a) { - Store((uint8_t*)p, (uint8x8_t)a); + Store((uint8_t*)p, vreinterpret_u8_u16(a)); } template SIMD_INLINE void Store(int16_t * p, int16x8_t a) { - Store((uint8_t*)p, (uint8x16_t)a); + Store((uint8_t*)p, vreinterpretq_u8_s16(a)); } template SIMD_INLINE void Store(uint32_t * p, uint32x4_t a) { - Store((uint8_t*)p, (uint8x16_t)a); + Store((uint8_t*)p, vreinterpretq_u8_u32(a)); } template SIMD_INLINE void Store(int32_t * p, int32x4_t a) { - Store((uint8_t*)p, (uint8x16_t)a); + Store((uint8_t*)p, vreinterpretq_u8_s32(a)); } template SIMD_INLINE void Store2(uint8_t * p, uint8x16x2_t a); @@ -310,7 +308,6 @@ namespace Simd #endif } - template SIMD_INLINE void Store3(uint8_t * p, uint8x16x3_t a); template <> SIMD_INLINE void Store3(uint8_t * p, uint8x16x3_t a) diff --git a/3rdparty/simdlib/Simd/SimdStream.h b/3rdparty/simdlib/Simd/SimdStream.h old mode 100644 new mode 100755 index b6399bd1f1..6abf65cf68 --- a/3rdparty/simdlib/Simd/SimdStream.h +++ b/3rdparty/simdlib/Simd/SimdStream.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -30,36 +30,31 @@ namespace Simd { const size_t STREAM_SIZE_MIN = 0x00100000; -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { - template SIMD_INLINE void Stream(float * p, __m128 a); + template SIMD_INLINE void Stream(float* p, __m128 a); - template <> SIMD_INLINE void Stream(float * p, __m128 a) + template <> SIMD_INLINE void Stream(float* p, __m128 a) { _mm_storeu_ps(p, a); } - template <> SIMD_INLINE void Stream(float * p, __m128 a) + template <> SIMD_INLINE void Stream(float* p, __m128 a) { _mm_storeu_ps(p, a); } - template <> SIMD_INLINE void Stream(float * p, __m128 a) + template <> SIMD_INLINE void Stream(float* p, __m128 a) { _mm_store_ps(p, a); } - template <> SIMD_INLINE void Stream(float * p, __m128 a) + template <> SIMD_INLINE void Stream(float* p, __m128 a) { _mm_stream_ps(p, a); } - } -#endif//SIMD_SSE_ENABLE -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { template SIMD_INLINE void Stream(__m128i * p, __m128i a); template <> SIMD_INLINE void Stream(__m128i * p, __m128i a) diff --git a/3rdparty/simdlib/Simd/SimdUpdate.h b/3rdparty/simdlib/Simd/SimdUpdate.h old mode 100644 new mode 100755 index 47e9b22dc2..4c4d64b1c0 --- a/3rdparty/simdlib/Simd/SimdUpdate.h +++ b/3rdparty/simdlib/Simd/SimdUpdate.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -47,8 +47,8 @@ namespace Simd } } -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { template SIMD_INLINE void Update(float * p, __m128 a) { @@ -63,13 +63,10 @@ namespace Simd template <> SIMD_INLINE void Update(float * p, __m128 a) { Store(p, _mm_add_ps(Load(p), a)); - } - } -#endif//SIMD_SSE_ENABLE + } -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { + //----------------------------------------------------------------------------------------- + template SIMD_INLINE void Update(int32_t * p, __m128i a) { Store((__m128i*)p, a); @@ -160,6 +157,6 @@ namespace Simd Store(p, vaddq_f32(Load(p), a)); } } -#endif//SIMD_SSE_ENABLE +#endif//SIMD_NEON_ENABLE } #endif//__SimdUpdate_h__ diff --git a/3rdparty/simdlib/Simd/SimdVersion.h b/3rdparty/simdlib/Simd/SimdVersion.h index 72ae751ade..09efd5de91 100644 --- a/3rdparty/simdlib/Simd/SimdVersion.h +++ b/3rdparty/simdlib/Simd/SimdVersion.h @@ -34,7 +34,7 @@ #ifndef __SimdVersion_h__ #define __SimdVersion_h__ -#define SIMD_VERSION "4.4.82" +#define SIMD_VERSION "4.9.107" #endif//__SimdVersion_h__ diff --git a/3rdparty/simdlib/Simd/SimdView.hpp b/3rdparty/simdlib/Simd/SimdView.hpp old mode 100644 new mode 100755 index c9a51c5f61..0c61a0e6e8 --- a/3rdparty/simdlib/Simd/SimdView.hpp +++ b/3rdparty/simdlib/Simd/SimdView.hpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, +* Copyright (c) 2011-2021 Yermalayeu Ihar, * 2014-2019 Antonenka Mikhail, * 2018-2019 Dmitry Fedorov, * 2019-2019 Artur Voronkov. @@ -95,7 +95,9 @@ namespace Simd /*! A single channel 64-bit float point pixel format. */ Double, /*! A 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */ - Rgb24 + Rgb24, + /*! A 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */ + Rgba32, }; /*! diff --git a/modules/core/src/image/vpImageConvert.cpp b/modules/core/src/image/vpImageConvert.cpp index 7d7ef289e9..b3a95e4372 100644 --- a/modules/core/src/image/vpImageConvert.cpp +++ b/modules/core/src/image/vpImageConvert.cpp @@ -673,7 +673,7 @@ dest.resize((unsigned int)src.rows, (unsigned int)src.cols); } } else if (src.type() == CV_8UC3) { if (src.isContinuous() && !flip) { - SimdBgrToRgba(src.data, src.cols, src.rows, src.step[0], reinterpret_cast(dest.bitmap), + SimdRgbToBgra(src.data, src.cols, src.rows, src.step[0], reinterpret_cast(dest.bitmap), dest.getWidth() * sizeof(vpRGBa), vpRGBa::alpha_default); } else { vpRGBa rgbaVal; @@ -3519,7 +3519,7 @@ void vpImageConvert::BGRToRGBa(unsigned char *bgr, unsigned char *rgba, unsigned bool flip) { if (!flip) { - SimdBgrToRgba(bgr, width, height, width*3, rgba, width * sizeof(vpRGBa), vpRGBa::alpha_default); + SimdRgbToBgra(bgr, width, height, width*3, rgba, width * sizeof(vpRGBa), vpRGBa::alpha_default); } else { // if we have to flip the image, we start from the end last scanline so the // step is negative From 04044b64040143b5887237b15dd2f1239c5218a6 Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Wed, 3 Nov 2021 10:17:07 +0100 Subject: [PATCH 02/18] Add missing file. --- 3rdparty/simdlib/Simd/SimdNeonCpu.cpp | 59 +++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 3rdparty/simdlib/Simd/SimdNeonCpu.cpp diff --git a/3rdparty/simdlib/Simd/SimdNeonCpu.cpp b/3rdparty/simdlib/Simd/SimdNeonCpu.cpp new file mode 100644 index 0000000000..8b644c04f6 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdNeonCpu.cpp @@ -0,0 +1,59 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2020 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdEnable.h" +#include "Simd/SimdCpu.h" + +#if defined(__GNUC__) && (defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)) +#include +#include +#include +#endif + +namespace Simd +{ +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + SIMD_INLINE bool SupportedByCPU() + { +#if defined(_MSC_VER) + return true; +#elif defined(__GNUC__) +#if defined(SIMD_ARM64_ENABLE) + return true; +#else + return Base::CheckBit(AT_HWCAP, HWCAP_NEON); +#endif +#else +#error Do not know how to detect NEON support! +#endif + } + + bool GetEnable() + { + return SupportedByCPU(); + } + } +#endif +} From df0461608768ea77f71ef27fd360c8e6594240d2 Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Wed, 3 Nov 2021 11:18:25 +0100 Subject: [PATCH 03/18] Remove not used SSE flags. Add missing SSE 4.1 implementation. --- 3rdparty/simdlib/CMakeLists.txt | 64 ++--------------- 3rdparty/simdlib/Simd/SimdLib.cpp | 7 +- 3rdparty/simdlib/Simd/SimdSse41.h | 3 + .../simdlib/Simd/SimdSse41CustomFunctions.cpp | 69 +++++++++++++++++++ modules/io/src/image/vpImageIo.cpp | 2 +- 5 files changed, 83 insertions(+), 62 deletions(-) create mode 100644 3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp diff --git a/3rdparty/simdlib/CMakeLists.txt b/3rdparty/simdlib/CMakeLists.txt index dc6d111aae..1acb1341be 100644 --- a/3rdparty/simdlib/CMakeLists.txt +++ b/3rdparty/simdlib/CMakeLists.txt @@ -20,46 +20,31 @@ file(GLOB_RECURSE SIMD_BASE_HDR ${CMAKE_CURRENT_SOURCE_DIR}/Simd/*.h ${CMAKE_CUR if(X86 OR X86_64) # Flags check - set(SSE_FLAG "") set(SSE2_FLAG "") - set(SSE3_FLAG "") - set(SSSE3_FLAG "") - set(SSE4_1_FLAG "") set(SSE4_2_FLAG "") set(AVX_FLAG "") set(AVX2_FLAG "") if(MSVC) if(NOT MSVC64) - vp_check_compiler_flag(CXX "/arch:SSE" HAVE_SSE_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse.cpp") vp_check_compiler_flag(CXX "/arch:SSE2" HAVE_SSE2_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp") endif() vp_check_compiler_flag(CXX "/arch:AVX" HAVE_AVX_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx.cpp") vp_check_compiler_flag(CXX "/arch:AVX2" HAVE_AVX2_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp") - if(HAVE_SSE_FLAG) - set(SSE_FLAG "/arch:SSE") - endif() if(HAVE_SSE2_FLAG) set(SSE2_FLAG "/arch:SSE2") endif() if(HAVE_AVX_FLAG) set(AVX_FLAG "/arch:AVX") set(SSE4_2_FLAG "/arch:AVX") - set(SSE4_1_FLAG "/arch:AVX") - set(SSSE3_FLAG "/arch:AVX") - set(SSE3_FLAG "/arch:AVX") endif() if(HAVE_AVX2_FLAG) set(AVX2_FLAG "/arch:AVX2") endif() else() - vp_check_compiler_flag(CXX "-msse" HAVE_SSE_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse.cpp") vp_check_compiler_flag(CXX "-msse2" HAVE_SSE2_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp") - vp_check_compiler_flag(CXX "-msse3" HAVE_SSE3_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse3.cpp") - vp_check_compiler_flag(CXX "-mssse3" HAVE_SSSE3_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_ssse3.cpp") - vp_check_compiler_flag(CXX "-msse4.1" HAVE_SSE4_1_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse41.cpp") vp_check_compiler_flag(CXX "-msse4.2" HAVE_SSE4_2_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse42.cpp") vp_check_compiler_flag(CXX "-mavx" HAVE_AVX_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx.cpp") vp_check_compiler_flag(CXX "-mavx2" HAVE_AVX2_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp") @@ -68,23 +53,11 @@ if(X86 OR X86_64) vp_check_compiler_flag(CXX "-Wno-sign-compare" HAVE_NO_SIGN_COMPARE_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_warning.cpp") vp_check_compiler_flag(CXX "-Wno-ignored-qualifiers" HAVE_NO_IGNORED_QUALIFIERS "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_warning.cpp") - if(HAVE_SSE_FLAG) - set(SSE_FLAG "-msse") - endif() if(HAVE_SSE2_FLAG) - set(SSE2_FLAG "-msse2") - endif() - if(HAVE_SSE3_FLAG) - set(SSE3_FLAG "-msse3") - endif() - if(HAVE_SSSE3_FLAG) - set(SSSE3_FLAG "-mssse3") - endif() - if(HAVE_SSE4_1_FLAG) - set(SSE4_1_FLAG "-msse4.1") + set(SSE2_FLAG "-msse -msse2") endif() if(HAVE_SSE4_2_FLAG) - set(SSE4_2_FLAG "-msse4.2") + set(SSE4_2_FLAG "-msse3 -mssse3 -msse4.1 -msse4.2") endif() if(HAVE_AVX_FLAG) set(AVX_FLAG "-mavx") @@ -110,10 +83,10 @@ if(X86 OR X86_64) set_source_files_properties(${SIMD_BASE_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS}") file(GLOB_RECURSE SIMD_SSE2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse2*.cpp) - set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG} ${SSE2_FLAG}") + set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE2_FLAG}") file(GLOB_RECURSE SIMD_SSE41_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse41*.cpp) - set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG} ${SSSE3_FLAG} ${SSE4_1_FLAG} ${SSE4_2_FLAG}") + set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}") file(GLOB_RECURSE SIMD_AVX1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx1*.cpp) set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}") @@ -126,7 +99,7 @@ if(X86 OR X86_64) endif() set(SIMD_LIB_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}") - set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE1_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE3_SRC} ${SIMD_SSSE3_SRC} ${SIMD_SSE41_SRC} ${SIMD_SSE42_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC}) + set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE41_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC}) file(GLOB_RECURSE SIMD_LIB_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdLib.cpp) set_source_files_properties(${SIMD_LIB_SRC} PROPERTIES COMPILE_FLAGS "${SIMD_LIB_FLAGS}") @@ -171,32 +144,21 @@ elseif(WINRT) add_library(${SIMD_LIBRARY} STATIC ${SIMD_LIB_SRC} ${SIMD_BASE_SRC} ${SIMD_NEON_SRC} ${SIMD_BASE_HDR}) else() # Flags check - set(SSE_FLAG "") set(SSE2_FLAG "") - set(SSE3_FLAG "") - set(SSSE3_FLAG "") - set(SSE4_1_FLAG "") set(SSE4_2_FLAG "") set(AVX_FLAG "") set(AVX2_FLAG "") - vp_check_compiler_flag(CXX "/arch:SSE" HAVE_SSE_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse.cpp") vp_check_compiler_flag(CXX "/arch:SSE2" HAVE_SSE2_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp") vp_check_compiler_flag(CXX "/arch:AVX" HAVE_AVX_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx.cpp") vp_check_compiler_flag(CXX "/arch:AVX2" HAVE_AVX2_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp") - if(HAVE_SSE_FLAG) - set(SSE_FLAG "/arch:SSE") - endif() if(HAVE_SSE2_FLAG) set(SSE2_FLAG "/arch:SSE2") endif() if(HAVE_AVX_FLAG) set(AVX_FLAG "/arch:AVX") set(SSE4_2_FLAG "/arch:AVX") - set(SSE4_1_FLAG "/arch:AVX") - set(SSSE3_FLAG "/arch:AVX") - set(SSE3_FLAG "/arch:AVX") endif() if(HAVE_AVX2_FLAG) set(AVX2_FLAG "/arch:AVX2") @@ -205,23 +167,11 @@ elseif(WINRT) file(GLOB_RECURSE SIMD_BASE_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdBase*.cpp) set_source_files_properties(${SIMD_BASE_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS}") - file(GLOB_RECURSE SIMD_SSE1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse1*.cpp) - set_source_files_properties(${SIMD_SSE1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG}") - file(GLOB_RECURSE SIMD_SSE2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse2*.cpp) set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE2_FLAG}") - file(GLOB_RECURSE SIMD_SSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse3*.cpp) - set_source_files_properties(${SIMD_SSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG}") - - file(GLOB_RECURSE SIMD_SSSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSsse3*.cpp) - set_source_files_properties(${SIMD_SSSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSSE3_FLAG}") - file(GLOB_RECURSE SIMD_SSE41_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse41*.cpp) - set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_1_FLAG}") - - file(GLOB_RECURSE SIMD_SSE42_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse42*.cpp) - set_source_files_properties(${SIMD_SSE42_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}") + set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}") file(GLOB_RECURSE SIMD_AVX1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx1*.cpp) set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}") @@ -230,7 +180,7 @@ elseif(WINRT) set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}") set(SIMD_LIB_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}") - set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE1_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE3_SRC} ${SIMD_SSSE3_SRC} ${SIMD_SSE41_SRC} ${SIMD_SSE42_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC}) + set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE41_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC}) file(GLOB_RECURSE SIMD_LIB_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdLib.cpp) set_source_files_properties(${SIMD_LIB_SRC} PROPERTIES COMPILE_FLAGS "${SIMD_LIB_FLAGS}") diff --git a/3rdparty/simdlib/Simd/SimdLib.cpp b/3rdparty/simdlib/Simd/SimdLib.cpp index b1cac8b1ba..89718bb80e 100755 --- a/3rdparty/simdlib/Simd/SimdLib.cpp +++ b/3rdparty/simdlib/Simd/SimdLib.cpp @@ -862,10 +862,9 @@ SIMD_API void SimdMatTranspose(const double * mat, size_t rows, size_t cols, dou SIMD_API void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff) { - //TODO: -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && size >= Ssse3::A) - Ssse3::SimdImageDifference(img1,img2, size, imgDiff); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && size >= Sse41::A) + Sse41::SimdImageDifference(img1,img2, size, imgDiff); else #endif Base::SimdImageDifference(img1, img2, size, imgDiff); diff --git a/3rdparty/simdlib/Simd/SimdSse41.h b/3rdparty/simdlib/Simd/SimdSse41.h index 958fc11bc5..7a4bb04ad8 100755 --- a/3rdparty/simdlib/Simd/SimdSse41.h +++ b/3rdparty/simdlib/Simd/SimdSse41.h @@ -70,6 +70,9 @@ namespace Simd void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); + + // ViSP custom SIMD code + void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff); } #endif// SIMD_SSE41_ENABLE } diff --git a/3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp b/3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp new file mode 100644 index 0000000000..f34a29329d --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp @@ -0,0 +1,69 @@ +/* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdBase.h" +#include "Simd/SimdStore.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff) + { + const __m128i mask1 = _mm_set_epi8(-1, 14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0); + const __m128i mask2 = _mm_set_epi8(-1, 15, -1, 13, -1, 11, -1, 9, -1, 7, -1, 5, -1, 3, -1, 1); + const __m128i mask_out2 = _mm_set_epi8(14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0, -1); + + size_t i = 0; + for (; i <= size-16; i+= 16) { + const __m128i vdata1 = _mm_loadu_si128(reinterpret_cast(img1 + i)); + const __m128i vdata2 = _mm_loadu_si128(reinterpret_cast(img2 + i)); + + __m128i vdata1_reorg = _mm_shuffle_epi8(vdata1, mask1); + __m128i vdata2_reorg = _mm_shuffle_epi8(vdata2, mask1); + + const __m128i vshift = _mm_set1_epi16(128); + __m128i vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift); + + const __m128i v255 = _mm_set1_epi16(255); + const __m128i vzero = _mm_setzero_si128(); + const __m128i vdata_diff_min_max1 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero); + + vdata1_reorg = _mm_shuffle_epi8(vdata1, mask2); + vdata2_reorg = _mm_shuffle_epi8(vdata2, mask2); + + vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift); + const __m128i vdata_diff_min_max2 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero); + + _mm_storeu_si128(reinterpret_cast<__m128i *>(imgDiff + i), _mm_or_si128(_mm_shuffle_epi8(vdata_diff_min_max1, mask1), + _mm_shuffle_epi8(vdata_diff_min_max2, mask_out2))); + } + + if (i < size) { + Base::SimdImageDifference(img1 + i, img2 + i, size - i, imgDiff + i); + } + } + } +#else + // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3CustomFunctions.cpp.o) has no symbols + void dummy_SimdSse41CustomFunctions(){}; +#endif// SIMD_SSE41_ENABLE +} diff --git a/modules/io/src/image/vpImageIo.cpp b/modules/io/src/image/vpImageIo.cpp index 633503389c..ab290fa5f7 100644 --- a/modules/io/src/image/vpImageIo.cpp +++ b/modules/io/src/image/vpImageIo.cpp @@ -102,7 +102,7 @@ void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const st while (cpt_elt != nb_elt) { // Skip empty lines or lines starting with # (comment) while (std::getline(fd, line) && (line.compare(0, 1, "#") == 0 || line.size() == 0)) { - }; + } if (fd.eof()) { fd.close(); From 9c193041c09bdf2fc7f20568d3e5677904191168 Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Wed, 3 Nov 2021 18:07:41 +0100 Subject: [PATCH 04/18] WIP code to add and test image loading/saving using Simd and for JPEG and PNG image format. --- 3rdparty/simdlib/CMakeLists.txt | 4 +- 3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp | 158 ++ 3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp | 138 + .../simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp | 351 +++ .../simdlib/Simd/SimdAvx2ImageSavePng.cpp | 369 +++ 3rdparty/simdlib/Simd/SimdBase.h | 4 + 3rdparty/simdlib/Simd/SimdBaseCrc32.cpp | 978 +++++++ 3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp | 371 +++ .../simdlib/Simd/SimdBaseImageLoadJpeg.cpp | 2456 +++++++++++++++++ .../simdlib/Simd/SimdBaseImageLoadPng.cpp | 1317 +++++++++ 3rdparty/simdlib/Simd/SimdBaseImageSave.cpp | 340 +++ .../simdlib/Simd/SimdBaseImageSaveJpeg.cpp | 451 +++ .../simdlib/Simd/SimdBaseImageSavePng.cpp | 379 +++ 3rdparty/simdlib/Simd/SimdImageLoad.h | 396 +++ 3rdparty/simdlib/Simd/SimdImageSave.h | 386 +++ 3rdparty/simdlib/Simd/SimdImageSaveJpeg.h | 649 +++++ 3rdparty/simdlib/Simd/SimdImageSavePng.h | 235 ++ 3rdparty/simdlib/Simd/SimdLib.cpp | 32 +- 3rdparty/simdlib/Simd/SimdLib.h | 109 +- 3rdparty/simdlib/Simd/SimdMath.h | 5 + 3rdparty/simdlib/Simd/SimdMemory.h | 19 + 3rdparty/simdlib/Simd/SimdMemoryStream.h | 510 ++++ 3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp | 154 ++ 3rdparty/simdlib/Simd/SimdNeonImageSave.cpp | 134 + 3rdparty/simdlib/Simd/SimdPerformance.h | 197 ++ 3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp | 159 ++ .../simdlib/Simd/SimdSse41ImageLoadPng.cpp | 1805 ++++++++++++ 3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp | 139 + .../simdlib/Simd/SimdSse41ImageSaveJpeg.cpp | 431 +++ .../simdlib/Simd/SimdSse41ImageSavePng.cpp | 370 +++ 3rdparty/simdlib/Simd/SimdView.hpp | 209 +- CMakeLists.txt | 2 + modules/io/CMakeLists.txt | 14 +- modules/io/include/visp3/io/vpImageIo.h | 8 + modules/io/src/image/vpImageIo.cpp | 63 + modules/io/test/perfImageLoadSave.cpp | 461 ++++ 36 files changed, 13646 insertions(+), 157 deletions(-) create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseCrc32.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageSave.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp create mode 100644 3rdparty/simdlib/Simd/SimdImageLoad.h create mode 100644 3rdparty/simdlib/Simd/SimdImageSave.h create mode 100644 3rdparty/simdlib/Simd/SimdImageSaveJpeg.h create mode 100644 3rdparty/simdlib/Simd/SimdImageSavePng.h create mode 100644 3rdparty/simdlib/Simd/SimdMemoryStream.h create mode 100644 3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp create mode 100644 3rdparty/simdlib/Simd/SimdNeonImageSave.cpp create mode 100644 3rdparty/simdlib/Simd/SimdPerformance.h create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp create mode 100644 modules/io/test/perfImageLoadSave.cpp diff --git a/3rdparty/simdlib/CMakeLists.txt b/3rdparty/simdlib/CMakeLists.txt index 1acb1341be..95b3358ad2 100644 --- a/3rdparty/simdlib/CMakeLists.txt +++ b/3rdparty/simdlib/CMakeLists.txt @@ -93,9 +93,9 @@ if(X86 OR X86_64) file(GLOB_RECURSE SIMD_AVX2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx2*.cpp) if(MSVC) - set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}") + set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store") else() - set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mfma") + set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mfma -mbmi -mbmi2 -mlzcnt -fabi-version=4 -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store") endif() set(SIMD_LIB_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}") diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp new file mode 100644 index 0000000000..aad4785761 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp @@ -0,0 +1,158 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdAvx2.h" + +#include + +namespace Simd +{ +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param) + : Sse41::ImagePgmTxtLoader(param) + { + } + + void ImagePgmTxtLoader::SetConverters() + { + Sse41::ImagePgmTxtLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Avx2::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Avx2::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Avx2::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Avx2::GrayToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param) + : Sse41::ImagePgmBinLoader(param) + { + } + + void ImagePgmBinLoader::SetConverters() + { + Sse41::ImagePgmBinLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Avx2::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Avx2::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Avx2::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Avx2::GrayToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param) + : Sse41::ImagePpmTxtLoader(param) + { + } + + void ImagePpmTxtLoader::SetConverters() + { + Sse41::ImagePpmTxtLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Avx2::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Avx2::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Avx2::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Avx2::BgrToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param) + : Sse41::ImagePpmBinLoader(param) + { + } + + void ImagePpmBinLoader::SetConverters() + { + Sse41::ImagePpmBinLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Avx2::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Avx2::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Avx2::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Avx2::BgrToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImageLoader* CreateImageLoader(const ImageLoaderParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param); + case SimdImageFilePgmBin: return new ImagePgmBinLoader(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param); + case SimdImageFilePpmBin: return new ImagePpmBinLoader(param); + case SimdImageFilePng: return new Sse41::ImagePngLoader(param); + case SimdImageFileJpeg: return new Base::ImageJpegLoader(param); + default: + return NULL; + } + } + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) + { + ImageLoaderParam param(data, size, *format); + if (param.Validate()) + { + Holder loader(CreateImageLoader(param)); + if (loader) + { + if (loader->FromStream()) + return loader->Release(stride, width, height, format); + } + } + return NULL; + } + } +#endif// SIMD_AVX2_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp new file mode 100644 index 0000000000..bd7e057092 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp @@ -0,0 +1,138 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdAvx2.h" + +#include + +namespace Simd +{ +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param) + : Sse41::ImagePgmTxtSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Avx2::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Avx2::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Avx2::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Avx2::RgbaToGray; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param) + : Sse41::ImagePgmBinSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Avx2::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Avx2::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Avx2::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Avx2::RgbaToGray; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param) + : Sse41::ImagePpmTxtSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Avx2::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Avx2::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Avx2::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Avx2::BgraToBgr; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param) + : Sse41::ImagePpmBinSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Avx2::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Avx2::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Avx2::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Avx2::BgraToBgr; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImageSaver* CreateImageSaver(const ImageSaverParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param); + case SimdImageFilePgmBin: return new ImagePgmBinSaver(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param); + case SimdImageFilePpmBin: return new ImagePpmBinSaver(param); + case SimdImageFilePng: return new ImagePngSaver(param); + case SimdImageFileJpeg: return new ImageJpegSaver(param); + default: + return NULL; + } + } + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size) + { + ImageSaverParam param(width, height, format, file, quality); + if (param.Validate()) + { + Holder saver(CreateImageSaver(param)); + if (saver) + { + if (saver->ToStream(src, stride)) + return saver->Release(size); + } + } + return NULL; + } + } +#endif// SIMD_AVX2_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp new file mode 100644 index 0000000000..2ff51e4dc1 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp @@ -0,0 +1,351 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdImageSaveJpeg.h" +#include "Simd/SimdLoad.h" +#include "Simd/SimdAvx2.h" + +namespace Simd +{ +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + const uint32_t JpegZigZagTi32[64] = { + 0, 8, 1, 2, 9, 16, 24, 17, + 10, 3, 4, 11, 18, 25, 32, 40, + 33, 26, 19, 12, 5, 6, 13, 20, + 27, 34, 41, 48, 56, 49, 42, 35, + 28, 21, 14, 7, 15, 22, 29, 36, + 43, 50, 57, 58, 51, 44, 37, 30, + 23, 31, 38, 45, 52, 59, 60, 53, + 46, 39, 47, 54, 61, 62, 55, 63 }; + + //--------------------------------------------------------------------- + + static int JpegProcessDu(Base::BitBuf& bitBuf, float* CDU, int stride, const float* fdtbl, int DC, const uint16_t HTDC[256][2], const uint16_t HTAC[256][2]) + { + SIMD_ALIGNED(32) int DUO[64], DU[64]; + JpegDct(CDU, stride, fdtbl, DUO); + union + { + uint64_t u64[1]; + uint32_t u32[2]; + uint8_t u8[8]; + } dum; + for (int i = 0, j = 0; i < 64; i += 8, j++) + { + __m256i du = _mm256_i32gather_epi32(DUO, _mm256_loadu_si256((__m256i*)(JpegZigZagTi32 + i)), 4); + dum.u8[j] = ~_mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpeq_epi32(du, Avx2::K_ZERO))); + _mm256_storeu_si256((__m256i*)(DU + i), du); + } + int diff = DU[0] - DC; + if (diff == 0) + bitBuf.Push(HTDC[0]); + else + { + uint16_t bits[2]; + Base::JpegCalcBits(diff, bits); + bitBuf.Push(HTDC[bits[1]]); + bitBuf.Push(bits); + } +#if defined(SIMD_X64_ENABLE) + if (dum.u64[0] == 0) + { + bitBuf.Push(HTAC[0x00]); + return DU[0]; + } + dum.u64[0] >>= 1; + int i = 1; + for (; dum.u64[0]; ++i, dum.u64[0] >>= 1) + { + int nrzeroes = (int)_tzcnt_u64(dum.u64[0]); + i += nrzeroes; + dum.u64[0] >>= nrzeroes; + if (nrzeroes >= 16) + { + for (int nrmarker = 16; nrmarker <= nrzeroes; nrmarker += 16) + bitBuf.Push(HTAC[0xF0]); + nrzeroes &= 15; + } + uint16_t bits[2]; + Base::JpegCalcBits(DU[i], bits); + bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]); + bitBuf.Push(bits); + } + if (i < 64) + bitBuf.Push(HTAC[0x00]); +#else + int end0pos = 64; + do + { + end0pos -= 8; + int mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_loadu_si256((__m256i*)(DU + end0pos)), Avx2::K_ZERO)); + if (mask) + { + end0pos += 7 - _lzcnt_u32(mask) / 4; + break; + } + } + while (end0pos > 0); + if (end0pos == 0) + { + bitBuf.Push(HTAC[0x00]); + return DU[0]; + } + for (int i = 1; i <= end0pos; ++i) + { + int startpos = i; + for (; DU[i] == 0 && i <= end0pos; ++i); + int nrzeroes = i - startpos; + if (nrzeroes >= 16) + { + int lng = nrzeroes >> 4; + int nrmarker; + for (nrmarker = 1; nrmarker <= lng; ++nrmarker) + bitBuf.Push(HTAC[0xF0]); + nrzeroes &= 15; + } + uint16_t bits[2]; + Base::JpegCalcBits(DU[i], bits); + bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]); + bitBuf.Push(bits); + } + if (end0pos != 63) + bitBuf.Push(HTAC[0x00]); +#endif + return DU[0]; + } + + SIMD_INLINE void RgbToYuvInit(__m256 k[10]) + { + k[0] = _mm256_set1_ps(+0.29900f); + k[1] = _mm256_set1_ps(+0.58700f); + k[2] = _mm256_set1_ps(+0.11400f); + k[3] = _mm256_set1_ps(-128.000f); + k[4] = _mm256_set1_ps(-0.16874f); + k[5] = _mm256_set1_ps(-0.33126f); + k[6] = _mm256_set1_ps(+0.50000f); + k[7] = _mm256_set1_ps(+0.50000f); + k[8] = _mm256_set1_ps(-0.41869f); + k[9] = _mm256_set1_ps(-0.08131f); + } + + SIMD_INLINE void RgbToYuv(const uint8_t* r, const uint8_t* g, const uint8_t* b, int stride, int height, + const __m256 k[10], float* y, float* u, float* v, int size) + { + for (int row = 0; row < size;) + { + for (int col = 0; col < size; col += 8) + { + __m256 _r = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(r + col)))); + __m256 _g = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(g + col)))); + __m256 _b = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(b + col)))); + _mm256_storeu_ps(y + col, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, k[0]), _mm256_mul_ps(_g, k[1])), _mm256_mul_ps(_b, k[2])), k[3])); + //_mm256_storeu_ps(y + col, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, _yr), _mm256_mul_ps(_g, _yg)), _mm256_add_ps(_mm256_mul_ps(_b, _yb), _yt))); + _mm256_storeu_ps(u + col, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, k[4]), _mm256_mul_ps(_g, k[5])), _mm256_mul_ps(_b, k[6]))); + _mm256_storeu_ps(v + col, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, k[7]), _mm256_mul_ps(_g, k[8])), _mm256_mul_ps(_b, k[9]))); + } + if(++row < height) + r += stride, g += stride, b += stride; + y += size, u += size, v += size; + } + } + + SIMD_INLINE void GrayToY(const uint8_t* g, int stride, int height, const __m256 k[10], float* y, int size) + { + for (int row = 0; row < size;) + { + for (int col = 0; col < size; col += 8) + { + __m256 _g = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(g + col)))); + _mm256_storeu_ps(y + col, _mm256_add_ps(_g, k[3])); + } + if (++row < height) + g += stride; + y += size; + } + } + + SIMD_INLINE void SubUv(const float * src, float * dst) + { + __m256 _0_25 = _mm256_set1_ps(0.25f), s0, s1; + for (int yy = 0; yy < 8; yy += 1) + { + s0 = _mm256_add_ps(_mm256_loadu_ps(src + 0), _mm256_loadu_ps(src + 16)); + s1 = _mm256_add_ps(_mm256_loadu_ps(src + 8), _mm256_loadu_ps(src + 24)); + _mm256_storeu_ps(dst + 0, _mm256_mul_ps(PermutedHorizontalAdd(s0, s1), _0_25)); + src += 32; + dst += 8; + } + } + + void JpegWriteBlockSubs(OutputMemoryStream& stream, int width, int height, const uint8_t* red, + const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3]) + { + __m256 k[10]; + RgbToYuvInit(k); + int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2]; + int width16 = width & (~15); + bool gray = red == green && red == blue; + Base::BitBuf bitBuf; + for (int y = 0; y < height; y += 16) + { + int x = 0; + SIMD_ALIGNED(16) float Y[256], U[256], V[256]; + SIMD_ALIGNED(16) float subU[64], subV[64]; + for (; x < width16; x += 16) + { + if (gray) + GrayToY(red + x, stride, height - y, k, Y, 16); + else + RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 16); + DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + SubUv(U, subU); + SubUv(V, subV); + DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + if (bitBuf.Full()) + { + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + } + for (; x < width; x += 16) + { + if (gray) + Base::GrayToY(red + x, stride, height - y, width - x, Y, 16); + else + Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 16); + DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + SubUv(U, subU); + SubUv(V, subV); + DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + } + } + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + + void JpegWriteBlockFull(OutputMemoryStream& stream, int width, int height, const uint8_t* red, + const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3]) + { + __m256 k[10]; + RgbToYuvInit(k); + int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2]; + int width8 = width & (~7); + bool gray = red == green && red == blue; + Base::BitBuf bitBuf; + for (int y = 0; y < height; y += 8) + { + int x = 0; + SIMD_ALIGNED(16) float Y[64], U[64], V[64]; + for (; x < width8; x += 8) + { + if (gray) + GrayToY(red + x, stride, height - y, k, Y, 8); + else + RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 8); + DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + if (bitBuf.Full()) + { + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + } + for (; x < width; x += 8) + { + if (gray) + Base::GrayToY(red + x, stride, height - y, width - x, Y, 8); + else + Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 8); + DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + } + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + } + + //--------------------------------------------------------------------- + + ImageJpegSaver::ImageJpegSaver(const ImageSaverParam& param) + : Sse41::ImageJpegSaver(param) + { + } + + void ImageJpegSaver::Init() + { + Sse41::ImageJpegSaver::Init(); + if (_param.width >= 32) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: + case SimdPixelFormatRgb24: + _deintBgr = Avx2::DeinterleaveBgr; + break; + case SimdPixelFormatBgra32: + case SimdPixelFormatRgba32: + _deintBgra = Avx2::DeinterleaveBgra; + break; + default: + break; + } + } + _writeBlock = _subSample ? JpegWriteBlockSubs : JpegWriteBlockFull; + } + } +#endif// SIMD_AVX2_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp new file mode 100644 index 0000000000..3cfa79fc62 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp @@ -0,0 +1,369 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdImageSavePng.h" +#include "Simd/SimdAvx2.h" +#include "Simd/SimdExtract.h" + +namespace Simd +{ +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + static uint32_t ZlibAdler32(uint8_t* data, int size) + { + __m256i _i0 = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7), _8 = _mm256_set1_epi32(8); + uint32_t lo = 1, hi = 0; + for (int b = 0, n = (int)(size % 5552); b < size;) + { + int n8 = n & (~7), i = 0; + __m256i _i = _mm256_add_epi32(_i0, _mm256_set1_epi32(n)); + __m256i _l = _mm256_setzero_si256(), _h = _mm256_setzero_si256(); + for (; i < n8; i += 8) + { + __m256i d = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(data + b + i))); + _l = _mm256_add_epi32(_l, d); + _h = _mm256_add_epi32(_h, _mm256_mullo_epi32(d, _i)); + _i = _mm256_sub_epi32(_i, _8); + } + int l = Avx2::ExtractSum(_l), h = Avx2::ExtractSum(_h); + for (; i < n; ++i) + { + l += data[b + i]; + h += data[b + i]*(n - i); + } + hi = (hi + h + lo*n) % 65521; + lo = (lo + l) % 65521; + b += n; + n = 5552; + } + return (hi << 16) | lo; + } + + void ZlibCompress(uint8_t* data, int size, int quality, OutputMemoryStream& stream) + { + const int ZHASH = 16384; + if (quality < 5) + quality = 5; + const int basket = quality * 2; + Array32i hashTable(ZHASH * basket); + memset(hashTable.data, -1, hashTable.RawSize()); + + stream.Write(uint8_t(0x78)); + stream.Write(uint8_t(0x5e)); + stream.WriteBits(1, 1); + stream.WriteBits(1, 2); + + int i = 0, j; + while (i < size - 3) + { + int h = Base::ZlibHash(data + i) & (ZHASH - 1), best = 3; + uint8_t* bestLoc = 0; + int* hList = hashTable.data + h * basket; + for (j = 0; hList[j] != -1 && j < basket; ++j) + { + if (hList[j] > i - 32768) + { + int d = Avx2::ZlibCount(data + hList[j], data + i, size - i); + if (d >= best) + { + best = d; + bestLoc = data + hList[j]; + } + } + } + if (j == basket) + { + memcpy(hList, hList + quality, quality * sizeof(int)); + memset(hList + quality, -1, quality * sizeof(int)); + j = quality; + } + hList[j] = i; + + if (bestLoc) + { + h = Base::ZlibHash(data + i + 1) & (ZHASH - 1); + int* hList = hashTable.data + h * basket; + for (j = 0; hList[j] != -1 && j < basket; ++j) + { + if (hList[j] > i - 32767) + { + int e = Avx2::ZlibCount(data + hList[j], data + i + 1, size - i - 1); + if (e > best) + { + bestLoc = NULL; + break; + } + } + } + } + + if (bestLoc) + { + int d = (int)(data + i - bestLoc); + assert(d <= 32767 && best <= 258); + for (j = 0; best > Base::ZlibLenC[j + 1] - 1; ++j); + Base::ZlibHuff(j + 257, stream); + if (Base::ZlibLenEb[j]) + stream.WriteBits(best - Base::ZlibLenC[j], Base::ZlibLenEb[j]); + for (j = 0; d > Base::ZlibDistC[j + 1] - 1; ++j); + stream.WriteBits(Base::ZlibBitRev(j, 5), 5); + if (Base::ZlibDistEb[j]) + stream.WriteBits(d - Base::ZlibDistC[j], Base::ZlibDistEb[j]); + i += best; + } + else + { + Base::ZlibHuffB(data[i], stream); + ++i; + } + } + for (; i < size; ++i) + Base::ZlibHuffB(data[i], stream); + Base::ZlibHuff(256, stream); + stream.FlushBits(); + stream.WriteBe32u(ZlibAdler32(data, size)); + } + + uint32_t EncodeLine0(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size, A); + __m256i _sum = _mm256_setzero_si256(); + for (; i < sizeA; i += A) + { + __m256i _src = _mm256_loadu_si256((__m256i*)(src + i)); + _mm256_storeu_si256((__m256i*)(dst + i), _src); + _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_src))); + } + uint32_t sum = Avx2::ExtractSum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine1(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + __m256i _sum = _mm256_setzero_si256(); + for (; i < sizeA; i += A) + { + __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i)); + __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n)); + __m256i _dst = _mm256_sub_epi8(_src0, _src1); + _mm256_storeu_si256((__m256i*)(dst + i), _dst); + _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst))); + } + sum += Avx2::ExtractSum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - src[i - n]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine2(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i] - src[i - stride]; + sum += ::abs(dst[i]); + } + __m256i _sum = _mm256_setzero_si256(); + for (; i < sizeA; i += A) + { + __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i)); + __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - stride)); + __m256i _dst = _mm256_sub_epi8(_src0, _src1); + _mm256_storeu_si256((__m256i*)(dst + i), _dst); + _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst))); + } + sum += Avx2::ExtractSum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - src[i - stride]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine3(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i] - (src[i - stride] >> 1); + sum += ::abs(dst[i]); + } + __m256i _sum = _mm256_setzero_si256(); + for (; i < sizeA; i += A) + { + __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i)); + __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n)); + __m256i _src2 = _mm256_loadu_si256((__m256i*)(src + i - stride)); + __m256i lo = _mm256_srli_epi16(_mm256_add_epi16(UnpackU8<0>(_src1), UnpackU8<0>(_src2)), 1); + __m256i hi = _mm256_srli_epi16(_mm256_add_epi16(UnpackU8<1>(_src1), UnpackU8<1>(_src2)), 1); + __m256i _dst = _mm256_sub_epi8(_src0, _mm256_packus_epi16(lo, hi)); + _mm256_storeu_si256((__m256i*)(dst + i), _dst); + _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst))); + } + sum += Avx2::ExtractSum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - ((src[i - n] + src[i - stride]) >> 1); + sum += ::abs(dst[i]); + } + return sum; + } + + SIMD_INLINE __m256i Paeth(__m256i a, __m256i b, __m256i c) + { + __m256i p = _mm256_sub_epi16(_mm256_add_epi16(a, b), c); + __m256i pa = _mm256_abs_epi16(_mm256_sub_epi16(p, a)); + __m256i pb = _mm256_abs_epi16(_mm256_sub_epi16(p, b)); + __m256i pc = _mm256_abs_epi16(_mm256_sub_epi16(p, c)); + __m256i mbc = _mm256_or_si256(_mm256_cmpgt_epi16(pa, pb), _mm256_cmpgt_epi16(pa, pc)); + __m256i mc = _mm256_cmpgt_epi16(pb, pc); + return _mm256_blendv_epi8(a, _mm256_blendv_epi8(b, c, mc), mbc); + } + + uint32_t EncodeLine4(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = (int8_t)(src[i] - src[i - stride]); + sum += ::abs(dst[i]); + } + __m256i _sum = _mm256_setzero_si256(); + for (; i < sizeA; i += A) + { + __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i)); + __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n)); + __m256i _src2 = _mm256_loadu_si256((__m256i*)(src + i - stride)); + __m256i _src3 = _mm256_loadu_si256((__m256i*)(src + i - stride - n)); + __m256i lo = Paeth(UnpackU8<0>(_src1), UnpackU8<0>(_src2), UnpackU8<0>(_src3)); + __m256i hi = Paeth(UnpackU8<1>(_src1), UnpackU8<1>(_src2), UnpackU8<1>(_src3)); + __m256i _dst = _mm256_sub_epi8(_src0, _mm256_packus_epi16(lo, hi)); + _mm256_storeu_si256((__m256i*)(dst + i), _dst); + _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst))); + } + sum += Avx2::ExtractSum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - Base::Paeth(src[i - n], src[i - stride], src[i - stride - n]); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine5(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + __m256i _sum = _mm256_setzero_si256(); + for (; i < sizeA; i += A) + { + __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i)); + __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n)); + __m256i lo = _mm256_srli_epi16(UnpackU8<0>(_src1), 1); + __m256i hi = _mm256_srli_epi16(UnpackU8<1>(_src1), 1); + __m256i _dst = _mm256_sub_epi8(_src0, _mm256_packus_epi16(lo, hi)); + _mm256_storeu_si256((__m256i*)(dst + i), _dst); + _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst))); + } + sum += Avx2::ExtractSum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - (src[i - n] >> 1); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine6(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + __m256i _sum = _mm256_setzero_si256(); + for (; i < sizeA; i += A) + { + __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i)); + __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n)); + __m256i _dst = _mm256_sub_epi8(_src0, _src1); + _mm256_storeu_si256((__m256i*)(dst + i), _dst); + _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst))); + } + sum += Avx2::ExtractSum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - src[i - n]; + sum += ::abs(dst[i]); + } + return sum; + } + + ImagePngSaver::ImagePngSaver(const ImageSaverParam& param) + : Sse41::ImagePngSaver(param) + { + if (_param.format == SimdPixelFormatBgr24) + _convert = Avx2::BgrToRgb; + else if (_param.format == SimdPixelFormatBgra32) + _convert = Avx2::BgraToRgba; + _encode[0] = Avx2::EncodeLine0; + _encode[1] = Avx2::EncodeLine1; + _encode[2] = Avx2::EncodeLine2; + _encode[3] = Avx2::EncodeLine3; + _encode[4] = Avx2::EncodeLine4; + _encode[5] = Avx2::EncodeLine5; + _encode[6] = Avx2::EncodeLine6; + _compress = Avx2::ZlibCompress; + } + } +#endif// SIMD_AVX2_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdBase.h b/3rdparty/simdlib/Simd/SimdBase.h index 998a7b7cbe..3ad6e60d96 100755 --- a/3rdparty/simdlib/Simd/SimdBase.h +++ b/3rdparty/simdlib/Simd/SimdBase.h @@ -32,6 +32,10 @@ namespace Simd { namespace Base { + uint32_t Crc32(const void* src, size_t size); + + uint32_t Crc32c(const void * src, size_t size); + void BgraToBgr(const uint8_t * bgra, size_t size, uint8_t * bgr, bool lastRow); void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride); diff --git a/3rdparty/simdlib/Simd/SimdBaseCrc32.cpp b/3rdparty/simdlib/Simd/SimdBaseCrc32.cpp new file mode 100644 index 0000000000..4008b0f0d8 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseCrc32.cpp @@ -0,0 +1,978 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdDefs.h" + +namespace Simd +{ + namespace Base + { + static SIMD_INLINE uint32_t Reorder32(uint32_t x) + { +#if defined(__GNUC__) || defined(__clang__) + return __builtin_bswap32(x); +#else + return (x >> 24) | + ((x >> 8) & 0x0000FF00) | + ((x << 8) & 0x00FF0000) | + (x << 24); +#endif + } + + // Precalculated CRC32c lookup table for polynomial 0xEDB88320. + static const uint32_t Crc32Table[16][256] = + { + { + 0x00000000,0x77073096,0xEE0E612C,0x990951BA,0x076DC419,0x706AF48F,0xE963A535,0x9E6495A3, + 0x0EDB8832,0x79DCB8A4,0xE0D5E91E,0x97D2D988,0x09B64C2B,0x7EB17CBD,0xE7B82D07,0x90BF1D91, + 0x1DB71064,0x6AB020F2,0xF3B97148,0x84BE41DE,0x1ADAD47D,0x6DDDE4EB,0xF4D4B551,0x83D385C7, + 0x136C9856,0x646BA8C0,0xFD62F97A,0x8A65C9EC,0x14015C4F,0x63066CD9,0xFA0F3D63,0x8D080DF5, + 0x3B6E20C8,0x4C69105E,0xD56041E4,0xA2677172,0x3C03E4D1,0x4B04D447,0xD20D85FD,0xA50AB56B, + 0x35B5A8FA,0x42B2986C,0xDBBBC9D6,0xACBCF940,0x32D86CE3,0x45DF5C75,0xDCD60DCF,0xABD13D59, + 0x26D930AC,0x51DE003A,0xC8D75180,0xBFD06116,0x21B4F4B5,0x56B3C423,0xCFBA9599,0xB8BDA50F, + 0x2802B89E,0x5F058808,0xC60CD9B2,0xB10BE924,0x2F6F7C87,0x58684C11,0xC1611DAB,0xB6662D3D, + 0x76DC4190,0x01DB7106,0x98D220BC,0xEFD5102A,0x71B18589,0x06B6B51F,0x9FBFE4A5,0xE8B8D433, + 0x7807C9A2,0x0F00F934,0x9609A88E,0xE10E9818,0x7F6A0DBB,0x086D3D2D,0x91646C97,0xE6635C01, + 0x6B6B51F4,0x1C6C6162,0x856530D8,0xF262004E,0x6C0695ED,0x1B01A57B,0x8208F4C1,0xF50FC457, + 0x65B0D9C6,0x12B7E950,0x8BBEB8EA,0xFCB9887C,0x62DD1DDF,0x15DA2D49,0x8CD37CF3,0xFBD44C65, + 0x4DB26158,0x3AB551CE,0xA3BC0074,0xD4BB30E2,0x4ADFA541,0x3DD895D7,0xA4D1C46D,0xD3D6F4FB, + 0x4369E96A,0x346ED9FC,0xAD678846,0xDA60B8D0,0x44042D73,0x33031DE5,0xAA0A4C5F,0xDD0D7CC9, + 0x5005713C,0x270241AA,0xBE0B1010,0xC90C2086,0x5768B525,0x206F85B3,0xB966D409,0xCE61E49F, + 0x5EDEF90E,0x29D9C998,0xB0D09822,0xC7D7A8B4,0x59B33D17,0x2EB40D81,0xB7BD5C3B,0xC0BA6CAD, + 0xEDB88320,0x9ABFB3B6,0x03B6E20C,0x74B1D29A,0xEAD54739,0x9DD277AF,0x04DB2615,0x73DC1683, + 0xE3630B12,0x94643B84,0x0D6D6A3E,0x7A6A5AA8,0xE40ECF0B,0x9309FF9D,0x0A00AE27,0x7D079EB1, + 0xF00F9344,0x8708A3D2,0x1E01F268,0x6906C2FE,0xF762575D,0x806567CB,0x196C3671,0x6E6B06E7, + 0xFED41B76,0x89D32BE0,0x10DA7A5A,0x67DD4ACC,0xF9B9DF6F,0x8EBEEFF9,0x17B7BE43,0x60B08ED5, + 0xD6D6A3E8,0xA1D1937E,0x38D8C2C4,0x4FDFF252,0xD1BB67F1,0xA6BC5767,0x3FB506DD,0x48B2364B, + 0xD80D2BDA,0xAF0A1B4C,0x36034AF6,0x41047A60,0xDF60EFC3,0xA867DF55,0x316E8EEF,0x4669BE79, + 0xCB61B38C,0xBC66831A,0x256FD2A0,0x5268E236,0xCC0C7795,0xBB0B4703,0x220216B9,0x5505262F, + 0xC5BA3BBE,0xB2BD0B28,0x2BB45A92,0x5CB36A04,0xC2D7FFA7,0xB5D0CF31,0x2CD99E8B,0x5BDEAE1D, + 0x9B64C2B0,0xEC63F226,0x756AA39C,0x026D930A,0x9C0906A9,0xEB0E363F,0x72076785,0x05005713, + 0x95BF4A82,0xE2B87A14,0x7BB12BAE,0x0CB61B38,0x92D28E9B,0xE5D5BE0D,0x7CDCEFB7,0x0BDBDF21, + 0x86D3D2D4,0xF1D4E242,0x68DDB3F8,0x1FDA836E,0x81BE16CD,0xF6B9265B,0x6FB077E1,0x18B74777, + 0x88085AE6,0xFF0F6A70,0x66063BCA,0x11010B5C,0x8F659EFF,0xF862AE69,0x616BFFD3,0x166CCF45, + 0xA00AE278,0xD70DD2EE,0x4E048354,0x3903B3C2,0xA7672661,0xD06016F7,0x4969474D,0x3E6E77DB, + 0xAED16A4A,0xD9D65ADC,0x40DF0B66,0x37D83BF0,0xA9BCAE53,0xDEBB9EC5,0x47B2CF7F,0x30B5FFE9, + 0xBDBDF21C,0xCABAC28A,0x53B39330,0x24B4A3A6,0xBAD03605,0xCDD70693,0x54DE5729,0x23D967BF, + 0xB3667A2E,0xC4614AB8,0x5D681B02,0x2A6F2B94,0xB40BBE37,0xC30C8EA1,0x5A05DF1B,0x2D02EF8D, + }, + { + 0x00000000,0x191B3141,0x32366282,0x2B2D53C3,0x646CC504,0x7D77F445,0x565AA786,0x4F4196C7, + 0xC8D98A08,0xD1C2BB49,0xFAEFE88A,0xE3F4D9CB,0xACB54F0C,0xB5AE7E4D,0x9E832D8E,0x87981CCF, + 0x4AC21251,0x53D92310,0x78F470D3,0x61EF4192,0x2EAED755,0x37B5E614,0x1C98B5D7,0x05838496, + 0x821B9859,0x9B00A918,0xB02DFADB,0xA936CB9A,0xE6775D5D,0xFF6C6C1C,0xD4413FDF,0xCD5A0E9E, + 0x958424A2,0x8C9F15E3,0xA7B24620,0xBEA97761,0xF1E8E1A6,0xE8F3D0E7,0xC3DE8324,0xDAC5B265, + 0x5D5DAEAA,0x44469FEB,0x6F6BCC28,0x7670FD69,0x39316BAE,0x202A5AEF,0x0B07092C,0x121C386D, + 0xDF4636F3,0xC65D07B2,0xED705471,0xF46B6530,0xBB2AF3F7,0xA231C2B6,0x891C9175,0x9007A034, + 0x179FBCFB,0x0E848DBA,0x25A9DE79,0x3CB2EF38,0x73F379FF,0x6AE848BE,0x41C51B7D,0x58DE2A3C, + 0xF0794F05,0xE9627E44,0xC24F2D87,0xDB541CC6,0x94158A01,0x8D0EBB40,0xA623E883,0xBF38D9C2, + 0x38A0C50D,0x21BBF44C,0x0A96A78F,0x138D96CE,0x5CCC0009,0x45D73148,0x6EFA628B,0x77E153CA, + 0xBABB5D54,0xA3A06C15,0x888D3FD6,0x91960E97,0xDED79850,0xC7CCA911,0xECE1FAD2,0xF5FACB93, + 0x7262D75C,0x6B79E61D,0x4054B5DE,0x594F849F,0x160E1258,0x0F152319,0x243870DA,0x3D23419B, + 0x65FD6BA7,0x7CE65AE6,0x57CB0925,0x4ED03864,0x0191AEA3,0x188A9FE2,0x33A7CC21,0x2ABCFD60, + 0xAD24E1AF,0xB43FD0EE,0x9F12832D,0x8609B26C,0xC94824AB,0xD05315EA,0xFB7E4629,0xE2657768, + 0x2F3F79F6,0x362448B7,0x1D091B74,0x04122A35,0x4B53BCF2,0x52488DB3,0x7965DE70,0x607EEF31, + 0xE7E6F3FE,0xFEFDC2BF,0xD5D0917C,0xCCCBA03D,0x838A36FA,0x9A9107BB,0xB1BC5478,0xA8A76539, + 0x3B83984B,0x2298A90A,0x09B5FAC9,0x10AECB88,0x5FEF5D4F,0x46F46C0E,0x6DD93FCD,0x74C20E8C, + 0xF35A1243,0xEA412302,0xC16C70C1,0xD8774180,0x9736D747,0x8E2DE606,0xA500B5C5,0xBC1B8484, + 0x71418A1A,0x685ABB5B,0x4377E898,0x5A6CD9D9,0x152D4F1E,0x0C367E5F,0x271B2D9C,0x3E001CDD, + 0xB9980012,0xA0833153,0x8BAE6290,0x92B553D1,0xDDF4C516,0xC4EFF457,0xEFC2A794,0xF6D996D5, + 0xAE07BCE9,0xB71C8DA8,0x9C31DE6B,0x852AEF2A,0xCA6B79ED,0xD37048AC,0xF85D1B6F,0xE1462A2E, + 0x66DE36E1,0x7FC507A0,0x54E85463,0x4DF36522,0x02B2F3E5,0x1BA9C2A4,0x30849167,0x299FA026, + 0xE4C5AEB8,0xFDDE9FF9,0xD6F3CC3A,0xCFE8FD7B,0x80A96BBC,0x99B25AFD,0xB29F093E,0xAB84387F, + 0x2C1C24B0,0x350715F1,0x1E2A4632,0x07317773,0x4870E1B4,0x516BD0F5,0x7A468336,0x635DB277, + 0xCBFAD74E,0xD2E1E60F,0xF9CCB5CC,0xE0D7848D,0xAF96124A,0xB68D230B,0x9DA070C8,0x84BB4189, + 0x03235D46,0x1A386C07,0x31153FC4,0x280E0E85,0x674F9842,0x7E54A903,0x5579FAC0,0x4C62CB81, + 0x8138C51F,0x9823F45E,0xB30EA79D,0xAA1596DC,0xE554001B,0xFC4F315A,0xD7626299,0xCE7953D8, + 0x49E14F17,0x50FA7E56,0x7BD72D95,0x62CC1CD4,0x2D8D8A13,0x3496BB52,0x1FBBE891,0x06A0D9D0, + 0x5E7EF3EC,0x4765C2AD,0x6C48916E,0x7553A02F,0x3A1236E8,0x230907A9,0x0824546A,0x113F652B, + 0x96A779E4,0x8FBC48A5,0xA4911B66,0xBD8A2A27,0xF2CBBCE0,0xEBD08DA1,0xC0FDDE62,0xD9E6EF23, + 0x14BCE1BD,0x0DA7D0FC,0x268A833F,0x3F91B27E,0x70D024B9,0x69CB15F8,0x42E6463B,0x5BFD777A, + 0xDC656BB5,0xC57E5AF4,0xEE530937,0xF7483876,0xB809AEB1,0xA1129FF0,0x8A3FCC33,0x9324FD72, + }, + { + 0x00000000,0x01C26A37,0x0384D46E,0x0246BE59,0x0709A8DC,0x06CBC2EB,0x048D7CB2,0x054F1685, + 0x0E1351B8,0x0FD13B8F,0x0D9785D6,0x0C55EFE1,0x091AF964,0x08D89353,0x0A9E2D0A,0x0B5C473D, + 0x1C26A370,0x1DE4C947,0x1FA2771E,0x1E601D29,0x1B2F0BAC,0x1AED619B,0x18ABDFC2,0x1969B5F5, + 0x1235F2C8,0x13F798FF,0x11B126A6,0x10734C91,0x153C5A14,0x14FE3023,0x16B88E7A,0x177AE44D, + 0x384D46E0,0x398F2CD7,0x3BC9928E,0x3A0BF8B9,0x3F44EE3C,0x3E86840B,0x3CC03A52,0x3D025065, + 0x365E1758,0x379C7D6F,0x35DAC336,0x3418A901,0x3157BF84,0x3095D5B3,0x32D36BEA,0x331101DD, + 0x246BE590,0x25A98FA7,0x27EF31FE,0x262D5BC9,0x23624D4C,0x22A0277B,0x20E69922,0x2124F315, + 0x2A78B428,0x2BBADE1F,0x29FC6046,0x283E0A71,0x2D711CF4,0x2CB376C3,0x2EF5C89A,0x2F37A2AD, + 0x709A8DC0,0x7158E7F7,0x731E59AE,0x72DC3399,0x7793251C,0x76514F2B,0x7417F172,0x75D59B45, + 0x7E89DC78,0x7F4BB64F,0x7D0D0816,0x7CCF6221,0x798074A4,0x78421E93,0x7A04A0CA,0x7BC6CAFD, + 0x6CBC2EB0,0x6D7E4487,0x6F38FADE,0x6EFA90E9,0x6BB5866C,0x6A77EC5B,0x68315202,0x69F33835, + 0x62AF7F08,0x636D153F,0x612BAB66,0x60E9C151,0x65A6D7D4,0x6464BDE3,0x662203BA,0x67E0698D, + 0x48D7CB20,0x4915A117,0x4B531F4E,0x4A917579,0x4FDE63FC,0x4E1C09CB,0x4C5AB792,0x4D98DDA5, + 0x46C49A98,0x4706F0AF,0x45404EF6,0x448224C1,0x41CD3244,0x400F5873,0x4249E62A,0x438B8C1D, + 0x54F16850,0x55330267,0x5775BC3E,0x56B7D609,0x53F8C08C,0x523AAABB,0x507C14E2,0x51BE7ED5, + 0x5AE239E8,0x5B2053DF,0x5966ED86,0x58A487B1,0x5DEB9134,0x5C29FB03,0x5E6F455A,0x5FAD2F6D, + 0xE1351B80,0xE0F771B7,0xE2B1CFEE,0xE373A5D9,0xE63CB35C,0xE7FED96B,0xE5B86732,0xE47A0D05, + 0xEF264A38,0xEEE4200F,0xECA29E56,0xED60F461,0xE82FE2E4,0xE9ED88D3,0xEBAB368A,0xEA695CBD, + 0xFD13B8F0,0xFCD1D2C7,0xFE976C9E,0xFF5506A9,0xFA1A102C,0xFBD87A1B,0xF99EC442,0xF85CAE75, + 0xF300E948,0xF2C2837F,0xF0843D26,0xF1465711,0xF4094194,0xF5CB2BA3,0xF78D95FA,0xF64FFFCD, + 0xD9785D60,0xD8BA3757,0xDAFC890E,0xDB3EE339,0xDE71F5BC,0xDFB39F8B,0xDDF521D2,0xDC374BE5, + 0xD76B0CD8,0xD6A966EF,0xD4EFD8B6,0xD52DB281,0xD062A404,0xD1A0CE33,0xD3E6706A,0xD2241A5D, + 0xC55EFE10,0xC49C9427,0xC6DA2A7E,0xC7184049,0xC25756CC,0xC3953CFB,0xC1D382A2,0xC011E895, + 0xCB4DAFA8,0xCA8FC59F,0xC8C97BC6,0xC90B11F1,0xCC440774,0xCD866D43,0xCFC0D31A,0xCE02B92D, + 0x91AF9640,0x906DFC77,0x922B422E,0x93E92819,0x96A63E9C,0x976454AB,0x9522EAF2,0x94E080C5, + 0x9FBCC7F8,0x9E7EADCF,0x9C381396,0x9DFA79A1,0x98B56F24,0x99770513,0x9B31BB4A,0x9AF3D17D, + 0x8D893530,0x8C4B5F07,0x8E0DE15E,0x8FCF8B69,0x8A809DEC,0x8B42F7DB,0x89044982,0x88C623B5, + 0x839A6488,0x82580EBF,0x801EB0E6,0x81DCDAD1,0x8493CC54,0x8551A663,0x8717183A,0x86D5720D, + 0xA9E2D0A0,0xA820BA97,0xAA6604CE,0xABA46EF9,0xAEEB787C,0xAF29124B,0xAD6FAC12,0xACADC625, + 0xA7F18118,0xA633EB2F,0xA4755576,0xA5B73F41,0xA0F829C4,0xA13A43F3,0xA37CFDAA,0xA2BE979D, + 0xB5C473D0,0xB40619E7,0xB640A7BE,0xB782CD89,0xB2CDDB0C,0xB30FB13B,0xB1490F62,0xB08B6555, + 0xBBD72268,0xBA15485F,0xB853F606,0xB9919C31,0xBCDE8AB4,0xBD1CE083,0xBF5A5EDA,0xBE9834ED, + }, + { + 0x00000000,0xB8BC6765,0xAA09C88B,0x12B5AFEE,0x8F629757,0x37DEF032,0x256B5FDC,0x9DD738B9, + 0xC5B428EF,0x7D084F8A,0x6FBDE064,0xD7018701,0x4AD6BFB8,0xF26AD8DD,0xE0DF7733,0x58631056, + 0x5019579F,0xE8A530FA,0xFA109F14,0x42ACF871,0xDF7BC0C8,0x67C7A7AD,0x75720843,0xCDCE6F26, + 0x95AD7F70,0x2D111815,0x3FA4B7FB,0x8718D09E,0x1ACFE827,0xA2738F42,0xB0C620AC,0x087A47C9, + 0xA032AF3E,0x188EC85B,0x0A3B67B5,0xB28700D0,0x2F503869,0x97EC5F0C,0x8559F0E2,0x3DE59787, + 0x658687D1,0xDD3AE0B4,0xCF8F4F5A,0x7733283F,0xEAE41086,0x525877E3,0x40EDD80D,0xF851BF68, + 0xF02BF8A1,0x48979FC4,0x5A22302A,0xE29E574F,0x7F496FF6,0xC7F50893,0xD540A77D,0x6DFCC018, + 0x359FD04E,0x8D23B72B,0x9F9618C5,0x272A7FA0,0xBAFD4719,0x0241207C,0x10F48F92,0xA848E8F7, + 0x9B14583D,0x23A83F58,0x311D90B6,0x89A1F7D3,0x1476CF6A,0xACCAA80F,0xBE7F07E1,0x06C36084, + 0x5EA070D2,0xE61C17B7,0xF4A9B859,0x4C15DF3C,0xD1C2E785,0x697E80E0,0x7BCB2F0E,0xC377486B, + 0xCB0D0FA2,0x73B168C7,0x6104C729,0xD9B8A04C,0x446F98F5,0xFCD3FF90,0xEE66507E,0x56DA371B, + 0x0EB9274D,0xB6054028,0xA4B0EFC6,0x1C0C88A3,0x81DBB01A,0x3967D77F,0x2BD27891,0x936E1FF4, + 0x3B26F703,0x839A9066,0x912F3F88,0x299358ED,0xB4446054,0x0CF80731,0x1E4DA8DF,0xA6F1CFBA, + 0xFE92DFEC,0x462EB889,0x549B1767,0xEC277002,0x71F048BB,0xC94C2FDE,0xDBF98030,0x6345E755, + 0x6B3FA09C,0xD383C7F9,0xC1366817,0x798A0F72,0xE45D37CB,0x5CE150AE,0x4E54FF40,0xF6E89825, + 0xAE8B8873,0x1637EF16,0x048240F8,0xBC3E279D,0x21E91F24,0x99557841,0x8BE0D7AF,0x335CB0CA, + 0xED59B63B,0x55E5D15E,0x47507EB0,0xFFEC19D5,0x623B216C,0xDA874609,0xC832E9E7,0x708E8E82, + 0x28ED9ED4,0x9051F9B1,0x82E4565F,0x3A58313A,0xA78F0983,0x1F336EE6,0x0D86C108,0xB53AA66D, + 0xBD40E1A4,0x05FC86C1,0x1749292F,0xAFF54E4A,0x322276F3,0x8A9E1196,0x982BBE78,0x2097D91D, + 0x78F4C94B,0xC048AE2E,0xD2FD01C0,0x6A4166A5,0xF7965E1C,0x4F2A3979,0x5D9F9697,0xE523F1F2, + 0x4D6B1905,0xF5D77E60,0xE762D18E,0x5FDEB6EB,0xC2098E52,0x7AB5E937,0x680046D9,0xD0BC21BC, + 0x88DF31EA,0x3063568F,0x22D6F961,0x9A6A9E04,0x07BDA6BD,0xBF01C1D8,0xADB46E36,0x15080953, + 0x1D724E9A,0xA5CE29FF,0xB77B8611,0x0FC7E174,0x9210D9CD,0x2AACBEA8,0x38191146,0x80A57623, + 0xD8C66675,0x607A0110,0x72CFAEFE,0xCA73C99B,0x57A4F122,0xEF189647,0xFDAD39A9,0x45115ECC, + 0x764DEE06,0xCEF18963,0xDC44268D,0x64F841E8,0xF92F7951,0x41931E34,0x5326B1DA,0xEB9AD6BF, + 0xB3F9C6E9,0x0B45A18C,0x19F00E62,0xA14C6907,0x3C9B51BE,0x842736DB,0x96929935,0x2E2EFE50, + 0x2654B999,0x9EE8DEFC,0x8C5D7112,0x34E11677,0xA9362ECE,0x118A49AB,0x033FE645,0xBB838120, + 0xE3E09176,0x5B5CF613,0x49E959FD,0xF1553E98,0x6C820621,0xD43E6144,0xC68BCEAA,0x7E37A9CF, + 0xD67F4138,0x6EC3265D,0x7C7689B3,0xC4CAEED6,0x591DD66F,0xE1A1B10A,0xF3141EE4,0x4BA87981, + 0x13CB69D7,0xAB770EB2,0xB9C2A15C,0x017EC639,0x9CA9FE80,0x241599E5,0x36A0360B,0x8E1C516E, + 0x866616A7,0x3EDA71C2,0x2C6FDE2C,0x94D3B949,0x090481F0,0xB1B8E695,0xA30D497B,0x1BB12E1E, + 0x43D23E48,0xFB6E592D,0xE9DBF6C3,0x516791A6,0xCCB0A91F,0x740CCE7A,0x66B96194,0xDE0506F1, + }, + { + 0x00000000,0x3D6029B0,0x7AC05360,0x47A07AD0,0xF580A6C0,0xC8E08F70,0x8F40F5A0,0xB220DC10, + 0x30704BC1,0x0D106271,0x4AB018A1,0x77D03111,0xC5F0ED01,0xF890C4B1,0xBF30BE61,0x825097D1, + 0x60E09782,0x5D80BE32,0x1A20C4E2,0x2740ED52,0x95603142,0xA80018F2,0xEFA06222,0xD2C04B92, + 0x5090DC43,0x6DF0F5F3,0x2A508F23,0x1730A693,0xA5107A83,0x98705333,0xDFD029E3,0xE2B00053, + 0xC1C12F04,0xFCA106B4,0xBB017C64,0x866155D4,0x344189C4,0x0921A074,0x4E81DAA4,0x73E1F314, + 0xF1B164C5,0xCCD14D75,0x8B7137A5,0xB6111E15,0x0431C205,0x3951EBB5,0x7EF19165,0x4391B8D5, + 0xA121B886,0x9C419136,0xDBE1EBE6,0xE681C256,0x54A11E46,0x69C137F6,0x2E614D26,0x13016496, + 0x9151F347,0xAC31DAF7,0xEB91A027,0xD6F18997,0x64D15587,0x59B17C37,0x1E1106E7,0x23712F57, + 0x58F35849,0x659371F9,0x22330B29,0x1F532299,0xAD73FE89,0x9013D739,0xD7B3ADE9,0xEAD38459, + 0x68831388,0x55E33A38,0x124340E8,0x2F236958,0x9D03B548,0xA0639CF8,0xE7C3E628,0xDAA3CF98, + 0x3813CFCB,0x0573E67B,0x42D39CAB,0x7FB3B51B,0xCD93690B,0xF0F340BB,0xB7533A6B,0x8A3313DB, + 0x0863840A,0x3503ADBA,0x72A3D76A,0x4FC3FEDA,0xFDE322CA,0xC0830B7A,0x872371AA,0xBA43581A, + 0x9932774D,0xA4525EFD,0xE3F2242D,0xDE920D9D,0x6CB2D18D,0x51D2F83D,0x167282ED,0x2B12AB5D, + 0xA9423C8C,0x9422153C,0xD3826FEC,0xEEE2465C,0x5CC29A4C,0x61A2B3FC,0x2602C92C,0x1B62E09C, + 0xF9D2E0CF,0xC4B2C97F,0x8312B3AF,0xBE729A1F,0x0C52460F,0x31326FBF,0x7692156F,0x4BF23CDF, + 0xC9A2AB0E,0xF4C282BE,0xB362F86E,0x8E02D1DE,0x3C220DCE,0x0142247E,0x46E25EAE,0x7B82771E, + 0xB1E6B092,0x8C869922,0xCB26E3F2,0xF646CA42,0x44661652,0x79063FE2,0x3EA64532,0x03C66C82, + 0x8196FB53,0xBCF6D2E3,0xFB56A833,0xC6368183,0x74165D93,0x49767423,0x0ED60EF3,0x33B62743, + 0xD1062710,0xEC660EA0,0xABC67470,0x96A65DC0,0x248681D0,0x19E6A860,0x5E46D2B0,0x6326FB00, + 0xE1766CD1,0xDC164561,0x9BB63FB1,0xA6D61601,0x14F6CA11,0x2996E3A1,0x6E369971,0x5356B0C1, + 0x70279F96,0x4D47B626,0x0AE7CCF6,0x3787E546,0x85A73956,0xB8C710E6,0xFF676A36,0xC2074386, + 0x4057D457,0x7D37FDE7,0x3A978737,0x07F7AE87,0xB5D77297,0x88B75B27,0xCF1721F7,0xF2770847, + 0x10C70814,0x2DA721A4,0x6A075B74,0x576772C4,0xE547AED4,0xD8278764,0x9F87FDB4,0xA2E7D404, + 0x20B743D5,0x1DD76A65,0x5A7710B5,0x67173905,0xD537E515,0xE857CCA5,0xAFF7B675,0x92979FC5, + 0xE915E8DB,0xD475C16B,0x93D5BBBB,0xAEB5920B,0x1C954E1B,0x21F567AB,0x66551D7B,0x5B3534CB, + 0xD965A31A,0xE4058AAA,0xA3A5F07A,0x9EC5D9CA,0x2CE505DA,0x11852C6A,0x562556BA,0x6B457F0A, + 0x89F57F59,0xB49556E9,0xF3352C39,0xCE550589,0x7C75D999,0x4115F029,0x06B58AF9,0x3BD5A349, + 0xB9853498,0x84E51D28,0xC34567F8,0xFE254E48,0x4C059258,0x7165BBE8,0x36C5C138,0x0BA5E888, + 0x28D4C7DF,0x15B4EE6F,0x521494BF,0x6F74BD0F,0xDD54611F,0xE03448AF,0xA794327F,0x9AF41BCF, + 0x18A48C1E,0x25C4A5AE,0x6264DF7E,0x5F04F6CE,0xED242ADE,0xD044036E,0x97E479BE,0xAA84500E, + 0x4834505D,0x755479ED,0x32F4033D,0x0F942A8D,0xBDB4F69D,0x80D4DF2D,0xC774A5FD,0xFA148C4D, + 0x78441B9C,0x4524322C,0x028448FC,0x3FE4614C,0x8DC4BD5C,0xB0A494EC,0xF704EE3C,0xCA64C78C, + }, + { + 0x00000000,0xCB5CD3A5,0x4DC8A10B,0x869472AE,0x9B914216,0x50CD91B3,0xD659E31D,0x1D0530B8, + 0xEC53826D,0x270F51C8,0xA19B2366,0x6AC7F0C3,0x77C2C07B,0xBC9E13DE,0x3A0A6170,0xF156B2D5, + 0x03D6029B,0xC88AD13E,0x4E1EA390,0x85427035,0x9847408D,0x531B9328,0xD58FE186,0x1ED33223, + 0xEF8580F6,0x24D95353,0xA24D21FD,0x6911F258,0x7414C2E0,0xBF481145,0x39DC63EB,0xF280B04E, + 0x07AC0536,0xCCF0D693,0x4A64A43D,0x81387798,0x9C3D4720,0x57619485,0xD1F5E62B,0x1AA9358E, + 0xEBFF875B,0x20A354FE,0xA6372650,0x6D6BF5F5,0x706EC54D,0xBB3216E8,0x3DA66446,0xF6FAB7E3, + 0x047A07AD,0xCF26D408,0x49B2A6A6,0x82EE7503,0x9FEB45BB,0x54B7961E,0xD223E4B0,0x197F3715, + 0xE82985C0,0x23755665,0xA5E124CB,0x6EBDF76E,0x73B8C7D6,0xB8E41473,0x3E7066DD,0xF52CB578, + 0x0F580A6C,0xC404D9C9,0x4290AB67,0x89CC78C2,0x94C9487A,0x5F959BDF,0xD901E971,0x125D3AD4, + 0xE30B8801,0x28575BA4,0xAEC3290A,0x659FFAAF,0x789ACA17,0xB3C619B2,0x35526B1C,0xFE0EB8B9, + 0x0C8E08F7,0xC7D2DB52,0x4146A9FC,0x8A1A7A59,0x971F4AE1,0x5C439944,0xDAD7EBEA,0x118B384F, + 0xE0DD8A9A,0x2B81593F,0xAD152B91,0x6649F834,0x7B4CC88C,0xB0101B29,0x36846987,0xFDD8BA22, + 0x08F40F5A,0xC3A8DCFF,0x453CAE51,0x8E607DF4,0x93654D4C,0x58399EE9,0xDEADEC47,0x15F13FE2, + 0xE4A78D37,0x2FFB5E92,0xA96F2C3C,0x6233FF99,0x7F36CF21,0xB46A1C84,0x32FE6E2A,0xF9A2BD8F, + 0x0B220DC1,0xC07EDE64,0x46EAACCA,0x8DB67F6F,0x90B34FD7,0x5BEF9C72,0xDD7BEEDC,0x16273D79, + 0xE7718FAC,0x2C2D5C09,0xAAB92EA7,0x61E5FD02,0x7CE0CDBA,0xB7BC1E1F,0x31286CB1,0xFA74BF14, + 0x1EB014D8,0xD5ECC77D,0x5378B5D3,0x98246676,0x852156CE,0x4E7D856B,0xC8E9F7C5,0x03B52460, + 0xF2E396B5,0x39BF4510,0xBF2B37BE,0x7477E41B,0x6972D4A3,0xA22E0706,0x24BA75A8,0xEFE6A60D, + 0x1D661643,0xD63AC5E6,0x50AEB748,0x9BF264ED,0x86F75455,0x4DAB87F0,0xCB3FF55E,0x006326FB, + 0xF135942E,0x3A69478B,0xBCFD3525,0x77A1E680,0x6AA4D638,0xA1F8059D,0x276C7733,0xEC30A496, + 0x191C11EE,0xD240C24B,0x54D4B0E5,0x9F886340,0x828D53F8,0x49D1805D,0xCF45F2F3,0x04192156, + 0xF54F9383,0x3E134026,0xB8873288,0x73DBE12D,0x6EDED195,0xA5820230,0x2316709E,0xE84AA33B, + 0x1ACA1375,0xD196C0D0,0x5702B27E,0x9C5E61DB,0x815B5163,0x4A0782C6,0xCC93F068,0x07CF23CD, + 0xF6999118,0x3DC542BD,0xBB513013,0x700DE3B6,0x6D08D30E,0xA65400AB,0x20C07205,0xEB9CA1A0, + 0x11E81EB4,0xDAB4CD11,0x5C20BFBF,0x977C6C1A,0x8A795CA2,0x41258F07,0xC7B1FDA9,0x0CED2E0C, + 0xFDBB9CD9,0x36E74F7C,0xB0733DD2,0x7B2FEE77,0x662ADECF,0xAD760D6A,0x2BE27FC4,0xE0BEAC61, + 0x123E1C2F,0xD962CF8A,0x5FF6BD24,0x94AA6E81,0x89AF5E39,0x42F38D9C,0xC467FF32,0x0F3B2C97, + 0xFE6D9E42,0x35314DE7,0xB3A53F49,0x78F9ECEC,0x65FCDC54,0xAEA00FF1,0x28347D5F,0xE368AEFA, + 0x16441B82,0xDD18C827,0x5B8CBA89,0x90D0692C,0x8DD55994,0x46898A31,0xC01DF89F,0x0B412B3A, + 0xFA1799EF,0x314B4A4A,0xB7DF38E4,0x7C83EB41,0x6186DBF9,0xAADA085C,0x2C4E7AF2,0xE712A957, + 0x15921919,0xDECECABC,0x585AB812,0x93066BB7,0x8E035B0F,0x455F88AA,0xC3CBFA04,0x089729A1, + 0xF9C19B74,0x329D48D1,0xB4093A7F,0x7F55E9DA,0x6250D962,0xA90C0AC7,0x2F987869,0xE4C4ABCC, + }, + { + 0x00000000,0xA6770BB4,0x979F1129,0x31E81A9D,0xF44F2413,0x52382FA7,0x63D0353A,0xC5A73E8E, + 0x33EF4E67,0x959845D3,0xA4705F4E,0x020754FA,0xC7A06A74,0x61D761C0,0x503F7B5D,0xF64870E9, + 0x67DE9CCE,0xC1A9977A,0xF0418DE7,0x56368653,0x9391B8DD,0x35E6B369,0x040EA9F4,0xA279A240, + 0x5431D2A9,0xF246D91D,0xC3AEC380,0x65D9C834,0xA07EF6BA,0x0609FD0E,0x37E1E793,0x9196EC27, + 0xCFBD399C,0x69CA3228,0x582228B5,0xFE552301,0x3BF21D8F,0x9D85163B,0xAC6D0CA6,0x0A1A0712, + 0xFC5277FB,0x5A257C4F,0x6BCD66D2,0xCDBA6D66,0x081D53E8,0xAE6A585C,0x9F8242C1,0x39F54975, + 0xA863A552,0x0E14AEE6,0x3FFCB47B,0x998BBFCF,0x5C2C8141,0xFA5B8AF5,0xCBB39068,0x6DC49BDC, + 0x9B8CEB35,0x3DFBE081,0x0C13FA1C,0xAA64F1A8,0x6FC3CF26,0xC9B4C492,0xF85CDE0F,0x5E2BD5BB, + 0x440B7579,0xE27C7ECD,0xD3946450,0x75E36FE4,0xB044516A,0x16335ADE,0x27DB4043,0x81AC4BF7, + 0x77E43B1E,0xD19330AA,0xE07B2A37,0x460C2183,0x83AB1F0D,0x25DC14B9,0x14340E24,0xB2430590, + 0x23D5E9B7,0x85A2E203,0xB44AF89E,0x123DF32A,0xD79ACDA4,0x71EDC610,0x4005DC8D,0xE672D739, + 0x103AA7D0,0xB64DAC64,0x87A5B6F9,0x21D2BD4D,0xE47583C3,0x42028877,0x73EA92EA,0xD59D995E, + 0x8BB64CE5,0x2DC14751,0x1C295DCC,0xBA5E5678,0x7FF968F6,0xD98E6342,0xE86679DF,0x4E11726B, + 0xB8590282,0x1E2E0936,0x2FC613AB,0x89B1181F,0x4C162691,0xEA612D25,0xDB8937B8,0x7DFE3C0C, + 0xEC68D02B,0x4A1FDB9F,0x7BF7C102,0xDD80CAB6,0x1827F438,0xBE50FF8C,0x8FB8E511,0x29CFEEA5, + 0xDF879E4C,0x79F095F8,0x48188F65,0xEE6F84D1,0x2BC8BA5F,0x8DBFB1EB,0xBC57AB76,0x1A20A0C2, + 0x8816EAF2,0x2E61E146,0x1F89FBDB,0xB9FEF06F,0x7C59CEE1,0xDA2EC555,0xEBC6DFC8,0x4DB1D47C, + 0xBBF9A495,0x1D8EAF21,0x2C66B5BC,0x8A11BE08,0x4FB68086,0xE9C18B32,0xD82991AF,0x7E5E9A1B, + 0xEFC8763C,0x49BF7D88,0x78576715,0xDE206CA1,0x1B87522F,0xBDF0599B,0x8C184306,0x2A6F48B2, + 0xDC27385B,0x7A5033EF,0x4BB82972,0xEDCF22C6,0x28681C48,0x8E1F17FC,0xBFF70D61,0x198006D5, + 0x47ABD36E,0xE1DCD8DA,0xD034C247,0x7643C9F3,0xB3E4F77D,0x1593FCC9,0x247BE654,0x820CEDE0, + 0x74449D09,0xD23396BD,0xE3DB8C20,0x45AC8794,0x800BB91A,0x267CB2AE,0x1794A833,0xB1E3A387, + 0x20754FA0,0x86024414,0xB7EA5E89,0x119D553D,0xD43A6BB3,0x724D6007,0x43A57A9A,0xE5D2712E, + 0x139A01C7,0xB5ED0A73,0x840510EE,0x22721B5A,0xE7D525D4,0x41A22E60,0x704A34FD,0xD63D3F49, + 0xCC1D9F8B,0x6A6A943F,0x5B828EA2,0xFDF58516,0x3852BB98,0x9E25B02C,0xAFCDAAB1,0x09BAA105, + 0xFFF2D1EC,0x5985DA58,0x686DC0C5,0xCE1ACB71,0x0BBDF5FF,0xADCAFE4B,0x9C22E4D6,0x3A55EF62, + 0xABC30345,0x0DB408F1,0x3C5C126C,0x9A2B19D8,0x5F8C2756,0xF9FB2CE2,0xC813367F,0x6E643DCB, + 0x982C4D22,0x3E5B4696,0x0FB35C0B,0xA9C457BF,0x6C636931,0xCA146285,0xFBFC7818,0x5D8B73AC, + 0x03A0A617,0xA5D7ADA3,0x943FB73E,0x3248BC8A,0xF7EF8204,0x519889B0,0x6070932D,0xC6079899, + 0x304FE870,0x9638E3C4,0xA7D0F959,0x01A7F2ED,0xC400CC63,0x6277C7D7,0x539FDD4A,0xF5E8D6FE, + 0x647E3AD9,0xC209316D,0xF3E12BF0,0x55962044,0x90311ECA,0x3646157E,0x07AE0FE3,0xA1D90457, + 0x579174BE,0xF1E67F0A,0xC00E6597,0x66796E23,0xA3DE50AD,0x05A95B19,0x34414184,0x92364A30, + }, + { + 0x00000000,0xCCAA009E,0x4225077D,0x8E8F07E3,0x844A0EFA,0x48E00E64,0xC66F0987,0x0AC50919, + 0xD3E51BB5,0x1F4F1B2B,0x91C01CC8,0x5D6A1C56,0x57AF154F,0x9B0515D1,0x158A1232,0xD92012AC, + 0x7CBB312B,0xB01131B5,0x3E9E3656,0xF23436C8,0xF8F13FD1,0x345B3F4F,0xBAD438AC,0x767E3832, + 0xAF5E2A9E,0x63F42A00,0xED7B2DE3,0x21D12D7D,0x2B142464,0xE7BE24FA,0x69312319,0xA59B2387, + 0xF9766256,0x35DC62C8,0xBB53652B,0x77F965B5,0x7D3C6CAC,0xB1966C32,0x3F196BD1,0xF3B36B4F, + 0x2A9379E3,0xE639797D,0x68B67E9E,0xA41C7E00,0xAED97719,0x62737787,0xECFC7064,0x205670FA, + 0x85CD537D,0x496753E3,0xC7E85400,0x0B42549E,0x01875D87,0xCD2D5D19,0x43A25AFA,0x8F085A64, + 0x562848C8,0x9A824856,0x140D4FB5,0xD8A74F2B,0xD2624632,0x1EC846AC,0x9047414F,0x5CED41D1, + 0x299DC2ED,0xE537C273,0x6BB8C590,0xA712C50E,0xADD7CC17,0x617DCC89,0xEFF2CB6A,0x2358CBF4, + 0xFA78D958,0x36D2D9C6,0xB85DDE25,0x74F7DEBB,0x7E32D7A2,0xB298D73C,0x3C17D0DF,0xF0BDD041, + 0x5526F3C6,0x998CF358,0x1703F4BB,0xDBA9F425,0xD16CFD3C,0x1DC6FDA2,0x9349FA41,0x5FE3FADF, + 0x86C3E873,0x4A69E8ED,0xC4E6EF0E,0x084CEF90,0x0289E689,0xCE23E617,0x40ACE1F4,0x8C06E16A, + 0xD0EBA0BB,0x1C41A025,0x92CEA7C6,0x5E64A758,0x54A1AE41,0x980BAEDF,0x1684A93C,0xDA2EA9A2, + 0x030EBB0E,0xCFA4BB90,0x412BBC73,0x8D81BCED,0x8744B5F4,0x4BEEB56A,0xC561B289,0x09CBB217, + 0xAC509190,0x60FA910E,0xEE7596ED,0x22DF9673,0x281A9F6A,0xE4B09FF4,0x6A3F9817,0xA6959889, + 0x7FB58A25,0xB31F8ABB,0x3D908D58,0xF13A8DC6,0xFBFF84DF,0x37558441,0xB9DA83A2,0x7570833C, + 0x533B85DA,0x9F918544,0x111E82A7,0xDDB48239,0xD7718B20,0x1BDB8BBE,0x95548C5D,0x59FE8CC3, + 0x80DE9E6F,0x4C749EF1,0xC2FB9912,0x0E51998C,0x04949095,0xC83E900B,0x46B197E8,0x8A1B9776, + 0x2F80B4F1,0xE32AB46F,0x6DA5B38C,0xA10FB312,0xABCABA0B,0x6760BA95,0xE9EFBD76,0x2545BDE8, + 0xFC65AF44,0x30CFAFDA,0xBE40A839,0x72EAA8A7,0x782FA1BE,0xB485A120,0x3A0AA6C3,0xF6A0A65D, + 0xAA4DE78C,0x66E7E712,0xE868E0F1,0x24C2E06F,0x2E07E976,0xE2ADE9E8,0x6C22EE0B,0xA088EE95, + 0x79A8FC39,0xB502FCA7,0x3B8DFB44,0xF727FBDA,0xFDE2F2C3,0x3148F25D,0xBFC7F5BE,0x736DF520, + 0xD6F6D6A7,0x1A5CD639,0x94D3D1DA,0x5879D144,0x52BCD85D,0x9E16D8C3,0x1099DF20,0xDC33DFBE, + 0x0513CD12,0xC9B9CD8C,0x4736CA6F,0x8B9CCAF1,0x8159C3E8,0x4DF3C376,0xC37CC495,0x0FD6C40B, + 0x7AA64737,0xB60C47A9,0x3883404A,0xF42940D4,0xFEEC49CD,0x32464953,0xBCC94EB0,0x70634E2E, + 0xA9435C82,0x65E95C1C,0xEB665BFF,0x27CC5B61,0x2D095278,0xE1A352E6,0x6F2C5505,0xA386559B, + 0x061D761C,0xCAB77682,0x44387161,0x889271FF,0x825778E6,0x4EFD7878,0xC0727F9B,0x0CD87F05, + 0xD5F86DA9,0x19526D37,0x97DD6AD4,0x5B776A4A,0x51B26353,0x9D1863CD,0x1397642E,0xDF3D64B0, + 0x83D02561,0x4F7A25FF,0xC1F5221C,0x0D5F2282,0x079A2B9B,0xCB302B05,0x45BF2CE6,0x89152C78, + 0x50353ED4,0x9C9F3E4A,0x121039A9,0xDEBA3937,0xD47F302E,0x18D530B0,0x965A3753,0x5AF037CD, + 0xFF6B144A,0x33C114D4,0xBD4E1337,0x71E413A9,0x7B211AB0,0xB78B1A2E,0x39041DCD,0xF5AE1D53, + 0x2C8E0FFF,0xE0240F61,0x6EAB0882,0xA201081C,0xA8C40105,0x646E019B,0xEAE10678,0x264B06E6, + }, + { + 0x00000000,0x177B1443,0x2EF62886,0x398D3CC5,0x5DEC510C,0x4A97454F,0x731A798A,0x64616DC9, + 0xBBD8A218,0xACA3B65B,0x952E8A9E,0x82559EDD,0xE634F314,0xF14FE757,0xC8C2DB92,0xDFB9CFD1, + 0xACC04271,0xBBBB5632,0x82366AF7,0x954D7EB4,0xF12C137D,0xE657073E,0xDFDA3BFB,0xC8A12FB8, + 0x1718E069,0x0063F42A,0x39EEC8EF,0x2E95DCAC,0x4AF4B165,0x5D8FA526,0x640299E3,0x73798DA0, + 0x82F182A3,0x958A96E0,0xAC07AA25,0xBB7CBE66,0xDF1DD3AF,0xC866C7EC,0xF1EBFB29,0xE690EF6A, + 0x392920BB,0x2E5234F8,0x17DF083D,0x00A41C7E,0x64C571B7,0x73BE65F4,0x4A335931,0x5D484D72, + 0x2E31C0D2,0x394AD491,0x00C7E854,0x17BCFC17,0x73DD91DE,0x64A6859D,0x5D2BB958,0x4A50AD1B, + 0x95E962CA,0x82927689,0xBB1F4A4C,0xAC645E0F,0xC80533C6,0xDF7E2785,0xE6F31B40,0xF1880F03, + 0xDE920307,0xC9E91744,0xF0642B81,0xE71F3FC2,0x837E520B,0x94054648,0xAD887A8D,0xBAF36ECE, + 0x654AA11F,0x7231B55C,0x4BBC8999,0x5CC79DDA,0x38A6F013,0x2FDDE450,0x1650D895,0x012BCCD6, + 0x72524176,0x65295535,0x5CA469F0,0x4BDF7DB3,0x2FBE107A,0x38C50439,0x014838FC,0x16332CBF, + 0xC98AE36E,0xDEF1F72D,0xE77CCBE8,0xF007DFAB,0x9466B262,0x831DA621,0xBA909AE4,0xADEB8EA7, + 0x5C6381A4,0x4B1895E7,0x7295A922,0x65EEBD61,0x018FD0A8,0x16F4C4EB,0x2F79F82E,0x3802EC6D, + 0xE7BB23BC,0xF0C037FF,0xC94D0B3A,0xDE361F79,0xBA5772B0,0xAD2C66F3,0x94A15A36,0x83DA4E75, + 0xF0A3C3D5,0xE7D8D796,0xDE55EB53,0xC92EFF10,0xAD4F92D9,0xBA34869A,0x83B9BA5F,0x94C2AE1C, + 0x4B7B61CD,0x5C00758E,0x658D494B,0x72F65D08,0x169730C1,0x01EC2482,0x38611847,0x2F1A0C04, + 0x6655004F,0x712E140C,0x48A328C9,0x5FD83C8A,0x3BB95143,0x2CC24500,0x154F79C5,0x02346D86, + 0xDD8DA257,0xCAF6B614,0xF37B8AD1,0xE4009E92,0x8061F35B,0x971AE718,0xAE97DBDD,0xB9ECCF9E, + 0xCA95423E,0xDDEE567D,0xE4636AB8,0xF3187EFB,0x97791332,0x80020771,0xB98F3BB4,0xAEF42FF7, + 0x714DE026,0x6636F465,0x5FBBC8A0,0x48C0DCE3,0x2CA1B12A,0x3BDAA569,0x025799AC,0x152C8DEF, + 0xE4A482EC,0xF3DF96AF,0xCA52AA6A,0xDD29BE29,0xB948D3E0,0xAE33C7A3,0x97BEFB66,0x80C5EF25, + 0x5F7C20F4,0x480734B7,0x718A0872,0x66F11C31,0x029071F8,0x15EB65BB,0x2C66597E,0x3B1D4D3D, + 0x4864C09D,0x5F1FD4DE,0x6692E81B,0x71E9FC58,0x15889191,0x02F385D2,0x3B7EB917,0x2C05AD54, + 0xF3BC6285,0xE4C776C6,0xDD4A4A03,0xCA315E40,0xAE503389,0xB92B27CA,0x80A61B0F,0x97DD0F4C, + 0xB8C70348,0xAFBC170B,0x96312BCE,0x814A3F8D,0xE52B5244,0xF2504607,0xCBDD7AC2,0xDCA66E81, + 0x031FA150,0x1464B513,0x2DE989D6,0x3A929D95,0x5EF3F05C,0x4988E41F,0x7005D8DA,0x677ECC99, + 0x14074139,0x037C557A,0x3AF169BF,0x2D8A7DFC,0x49EB1035,0x5E900476,0x671D38B3,0x70662CF0, + 0xAFDFE321,0xB8A4F762,0x8129CBA7,0x9652DFE4,0xF233B22D,0xE548A66E,0xDCC59AAB,0xCBBE8EE8, + 0x3A3681EB,0x2D4D95A8,0x14C0A96D,0x03BBBD2E,0x67DAD0E7,0x70A1C4A4,0x492CF861,0x5E57EC22, + 0x81EE23F3,0x969537B0,0xAF180B75,0xB8631F36,0xDC0272FF,0xCB7966BC,0xF2F45A79,0xE58F4E3A, + 0x96F6C39A,0x818DD7D9,0xB800EB1C,0xAF7BFF5F,0xCB1A9296,0xDC6186D5,0xE5ECBA10,0xF297AE53, + 0x2D2E6182,0x3A5575C1,0x03D84904,0x14A35D47,0x70C2308E,0x67B924CD,0x5E341808,0x494F0C4B, + }, + { + 0x00000000,0xEFC26B3E,0x04F5D03D,0xEB37BB03,0x09EBA07A,0xE629CB44,0x0D1E7047,0xE2DC1B79, + 0x13D740F4,0xFC152BCA,0x172290C9,0xF8E0FBF7,0x1A3CE08E,0xF5FE8BB0,0x1EC930B3,0xF10B5B8D, + 0x27AE81E8,0xC86CEAD6,0x235B51D5,0xCC993AEB,0x2E452192,0xC1874AAC,0x2AB0F1AF,0xC5729A91, + 0x3479C11C,0xDBBBAA22,0x308C1121,0xDF4E7A1F,0x3D926166,0xD2500A58,0x3967B15B,0xD6A5DA65, + 0x4F5D03D0,0xA09F68EE,0x4BA8D3ED,0xA46AB8D3,0x46B6A3AA,0xA974C894,0x42437397,0xAD8118A9, + 0x5C8A4324,0xB348281A,0x587F9319,0xB7BDF827,0x5561E35E,0xBAA38860,0x51943363,0xBE56585D, + 0x68F38238,0x8731E906,0x6C065205,0x83C4393B,0x61182242,0x8EDA497C,0x65EDF27F,0x8A2F9941, + 0x7B24C2CC,0x94E6A9F2,0x7FD112F1,0x901379CF,0x72CF62B6,0x9D0D0988,0x763AB28B,0x99F8D9B5, + 0x9EBA07A0,0x71786C9E,0x9A4FD79D,0x758DBCA3,0x9751A7DA,0x7893CCE4,0x93A477E7,0x7C661CD9, + 0x8D6D4754,0x62AF2C6A,0x89989769,0x665AFC57,0x8486E72E,0x6B448C10,0x80733713,0x6FB15C2D, + 0xB9148648,0x56D6ED76,0xBDE15675,0x52233D4B,0xB0FF2632,0x5F3D4D0C,0xB40AF60F,0x5BC89D31, + 0xAAC3C6BC,0x4501AD82,0xAE361681,0x41F47DBF,0xA32866C6,0x4CEA0DF8,0xA7DDB6FB,0x481FDDC5, + 0xD1E70470,0x3E256F4E,0xD512D44D,0x3AD0BF73,0xD80CA40A,0x37CECF34,0xDCF97437,0x333B1F09, + 0xC2304484,0x2DF22FBA,0xC6C594B9,0x2907FF87,0xCBDBE4FE,0x24198FC0,0xCF2E34C3,0x20EC5FFD, + 0xF6498598,0x198BEEA6,0xF2BC55A5,0x1D7E3E9B,0xFFA225E2,0x10604EDC,0xFB57F5DF,0x14959EE1, + 0xE59EC56C,0x0A5CAE52,0xE16B1551,0x0EA97E6F,0xEC756516,0x03B70E28,0xE880B52B,0x0742DE15, + 0xE6050901,0x09C7623F,0xE2F0D93C,0x0D32B202,0xEFEEA97B,0x002CC245,0xEB1B7946,0x04D91278, + 0xF5D249F5,0x1A1022CB,0xF12799C8,0x1EE5F2F6,0xFC39E98F,0x13FB82B1,0xF8CC39B2,0x170E528C, + 0xC1AB88E9,0x2E69E3D7,0xC55E58D4,0x2A9C33EA,0xC8402893,0x278243AD,0xCCB5F8AE,0x23779390, + 0xD27CC81D,0x3DBEA323,0xD6891820,0x394B731E,0xDB976867,0x34550359,0xDF62B85A,0x30A0D364, + 0xA9580AD1,0x469A61EF,0xADADDAEC,0x426FB1D2,0xA0B3AAAB,0x4F71C195,0xA4467A96,0x4B8411A8, + 0xBA8F4A25,0x554D211B,0xBE7A9A18,0x51B8F126,0xB364EA5F,0x5CA68161,0xB7913A62,0x5853515C, + 0x8EF68B39,0x6134E007,0x8A035B04,0x65C1303A,0x871D2B43,0x68DF407D,0x83E8FB7E,0x6C2A9040, + 0x9D21CBCD,0x72E3A0F3,0x99D41BF0,0x761670CE,0x94CA6BB7,0x7B080089,0x903FBB8A,0x7FFDD0B4, + 0x78BF0EA1,0x977D659F,0x7C4ADE9C,0x9388B5A2,0x7154AEDB,0x9E96C5E5,0x75A17EE6,0x9A6315D8, + 0x6B684E55,0x84AA256B,0x6F9D9E68,0x805FF556,0x6283EE2F,0x8D418511,0x66763E12,0x89B4552C, + 0x5F118F49,0xB0D3E477,0x5BE45F74,0xB426344A,0x56FA2F33,0xB938440D,0x520FFF0E,0xBDCD9430, + 0x4CC6CFBD,0xA304A483,0x48331F80,0xA7F174BE,0x452D6FC7,0xAAEF04F9,0x41D8BFFA,0xAE1AD4C4, + 0x37E20D71,0xD820664F,0x3317DD4C,0xDCD5B672,0x3E09AD0B,0xD1CBC635,0x3AFC7D36,0xD53E1608, + 0x24354D85,0xCBF726BB,0x20C09DB8,0xCF02F686,0x2DDEEDFF,0xC21C86C1,0x292B3DC2,0xC6E956FC, + 0x104C8C99,0xFF8EE7A7,0x14B95CA4,0xFB7B379A,0x19A72CE3,0xF66547DD,0x1D52FCDE,0xF29097E0, + 0x039BCC6D,0xEC59A753,0x076E1C50,0xE8AC776E,0x0A706C17,0xE5B20729,0x0E85BC2A,0xE147D714, + }, + { + 0x00000000,0xC18EDFC0,0x586CB9C1,0x99E26601,0xB0D97382,0x7157AC42,0xE8B5CA43,0x293B1583, + 0xBAC3E145,0x7B4D3E85,0xE2AF5884,0x23218744,0x0A1A92C7,0xCB944D07,0x52762B06,0x93F8F4C6, + 0xAEF6C4CB,0x6F781B0B,0xF69A7D0A,0x3714A2CA,0x1E2FB749,0xDFA16889,0x46430E88,0x87CDD148, + 0x1435258E,0xD5BBFA4E,0x4C599C4F,0x8DD7438F,0xA4EC560C,0x656289CC,0xFC80EFCD,0x3D0E300D, + 0x869C8FD7,0x47125017,0xDEF03616,0x1F7EE9D6,0x3645FC55,0xF7CB2395,0x6E294594,0xAFA79A54, + 0x3C5F6E92,0xFDD1B152,0x6433D753,0xA5BD0893,0x8C861D10,0x4D08C2D0,0xD4EAA4D1,0x15647B11, + 0x286A4B1C,0xE9E494DC,0x7006F2DD,0xB1882D1D,0x98B3389E,0x593DE75E,0xC0DF815F,0x01515E9F, + 0x92A9AA59,0x53277599,0xCAC51398,0x0B4BCC58,0x2270D9DB,0xE3FE061B,0x7A1C601A,0xBB92BFDA, + 0xD64819EF,0x17C6C62F,0x8E24A02E,0x4FAA7FEE,0x66916A6D,0xA71FB5AD,0x3EFDD3AC,0xFF730C6C, + 0x6C8BF8AA,0xAD05276A,0x34E7416B,0xF5699EAB,0xDC528B28,0x1DDC54E8,0x843E32E9,0x45B0ED29, + 0x78BEDD24,0xB93002E4,0x20D264E5,0xE15CBB25,0xC867AEA6,0x09E97166,0x900B1767,0x5185C8A7, + 0xC27D3C61,0x03F3E3A1,0x9A1185A0,0x5B9F5A60,0x72A44FE3,0xB32A9023,0x2AC8F622,0xEB4629E2, + 0x50D49638,0x915A49F8,0x08B82FF9,0xC936F039,0xE00DE5BA,0x21833A7A,0xB8615C7B,0x79EF83BB, + 0xEA17777D,0x2B99A8BD,0xB27BCEBC,0x73F5117C,0x5ACE04FF,0x9B40DB3F,0x02A2BD3E,0xC32C62FE, + 0xFE2252F3,0x3FAC8D33,0xA64EEB32,0x67C034F2,0x4EFB2171,0x8F75FEB1,0x169798B0,0xD7194770, + 0x44E1B3B6,0x856F6C76,0x1C8D0A77,0xDD03D5B7,0xF438C034,0x35B61FF4,0xAC5479F5,0x6DDAA635, + 0x77E1359F,0xB66FEA5F,0x2F8D8C5E,0xEE03539E,0xC738461D,0x06B699DD,0x9F54FFDC,0x5EDA201C, + 0xCD22D4DA,0x0CAC0B1A,0x954E6D1B,0x54C0B2DB,0x7DFBA758,0xBC757898,0x25971E99,0xE419C159, + 0xD917F154,0x18992E94,0x817B4895,0x40F59755,0x69CE82D6,0xA8405D16,0x31A23B17,0xF02CE4D7, + 0x63D41011,0xA25ACFD1,0x3BB8A9D0,0xFA367610,0xD30D6393,0x1283BC53,0x8B61DA52,0x4AEF0592, + 0xF17DBA48,0x30F36588,0xA9110389,0x689FDC49,0x41A4C9CA,0x802A160A,0x19C8700B,0xD846AFCB, + 0x4BBE5B0D,0x8A3084CD,0x13D2E2CC,0xD25C3D0C,0xFB67288F,0x3AE9F74F,0xA30B914E,0x62854E8E, + 0x5F8B7E83,0x9E05A143,0x07E7C742,0xC6691882,0xEF520D01,0x2EDCD2C1,0xB73EB4C0,0x76B06B00, + 0xE5489FC6,0x24C64006,0xBD242607,0x7CAAF9C7,0x5591EC44,0x941F3384,0x0DFD5585,0xCC738A45, + 0xA1A92C70,0x6027F3B0,0xF9C595B1,0x384B4A71,0x11705FF2,0xD0FE8032,0x491CE633,0x889239F3, + 0x1B6ACD35,0xDAE412F5,0x430674F4,0x8288AB34,0xABB3BEB7,0x6A3D6177,0xF3DF0776,0x3251D8B6, + 0x0F5FE8BB,0xCED1377B,0x5733517A,0x96BD8EBA,0xBF869B39,0x7E0844F9,0xE7EA22F8,0x2664FD38, + 0xB59C09FE,0x7412D63E,0xEDF0B03F,0x2C7E6FFF,0x05457A7C,0xC4CBA5BC,0x5D29C3BD,0x9CA71C7D, + 0x2735A3A7,0xE6BB7C67,0x7F591A66,0xBED7C5A6,0x97ECD025,0x56620FE5,0xCF8069E4,0x0E0EB624, + 0x9DF642E2,0x5C789D22,0xC59AFB23,0x041424E3,0x2D2F3160,0xECA1EEA0,0x754388A1,0xB4CD5761, + 0x89C3676C,0x484DB8AC,0xD1AFDEAD,0x1021016D,0x391A14EE,0xF894CB2E,0x6176AD2F,0xA0F872EF, + 0x33008629,0xF28E59E9,0x6B6C3FE8,0xAAE2E028,0x83D9F5AB,0x42572A6B,0xDBB54C6A,0x1A3B93AA, + }, + { + 0x00000000,0x9BA54C6F,0xEC3B9E9F,0x779ED2F0,0x03063B7F,0x98A37710,0xEF3DA5E0,0x7498E98F, + 0x060C76FE,0x9DA93A91,0xEA37E861,0x7192A40E,0x050A4D81,0x9EAF01EE,0xE931D31E,0x72949F71, + 0x0C18EDFC,0x97BDA193,0xE0237363,0x7B863F0C,0x0F1ED683,0x94BB9AEC,0xE325481C,0x78800473, + 0x0A149B02,0x91B1D76D,0xE62F059D,0x7D8A49F2,0x0912A07D,0x92B7EC12,0xE5293EE2,0x7E8C728D, + 0x1831DBF8,0x83949797,0xF40A4567,0x6FAF0908,0x1B37E087,0x8092ACE8,0xF70C7E18,0x6CA93277, + 0x1E3DAD06,0x8598E169,0xF2063399,0x69A37FF6,0x1D3B9679,0x869EDA16,0xF10008E6,0x6AA54489, + 0x14293604,0x8F8C7A6B,0xF812A89B,0x63B7E4F4,0x172F0D7B,0x8C8A4114,0xFB1493E4,0x60B1DF8B, + 0x122540FA,0x89800C95,0xFE1EDE65,0x65BB920A,0x11237B85,0x8A8637EA,0xFD18E51A,0x66BDA975, + 0x3063B7F0,0xABC6FB9F,0xDC58296F,0x47FD6500,0x33658C8F,0xA8C0C0E0,0xDF5E1210,0x44FB5E7F, + 0x366FC10E,0xADCA8D61,0xDA545F91,0x41F113FE,0x3569FA71,0xAECCB61E,0xD95264EE,0x42F72881, + 0x3C7B5A0C,0xA7DE1663,0xD040C493,0x4BE588FC,0x3F7D6173,0xA4D82D1C,0xD346FFEC,0x48E3B383, + 0x3A772CF2,0xA1D2609D,0xD64CB26D,0x4DE9FE02,0x3971178D,0xA2D45BE2,0xD54A8912,0x4EEFC57D, + 0x28526C08,0xB3F72067,0xC469F297,0x5FCCBEF8,0x2B545777,0xB0F11B18,0xC76FC9E8,0x5CCA8587, + 0x2E5E1AF6,0xB5FB5699,0xC2658469,0x59C0C806,0x2D582189,0xB6FD6DE6,0xC163BF16,0x5AC6F379, + 0x244A81F4,0xBFEFCD9B,0xC8711F6B,0x53D45304,0x274CBA8B,0xBCE9F6E4,0xCB772414,0x50D2687B, + 0x2246F70A,0xB9E3BB65,0xCE7D6995,0x55D825FA,0x2140CC75,0xBAE5801A,0xCD7B52EA,0x56DE1E85, + 0x60C76FE0,0xFB62238F,0x8CFCF17F,0x1759BD10,0x63C1549F,0xF86418F0,0x8FFACA00,0x145F866F, + 0x66CB191E,0xFD6E5571,0x8AF08781,0x1155CBEE,0x65CD2261,0xFE686E0E,0x89F6BCFE,0x1253F091, + 0x6CDF821C,0xF77ACE73,0x80E41C83,0x1B4150EC,0x6FD9B963,0xF47CF50C,0x83E227FC,0x18476B93, + 0x6AD3F4E2,0xF176B88D,0x86E86A7D,0x1D4D2612,0x69D5CF9D,0xF27083F2,0x85EE5102,0x1E4B1D6D, + 0x78F6B418,0xE353F877,0x94CD2A87,0x0F6866E8,0x7BF08F67,0xE055C308,0x97CB11F8,0x0C6E5D97, + 0x7EFAC2E6,0xE55F8E89,0x92C15C79,0x09641016,0x7DFCF999,0xE659B5F6,0x91C76706,0x0A622B69, + 0x74EE59E4,0xEF4B158B,0x98D5C77B,0x03708B14,0x77E8629B,0xEC4D2EF4,0x9BD3FC04,0x0076B06B, + 0x72E22F1A,0xE9476375,0x9ED9B185,0x057CFDEA,0x71E41465,0xEA41580A,0x9DDF8AFA,0x067AC695, + 0x50A4D810,0xCB01947F,0xBC9F468F,0x273A0AE0,0x53A2E36F,0xC807AF00,0xBF997DF0,0x243C319F, + 0x56A8AEEE,0xCD0DE281,0xBA933071,0x21367C1E,0x55AE9591,0xCE0BD9FE,0xB9950B0E,0x22304761, + 0x5CBC35EC,0xC7197983,0xB087AB73,0x2B22E71C,0x5FBA0E93,0xC41F42FC,0xB381900C,0x2824DC63, + 0x5AB04312,0xC1150F7D,0xB68BDD8D,0x2D2E91E2,0x59B6786D,0xC2133402,0xB58DE6F2,0x2E28AA9D, + 0x489503E8,0xD3304F87,0xA4AE9D77,0x3F0BD118,0x4B933897,0xD03674F8,0xA7A8A608,0x3C0DEA67, + 0x4E997516,0xD53C3979,0xA2A2EB89,0x3907A7E6,0x4D9F4E69,0xD63A0206,0xA1A4D0F6,0x3A019C99, + 0x448DEE14,0xDF28A27B,0xA8B6708B,0x33133CE4,0x478BD56B,0xDC2E9904,0xABB04BF4,0x3015079B, + 0x428198EA,0xD924D485,0xAEBA0675,0x351F4A1A,0x4187A395,0xDA22EFFA,0xADBC3D0A,0x36197165, + }, + { + 0x00000000,0xDD96D985,0x605CB54B,0xBDCA6CCE,0xC0B96A96,0x1D2FB313,0xA0E5DFDD,0x7D730658, + 0x5A03D36D,0x87950AE8,0x3A5F6626,0xE7C9BFA3,0x9ABAB9FB,0x472C607E,0xFAE60CB0,0x2770D535, + 0xB407A6DA,0x69917F5F,0xD45B1391,0x09CDCA14,0x74BECC4C,0xA92815C9,0x14E27907,0xC974A082, + 0xEE0475B7,0x3392AC32,0x8E58C0FC,0x53CE1979,0x2EBD1F21,0xF32BC6A4,0x4EE1AA6A,0x937773EF, + 0xB37E4BF5,0x6EE89270,0xD322FEBE,0x0EB4273B,0x73C72163,0xAE51F8E6,0x139B9428,0xCE0D4DAD, + 0xE97D9898,0x34EB411D,0x89212DD3,0x54B7F456,0x29C4F20E,0xF4522B8B,0x49984745,0x940E9EC0, + 0x0779ED2F,0xDAEF34AA,0x67255864,0xBAB381E1,0xC7C087B9,0x1A565E3C,0xA79C32F2,0x7A0AEB77, + 0x5D7A3E42,0x80ECE7C7,0x3D268B09,0xE0B0528C,0x9DC354D4,0x40558D51,0xFD9FE19F,0x2009381A, + 0xBD8D91AB,0x601B482E,0xDDD124E0,0x0047FD65,0x7D34FB3D,0xA0A222B8,0x1D684E76,0xC0FE97F3, + 0xE78E42C6,0x3A189B43,0x87D2F78D,0x5A442E08,0x27372850,0xFAA1F1D5,0x476B9D1B,0x9AFD449E, + 0x098A3771,0xD41CEEF4,0x69D6823A,0xB4405BBF,0xC9335DE7,0x14A58462,0xA96FE8AC,0x74F93129, + 0x5389E41C,0x8E1F3D99,0x33D55157,0xEE4388D2,0x93308E8A,0x4EA6570F,0xF36C3BC1,0x2EFAE244, + 0x0EF3DA5E,0xD36503DB,0x6EAF6F15,0xB339B690,0xCE4AB0C8,0x13DC694D,0xAE160583,0x7380DC06, + 0x54F00933,0x8966D0B6,0x34ACBC78,0xE93A65FD,0x944963A5,0x49DFBA20,0xF415D6EE,0x29830F6B, + 0xBAF47C84,0x6762A501,0xDAA8C9CF,0x073E104A,0x7A4D1612,0xA7DBCF97,0x1A11A359,0xC7877ADC, + 0xE0F7AFE9,0x3D61766C,0x80AB1AA2,0x5D3DC327,0x204EC57F,0xFDD81CFA,0x40127034,0x9D84A9B1, + 0xA06A2517,0x7DFCFC92,0xC036905C,0x1DA049D9,0x60D34F81,0xBD459604,0x008FFACA,0xDD19234F, + 0xFA69F67A,0x27FF2FFF,0x9A354331,0x47A39AB4,0x3AD09CEC,0xE7464569,0x5A8C29A7,0x871AF022, + 0x146D83CD,0xC9FB5A48,0x74313686,0xA9A7EF03,0xD4D4E95B,0x094230DE,0xB4885C10,0x691E8595, + 0x4E6E50A0,0x93F88925,0x2E32E5EB,0xF3A43C6E,0x8ED73A36,0x5341E3B3,0xEE8B8F7D,0x331D56F8, + 0x13146EE2,0xCE82B767,0x7348DBA9,0xAEDE022C,0xD3AD0474,0x0E3BDDF1,0xB3F1B13F,0x6E6768BA, + 0x4917BD8F,0x9481640A,0x294B08C4,0xF4DDD141,0x89AED719,0x54380E9C,0xE9F26252,0x3464BBD7, + 0xA713C838,0x7A8511BD,0xC74F7D73,0x1AD9A4F6,0x67AAA2AE,0xBA3C7B2B,0x07F617E5,0xDA60CE60, + 0xFD101B55,0x2086C2D0,0x9D4CAE1E,0x40DA779B,0x3DA971C3,0xE03FA846,0x5DF5C488,0x80631D0D, + 0x1DE7B4BC,0xC0716D39,0x7DBB01F7,0xA02DD872,0xDD5EDE2A,0x00C807AF,0xBD026B61,0x6094B2E4, + 0x47E467D1,0x9A72BE54,0x27B8D29A,0xFA2E0B1F,0x875D0D47,0x5ACBD4C2,0xE701B80C,0x3A976189, + 0xA9E01266,0x7476CBE3,0xC9BCA72D,0x142A7EA8,0x695978F0,0xB4CFA175,0x0905CDBB,0xD493143E, + 0xF3E3C10B,0x2E75188E,0x93BF7440,0x4E29ADC5,0x335AAB9D,0xEECC7218,0x53061ED6,0x8E90C753, + 0xAE99FF49,0x730F26CC,0xCEC54A02,0x13539387,0x6E2095DF,0xB3B64C5A,0x0E7C2094,0xD3EAF911, + 0xF49A2C24,0x290CF5A1,0x94C6996F,0x495040EA,0x342346B2,0xE9B59F37,0x547FF3F9,0x89E92A7C, + 0x1A9E5993,0xC7088016,0x7AC2ECD8,0xA754355D,0xDA273305,0x07B1EA80,0xBA7B864E,0x67ED5FCB, + 0x409D8AFE,0x9D0B537B,0x20C13FB5,0xFD57E630,0x8024E068,0x5DB239ED,0xE0785523,0x3DEE8CA6, + }, + { + 0x00000000,0x9D0FE176,0xE16EC4AD,0x7C6125DB,0x19AC8F1B,0x84A36E6D,0xF8C24BB6,0x65CDAAC0, + 0x33591E36,0xAE56FF40,0xD237DA9B,0x4F383BED,0x2AF5912D,0xB7FA705B,0xCB9B5580,0x5694B4F6, + 0x66B23C6C,0xFBBDDD1A,0x87DCF8C1,0x1AD319B7,0x7F1EB377,0xE2115201,0x9E7077DA,0x037F96AC, + 0x55EB225A,0xC8E4C32C,0xB485E6F7,0x298A0781,0x4C47AD41,0xD1484C37,0xAD2969EC,0x3026889A, + 0xCD6478D8,0x506B99AE,0x2C0ABC75,0xB1055D03,0xD4C8F7C3,0x49C716B5,0x35A6336E,0xA8A9D218, + 0xFE3D66EE,0x63328798,0x1F53A243,0x825C4335,0xE791E9F5,0x7A9E0883,0x06FF2D58,0x9BF0CC2E, + 0xABD644B4,0x36D9A5C2,0x4AB88019,0xD7B7616F,0xB27ACBAF,0x2F752AD9,0x53140F02,0xCE1BEE74, + 0x988F5A82,0x0580BBF4,0x79E19E2F,0xE4EE7F59,0x8123D599,0x1C2C34EF,0x604D1134,0xFD42F042, + 0x41B9F7F1,0xDCB61687,0xA0D7335C,0x3DD8D22A,0x581578EA,0xC51A999C,0xB97BBC47,0x24745D31, + 0x72E0E9C7,0xEFEF08B1,0x938E2D6A,0x0E81CC1C,0x6B4C66DC,0xF64387AA,0x8A22A271,0x172D4307, + 0x270BCB9D,0xBA042AEB,0xC6650F30,0x5B6AEE46,0x3EA74486,0xA3A8A5F0,0xDFC9802B,0x42C6615D, + 0x1452D5AB,0x895D34DD,0xF53C1106,0x6833F070,0x0DFE5AB0,0x90F1BBC6,0xEC909E1D,0x719F7F6B, + 0x8CDD8F29,0x11D26E5F,0x6DB34B84,0xF0BCAAF2,0x95710032,0x087EE144,0x741FC49F,0xE91025E9, + 0xBF84911F,0x228B7069,0x5EEA55B2,0xC3E5B4C4,0xA6281E04,0x3B27FF72,0x4746DAA9,0xDA493BDF, + 0xEA6FB345,0x77605233,0x0B0177E8,0x960E969E,0xF3C33C5E,0x6ECCDD28,0x12ADF8F3,0x8FA21985, + 0xD936AD73,0x44394C05,0x385869DE,0xA55788A8,0xC09A2268,0x5D95C31E,0x21F4E6C5,0xBCFB07B3, + 0x8373EFE2,0x1E7C0E94,0x621D2B4F,0xFF12CA39,0x9ADF60F9,0x07D0818F,0x7BB1A454,0xE6BE4522, + 0xB02AF1D4,0x2D2510A2,0x51443579,0xCC4BD40F,0xA9867ECF,0x34899FB9,0x48E8BA62,0xD5E75B14, + 0xE5C1D38E,0x78CE32F8,0x04AF1723,0x99A0F655,0xFC6D5C95,0x6162BDE3,0x1D039838,0x800C794E, + 0xD698CDB8,0x4B972CCE,0x37F60915,0xAAF9E863,0xCF3442A3,0x523BA3D5,0x2E5A860E,0xB3556778, + 0x4E17973A,0xD318764C,0xAF795397,0x3276B2E1,0x57BB1821,0xCAB4F957,0xB6D5DC8C,0x2BDA3DFA, + 0x7D4E890C,0xE041687A,0x9C204DA1,0x012FACD7,0x64E20617,0xF9EDE761,0x858CC2BA,0x188323CC, + 0x28A5AB56,0xB5AA4A20,0xC9CB6FFB,0x54C48E8D,0x3109244D,0xAC06C53B,0xD067E0E0,0x4D680196, + 0x1BFCB560,0x86F35416,0xFA9271CD,0x679D90BB,0x02503A7B,0x9F5FDB0D,0xE33EFED6,0x7E311FA0, + 0xC2CA1813,0x5FC5F965,0x23A4DCBE,0xBEAB3DC8,0xDB669708,0x4669767E,0x3A0853A5,0xA707B2D3, + 0xF1930625,0x6C9CE753,0x10FDC288,0x8DF223FE,0xE83F893E,0x75306848,0x09514D93,0x945EACE5, + 0xA478247F,0x3977C509,0x4516E0D2,0xD81901A4,0xBDD4AB64,0x20DB4A12,0x5CBA6FC9,0xC1B58EBF, + 0x97213A49,0x0A2EDB3F,0x764FFEE4,0xEB401F92,0x8E8DB552,0x13825424,0x6FE371FF,0xF2EC9089, + 0x0FAE60CB,0x92A181BD,0xEEC0A466,0x73CF4510,0x1602EFD0,0x8B0D0EA6,0xF76C2B7D,0x6A63CA0B, + 0x3CF77EFD,0xA1F89F8B,0xDD99BA50,0x40965B26,0x255BF1E6,0xB8541090,0xC435354B,0x593AD43D, + 0x691C5CA7,0xF413BDD1,0x8872980A,0x157D797C,0x70B0D3BC,0xEDBF32CA,0x91DE1711,0x0CD1F667, + 0x5A454291,0xC74AA3E7,0xBB2B863C,0x2624674A,0x43E9CD8A,0xDEE62CFC,0xA2870927,0x3F88E851, + }, + { + 0x00000000,0xB9FBDBE8,0xA886B191,0x117D6A79,0x8A7C6563,0x3387BE8B,0x22FAD4F2,0x9B010F1A, + 0xCF89CC87,0x7672176F,0x670F7D16,0xDEF4A6FE,0x45F5A9E4,0xFC0E720C,0xED731875,0x5488C39D, + 0x44629F4F,0xFD9944A7,0xECE42EDE,0x551FF536,0xCE1EFA2C,0x77E521C4,0x66984BBD,0xDF639055, + 0x8BEB53C8,0x32108820,0x236DE259,0x9A9639B1,0x019736AB,0xB86CED43,0xA911873A,0x10EA5CD2, + 0x88C53E9E,0x313EE576,0x20438F0F,0x99B854E7,0x02B95BFD,0xBB428015,0xAA3FEA6C,0x13C43184, + 0x474CF219,0xFEB729F1,0xEFCA4388,0x56319860,0xCD30977A,0x74CB4C92,0x65B626EB,0xDC4DFD03, + 0xCCA7A1D1,0x755C7A39,0x64211040,0xDDDACBA8,0x46DBC4B2,0xFF201F5A,0xEE5D7523,0x57A6AECB, + 0x032E6D56,0xBAD5B6BE,0xABA8DCC7,0x1253072F,0x89520835,0x30A9D3DD,0x21D4B9A4,0x982F624C, + 0xCAFB7B7D,0x7300A095,0x627DCAEC,0xDB861104,0x40871E1E,0xF97CC5F6,0xE801AF8F,0x51FA7467, + 0x0572B7FA,0xBC896C12,0xADF4066B,0x140FDD83,0x8F0ED299,0x36F50971,0x27886308,0x9E73B8E0, + 0x8E99E432,0x37623FDA,0x261F55A3,0x9FE48E4B,0x04E58151,0xBD1E5AB9,0xAC6330C0,0x1598EB28, + 0x411028B5,0xF8EBF35D,0xE9969924,0x506D42CC,0xCB6C4DD6,0x7297963E,0x63EAFC47,0xDA1127AF, + 0x423E45E3,0xFBC59E0B,0xEAB8F472,0x53432F9A,0xC8422080,0x71B9FB68,0x60C49111,0xD93F4AF9, + 0x8DB78964,0x344C528C,0x253138F5,0x9CCAE31D,0x07CBEC07,0xBE3037EF,0xAF4D5D96,0x16B6867E, + 0x065CDAAC,0xBFA70144,0xAEDA6B3D,0x1721B0D5,0x8C20BFCF,0x35DB6427,0x24A60E5E,0x9D5DD5B6, + 0xC9D5162B,0x702ECDC3,0x6153A7BA,0xD8A87C52,0x43A97348,0xFA52A8A0,0xEB2FC2D9,0x52D41931, + 0x4E87F0BB,0xF77C2B53,0xE601412A,0x5FFA9AC2,0xC4FB95D8,0x7D004E30,0x6C7D2449,0xD586FFA1, + 0x810E3C3C,0x38F5E7D4,0x29888DAD,0x90735645,0x0B72595F,0xB28982B7,0xA3F4E8CE,0x1A0F3326, + 0x0AE56FF4,0xB31EB41C,0xA263DE65,0x1B98058D,0x80990A97,0x3962D17F,0x281FBB06,0x91E460EE, + 0xC56CA373,0x7C97789B,0x6DEA12E2,0xD411C90A,0x4F10C610,0xF6EB1DF8,0xE7967781,0x5E6DAC69, + 0xC642CE25,0x7FB915CD,0x6EC47FB4,0xD73FA45C,0x4C3EAB46,0xF5C570AE,0xE4B81AD7,0x5D43C13F, + 0x09CB02A2,0xB030D94A,0xA14DB333,0x18B668DB,0x83B767C1,0x3A4CBC29,0x2B31D650,0x92CA0DB8, + 0x8220516A,0x3BDB8A82,0x2AA6E0FB,0x935D3B13,0x085C3409,0xB1A7EFE1,0xA0DA8598,0x19215E70, + 0x4DA99DED,0xF4524605,0xE52F2C7C,0x5CD4F794,0xC7D5F88E,0x7E2E2366,0x6F53491F,0xD6A892F7, + 0x847C8BC6,0x3D87502E,0x2CFA3A57,0x9501E1BF,0x0E00EEA5,0xB7FB354D,0xA6865F34,0x1F7D84DC, + 0x4BF54741,0xF20E9CA9,0xE373F6D0,0x5A882D38,0xC1892222,0x7872F9CA,0x690F93B3,0xD0F4485B, + 0xC01E1489,0x79E5CF61,0x6898A518,0xD1637EF0,0x4A6271EA,0xF399AA02,0xE2E4C07B,0x5B1F1B93, + 0x0F97D80E,0xB66C03E6,0xA711699F,0x1EEAB277,0x85EBBD6D,0x3C106685,0x2D6D0CFC,0x9496D714, + 0x0CB9B558,0xB5426EB0,0xA43F04C9,0x1DC4DF21,0x86C5D03B,0x3F3E0BD3,0x2E4361AA,0x97B8BA42, + 0xC33079DF,0x7ACBA237,0x6BB6C84E,0xD24D13A6,0x494C1CBC,0xF0B7C754,0xE1CAAD2D,0x583176C5, + 0x48DB2A17,0xF120F1FF,0xE05D9B86,0x59A6406E,0xC2A74F74,0x7B5C949C,0x6A21FEE5,0xD3DA250D, + 0x8752E690,0x3EA93D78,0x2FD45701,0x962F8CE9,0x0D2E83F3,0xB4D5581B,0xA5A83262,0x1C53E98A, + }, + { + 0x00000000,0xAE689191,0x87A02563,0x29C8B4F2,0xD4314C87,0x7A59DD16,0x539169E4,0xFDF9F875, + 0x73139F4F,0xDD7B0EDE,0xF4B3BA2C,0x5ADB2BBD,0xA722D3C8,0x094A4259,0x2082F6AB,0x8EEA673A, + 0xE6273E9E,0x484FAF0F,0x61871BFD,0xCFEF8A6C,0x32167219,0x9C7EE388,0xB5B6577A,0x1BDEC6EB, + 0x9534A1D1,0x3B5C3040,0x129484B2,0xBCFC1523,0x4105ED56,0xEF6D7CC7,0xC6A5C835,0x68CD59A4, + 0x173F7B7D,0xB957EAEC,0x909F5E1E,0x3EF7CF8F,0xC30E37FA,0x6D66A66B,0x44AE1299,0xEAC68308, + 0x642CE432,0xCA4475A3,0xE38CC151,0x4DE450C0,0xB01DA8B5,0x1E753924,0x37BD8DD6,0x99D51C47, + 0xF11845E3,0x5F70D472,0x76B86080,0xD8D0F111,0x25290964,0x8B4198F5,0xA2892C07,0x0CE1BD96, + 0x820BDAAC,0x2C634B3D,0x05ABFFCF,0xABC36E5E,0x563A962B,0xF85207BA,0xD19AB348,0x7FF222D9, + 0x2E7EF6FA,0x8016676B,0xA9DED399,0x07B64208,0xFA4FBA7D,0x54272BEC,0x7DEF9F1E,0xD3870E8F, + 0x5D6D69B5,0xF305F824,0xDACD4CD6,0x74A5DD47,0x895C2532,0x2734B4A3,0x0EFC0051,0xA09491C0, + 0xC859C864,0x663159F5,0x4FF9ED07,0xE1917C96,0x1C6884E3,0xB2001572,0x9BC8A180,0x35A03011, + 0xBB4A572B,0x1522C6BA,0x3CEA7248,0x9282E3D9,0x6F7B1BAC,0xC1138A3D,0xE8DB3ECF,0x46B3AF5E, + 0x39418D87,0x97291C16,0xBEE1A8E4,0x10893975,0xED70C100,0x43185091,0x6AD0E463,0xC4B875F2, + 0x4A5212C8,0xE43A8359,0xCDF237AB,0x639AA63A,0x9E635E4F,0x300BCFDE,0x19C37B2C,0xB7ABEABD, + 0xDF66B319,0x710E2288,0x58C6967A,0xF6AE07EB,0x0B57FF9E,0xA53F6E0F,0x8CF7DAFD,0x229F4B6C, + 0xAC752C56,0x021DBDC7,0x2BD50935,0x85BD98A4,0x784460D1,0xD62CF140,0xFFE445B2,0x518CD423, + 0x5CFDEDF4,0xF2957C65,0xDB5DC897,0x75355906,0x88CCA173,0x26A430E2,0x0F6C8410,0xA1041581, + 0x2FEE72BB,0x8186E32A,0xA84E57D8,0x0626C649,0xFBDF3E3C,0x55B7AFAD,0x7C7F1B5F,0xD2178ACE, + 0xBADAD36A,0x14B242FB,0x3D7AF609,0x93126798,0x6EEB9FED,0xC0830E7C,0xE94BBA8E,0x47232B1F, + 0xC9C94C25,0x67A1DDB4,0x4E696946,0xE001F8D7,0x1DF800A2,0xB3909133,0x9A5825C1,0x3430B450, + 0x4BC29689,0xE5AA0718,0xCC62B3EA,0x620A227B,0x9FF3DA0E,0x319B4B9F,0x1853FF6D,0xB63B6EFC, + 0x38D109C6,0x96B99857,0xBF712CA5,0x1119BD34,0xECE04541,0x4288D4D0,0x6B406022,0xC528F1B3, + 0xADE5A817,0x038D3986,0x2A458D74,0x842D1CE5,0x79D4E490,0xD7BC7501,0xFE74C1F3,0x501C5062, + 0xDEF63758,0x709EA6C9,0x5956123B,0xF73E83AA,0x0AC77BDF,0xA4AFEA4E,0x8D675EBC,0x230FCF2D, + 0x72831B0E,0xDCEB8A9F,0xF5233E6D,0x5B4BAFFC,0xA6B25789,0x08DAC618,0x211272EA,0x8F7AE37B, + 0x01908441,0xAFF815D0,0x8630A122,0x285830B3,0xD5A1C8C6,0x7BC95957,0x5201EDA5,0xFC697C34, + 0x94A42590,0x3ACCB401,0x130400F3,0xBD6C9162,0x40956917,0xEEFDF886,0xC7354C74,0x695DDDE5, + 0xE7B7BADF,0x49DF2B4E,0x60179FBC,0xCE7F0E2D,0x3386F658,0x9DEE67C9,0xB426D33B,0x1A4E42AA, + 0x65BC6073,0xCBD4F1E2,0xE21C4510,0x4C74D481,0xB18D2CF4,0x1FE5BD65,0x362D0997,0x98459806, + 0x16AFFF3C,0xB8C76EAD,0x910FDA5F,0x3F674BCE,0xC29EB3BB,0x6CF6222A,0x453E96D8,0xEB560749, + 0x839B5EED,0x2DF3CF7C,0x043B7B8E,0xAA53EA1F,0x57AA126A,0xF9C283FB,0xD00A3709,0x7E62A698, + 0xF088C1A2,0x5EE05033,0x7728E4C1,0xD9407550,0x24B98D25,0x8AD11CB4,0xA319A846,0x0D7139D7, + } + }; + + uint32_t Crc32(const void* src, size_t size) + { + const uint8_t* p8 = (const uint8_t*)src; + uint32_t crc = 0xFFFFFFFF; + + for (; ((uintptr_t)p8 & (sizeof(uint32_t) - 1)) != 0 && size > 0; ++p8, --size) + crc = Crc32Table[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8); + + const uint32_t* p32 = (const uint32_t*)p8; + for (; size >= 16; size -= 16) + { +#ifdef SIMD_BIG_ENDIAN + uint32_t v0 = *p32++ ^ Reorder32(crc); + uint32_t v1 = *p32++; + uint32_t v2 = *p32++; + uint32_t v3 = *p32++; + crc = + Crc32Table[0x0][v3 & 0xFF] ^ + Crc32Table[0x1][(v3 >> 8) & 0xFF] ^ + Crc32Table[0x2][(v3 >> 16) & 0xFF] ^ + Crc32Table[0x3][(v3 >> 24) & 0xFF] ^ + Crc32Table[0x4][v2 & 0xFF] ^ + Crc32Table[0x5][(v2 >> 8) & 0xFF] ^ + Crc32Table[0x6][(v2 >> 16) & 0xFF] ^ + Crc32Table[0x7][(v2 >> 24) & 0xFF] ^ + Crc32Table[0x8][v1 & 0xFF] ^ + Crc32Table[0x9][(v1 >> 8) & 0xFF] ^ + Crc32Table[0xA][(v1 >> 16) & 0xFF] ^ + Crc32Table[0xB][(v1 >> 24) & 0xFF] ^ + Crc32Table[0xC][v0 & 0xFF] ^ + Crc32Table[0xD][(v0 >> 8) & 0xFF] ^ + Crc32Table[0xE][(v0 >> 16) & 0xFF] ^ + Crc32Table[0xF][(v0 >> 24) & 0xFF]; +#else + uint32_t v0 = *p32++ ^ crc; + uint32_t v1 = *p32++; + uint32_t v2 = *p32++; + uint32_t v3 = *p32++; + crc = + Crc32Table[0x0][(v3 >> 24) & 0xFF] ^ + Crc32Table[0x1][(v3 >> 16) & 0xFF] ^ + Crc32Table[0x2][(v3 >> 8) & 0xFF] ^ + Crc32Table[0x3][v3 & 0xFF] ^ + Crc32Table[0x4][(v2 >> 24) & 0xFF] ^ + Crc32Table[0x5][(v2 >> 16) & 0xFF] ^ + Crc32Table[0x6][(v2 >> 8) & 0xFF] ^ + Crc32Table[0x7][v2 & 0xFF] ^ + Crc32Table[0x8][(v1 >> 24) & 0xFF] ^ + Crc32Table[0x9][(v1 >> 16) & 0xFF] ^ + Crc32Table[0xA][(v1 >> 8) & 0xFF] ^ + Crc32Table[0xB][v1 & 0xFF] ^ + Crc32Table[0xC][(v0 >> 24) & 0xFF] ^ + Crc32Table[0xD][(v0 >> 16) & 0xFF] ^ + Crc32Table[0xE][(v0 >> 8) & 0xFF] ^ + Crc32Table[0xF][v0 & 0xFF]; +#endif + } + + for (p8 = (const uint8_t*)p32; size > 0; ++p8, size--) + crc = Crc32Table[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8); + + return (~crc); + } + + //--------------------------------------------------------------------- + + // Precalculated CRC32c lookup table for polynomial 0x1EDC6F41 (castagnoli-crc). + static const uint32_t Crc32cTable[8][256] = + { + { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 + }, + { + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, + 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, + 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, + 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, + 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, + 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, + 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, + 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, + 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, + 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, + 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, + 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, + 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, + 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, + 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, + 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, + 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, + 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, + 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, + 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, + 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, + 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 + }, + { + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, + 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, + 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, + 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, + 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, + 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, + 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, + 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, + 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, + 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, + 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, + 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, + 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, + 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, + 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, + 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, + 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, + 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, + 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, + 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, + 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, + 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 + }, + { + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, + 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, + 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, + 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, + 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, + 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, + 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, + 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, + 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, + 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, + 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, + 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, + 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, + 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, + 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, + 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, + 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, + 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, + 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, + 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, + 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, + 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 + }, + { + 0x00000000, 0x38116fac, 0x7022df58, 0x4833b0f4, 0xe045beb0, 0xd854d11c, 0x906761e8, 0xa8760e44, + 0xc5670b91, 0xfd76643d, 0xb545d4c9, 0x8d54bb65, 0x2522b521, 0x1d33da8d, 0x55006a79, 0x6d1105d5, + 0x8f2261d3, 0xb7330e7f, 0xff00be8b, 0xc711d127, 0x6f67df63, 0x5776b0cf, 0x1f45003b, 0x27546f97, + 0x4a456a42, 0x725405ee, 0x3a67b51a, 0x0276dab6, 0xaa00d4f2, 0x9211bb5e, 0xda220baa, 0xe2336406, + 0x1ba8b557, 0x23b9dafb, 0x6b8a6a0f, 0x539b05a3, 0xfbed0be7, 0xc3fc644b, 0x8bcfd4bf, 0xb3debb13, + 0xdecfbec6, 0xe6ded16a, 0xaeed619e, 0x96fc0e32, 0x3e8a0076, 0x069b6fda, 0x4ea8df2e, 0x76b9b082, + 0x948ad484, 0xac9bbb28, 0xe4a80bdc, 0xdcb96470, 0x74cf6a34, 0x4cde0598, 0x04edb56c, 0x3cfcdac0, + 0x51eddf15, 0x69fcb0b9, 0x21cf004d, 0x19de6fe1, 0xb1a861a5, 0x89b90e09, 0xc18abefd, 0xf99bd151, + 0x37516aae, 0x0f400502, 0x4773b5f6, 0x7f62da5a, 0xd714d41e, 0xef05bbb2, 0xa7360b46, 0x9f2764ea, + 0xf236613f, 0xca270e93, 0x8214be67, 0xba05d1cb, 0x1273df8f, 0x2a62b023, 0x625100d7, 0x5a406f7b, + 0xb8730b7d, 0x806264d1, 0xc851d425, 0xf040bb89, 0x5836b5cd, 0x6027da61, 0x28146a95, 0x10050539, + 0x7d1400ec, 0x45056f40, 0x0d36dfb4, 0x3527b018, 0x9d51be5c, 0xa540d1f0, 0xed736104, 0xd5620ea8, + 0x2cf9dff9, 0x14e8b055, 0x5cdb00a1, 0x64ca6f0d, 0xccbc6149, 0xf4ad0ee5, 0xbc9ebe11, 0x848fd1bd, + 0xe99ed468, 0xd18fbbc4, 0x99bc0b30, 0xa1ad649c, 0x09db6ad8, 0x31ca0574, 0x79f9b580, 0x41e8da2c, + 0xa3dbbe2a, 0x9bcad186, 0xd3f96172, 0xebe80ede, 0x439e009a, 0x7b8f6f36, 0x33bcdfc2, 0x0badb06e, + 0x66bcb5bb, 0x5eadda17, 0x169e6ae3, 0x2e8f054f, 0x86f90b0b, 0xbee864a7, 0xf6dbd453, 0xcecabbff, + 0x6ea2d55c, 0x56b3baf0, 0x1e800a04, 0x269165a8, 0x8ee76bec, 0xb6f60440, 0xfec5b4b4, 0xc6d4db18, + 0xabc5decd, 0x93d4b161, 0xdbe70195, 0xe3f66e39, 0x4b80607d, 0x73910fd1, 0x3ba2bf25, 0x03b3d089, + 0xe180b48f, 0xd991db23, 0x91a26bd7, 0xa9b3047b, 0x01c50a3f, 0x39d46593, 0x71e7d567, 0x49f6bacb, + 0x24e7bf1e, 0x1cf6d0b2, 0x54c56046, 0x6cd40fea, 0xc4a201ae, 0xfcb36e02, 0xb480def6, 0x8c91b15a, + 0x750a600b, 0x4d1b0fa7, 0x0528bf53, 0x3d39d0ff, 0x954fdebb, 0xad5eb117, 0xe56d01e3, 0xdd7c6e4f, + 0xb06d6b9a, 0x887c0436, 0xc04fb4c2, 0xf85edb6e, 0x5028d52a, 0x6839ba86, 0x200a0a72, 0x181b65de, + 0xfa2801d8, 0xc2396e74, 0x8a0ade80, 0xb21bb12c, 0x1a6dbf68, 0x227cd0c4, 0x6a4f6030, 0x525e0f9c, + 0x3f4f0a49, 0x075e65e5, 0x4f6dd511, 0x777cbabd, 0xdf0ab4f9, 0xe71bdb55, 0xaf286ba1, 0x9739040d, + 0x59f3bff2, 0x61e2d05e, 0x29d160aa, 0x11c00f06, 0xb9b60142, 0x81a76eee, 0xc994de1a, 0xf185b1b6, + 0x9c94b463, 0xa485dbcf, 0xecb66b3b, 0xd4a70497, 0x7cd10ad3, 0x44c0657f, 0x0cf3d58b, 0x34e2ba27, + 0xd6d1de21, 0xeec0b18d, 0xa6f30179, 0x9ee26ed5, 0x36946091, 0x0e850f3d, 0x46b6bfc9, 0x7ea7d065, + 0x13b6d5b0, 0x2ba7ba1c, 0x63940ae8, 0x5b856544, 0xf3f36b00, 0xcbe204ac, 0x83d1b458, 0xbbc0dbf4, + 0x425b0aa5, 0x7a4a6509, 0x3279d5fd, 0x0a68ba51, 0xa21eb415, 0x9a0fdbb9, 0xd23c6b4d, 0xea2d04e1, + 0x873c0134, 0xbf2d6e98, 0xf71ede6c, 0xcf0fb1c0, 0x6779bf84, 0x5f68d028, 0x175b60dc, 0x2f4a0f70, + 0xcd796b76, 0xf56804da, 0xbd5bb42e, 0x854adb82, 0x2d3cd5c6, 0x152dba6a, 0x5d1e0a9e, 0x650f6532, + 0x081e60e7, 0x300f0f4b, 0x783cbfbf, 0x402dd013, 0xe85bde57, 0xd04ab1fb, 0x9879010f, 0xa0686ea3 + }, + { + 0x00000000, 0xef306b19, 0xdb8ca0c3, 0x34bccbda, 0xb2f53777, 0x5dc55c6e, 0x697997b4, 0x8649fcad, + 0x6006181f, 0x8f367306, 0xbb8ab8dc, 0x54bad3c5, 0xd2f32f68, 0x3dc34471, 0x097f8fab, 0xe64fe4b2, + 0xc00c303e, 0x2f3c5b27, 0x1b8090fd, 0xf4b0fbe4, 0x72f90749, 0x9dc96c50, 0xa975a78a, 0x4645cc93, + 0xa00a2821, 0x4f3a4338, 0x7b8688e2, 0x94b6e3fb, 0x12ff1f56, 0xfdcf744f, 0xc973bf95, 0x2643d48c, + 0x85f4168d, 0x6ac47d94, 0x5e78b64e, 0xb148dd57, 0x370121fa, 0xd8314ae3, 0xec8d8139, 0x03bdea20, + 0xe5f20e92, 0x0ac2658b, 0x3e7eae51, 0xd14ec548, 0x570739e5, 0xb83752fc, 0x8c8b9926, 0x63bbf23f, + 0x45f826b3, 0xaac84daa, 0x9e748670, 0x7144ed69, 0xf70d11c4, 0x183d7add, 0x2c81b107, 0xc3b1da1e, + 0x25fe3eac, 0xcace55b5, 0xfe729e6f, 0x1142f576, 0x970b09db, 0x783b62c2, 0x4c87a918, 0xa3b7c201, + 0x0e045beb, 0xe13430f2, 0xd588fb28, 0x3ab89031, 0xbcf16c9c, 0x53c10785, 0x677dcc5f, 0x884da746, + 0x6e0243f4, 0x813228ed, 0xb58ee337, 0x5abe882e, 0xdcf77483, 0x33c71f9a, 0x077bd440, 0xe84bbf59, + 0xce086bd5, 0x213800cc, 0x1584cb16, 0xfab4a00f, 0x7cfd5ca2, 0x93cd37bb, 0xa771fc61, 0x48419778, + 0xae0e73ca, 0x413e18d3, 0x7582d309, 0x9ab2b810, 0x1cfb44bd, 0xf3cb2fa4, 0xc777e47e, 0x28478f67, + 0x8bf04d66, 0x64c0267f, 0x507ceda5, 0xbf4c86bc, 0x39057a11, 0xd6351108, 0xe289dad2, 0x0db9b1cb, + 0xebf65579, 0x04c63e60, 0x307af5ba, 0xdf4a9ea3, 0x5903620e, 0xb6330917, 0x828fc2cd, 0x6dbfa9d4, + 0x4bfc7d58, 0xa4cc1641, 0x9070dd9b, 0x7f40b682, 0xf9094a2f, 0x16392136, 0x2285eaec, 0xcdb581f5, + 0x2bfa6547, 0xc4ca0e5e, 0xf076c584, 0x1f46ae9d, 0x990f5230, 0x763f3929, 0x4283f2f3, 0xadb399ea, + 0x1c08b7d6, 0xf338dccf, 0xc7841715, 0x28b47c0c, 0xaefd80a1, 0x41cdebb8, 0x75712062, 0x9a414b7b, + 0x7c0eafc9, 0x933ec4d0, 0xa7820f0a, 0x48b26413, 0xcefb98be, 0x21cbf3a7, 0x1577387d, 0xfa475364, + 0xdc0487e8, 0x3334ecf1, 0x0788272b, 0xe8b84c32, 0x6ef1b09f, 0x81c1db86, 0xb57d105c, 0x5a4d7b45, + 0xbc029ff7, 0x5332f4ee, 0x678e3f34, 0x88be542d, 0x0ef7a880, 0xe1c7c399, 0xd57b0843, 0x3a4b635a, + 0x99fca15b, 0x76ccca42, 0x42700198, 0xad406a81, 0x2b09962c, 0xc439fd35, 0xf08536ef, 0x1fb55df6, + 0xf9fab944, 0x16cad25d, 0x22761987, 0xcd46729e, 0x4b0f8e33, 0xa43fe52a, 0x90832ef0, 0x7fb345e9, + 0x59f09165, 0xb6c0fa7c, 0x827c31a6, 0x6d4c5abf, 0xeb05a612, 0x0435cd0b, 0x308906d1, 0xdfb96dc8, + 0x39f6897a, 0xd6c6e263, 0xe27a29b9, 0x0d4a42a0, 0x8b03be0d, 0x6433d514, 0x508f1ece, 0xbfbf75d7, + 0x120cec3d, 0xfd3c8724, 0xc9804cfe, 0x26b027e7, 0xa0f9db4a, 0x4fc9b053, 0x7b757b89, 0x94451090, + 0x720af422, 0x9d3a9f3b, 0xa98654e1, 0x46b63ff8, 0xc0ffc355, 0x2fcfa84c, 0x1b736396, 0xf443088f, + 0xd200dc03, 0x3d30b71a, 0x098c7cc0, 0xe6bc17d9, 0x60f5eb74, 0x8fc5806d, 0xbb794bb7, 0x544920ae, + 0xb206c41c, 0x5d36af05, 0x698a64df, 0x86ba0fc6, 0x00f3f36b, 0xefc39872, 0xdb7f53a8, 0x344f38b1, + 0x97f8fab0, 0x78c891a9, 0x4c745a73, 0xa344316a, 0x250dcdc7, 0xca3da6de, 0xfe816d04, 0x11b1061d, + 0xf7fee2af, 0x18ce89b6, 0x2c72426c, 0xc3422975, 0x450bd5d8, 0xaa3bbec1, 0x9e87751b, 0x71b71e02, + 0x57f4ca8e, 0xb8c4a197, 0x8c786a4d, 0x63480154, 0xe501fdf9, 0x0a3196e0, 0x3e8d5d3a, 0xd1bd3623, + 0x37f2d291, 0xd8c2b988, 0xec7e7252, 0x034e194b, 0x8507e5e6, 0x6a378eff, 0x5e8b4525, 0xb1bb2e3c + }, + { + 0x00000000, 0x68032cc8, 0xd0065990, 0xb8057558, 0xa5e0c5d1, 0xcde3e919, 0x75e69c41, 0x1de5b089, + 0x4e2dfd53, 0x262ed19b, 0x9e2ba4c3, 0xf628880b, 0xebcd3882, 0x83ce144a, 0x3bcb6112, 0x53c84dda, + 0x9c5bfaa6, 0xf458d66e, 0x4c5da336, 0x245e8ffe, 0x39bb3f77, 0x51b813bf, 0xe9bd66e7, 0x81be4a2f, + 0xd27607f5, 0xba752b3d, 0x02705e65, 0x6a7372ad, 0x7796c224, 0x1f95eeec, 0xa7909bb4, 0xcf93b77c, + 0x3d5b83bd, 0x5558af75, 0xed5dda2d, 0x855ef6e5, 0x98bb466c, 0xf0b86aa4, 0x48bd1ffc, 0x20be3334, + 0x73767eee, 0x1b755226, 0xa370277e, 0xcb730bb6, 0xd696bb3f, 0xbe9597f7, 0x0690e2af, 0x6e93ce67, + 0xa100791b, 0xc90355d3, 0x7106208b, 0x19050c43, 0x04e0bcca, 0x6ce39002, 0xd4e6e55a, 0xbce5c992, + 0xef2d8448, 0x872ea880, 0x3f2bddd8, 0x5728f110, 0x4acd4199, 0x22ce6d51, 0x9acb1809, 0xf2c834c1, + 0x7ab7077a, 0x12b42bb2, 0xaab15eea, 0xc2b27222, 0xdf57c2ab, 0xb754ee63, 0x0f519b3b, 0x6752b7f3, + 0x349afa29, 0x5c99d6e1, 0xe49ca3b9, 0x8c9f8f71, 0x917a3ff8, 0xf9791330, 0x417c6668, 0x297f4aa0, + 0xe6ecfddc, 0x8eefd114, 0x36eaa44c, 0x5ee98884, 0x430c380d, 0x2b0f14c5, 0x930a619d, 0xfb094d55, + 0xa8c1008f, 0xc0c22c47, 0x78c7591f, 0x10c475d7, 0x0d21c55e, 0x6522e996, 0xdd279cce, 0xb524b006, + 0x47ec84c7, 0x2fefa80f, 0x97eadd57, 0xffe9f19f, 0xe20c4116, 0x8a0f6dde, 0x320a1886, 0x5a09344e, + 0x09c17994, 0x61c2555c, 0xd9c72004, 0xb1c40ccc, 0xac21bc45, 0xc422908d, 0x7c27e5d5, 0x1424c91d, + 0xdbb77e61, 0xb3b452a9, 0x0bb127f1, 0x63b20b39, 0x7e57bbb0, 0x16549778, 0xae51e220, 0xc652cee8, + 0x959a8332, 0xfd99affa, 0x459cdaa2, 0x2d9ff66a, 0x307a46e3, 0x58796a2b, 0xe07c1f73, 0x887f33bb, + 0xf56e0ef4, 0x9d6d223c, 0x25685764, 0x4d6b7bac, 0x508ecb25, 0x388de7ed, 0x808892b5, 0xe88bbe7d, + 0xbb43f3a7, 0xd340df6f, 0x6b45aa37, 0x034686ff, 0x1ea33676, 0x76a01abe, 0xcea56fe6, 0xa6a6432e, + 0x6935f452, 0x0136d89a, 0xb933adc2, 0xd130810a, 0xccd53183, 0xa4d61d4b, 0x1cd36813, 0x74d044db, + 0x27180901, 0x4f1b25c9, 0xf71e5091, 0x9f1d7c59, 0x82f8ccd0, 0xeafbe018, 0x52fe9540, 0x3afdb988, + 0xc8358d49, 0xa036a181, 0x1833d4d9, 0x7030f811, 0x6dd54898, 0x05d66450, 0xbdd31108, 0xd5d03dc0, + 0x8618701a, 0xee1b5cd2, 0x561e298a, 0x3e1d0542, 0x23f8b5cb, 0x4bfb9903, 0xf3feec5b, 0x9bfdc093, + 0x546e77ef, 0x3c6d5b27, 0x84682e7f, 0xec6b02b7, 0xf18eb23e, 0x998d9ef6, 0x2188ebae, 0x498bc766, + 0x1a438abc, 0x7240a674, 0xca45d32c, 0xa246ffe4, 0xbfa34f6d, 0xd7a063a5, 0x6fa516fd, 0x07a63a35, + 0x8fd9098e, 0xe7da2546, 0x5fdf501e, 0x37dc7cd6, 0x2a39cc5f, 0x423ae097, 0xfa3f95cf, 0x923cb907, + 0xc1f4f4dd, 0xa9f7d815, 0x11f2ad4d, 0x79f18185, 0x6414310c, 0x0c171dc4, 0xb412689c, 0xdc114454, + 0x1382f328, 0x7b81dfe0, 0xc384aab8, 0xab878670, 0xb66236f9, 0xde611a31, 0x66646f69, 0x0e6743a1, + 0x5daf0e7b, 0x35ac22b3, 0x8da957eb, 0xe5aa7b23, 0xf84fcbaa, 0x904ce762, 0x2849923a, 0x404abef2, + 0xb2828a33, 0xda81a6fb, 0x6284d3a3, 0x0a87ff6b, 0x17624fe2, 0x7f61632a, 0xc7641672, 0xaf673aba, + 0xfcaf7760, 0x94ac5ba8, 0x2ca92ef0, 0x44aa0238, 0x594fb2b1, 0x314c9e79, 0x8949eb21, 0xe14ac7e9, + 0x2ed97095, 0x46da5c5d, 0xfedf2905, 0x96dc05cd, 0x8b39b544, 0xe33a998c, 0x5b3fecd4, 0x333cc01c, + 0x60f48dc6, 0x08f7a10e, 0xb0f2d456, 0xd8f1f89e, 0xc5144817, 0xad1764df, 0x15121187, 0x7d113d4f + }, + { + 0x00000000, 0x493c7d27, 0x9278fa4e, 0xdb448769, 0x211d826d, 0x6821ff4a, 0xb3657823, 0xfa590504, + 0x423b04da, 0x0b0779fd, 0xd043fe94, 0x997f83b3, 0x632686b7, 0x2a1afb90, 0xf15e7cf9, 0xb86201de, + 0x847609b4, 0xcd4a7493, 0x160ef3fa, 0x5f328edd, 0xa56b8bd9, 0xec57f6fe, 0x37137197, 0x7e2f0cb0, + 0xc64d0d6e, 0x8f717049, 0x5435f720, 0x1d098a07, 0xe7508f03, 0xae6cf224, 0x7528754d, 0x3c14086a, + 0x0d006599, 0x443c18be, 0x9f789fd7, 0xd644e2f0, 0x2c1de7f4, 0x65219ad3, 0xbe651dba, 0xf759609d, + 0x4f3b6143, 0x06071c64, 0xdd439b0d, 0x947fe62a, 0x6e26e32e, 0x271a9e09, 0xfc5e1960, 0xb5626447, + 0x89766c2d, 0xc04a110a, 0x1b0e9663, 0x5232eb44, 0xa86bee40, 0xe1579367, 0x3a13140e, 0x732f6929, + 0xcb4d68f7, 0x827115d0, 0x593592b9, 0x1009ef9e, 0xea50ea9a, 0xa36c97bd, 0x782810d4, 0x31146df3, + 0x1a00cb32, 0x533cb615, 0x8878317c, 0xc1444c5b, 0x3b1d495f, 0x72213478, 0xa965b311, 0xe059ce36, + 0x583bcfe8, 0x1107b2cf, 0xca4335a6, 0x837f4881, 0x79264d85, 0x301a30a2, 0xeb5eb7cb, 0xa262caec, + 0x9e76c286, 0xd74abfa1, 0x0c0e38c8, 0x453245ef, 0xbf6b40eb, 0xf6573dcc, 0x2d13baa5, 0x642fc782, + 0xdc4dc65c, 0x9571bb7b, 0x4e353c12, 0x07094135, 0xfd504431, 0xb46c3916, 0x6f28be7f, 0x2614c358, + 0x1700aeab, 0x5e3cd38c, 0x857854e5, 0xcc4429c2, 0x361d2cc6, 0x7f2151e1, 0xa465d688, 0xed59abaf, + 0x553baa71, 0x1c07d756, 0xc743503f, 0x8e7f2d18, 0x7426281c, 0x3d1a553b, 0xe65ed252, 0xaf62af75, + 0x9376a71f, 0xda4ada38, 0x010e5d51, 0x48322076, 0xb26b2572, 0xfb575855, 0x2013df3c, 0x692fa21b, + 0xd14da3c5, 0x9871dee2, 0x4335598b, 0x0a0924ac, 0xf05021a8, 0xb96c5c8f, 0x6228dbe6, 0x2b14a6c1, + 0x34019664, 0x7d3deb43, 0xa6796c2a, 0xef45110d, 0x151c1409, 0x5c20692e, 0x8764ee47, 0xce589360, + 0x763a92be, 0x3f06ef99, 0xe44268f0, 0xad7e15d7, 0x572710d3, 0x1e1b6df4, 0xc55fea9d, 0x8c6397ba, + 0xb0779fd0, 0xf94be2f7, 0x220f659e, 0x6b3318b9, 0x916a1dbd, 0xd856609a, 0x0312e7f3, 0x4a2e9ad4, + 0xf24c9b0a, 0xbb70e62d, 0x60346144, 0x29081c63, 0xd3511967, 0x9a6d6440, 0x4129e329, 0x08159e0e, + 0x3901f3fd, 0x703d8eda, 0xab7909b3, 0xe2457494, 0x181c7190, 0x51200cb7, 0x8a648bde, 0xc358f6f9, + 0x7b3af727, 0x32068a00, 0xe9420d69, 0xa07e704e, 0x5a27754a, 0x131b086d, 0xc85f8f04, 0x8163f223, + 0xbd77fa49, 0xf44b876e, 0x2f0f0007, 0x66337d20, 0x9c6a7824, 0xd5560503, 0x0e12826a, 0x472eff4d, + 0xff4cfe93, 0xb67083b4, 0x6d3404dd, 0x240879fa, 0xde517cfe, 0x976d01d9, 0x4c2986b0, 0x0515fb97, + 0x2e015d56, 0x673d2071, 0xbc79a718, 0xf545da3f, 0x0f1cdf3b, 0x4620a21c, 0x9d642575, 0xd4585852, + 0x6c3a598c, 0x250624ab, 0xfe42a3c2, 0xb77edee5, 0x4d27dbe1, 0x041ba6c6, 0xdf5f21af, 0x96635c88, + 0xaa7754e2, 0xe34b29c5, 0x380faeac, 0x7133d38b, 0x8b6ad68f, 0xc256aba8, 0x19122cc1, 0x502e51e6, + 0xe84c5038, 0xa1702d1f, 0x7a34aa76, 0x3308d751, 0xc951d255, 0x806daf72, 0x5b29281b, 0x1215553c, + 0x230138cf, 0x6a3d45e8, 0xb179c281, 0xf845bfa6, 0x021cbaa2, 0x4b20c785, 0x906440ec, 0xd9583dcb, + 0x613a3c15, 0x28064132, 0xf342c65b, 0xba7ebb7c, 0x4027be78, 0x091bc35f, 0xd25f4436, 0x9b633911, + 0xa777317b, 0xee4b4c5c, 0x350fcb35, 0x7c33b612, 0x866ab316, 0xcf56ce31, 0x14124958, 0x5d2e347f, + 0xe54c35a1, 0xac704886, 0x7734cfef, 0x3e08b2c8, 0xc451b7cc, 0x8d6dcaeb, 0x56294d82, 0x1f1530a5 + } + }; + + uint32_t Crc32c(const void* src, size_t size) + { + const uint8_t* p8 = (const uint8_t*)src; + uint32_t crc = 0xFFFFFFFF; + + for (; ((uintptr_t)p8 & (sizeof(uint32_t) - 1)) != 0 && size > 0; ++p8, --size) + crc = Crc32cTable[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8); + + const uint32_t* p32 = (const uint32_t*)p8; + for (; size >= 8; size -= 8) + { +#ifdef SIMD_BIG_ENDIAN + uint32_t v0 = *p32++ ^ Reorder32(crc); + uint32_t v1 = *p32++; + crc = + Crc32cTable[0x0][v1 & 0xFF] ^ + Crc32cTable[0x1][(v1 >> 8) & 0xFF] ^ + Crc32cTable[0x2][(v1 >> 16) & 0xFF] ^ + Crc32cTable[0x3][(v1 >> 24) & 0xFF] ^ + Crc32cTable[0x4][v0 & 0xFF] ^ + Crc32cTable[0x5][(v0 >> 8) & 0xFF] ^ + Crc32cTable[0x6][(v0 >> 16) & 0xFF] ^ + Crc32cTable[0x7][(v0 >> 24) & 0xFF]; +#else + uint32_t v0 = *p32++ ^ crc; + uint32_t v1 = *p32++; + crc = + Crc32cTable[0x0][(v1 >> 24) & 0xFF] ^ + Crc32cTable[0x1][(v1 >> 16) & 0xFF] ^ + Crc32cTable[0x2][(v1 >> 8) & 0xFF] ^ + Crc32cTable[0x3][v1 & 0xFF] ^ + Crc32cTable[0x4][(v0 >> 24) & 0xFF] ^ + Crc32cTable[0x5][(v0 >> 16) & 0xFF] ^ + Crc32cTable[0x6][(v0 >> 8) & 0xFF] ^ + Crc32cTable[0x7][v0 & 0xFF]; +#endif + } + + for (p8 = (const uint8_t*)p32; size > 0; ++p8, size--) + crc = Crc32cTable[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8); + + return (~crc); + } + } +} diff --git a/3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp b/3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp new file mode 100644 index 0000000000..b064ca50a2 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp @@ -0,0 +1,371 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdArray.h" +#include "Simd/SimdCpu.h" +#include "Simd/SimdBase.h" + +#include + +#if defined(_MSC_VER) +#pragma warning (push) +#pragma warning (disable: 4996) +#endif + +namespace Simd +{ + uint8_t* ImageLoadFromFile(const ImageLoadFromMemoryPtr loader, const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) + { + uint8_t* data = NULL; + ::FILE* file = ::fopen(path, "rb"); + if (file) + { + ::fseek(file, 0, SEEK_END); + Array8u buffer(::ftell(file)); + ::fseek(file, 0, SEEK_SET); + if (::fread(buffer.data, 1, buffer.size, file) == buffer.size) + data = loader(buffer.data, buffer.size, stride, width, height, format); + ::fclose(file); + } + return data; + } + + //------------------------------------------------------------------------- + + ImageLoaderParam::ImageLoaderParam(const uint8_t* d, size_t s, SimdPixelFormatType f) + : data(d) + , size(s) + , format(f) + , file(SimdImageFileUndefined) + { + } + + bool ImageLoaderParam::Validate() + { + if (size >= 3) + { + if (data[0] == 'P' && data[2] == '\n') + { + if (data[1] == '2') + file = SimdImageFilePgmTxt; + if (data[1] == '3') + file = SimdImageFilePpmTxt; + if (data[1] == '5') + file = SimdImageFilePgmBin; + if (data[1] == '6') + file = SimdImageFilePpmBin; + } + } + if (size >= 8) + { + const uint8_t SIGNATURE[8] = { 137, 80, 78, 71, 13, 10, 26, 10 }; + if(memcmp(data, SIGNATURE, 8) == 0) + file = SimdImageFilePng; + } + if (size >= 2) + { + if (data[0] == 0xFF && data[1] == 0xD8) + file = SimdImageFileJpeg; + } + return + file != SimdImageFileUndefined && + (format == SimdPixelFormatNone || format == SimdPixelFormatGray8 || + format == SimdPixelFormatBgr24 || format == SimdPixelFormatBgra32 || + format == SimdPixelFormatRgb24 || format == SimdPixelFormatRgba32); + } + + namespace Base + { + ImagePxmLoader::ImagePxmLoader(const ImageLoaderParam& param) + : ImageLoader(param) + , _toAny(NULL) + , _toBgra(NULL) + { + } + + bool ImagePxmLoader::ReadHeader(size_t version) + { + if (_stream.Size() < 3 || + _stream.Data()[0] != 'P' || + _stream.Data()[1] != '0' + version || + _stream.Data()[2] != '\n') + return false; + _stream.Seek(3); + uint32_t width, height, max; + if (!(_stream.ReadUnsigned(width) && _stream.ReadUnsigned(height) && _stream.ReadUnsigned(max))) + return false; + if (!(width > 0 && height > 0 && max == 255)) + return false; + uint8_t byte; + if (!(_stream.Read(byte) && byte == '\n')) + return false; + _image.Recreate(width, height, (Image::Format)_param.format); + _block = height; + if (_param.file == SimdImageFilePgmTxt || _param.file == SimdImageFilePgmBin) + { + _size = width * 1; + if (_param.format != SimdPixelFormatGray8) + { + _block = Simd::RestrictRange(Base::AlgCacheL1() / _size, 1, height); + _buffer.Resize(_block * _size); + } + } + else if (_param.file == SimdImageFilePpmTxt || _param.file == SimdImageFilePpmBin) + { + _size = width * 3; + if (_param.format != SimdPixelFormatRgb24) + { + _block = Simd::RestrictRange(Base::AlgCacheL1() / _size, 1, height); + _buffer.Resize(_block * _size); + } + } + else + return false; + SetConverters(); + return true; + } + + //------------------------------------------------------------------------- + + ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param) + : ImagePxmLoader(param) + { + if (_param.format == SimdPixelFormatNone) + _param.format = SimdPixelFormatGray8; + } + + bool ImagePgmTxtLoader::FromStream() + { + if (!ReadHeader(2)) + return false; + size_t grayStride = _param.format == SimdPixelFormatGray8 ? _image.stride : _size; + for (size_t row = 0; row < _image.height;) + { + size_t block = Simd::Min(row + _block, _image.height) - row; + uint8_t * gray = _param.format == SimdPixelFormatGray8 ? _image.Row(row) : _buffer.data; + for (size_t b = 0; b < block; ++b) + { + for (size_t i = 0; i < _size; ++i) + { + if (!_stream.ReadUnsigned(gray[i])) + return false; + } + gray += grayStride; + } + if(_param.format == SimdPixelFormatBgr24 || _param.format == SimdPixelFormatRgb24) + _toAny(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride); + if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32) + _toBgra(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride, 0xFF); + row += block; + } + return true; + } + + void ImagePgmTxtLoader::SetConverters() + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Base::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Base::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Base::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Base::GrayToBgra; break; + default: break; + } + } + + //------------------------------------------------------------------------- + + ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param) + : ImagePxmLoader(param) + { + if (_param.format == SimdPixelFormatNone) + _param.format = SimdPixelFormatGray8; + } + + bool ImagePgmBinLoader::FromStream() + { + if (!ReadHeader(5)) + return false; + size_t grayStride = _param.format == SimdPixelFormatGray8 ? _image.stride : _size; + for (size_t row = 0; row < _image.height;) + { + size_t block = Simd::Min(row + _block, _image.height) - row; + uint8_t* gray = _param.format == SimdPixelFormatGray8 ? _image.Row(row) : _buffer.data; + for (size_t b = 0; b < block; ++b) + { + if (_stream.Read(_size, gray) != _size) + return false; + gray += grayStride; + } + if (_param.format == SimdPixelFormatBgr24 || _param.format == SimdPixelFormatRgb24) + _toAny(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride); + if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32) + _toBgra(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride, 0xFF); + row += block; + } + return true; + } + + void ImagePgmBinLoader::SetConverters() + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Base::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Base::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Base::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Base::GrayToBgra; break; + default: break; + } + } + + //------------------------------------------------------------------------- + + ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param) + : ImagePxmLoader(param) + { + if (_param.format == SimdPixelFormatNone) + _param.format = SimdPixelFormatRgb24; + } + + bool ImagePpmTxtLoader::FromStream() + { + if (!ReadHeader(3)) + return false; + size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? _image.stride : _size; + for (size_t row = 0; row < _image.height;) + { + size_t block = Simd::Min(row + _block, _image.height) - row; + uint8_t* rgb = _param.format == SimdPixelFormatRgb24 ? _image.Row(row) : _buffer.data; + for (size_t b = 0; b < block; ++b) + { + for (size_t i = 0; i < _size; ++i) + { + if (!_stream.ReadUnsigned(rgb[i])) + return false; + } + rgb += rgbStride; + } + if (_param.format == SimdPixelFormatGray8 || _param.format == SimdPixelFormatBgr24) + _toAny(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride); + if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32) + _toBgra(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride, 0xFF); + row += block; + } + return true; + } + + void ImagePpmTxtLoader::SetConverters() + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Base::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Base::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Base::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Base::BgrToBgra; break; + default: break; + } + } + + //------------------------------------------------------------------------- + + ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param) + : ImagePxmLoader(param) + { + if (_param.format == SimdPixelFormatNone) + _param.format = SimdPixelFormatRgb24; + } + + bool ImagePpmBinLoader::FromStream() + { + if (!ReadHeader(6)) + return false; + size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? _image.stride : _size; + for (size_t row = 0; row < _image.height;) + { + size_t block = Simd::Min(row + _block, _image.height) - row; + uint8_t* rgb = _param.format == SimdPixelFormatRgb24 ? _image.Row(row) : _buffer.data; + for (size_t b = 0; b < block; ++b) + { + if (_stream.Read(_size, rgb) != _size) + return false; + rgb += rgbStride; + } + if (_param.format == SimdPixelFormatGray8 || _param.format == SimdPixelFormatBgr24) + _toAny(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride); + if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32) + _toBgra(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride, 0xFF); + row += block; + } + return true; + } + + void ImagePpmBinLoader::SetConverters() + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Base::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Base::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Base::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Base::BgrToBgra; break; + default: break; + } + } + + //------------------------------------------------------------------------- + + ImageLoader* CreateImageLoader(const ImageLoaderParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param); + case SimdImageFilePgmBin: return new ImagePgmBinLoader(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param); + case SimdImageFilePpmBin: return new ImagePpmBinLoader(param); + case SimdImageFilePng: return new ImagePngLoader(param); + case SimdImageFileJpeg: return new ImageJpegLoader(param); + default: + return NULL; + } + } + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) + { + ImageLoaderParam param(data, size, *format); + if (param.Validate()) + { + Holder loader(CreateImageLoader(param)); + if (loader) + { + if (loader->FromStream()) + return loader->Release(stride, width, height, format); + } + } + return NULL; + } + } +} + +#if defined(_MSC_VER) +#pragma warning (pop) +#endif diff --git a/3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp b/3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp new file mode 100644 index 0000000000..88c5da73d0 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp @@ -0,0 +1,2456 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdArray.h" +#include "Simd/SimdCpu.h" +#include "Simd/SimdBase.h" + +namespace Simd +{ + namespace Base + { +#if defined(SIMD_X64_ENABLE) && !defined(SIMD_SSE2_DISABLE) +#define JPEG_SSE2 + static int jpeg__sse2_available(void) + { + return 1; + } +#endif + +#if defined(SIMD_ARM64_ENABLE) && !defined(SIMD_NEON_DISABLE) +#define JPEG_NEON +#endif + + typedef unsigned char jpeg_uc; + typedef unsigned short jpeg_us; + typedef unsigned short jpeg__uint16; + typedef signed short jpeg__int16; + typedef unsigned int jpeg__uint32; + typedef signed int jpeg__int32; + + typedef struct + { + int (*read) (void* user, char* data, int size); // fill 'data' with 'size' bytes. return number of bytes actually read + void (*skip) (void* user, int n); // skip the next 'n' bytes, or 'unget' the last -n bytes if negative + int (*eof) (void* user); // returns nonzero if we are at end of file/data + } jpeg_io_callbacks; + +#define jpeg_inline SIMD_INLINE +#define JPEG_ASSERT assert + +#ifdef _MSC_VER +#define JPEG_NOTUSED(v) (void)(v) +#else +#define JPEG_NOTUSED(v) (void)sizeof(v) +#endif + + typedef struct + { + jpeg__uint32 img_x, img_y; + int img_n, img_out_n; + + jpeg_io_callbacks io; + void* io_user_data; + + int read_from_callbacks; + int buflen; + jpeg_uc buffer_start[128]; + int callback_already_read; + + jpeg_uc* img_buffer, * img_buffer_end; + jpeg_uc* img_buffer_original, * img_buffer_original_end; + } jpeg__context; + + static int jpeg__err(const char* str) + { + //jpeg__g_failure_reason = str; + return 0; + } + + static int jpeg__err(const char* str1, const char* str2) + { + //jpeg__g_failure_reason = str; + return 0; + } + +#define jpeg__errpuc(x,y) ((unsigned char *)(size_t) (jpeg__err(x,y)?NULL:NULL)) + + static void jpeg__refill_buffer(jpeg__context* s) + { + int n = (s->io.read)(s->io_user_data, (char*)s->buffer_start, s->buflen); + s->callback_already_read += (int)(s->img_buffer - s->img_buffer_original); + if (n == 0) { + // at end of file, treat same as if from memory, but need to handle case + // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file + s->read_from_callbacks = 0; + s->img_buffer = s->buffer_start; + s->img_buffer_end = s->buffer_start + 1; + *s->img_buffer = 0; + } + else { + s->img_buffer = s->buffer_start; + s->img_buffer_end = s->buffer_start + n; + } + } + + jpeg_inline static jpeg_uc jpeg__get8(jpeg__context* s) + { + if (s->img_buffer < s->img_buffer_end) + return *s->img_buffer++; + if (s->read_from_callbacks) { + jpeg__refill_buffer(s); + return *s->img_buffer++; + } + return 0; + } + +#define jpeg_lrot(x,y) (((x) << (y)) | ((x) >> (32 - (y)))) + +#define JPEG_SIMD_ALIGN(type, name) SIMD_ALIGNED(16) type name + + static int jpeg__get16be(jpeg__context* s) + { + int z = jpeg__get8(s); + return (z << 8) + jpeg__get8(s); + } + + static void jpeg__skip(jpeg__context* s, int n) + { + if (n == 0) return; // already there! + if (n < 0) { + s->img_buffer = s->img_buffer_end; + return; + } + if (s->io.read) { + int blen = (int)(s->img_buffer_end - s->img_buffer); + if (blen < n) { + s->img_buffer = s->img_buffer_end; + (s->io.skip)(s->io_user_data, n - blen); + return; + } + } + s->img_buffer += n; + } + + jpeg_inline static int jpeg__at_eof(jpeg__context* s) + { + if (s->io.read) { + if (!(s->io.eof)(s->io_user_data)) return 0; + // if feof() is true, check if buffer = end + // special case: we've only got the special 0 character at the end + if (s->read_from_callbacks == 0) return 1; + } + + return s->img_buffer >= s->img_buffer_end; + } + +#define JPEG_MALLOC(sz) malloc(sz) +#define JPEG_REALLOC(p,newsz) realloc(p,newsz) +#define JPEG_FREE(p) free(p) + +#define JPEG_MAX_DIMENSIONS (1 << 24) + + enum + { + JPEG__SCAN_load = 0, + JPEG__SCAN_type, + JPEG__SCAN_header + }; + + static void* jpeg__malloc(size_t size) + { + return JPEG_MALLOC(size); + } + + static int jpeg__addsizes_valid(int a, int b) + { + if (b < 0) return 0; + // now 0 <= b <= INT_MAX, hence also + // 0 <= INT_MAX - b <= INTMAX. + // And "a + b <= INT_MAX" (which might overflow) is the + // same as a <= INT_MAX - b (no overflow) + return a <= INT_MAX - b; + } + + static int jpeg__mul2sizes_valid(int a, int b) + { + if (a < 0 || b < 0) return 0; + if (b == 0) return 1; // mul-by-0 is always safe + // portable way to check for no overflows in a*b + return a <= INT_MAX / b; + } + + static int jpeg__mad2sizes_valid(int a, int b, int add) + { + return jpeg__mul2sizes_valid(a, b) && jpeg__addsizes_valid(a * b, add); + } + + // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow + static int jpeg__mad3sizes_valid(int a, int b, int c, int add) + { + return jpeg__mul2sizes_valid(a, b) && jpeg__mul2sizes_valid(a * b, c) && + jpeg__addsizes_valid(a * b * c, add); + } + + static int jpeg__mad4sizes_valid(int a, int b, int c, int d, int add) + { + return jpeg__mul2sizes_valid(a, b) && jpeg__mul2sizes_valid(a * b, c) && + jpeg__mul2sizes_valid(a * b * c, d) && jpeg__addsizes_valid(a * b * c * d, add); + } + + static void* jpeg__malloc_mad2(int a, int b, int add) + { + if (!jpeg__mad2sizes_valid(a, b, add)) return NULL; + return jpeg__malloc(a * b + add); + } + + static void* jpeg__malloc_mad3(int a, int b, int c, int add) + { + if (!jpeg__mad3sizes_valid(a, b, c, add)) return NULL; + return jpeg__malloc(a * b * c + add); + } + + static jpeg_uc jpeg__compute_y(int r, int g, int b) + { + return (jpeg_uc)(((r * 77) + (g * 150) + (29 * b)) >> 8); + } + + typedef struct + { + int bits_per_channel; + int num_channels; + int channel_order; + } jpeg__result_info; + + static void jpeg__rewind(jpeg__context* s) + { + // conceptually rewind SHOULD rewind to the beginning of the stream, + // but we just rewind to the beginning of the initial buffer, because + // we only use it after doing 'test', which only ever looks at at most 92 bytes + s->img_buffer = s->img_buffer_original; + s->img_buffer_end = s->img_buffer_original_end; + } + + //------------------------------------------------------------------------------ + + // huffman decoding acceleration +#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache + + typedef struct + { + jpeg_uc fast[1 << FAST_BITS]; + // weirdly, repacking this into AoS is a 10% speed loss, instead of a win + jpeg__uint16 code[256]; + jpeg_uc values[256]; + jpeg_uc size[257]; + unsigned int maxcode[18]; + int delta[17]; // old 'firstsymbol' - old 'firstcode' + } jpeg__huffman; + + typedef struct + { + jpeg__context* s; + jpeg__huffman huff_dc[4]; + jpeg__huffman huff_ac[4]; + jpeg__uint16 dequant[4][64]; + jpeg__int16 fast_ac[4][1 << FAST_BITS]; + + // sizes for components, interleaved MCUs + int img_h_max, img_v_max; + int img_mcu_x, img_mcu_y; + int img_mcu_w, img_mcu_h; + + // definition of jpeg image component + struct + { + int id; + int h, v; + int tq; + int hd, ha; + int dc_pred; + + int x, y, w2, h2; + jpeg_uc* data; + void* raw_data, * raw_coeff; + jpeg_uc* linebuf; + short* coeff; // progressive only + int coeff_w, coeff_h; // number of 8x8 coefficient blocks + } img_comp[4]; + + jpeg__uint32 code_buffer; // jpeg entropy-coded buffer + int code_bits; // number of valid bits + unsigned char marker; // marker seen while filling entropy buffer + int nomore; // flag if we saw a marker so must stop + + int progressive; + int spec_start; + int spec_end; + int succ_high; + int succ_low; + int eob_run; + int jfif; + int app14_color_transform; // Adobe APP14 tag + int rgb; + + int scan_n, order[4]; + int restart_interval, todo; + + // kernels + void (*idct_block_kernel)(jpeg_uc* out, int out_stride, short data[64]); + void (*YCbCr_to_RGB_kernel)(jpeg_uc* out, const jpeg_uc* y, const jpeg_uc* pcb, const jpeg_uc* pcr, int count, int step); + jpeg_uc* (*resample_row_hv_2_kernel)(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs); + } jpeg__jpeg; + + static int jpeg__build_huffman(jpeg__huffman* h, int* count) + { + int i, j, k = 0; + unsigned int code; + // build size list for each symbol (from JPEG spec) + for (i = 0; i < 16; ++i) + for (j = 0; j < count[i]; ++j) + h->size[k++] = (jpeg_uc)(i + 1); + h->size[k] = 0; + + // compute actual symbols (from jpeg spec) + code = 0; + k = 0; + for (j = 1; j <= 16; ++j) { + // compute delta to add to code to compute symbol id + h->delta[j] = k - code; + if (h->size[k] == j) { + while (h->size[k] == j) + h->code[k++] = (jpeg__uint16)(code++); + if (code - 1 >= (1u << j)) return jpeg__err("bad code lengths", "Corrupt JPEG"); + } + // compute largest code + 1 for this size, preshifted as needed later + h->maxcode[j] = code << (16 - j); + code <<= 1; + } + h->maxcode[j] = 0xffffffff; + + // build non-spec acceleration table; 255 is flag for not-accelerated + memset(h->fast, 255, 1 << FAST_BITS); + for (i = 0; i < k; ++i) { + int s = h->size[i]; + if (s <= FAST_BITS) { + int c = h->code[i] << (FAST_BITS - s); + int m = 1 << (FAST_BITS - s); + for (j = 0; j < m; ++j) { + h->fast[c + j] = (jpeg_uc)i; + } + } + } + return 1; + } + + // build a table that decodes both magnitude and value of small ACs in + // one go. + static void jpeg__build_fast_ac(jpeg__int16* fast_ac, jpeg__huffman* h) + { + int i; + for (i = 0; i < (1 << FAST_BITS); ++i) { + jpeg_uc fast = h->fast[i]; + fast_ac[i] = 0; + if (fast < 255) { + int rs = h->values[fast]; + int run = (rs >> 4) & 15; + int magbits = rs & 15; + int len = h->size[fast]; + + if (magbits && len + magbits <= FAST_BITS) { + // magnitude code followed by receive_extend code + int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits); + int m = 1 << (magbits - 1); + if (k < m) k += (~0U << magbits) + 1; + // if the result is small enough, we can fit it in fast_ac table + if (k >= -128 && k <= 127) + fast_ac[i] = (jpeg__int16)((k * 256) + (run * 16) + (len + magbits)); + } + } + } + } + + static void jpeg__grow_buffer_unsafe(jpeg__jpeg* j) + { + do { + unsigned int b = j->nomore ? 0 : jpeg__get8(j->s); + if (b == 0xff) { + int c = jpeg__get8(j->s); + while (c == 0xff) c = jpeg__get8(j->s); // consume fill bytes + if (c != 0) { + j->marker = (unsigned char)c; + j->nomore = 1; + return; + } + } + j->code_buffer |= b << (24 - j->code_bits); + j->code_bits += 8; + } while (j->code_bits <= 24); + } + + // (1 << n) - 1 + static const jpeg__uint32 jpeg__bmask[17] = { 0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535 }; + + // decode a jpeg huffman value from the bitstream + jpeg_inline static int jpeg__jpeg_huff_decode(jpeg__jpeg* j, jpeg__huffman* h) + { + unsigned int temp; + int c, k; + + if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j); + + // look at the top FAST_BITS and determine what symbol ID it is, + // if the code is <= FAST_BITS + c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); + k = h->fast[c]; + if (k < 255) { + int s = h->size[k]; + if (s > j->code_bits) + return -1; + j->code_buffer <<= s; + j->code_bits -= s; + return h->values[k]; + } + + // naive test is to shift the code_buffer down so k bits are + // valid, then test against maxcode. To speed this up, we've + // preshifted maxcode left so that it has (16-k) 0s at the + // end; in other words, regardless of the number of bits, it + // wants to be compared against something shifted to have 16; + // that way we don't need to shift inside the loop. + temp = j->code_buffer >> 16; + for (k = FAST_BITS + 1; ; ++k) + if (temp < h->maxcode[k]) + break; + if (k == 17) { + // error! code not found + j->code_bits -= 16; + return -1; + } + + if (k > j->code_bits) + return -1; + + // convert the huffman code to the symbol id + c = ((j->code_buffer >> (32 - k)) & jpeg__bmask[k]) + h->delta[k]; + JPEG_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & jpeg__bmask[h->size[c]]) == h->code[c]); + + // convert the id to a symbol + j->code_bits -= k; + j->code_buffer <<= k; + return h->values[c]; + } + + // bias[n] = (-1<code_bits < n) jpeg__grow_buffer_unsafe(j); + + sgn = (jpeg__int32)j->code_buffer >> 31; // sign bit is always in MSB + k = jpeg_lrot(j->code_buffer, n); + if (n < 0 || n >= (int)(sizeof(jpeg__bmask) / sizeof(*jpeg__bmask))) return 0; + j->code_buffer = k & ~jpeg__bmask[n]; + k &= jpeg__bmask[n]; + j->code_bits -= n; + return k + (jpeg__jbias[n] & ~sgn); + } + + // get some unsigned bits + jpeg_inline static int jpeg__jpeg_get_bits(jpeg__jpeg* j, int n) + { + unsigned int k; + if (j->code_bits < n) jpeg__grow_buffer_unsafe(j); + k = jpeg_lrot(j->code_buffer, n); + j->code_buffer = k & ~jpeg__bmask[n]; + k &= jpeg__bmask[n]; + j->code_bits -= n; + return k; + } + + jpeg_inline static int jpeg__jpeg_get_bit(jpeg__jpeg* j) + { + unsigned int k; + if (j->code_bits < 1) jpeg__grow_buffer_unsafe(j); + k = j->code_buffer; + j->code_buffer <<= 1; + --j->code_bits; + return k & 0x80000000; + } + + // given a value that's at position X in the zigzag stream, + // where does it appear in the 8x8 matrix coded as row-major? + static const jpeg_uc jpeg__jpeg_dezigzag[64 + 15] = + { + 0, 1, 8, 16, 9, 2, 3, 10, + 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, + 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, + 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, + 53, 60, 61, 54, 47, 55, 62, 63, + // let corrupt input sample past end + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63 + }; + + // decode one 64-entry block-- + static int jpeg__jpeg_decode_block(jpeg__jpeg* j, short data[64], jpeg__huffman* hdc, jpeg__huffman* hac, jpeg__int16* fac, int b, jpeg__uint16* dequant) + { + int diff, dc, k; + int t; + + if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j); + t = jpeg__jpeg_huff_decode(j, hdc); + if (t < 0) return jpeg__err("bad huffman code", "Corrupt JPEG"); + + // 0 all the ac values now so we can do it 32-bits at a time + memset(data, 0, 64 * sizeof(data[0])); + + diff = t ? jpeg__extend_receive(j, t) : 0; + dc = j->img_comp[b].dc_pred + diff; + j->img_comp[b].dc_pred = dc; + data[0] = (short)(dc * dequant[0]); + + // decode AC components, see JPEG spec + k = 1; + do { + unsigned int zig; + int c, r, s; + if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j); + c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); + r = fac[c]; + if (r) { // fast-AC path + k += (r >> 4) & 15; // run + s = r & 15; // combined length + j->code_buffer <<= s; + j->code_bits -= s; + // decode into unzigzag'd location + zig = jpeg__jpeg_dezigzag[k++]; + data[zig] = (short)((r >> 8) * dequant[zig]); + } + else { + int rs = jpeg__jpeg_huff_decode(j, hac); + if (rs < 0) return jpeg__err("bad huffman code", "Corrupt JPEG"); + s = rs & 15; + r = rs >> 4; + if (s == 0) { + if (rs != 0xf0) break; // end block + k += 16; + } + else { + k += r; + // decode into unzigzag'd location + zig = jpeg__jpeg_dezigzag[k++]; + data[zig] = (short)(jpeg__extend_receive(j, s) * dequant[zig]); + } + } + } while (k < 64); + return 1; + } + + static int jpeg__jpeg_decode_block_prog_dc(jpeg__jpeg* j, short data[64], jpeg__huffman* hdc, int b) + { + int diff, dc; + int t; + if (j->spec_end != 0) return jpeg__err("can't merge dc and ac", "Corrupt JPEG"); + + if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j); + + if (j->succ_high == 0) { + // first scan for DC coefficient, must be first + memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now + t = jpeg__jpeg_huff_decode(j, hdc); + if (t == -1) return jpeg__err("can't merge dc and ac", "Corrupt JPEG"); + diff = t ? jpeg__extend_receive(j, t) : 0; + + dc = j->img_comp[b].dc_pred + diff; + j->img_comp[b].dc_pred = dc; + data[0] = (short)(dc << j->succ_low); + } + else { + // refinement scan for DC coefficient + if (jpeg__jpeg_get_bit(j)) + data[0] += (short)(1 << j->succ_low); + } + return 1; + } + + // @OPTIMIZE: store non-zigzagged during the decode passes, + // and only de-zigzag when dequantizing + static int jpeg__jpeg_decode_block_prog_ac(jpeg__jpeg* j, short data[64], jpeg__huffman* hac, jpeg__int16* fac) + { + int k; + if (j->spec_start == 0) return jpeg__err("can't merge dc and ac", "Corrupt JPEG"); + + if (j->succ_high == 0) { + int shift = j->succ_low; + + if (j->eob_run) { + --j->eob_run; + return 1; + } + + k = j->spec_start; + do { + unsigned int zig; + int c, r, s; + if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j); + c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); + r = fac[c]; + if (r) { // fast-AC path + k += (r >> 4) & 15; // run + s = r & 15; // combined length + j->code_buffer <<= s; + j->code_bits -= s; + zig = jpeg__jpeg_dezigzag[k++]; + data[zig] = (short)((r >> 8) << shift); + } + else { + int rs = jpeg__jpeg_huff_decode(j, hac); + if (rs < 0) return jpeg__err("bad huffman code", "Corrupt JPEG"); + s = rs & 15; + r = rs >> 4; + if (s == 0) { + if (r < 15) { + j->eob_run = (1 << r); + if (r) + j->eob_run += jpeg__jpeg_get_bits(j, r); + --j->eob_run; + break; + } + k += 16; + } + else { + k += r; + zig = jpeg__jpeg_dezigzag[k++]; + data[zig] = (short)(jpeg__extend_receive(j, s) << shift); + } + } + } while (k <= j->spec_end); + } + else { + // refinement scan for these AC coefficients + + short bit = (short)(1 << j->succ_low); + + if (j->eob_run) { + --j->eob_run; + for (k = j->spec_start; k <= j->spec_end; ++k) { + short* p = &data[jpeg__jpeg_dezigzag[k]]; + if (*p != 0) + if (jpeg__jpeg_get_bit(j)) + if ((*p & bit) == 0) { + if (*p > 0) + *p += bit; + else + *p -= bit; + } + } + } + else { + k = j->spec_start; + do { + int r, s; + int rs = jpeg__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh + if (rs < 0) return jpeg__err("bad huffman code", "Corrupt JPEG"); + s = rs & 15; + r = rs >> 4; + if (s == 0) { + if (r < 15) { + j->eob_run = (1 << r) - 1; + if (r) + j->eob_run += jpeg__jpeg_get_bits(j, r); + r = 64; // force end of block + } + else { + // r=15 s=0 should write 16 0s, so we just do + // a run of 15 0s and then write s (which is 0), + // so we don't have to do anything special here + } + } + else { + if (s != 1) return jpeg__err("bad huffman code", "Corrupt JPEG"); + // sign bit + if (jpeg__jpeg_get_bit(j)) + s = bit; + else + s = -bit; + } + + // advance by r + while (k <= j->spec_end) { + short* p = &data[jpeg__jpeg_dezigzag[k++]]; + if (*p != 0) { + if (jpeg__jpeg_get_bit(j)) + if ((*p & bit) == 0) { + if (*p > 0) + *p += bit; + else + *p -= bit; + } + } + else { + if (r == 0) { + *p = (short)s; + break; + } + --r; + } + } + } while (k <= j->spec_end); + } + } + return 1; + } + + // take a -128..127 value and jpeg__clamp it and convert to 0..255 + jpeg_inline static jpeg_uc jpeg__clamp(int x) + { + // trick to use a single test to catch both cases + if ((unsigned int)x > 255) { + if (x < 0) return 0; + if (x > 255) return 255; + } + return (jpeg_uc)x; + } + +#define jpeg__f2f(x) ((int) (((x) * 4096 + 0.5))) +#define jpeg__fsh(x) ((x) * 4096) + + // derived from jidctint -- DCT_ISLOW +#define JPEG__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \ + int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \ + p2 = s2; \ + p3 = s6; \ + p1 = (p2+p3) * jpeg__f2f(0.5411961f); \ + t2 = p1 + p3*jpeg__f2f(-1.847759065f); \ + t3 = p1 + p2*jpeg__f2f( 0.765366865f); \ + p2 = s0; \ + p3 = s4; \ + t0 = jpeg__fsh(p2+p3); \ + t1 = jpeg__fsh(p2-p3); \ + x0 = t0+t3; \ + x3 = t0-t3; \ + x1 = t1+t2; \ + x2 = t1-t2; \ + t0 = s7; \ + t1 = s5; \ + t2 = s3; \ + t3 = s1; \ + p3 = t0+t2; \ + p4 = t1+t3; \ + p1 = t0+t3; \ + p2 = t1+t2; \ + p5 = (p3+p4)*jpeg__f2f( 1.175875602f); \ + t0 = t0*jpeg__f2f( 0.298631336f); \ + t1 = t1*jpeg__f2f( 2.053119869f); \ + t2 = t2*jpeg__f2f( 3.072711026f); \ + t3 = t3*jpeg__f2f( 1.501321110f); \ + p1 = p5 + p1*jpeg__f2f(-0.899976223f); \ + p2 = p5 + p2*jpeg__f2f(-2.562915447f); \ + p3 = p3*jpeg__f2f(-1.961570560f); \ + p4 = p4*jpeg__f2f(-0.390180644f); \ + t3 += p1+p4; \ + t2 += p2+p3; \ + t1 += p2+p4; \ + t0 += p1+p3; + + static void jpeg__idct_block(jpeg_uc* out, int out_stride, short data[64]) + { + int i, val[64], * v = val; + jpeg_uc* o; + short* d = data; + + // columns + for (i = 0; i < 8; ++i, ++d, ++v) { + // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing + if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 + && d[40] == 0 && d[48] == 0 && d[56] == 0) { + // no shortcut 0 seconds + // (1|2|3|4|5|6|7)==0 0 seconds + // all separate -0.047 seconds + // 1 && 2|3 && 4|5 && 6|7: -0.047 seconds + int dcterm = d[0] * 4; + v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm; + } + else { + JPEG__IDCT_1D(d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56]) + // constants scaled things up by 1<<12; let's bring them back + // down, but keep 2 extra bits of precision + x0 += 512; x1 += 512; x2 += 512; x3 += 512; + v[0] = (x0 + t3) >> 10; + v[56] = (x0 - t3) >> 10; + v[8] = (x1 + t2) >> 10; + v[48] = (x1 - t2) >> 10; + v[16] = (x2 + t1) >> 10; + v[40] = (x2 - t1) >> 10; + v[24] = (x3 + t0) >> 10; + v[32] = (x3 - t0) >> 10; + } + } + + for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) { + // no fast case since the first 1D IDCT spread components out + JPEG__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]) + // constants scaled things up by 1<<12, plus we had 1<<2 from first + // loop, plus horizontal and vertical each scale by sqrt(8) so together + // we've got an extra 1<<3, so 1<<17 total we need to remove. + // so we want to round that, which means adding 0.5 * 1<<17, + // aka 65536. Also, we'll end up with -128 to 127 that we want + // to encode as 0..255 by adding 128, so we'll add that before the shift + x0 += 65536 + (128 << 17); + x1 += 65536 + (128 << 17); + x2 += 65536 + (128 << 17); + x3 += 65536 + (128 << 17); + // tried computing the shifts into temps, or'ing the temps to see + // if any were out of range, but that was slower + o[0] = jpeg__clamp((x0 + t3) >> 17); + o[7] = jpeg__clamp((x0 - t3) >> 17); + o[1] = jpeg__clamp((x1 + t2) >> 17); + o[6] = jpeg__clamp((x1 - t2) >> 17); + o[2] = jpeg__clamp((x2 + t1) >> 17); + o[5] = jpeg__clamp((x2 - t1) >> 17); + o[3] = jpeg__clamp((x3 + t0) >> 17); + o[4] = jpeg__clamp((x3 - t0) >> 17); + } + } + +#ifdef JPEG_SSE2 + // sse2 integer IDCT. not the fastest possible implementation but it + // produces bit-identical results to the generic C version so it's + // fully "transparent". + static void jpeg__idct_simd(jpeg_uc* out, int out_stride, short data[64]) + { + // This is constructed to match our regular (generic) integer IDCT exactly. + __m128i row0, row1, row2, row3, row4, row5, row6, row7; + __m128i tmp; + + // dot product constant: even elems=x, odd elems=y +#define dct_const(x,y) _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y)) + +// out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit) +// out(1) = c1[even]*x + c1[odd]*y +#define dct_rot(out0,out1, x,y,c0,c1) \ + __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \ + __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \ + __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \ + __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \ + __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \ + __m128i out1##_h = _mm_madd_epi16(c0##hi, c1) + + // out = in << 12 (in 16-bit, out 32-bit) +#define dct_widen(out, in) \ + __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \ + __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4) + + // wide add +#define dct_wadd(out, a, b) \ + __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \ + __m128i out##_h = _mm_add_epi32(a##_h, b##_h) + + // wide sub +#define dct_wsub(out, a, b) \ + __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \ + __m128i out##_h = _mm_sub_epi32(a##_h, b##_h) + + // butterfly a/b, add bias, then shift by "s" and pack +#define dct_bfly32o(out0, out1, a,b,bias,s) \ + { \ + __m128i abiased_l = _mm_add_epi32(a##_l, bias); \ + __m128i abiased_h = _mm_add_epi32(a##_h, bias); \ + dct_wadd(sum, abiased, b); \ + dct_wsub(dif, abiased, b); \ + out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \ + out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \ + } + + // 8-bit interleave step (for transposes) +#define dct_interleave8(a, b) \ + tmp = a; \ + a = _mm_unpacklo_epi8(a, b); \ + b = _mm_unpackhi_epi8(tmp, b) + + // 16-bit interleave step (for transposes) +#define dct_interleave16(a, b) \ + tmp = a; \ + a = _mm_unpacklo_epi16(a, b); \ + b = _mm_unpackhi_epi16(tmp, b) + +#define dct_pass(bias,shift) \ + { \ + /* even part */ \ + dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \ + __m128i sum04 = _mm_add_epi16(row0, row4); \ + __m128i dif04 = _mm_sub_epi16(row0, row4); \ + dct_widen(t0e, sum04); \ + dct_widen(t1e, dif04); \ + dct_wadd(x0, t0e, t3e); \ + dct_wsub(x3, t0e, t3e); \ + dct_wadd(x1, t1e, t2e); \ + dct_wsub(x2, t1e, t2e); \ + /* odd part */ \ + dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \ + dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \ + __m128i sum17 = _mm_add_epi16(row1, row7); \ + __m128i sum35 = _mm_add_epi16(row3, row5); \ + dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \ + dct_wadd(x4, y0o, y4o); \ + dct_wadd(x5, y1o, y5o); \ + dct_wadd(x6, y2o, y5o); \ + dct_wadd(x7, y3o, y4o); \ + dct_bfly32o(row0,row7, x0,x7,bias,shift); \ + dct_bfly32o(row1,row6, x1,x6,bias,shift); \ + dct_bfly32o(row2,row5, x2,x5,bias,shift); \ + dct_bfly32o(row3,row4, x3,x4,bias,shift); \ + } + + __m128i rot0_0 = dct_const(jpeg__f2f(0.5411961f), jpeg__f2f(0.5411961f) + jpeg__f2f(-1.847759065f)); + __m128i rot0_1 = dct_const(jpeg__f2f(0.5411961f) + jpeg__f2f(0.765366865f), jpeg__f2f(0.5411961f)); + __m128i rot1_0 = dct_const(jpeg__f2f(1.175875602f) + jpeg__f2f(-0.899976223f), jpeg__f2f(1.175875602f)); + __m128i rot1_1 = dct_const(jpeg__f2f(1.175875602f), jpeg__f2f(1.175875602f) + jpeg__f2f(-2.562915447f)); + __m128i rot2_0 = dct_const(jpeg__f2f(-1.961570560f) + jpeg__f2f(0.298631336f), jpeg__f2f(-1.961570560f)); + __m128i rot2_1 = dct_const(jpeg__f2f(-1.961570560f), jpeg__f2f(-1.961570560f) + jpeg__f2f(3.072711026f)); + __m128i rot3_0 = dct_const(jpeg__f2f(-0.390180644f) + jpeg__f2f(2.053119869f), jpeg__f2f(-0.390180644f)); + __m128i rot3_1 = dct_const(jpeg__f2f(-0.390180644f), jpeg__f2f(-0.390180644f) + jpeg__f2f(1.501321110f)); + + // rounding biases in column/row passes, see jpeg__idct_block for explanation. + __m128i bias_0 = _mm_set1_epi32(512); + __m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17)); + + // load + row0 = _mm_load_si128((const __m128i*) (data + 0 * 8)); + row1 = _mm_load_si128((const __m128i*) (data + 1 * 8)); + row2 = _mm_load_si128((const __m128i*) (data + 2 * 8)); + row3 = _mm_load_si128((const __m128i*) (data + 3 * 8)); + row4 = _mm_load_si128((const __m128i*) (data + 4 * 8)); + row5 = _mm_load_si128((const __m128i*) (data + 5 * 8)); + row6 = _mm_load_si128((const __m128i*) (data + 6 * 8)); + row7 = _mm_load_si128((const __m128i*) (data + 7 * 8)); + + // column pass + dct_pass(bias_0, 10); + + { + // 16bit 8x8 transpose pass 1 + dct_interleave16(row0, row4); + dct_interleave16(row1, row5); + dct_interleave16(row2, row6); + dct_interleave16(row3, row7); + + // transpose pass 2 + dct_interleave16(row0, row2); + dct_interleave16(row1, row3); + dct_interleave16(row4, row6); + dct_interleave16(row5, row7); + + // transpose pass 3 + dct_interleave16(row0, row1); + dct_interleave16(row2, row3); + dct_interleave16(row4, row5); + dct_interleave16(row6, row7); + } + + // row pass + dct_pass(bias_1, 17); + + { + // pack + __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7 + __m128i p1 = _mm_packus_epi16(row2, row3); + __m128i p2 = _mm_packus_epi16(row4, row5); + __m128i p3 = _mm_packus_epi16(row6, row7); + + // 8bit 8x8 transpose pass 1 + dct_interleave8(p0, p2); // a0e0a1e1... + dct_interleave8(p1, p3); // c0g0c1g1... + + // transpose pass 2 + dct_interleave8(p0, p1); // a0c0e0g0... + dct_interleave8(p2, p3); // b0d0f0h0... + + // transpose pass 3 + dct_interleave8(p0, p2); // a0b0c0d0... + dct_interleave8(p1, p3); // a4b4c4d4... + + // store + _mm_storel_epi64((__m128i*) out, p0); out += out_stride; + _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride; + _mm_storel_epi64((__m128i*) out, p2); out += out_stride; + _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride; + _mm_storel_epi64((__m128i*) out, p1); out += out_stride; + _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride; + _mm_storel_epi64((__m128i*) out, p3); out += out_stride; + _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p3, 0x4e)); + } + +#undef dct_const +#undef dct_rot +#undef dct_widen +#undef dct_wadd +#undef dct_wsub +#undef dct_bfly32o +#undef dct_interleave8 +#undef dct_interleave16 +#undef dct_pass + } + +#endif // JPEG_SSE2 + +#ifdef JPEG_NEON + + // NEON integer IDCT. should produce bit-identical + // results to the generic C version. + static void jpeg__idct_simd(jpeg_uc* out, int out_stride, short data[64]) + { + int16x8_t row0, row1, row2, row3, row4, row5, row6, row7; + + int16x4_t rot0_0 = vdup_n_s16(jpeg__f2f(0.5411961f)); + int16x4_t rot0_1 = vdup_n_s16(jpeg__f2f(-1.847759065f)); + int16x4_t rot0_2 = vdup_n_s16(jpeg__f2f(0.765366865f)); + int16x4_t rot1_0 = vdup_n_s16(jpeg__f2f(1.175875602f)); + int16x4_t rot1_1 = vdup_n_s16(jpeg__f2f(-0.899976223f)); + int16x4_t rot1_2 = vdup_n_s16(jpeg__f2f(-2.562915447f)); + int16x4_t rot2_0 = vdup_n_s16(jpeg__f2f(-1.961570560f)); + int16x4_t rot2_1 = vdup_n_s16(jpeg__f2f(-0.390180644f)); + int16x4_t rot3_0 = vdup_n_s16(jpeg__f2f(0.298631336f)); + int16x4_t rot3_1 = vdup_n_s16(jpeg__f2f(2.053119869f)); + int16x4_t rot3_2 = vdup_n_s16(jpeg__f2f(3.072711026f)); + int16x4_t rot3_3 = vdup_n_s16(jpeg__f2f(1.501321110f)); + +#define dct_long_mul(out, inq, coeff) \ + int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \ + int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff) + +#define dct_long_mac(out, acc, inq, coeff) \ + int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \ + int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff) + +#define dct_widen(out, inq) \ + int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \ + int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12) + + // wide add +#define dct_wadd(out, a, b) \ + int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \ + int32x4_t out##_h = vaddq_s32(a##_h, b##_h) + +// wide sub +#define dct_wsub(out, a, b) \ + int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \ + int32x4_t out##_h = vsubq_s32(a##_h, b##_h) + +// butterfly a/b, then shift using "shiftop" by "s" and pack +#define dct_bfly32o(out0,out1, a,b,shiftop,s) \ + { \ + dct_wadd(sum, a, b); \ + dct_wsub(dif, a, b); \ + out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \ + out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \ + } + +#define dct_pass(shiftop, shift) \ + { \ + /* even part */ \ + int16x8_t sum26 = vaddq_s16(row2, row6); \ + dct_long_mul(p1e, sum26, rot0_0); \ + dct_long_mac(t2e, p1e, row6, rot0_1); \ + dct_long_mac(t3e, p1e, row2, rot0_2); \ + int16x8_t sum04 = vaddq_s16(row0, row4); \ + int16x8_t dif04 = vsubq_s16(row0, row4); \ + dct_widen(t0e, sum04); \ + dct_widen(t1e, dif04); \ + dct_wadd(x0, t0e, t3e); \ + dct_wsub(x3, t0e, t3e); \ + dct_wadd(x1, t1e, t2e); \ + dct_wsub(x2, t1e, t2e); \ + /* odd part */ \ + int16x8_t sum15 = vaddq_s16(row1, row5); \ + int16x8_t sum17 = vaddq_s16(row1, row7); \ + int16x8_t sum35 = vaddq_s16(row3, row5); \ + int16x8_t sum37 = vaddq_s16(row3, row7); \ + int16x8_t sumodd = vaddq_s16(sum17, sum35); \ + dct_long_mul(p5o, sumodd, rot1_0); \ + dct_long_mac(p1o, p5o, sum17, rot1_1); \ + dct_long_mac(p2o, p5o, sum35, rot1_2); \ + dct_long_mul(p3o, sum37, rot2_0); \ + dct_long_mul(p4o, sum15, rot2_1); \ + dct_wadd(sump13o, p1o, p3o); \ + dct_wadd(sump24o, p2o, p4o); \ + dct_wadd(sump23o, p2o, p3o); \ + dct_wadd(sump14o, p1o, p4o); \ + dct_long_mac(x4, sump13o, row7, rot3_0); \ + dct_long_mac(x5, sump24o, row5, rot3_1); \ + dct_long_mac(x6, sump23o, row3, rot3_2); \ + dct_long_mac(x7, sump14o, row1, rot3_3); \ + dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \ + dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \ + dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \ + dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \ + } + + // load + row0 = vld1q_s16(data + 0 * 8); + row1 = vld1q_s16(data + 1 * 8); + row2 = vld1q_s16(data + 2 * 8); + row3 = vld1q_s16(data + 3 * 8); + row4 = vld1q_s16(data + 4 * 8); + row5 = vld1q_s16(data + 5 * 8); + row6 = vld1q_s16(data + 6 * 8); + row7 = vld1q_s16(data + 7 * 8); + + // add DC bias + row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0)); + + // column pass + dct_pass(vrshrn_n_s32, 10); + + // 16bit 8x8 transpose + { + // these three map to a single VTRN.16, VTRN.32, and VSWP, respectively. + // whether compilers actually get this is another story, sadly. +#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; } +#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); } +#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); } + + // pass 1 + dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6 + dct_trn16(row2, row3); + dct_trn16(row4, row5); + dct_trn16(row6, row7); + + // pass 2 + dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4 + dct_trn32(row1, row3); + dct_trn32(row4, row6); + dct_trn32(row5, row7); + + // pass 3 + dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0 + dct_trn64(row1, row5); + dct_trn64(row2, row6); + dct_trn64(row3, row7); + +#undef dct_trn16 +#undef dct_trn32 +#undef dct_trn64 + } + + // row pass + // vrshrn_n_s32 only supports shifts up to 16, we need + // 17. so do a non-rounding shift of 16 first then follow + // up with a rounding shift by 1. + dct_pass(vshrn_n_s32, 16); + + { + // pack and round + uint8x8_t p0 = vqrshrun_n_s16(row0, 1); + uint8x8_t p1 = vqrshrun_n_s16(row1, 1); + uint8x8_t p2 = vqrshrun_n_s16(row2, 1); + uint8x8_t p3 = vqrshrun_n_s16(row3, 1); + uint8x8_t p4 = vqrshrun_n_s16(row4, 1); + uint8x8_t p5 = vqrshrun_n_s16(row5, 1); + uint8x8_t p6 = vqrshrun_n_s16(row6, 1); + uint8x8_t p7 = vqrshrun_n_s16(row7, 1); + + // again, these can translate into one instruction, but often don't. +#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; } +#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); } +#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); } + + // sadly can't use interleaved stores here since we only write + // 8 bytes to each scan line! + + // 8x8 8-bit transpose pass 1 + dct_trn8_8(p0, p1); + dct_trn8_8(p2, p3); + dct_trn8_8(p4, p5); + dct_trn8_8(p6, p7); + + // pass 2 + dct_trn8_16(p0, p2); + dct_trn8_16(p1, p3); + dct_trn8_16(p4, p6); + dct_trn8_16(p5, p7); + + // pass 3 + dct_trn8_32(p0, p4); + dct_trn8_32(p1, p5); + dct_trn8_32(p2, p6); + dct_trn8_32(p3, p7); + + // store + vst1_u8(out, p0); out += out_stride; + vst1_u8(out, p1); out += out_stride; + vst1_u8(out, p2); out += out_stride; + vst1_u8(out, p3); out += out_stride; + vst1_u8(out, p4); out += out_stride; + vst1_u8(out, p5); out += out_stride; + vst1_u8(out, p6); out += out_stride; + vst1_u8(out, p7); + +#undef dct_trn8_8 +#undef dct_trn8_16 +#undef dct_trn8_32 + } + +#undef dct_long_mul +#undef dct_long_mac +#undef dct_widen +#undef dct_wadd +#undef dct_wsub +#undef dct_bfly32o +#undef dct_pass + } + +#endif // JPEG_NEON + +#define JPEG__MARKER_none 0xff + // if there's a pending marker from the entropy stream, return that + // otherwise, fetch from the stream and get a marker. if there's no + // marker, return 0xff, which is never a valid marker value + static jpeg_uc jpeg__get_marker(jpeg__jpeg* j) + { + jpeg_uc x; + if (j->marker != JPEG__MARKER_none) { x = j->marker; j->marker = JPEG__MARKER_none; return x; } + x = jpeg__get8(j->s); + if (x != 0xff) return JPEG__MARKER_none; + while (x == 0xff) + x = jpeg__get8(j->s); // consume repeated 0xff fill bytes + return x; + } + + // in each scan, we'll have scan_n components, and the order + // of the components is specified by order[] +#define JPEG__RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7) + +// after a restart interval, jpeg__jpeg_reset the entropy decoder and +// the dc prediction + static void jpeg__jpeg_reset(jpeg__jpeg* j) + { + j->code_bits = 0; + j->code_buffer = 0; + j->nomore = 0; + j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0; + j->marker = JPEG__MARKER_none; + j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff; + j->eob_run = 0; + // no more than 1<<31 MCUs if no restart_interal? that's plenty safe, + // since we don't even allow 1<<30 pixels + } + + static int jpeg__parse_entropy_coded_data(jpeg__jpeg* z) + { + jpeg__jpeg_reset(z); + if (!z->progressive) { + if (z->scan_n == 1) { + int i, j; + JPEG_SIMD_ALIGN(short, data[64]); + int n = z->order[0]; + // non-interleaved data, we just need to process one block at a time, + // in trivial scanline order + // number of blocks to do just depends on how many actual "pixels" this + // component has, independent of interleaved MCU blocking and such + int w = (z->img_comp[n].x + 7) >> 3; + int h = (z->img_comp[n].y + 7) >> 3; + for (j = 0; j < h; ++j) { + for (i = 0; i < w; ++i) { + int ha = z->img_comp[n].ha; + if (!jpeg__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0; + z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data); + // every data block is an MCU, so countdown the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z); + // if it's NOT a restart, then just bail, so we get corrupt data + // rather than no data + if (!JPEG__RESTART(z->marker)) return 1; + jpeg__jpeg_reset(z); + } + } + } + return 1; + } + else { // interleaved + int i, j, k, x, y; + JPEG_SIMD_ALIGN(short, data[64]); + for (j = 0; j < z->img_mcu_y; ++j) { + for (i = 0; i < z->img_mcu_x; ++i) { + // scan an interleaved mcu... process scan_n components in order + for (k = 0; k < z->scan_n; ++k) { + int n = z->order[k]; + // scan out an mcu's worth of this component; that's just determined + // by the basic H and V specified for the component + for (y = 0; y < z->img_comp[n].v; ++y) { + for (x = 0; x < z->img_comp[n].h; ++x) { + int x2 = (i * z->img_comp[n].h + x) * 8; + int y2 = (j * z->img_comp[n].v + y) * 8; + int ha = z->img_comp[n].ha; + if (!jpeg__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0; + z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2, z->img_comp[n].w2, data); + } + } + } + // after all interleaved components, that's an interleaved MCU, + // so now count down the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z); + if (!JPEG__RESTART(z->marker)) return 1; + jpeg__jpeg_reset(z); + } + } + } + return 1; + } + } + else { + if (z->scan_n == 1) { + int i, j; + int n = z->order[0]; + // non-interleaved data, we just need to process one block at a time, + // in trivial scanline order + // number of blocks to do just depends on how many actual "pixels" this + // component has, independent of interleaved MCU blocking and such + int w = (z->img_comp[n].x + 7) >> 3; + int h = (z->img_comp[n].y + 7) >> 3; + for (j = 0; j < h; ++j) { + for (i = 0; i < w; ++i) { + short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); + if (z->spec_start == 0) { + if (!jpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) + return 0; + } + else { + int ha = z->img_comp[n].ha; + if (!jpeg__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha])) + return 0; + } + // every data block is an MCU, so countdown the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z); + if (!JPEG__RESTART(z->marker)) return 1; + jpeg__jpeg_reset(z); + } + } + } + return 1; + } + else { // interleaved + int i, j, k, x, y; + for (j = 0; j < z->img_mcu_y; ++j) { + for (i = 0; i < z->img_mcu_x; ++i) { + // scan an interleaved mcu... process scan_n components in order + for (k = 0; k < z->scan_n; ++k) { + int n = z->order[k]; + // scan out an mcu's worth of this component; that's just determined + // by the basic H and V specified for the component + for (y = 0; y < z->img_comp[n].v; ++y) { + for (x = 0; x < z->img_comp[n].h; ++x) { + int x2 = (i * z->img_comp[n].h + x); + int y2 = (j * z->img_comp[n].v + y); + short* data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w); + if (!jpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) + return 0; + } + } + } + // after all interleaved components, that's an interleaved MCU, + // so now count down the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z); + if (!JPEG__RESTART(z->marker)) return 1; + jpeg__jpeg_reset(z); + } + } + } + return 1; + } + } + } + + static void jpeg__jpeg_dequantize(short* data, jpeg__uint16* dequant) + { + int i; + for (i = 0; i < 64; ++i) + data[i] *= dequant[i]; + } + + static void jpeg__jpeg_finish(jpeg__jpeg* z) + { + if (z->progressive) { + // dequantize and idct the data + int i, j, n; + for (n = 0; n < z->s->img_n; ++n) { + int w = (z->img_comp[n].x + 7) >> 3; + int h = (z->img_comp[n].y + 7) >> 3; + for (j = 0; j < h; ++j) { + for (i = 0; i < w; ++i) { + short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); + jpeg__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]); + z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data); + } + } + } + } + } + + static int jpeg__process_marker(jpeg__jpeg* z, int m) + { + int L; + switch (m) { + case JPEG__MARKER_none: // no marker found + return jpeg__err("expected marker", "Corrupt JPEG"); + + case 0xDD: // DRI - specify restart interval + if (jpeg__get16be(z->s) != 4) return jpeg__err("bad DRI len", "Corrupt JPEG"); + z->restart_interval = jpeg__get16be(z->s); + return 1; + + case 0xDB: // DQT - define quantization table + L = jpeg__get16be(z->s) - 2; + while (L > 0) { + int q = jpeg__get8(z->s); + int p = q >> 4, sixteen = (p != 0); + int t = q & 15, i; + if (p != 0 && p != 1) return jpeg__err("bad DQT type", "Corrupt JPEG"); + if (t > 3) return jpeg__err("bad DQT table", "Corrupt JPEG"); + + for (i = 0; i < 64; ++i) + z->dequant[t][jpeg__jpeg_dezigzag[i]] = (jpeg__uint16)(sixteen ? jpeg__get16be(z->s) : jpeg__get8(z->s)); + L -= (sixteen ? 129 : 65); + } + return L == 0; + + case 0xC4: // DHT - define huffman table + L = jpeg__get16be(z->s) - 2; + while (L > 0) { + jpeg_uc* v; + int sizes[16], i, n = 0; + int q = jpeg__get8(z->s); + int tc = q >> 4; + int th = q & 15; + if (tc > 1 || th > 3) return jpeg__err("bad DHT header", "Corrupt JPEG"); + for (i = 0; i < 16; ++i) { + sizes[i] = jpeg__get8(z->s); + n += sizes[i]; + } + L -= 17; + if (tc == 0) { + if (!jpeg__build_huffman(z->huff_dc + th, sizes)) return 0; + v = z->huff_dc[th].values; + } + else { + if (!jpeg__build_huffman(z->huff_ac + th, sizes)) return 0; + v = z->huff_ac[th].values; + } + for (i = 0; i < n; ++i) + v[i] = jpeg__get8(z->s); + if (tc != 0) + jpeg__build_fast_ac(z->fast_ac[th], z->huff_ac + th); + L -= n; + } + return L == 0; + } + + // check for comment block or APP blocks + if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) { + L = jpeg__get16be(z->s); + if (L < 2) { + if (m == 0xFE) + return jpeg__err("bad COM len", "Corrupt JPEG"); + else + return jpeg__err("bad APP len", "Corrupt JPEG"); + } + L -= 2; + + if (m == 0xE0 && L >= 5) { // JFIF APP0 segment + static const unsigned char tag[5] = { 'J','F','I','F','\0' }; + int ok = 1; + int i; + for (i = 0; i < 5; ++i) + if (jpeg__get8(z->s) != tag[i]) + ok = 0; + L -= 5; + if (ok) + z->jfif = 1; + } + else if (m == 0xEE && L >= 12) { // Adobe APP14 segment + static const unsigned char tag[6] = { 'A','d','o','b','e','\0' }; + int ok = 1; + int i; + for (i = 0; i < 6; ++i) + if (jpeg__get8(z->s) != tag[i]) + ok = 0; + L -= 6; + if (ok) { + jpeg__get8(z->s); // version + jpeg__get16be(z->s); // flags0 + jpeg__get16be(z->s); // flags1 + z->app14_color_transform = jpeg__get8(z->s); // color transform + L -= 6; + } + } + + jpeg__skip(z->s, L); + return 1; + } + + return jpeg__err("unknown marker", "Corrupt JPEG"); + } + + // after we see SOS + static int jpeg__process_scan_header(jpeg__jpeg* z) + { + int i; + int Ls = jpeg__get16be(z->s); + z->scan_n = jpeg__get8(z->s); + if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int)z->s->img_n) return jpeg__err("bad SOS component count", "Corrupt JPEG"); + if (Ls != 6 + 2 * z->scan_n) return jpeg__err("bad SOS len", "Corrupt JPEG"); + for (i = 0; i < z->scan_n; ++i) { + int id = jpeg__get8(z->s), which; + int q = jpeg__get8(z->s); + for (which = 0; which < z->s->img_n; ++which) + if (z->img_comp[which].id == id) + break; + if (which == z->s->img_n) return 0; // no match + z->img_comp[which].hd = q >> 4; if (z->img_comp[which].hd > 3) return jpeg__err("bad DC huff", "Corrupt JPEG"); + z->img_comp[which].ha = q & 15; if (z->img_comp[which].ha > 3) return jpeg__err("bad AC huff", "Corrupt JPEG"); + z->order[i] = which; + } + + { + int aa; + z->spec_start = jpeg__get8(z->s); + z->spec_end = jpeg__get8(z->s); // should be 63, but might be 0 + aa = jpeg__get8(z->s); + z->succ_high = (aa >> 4); + z->succ_low = (aa & 15); + if (z->progressive) { + if (z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13) + return jpeg__err("bad SOS", "Corrupt JPEG"); + } + else { + if (z->spec_start != 0) return jpeg__err("bad SOS", "Corrupt JPEG"); + if (z->succ_high != 0 || z->succ_low != 0) return jpeg__err("bad SOS", "Corrupt JPEG"); + z->spec_end = 63; + } + } + + return 1; + } + + static int jpeg__free_jpeg_components(jpeg__jpeg* z, int ncomp, int why) + { + int i; + for (i = 0; i < ncomp; ++i) { + if (z->img_comp[i].raw_data) { + JPEG_FREE(z->img_comp[i].raw_data); + z->img_comp[i].raw_data = NULL; + z->img_comp[i].data = NULL; + } + if (z->img_comp[i].raw_coeff) { + JPEG_FREE(z->img_comp[i].raw_coeff); + z->img_comp[i].raw_coeff = 0; + z->img_comp[i].coeff = 0; + } + if (z->img_comp[i].linebuf) { + JPEG_FREE(z->img_comp[i].linebuf); + z->img_comp[i].linebuf = NULL; + } + } + return why; + } + + static int jpeg__process_frame_header(jpeg__jpeg* z, int scan) + { + jpeg__context* s = z->s; + int Lf, p, i, q, h_max = 1, v_max = 1, c; + Lf = jpeg__get16be(s); if (Lf < 11) return jpeg__err("bad SOF len", "Corrupt JPEG"); // JPEG + p = jpeg__get8(s); if (p != 8) return jpeg__err("only 8-bit", "JPEG format not supported: 8-bit only"); // JPEG baseline + s->img_y = jpeg__get16be(s); if (s->img_y == 0) return jpeg__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG + s->img_x = jpeg__get16be(s); if (s->img_x == 0) return jpeg__err("0 width", "Corrupt JPEG"); // JPEG requires + if (s->img_y > JPEG_MAX_DIMENSIONS) return jpeg__err("too large", "Very large image (corrupt?)"); + if (s->img_x > JPEG_MAX_DIMENSIONS) return jpeg__err("too large", "Very large image (corrupt?)"); + c = jpeg__get8(s); + if (c != 3 && c != 1 && c != 4) return jpeg__err("bad component count", "Corrupt JPEG"); + s->img_n = c; + for (i = 0; i < c; ++i) { + z->img_comp[i].data = NULL; + z->img_comp[i].linebuf = NULL; + } + + if (Lf != 8 + 3 * s->img_n) return jpeg__err("bad SOF len", "Corrupt JPEG"); + + z->rgb = 0; + for (i = 0; i < s->img_n; ++i) { + static const unsigned char rgb[3] = { 'R', 'G', 'B' }; + z->img_comp[i].id = jpeg__get8(s); + if (s->img_n == 3 && z->img_comp[i].id == rgb[i]) + ++z->rgb; + q = jpeg__get8(s); + z->img_comp[i].h = (q >> 4); if (!z->img_comp[i].h || z->img_comp[i].h > 4) return jpeg__err("bad H", "Corrupt JPEG"); + z->img_comp[i].v = q & 15; if (!z->img_comp[i].v || z->img_comp[i].v > 4) return jpeg__err("bad V", "Corrupt JPEG"); + z->img_comp[i].tq = jpeg__get8(s); if (z->img_comp[i].tq > 3) return jpeg__err("bad TQ", "Corrupt JPEG"); + } + + if (scan != JPEG__SCAN_load) return 1; + + if (!jpeg__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return jpeg__err("too large", "Image too large to decode"); + + for (i = 0; i < s->img_n; ++i) { + if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h; + if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v; + } + + // compute interleaved mcu info + z->img_h_max = h_max; + z->img_v_max = v_max; + z->img_mcu_w = h_max * 8; + z->img_mcu_h = v_max * 8; + // these sizes can't be more than 17 bits + z->img_mcu_x = (s->img_x + z->img_mcu_w - 1) / z->img_mcu_w; + z->img_mcu_y = (s->img_y + z->img_mcu_h - 1) / z->img_mcu_h; + + for (i = 0; i < s->img_n; ++i) { + // number of effective pixels (e.g. for non-interleaved MCU) + z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max - 1) / h_max; + z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max - 1) / v_max; + // to simplify generation, we'll allocate enough memory to decode + // the bogus oversized data from using interleaved MCUs and their + // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't + // discard the extra data until colorspace conversion + // + // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier) + // so these muls can't overflow with 32-bit ints (which we require) + z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8; + z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8; + z->img_comp[i].coeff = 0; + z->img_comp[i].raw_coeff = 0; + z->img_comp[i].linebuf = NULL; + z->img_comp[i].raw_data = jpeg__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15); + if (z->img_comp[i].raw_data == NULL) + return jpeg__free_jpeg_components(z, i + 1, jpeg__err("outofmem", "Out of memory")); + // align blocks for idct using mmx/sse + z->img_comp[i].data = (jpeg_uc*)(((size_t)z->img_comp[i].raw_data + 15) & ~15); + if (z->progressive) { + // w2, h2 are multiples of 8 (see above) + z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8; + z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8; + z->img_comp[i].raw_coeff = jpeg__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15); + if (z->img_comp[i].raw_coeff == NULL) + return jpeg__free_jpeg_components(z, i + 1, jpeg__err("outofmem", "Out of memory")); + z->img_comp[i].coeff = (short*)(((size_t)z->img_comp[i].raw_coeff + 15) & ~15); + } + } + + return 1; + } + + // use comparisons since in some cases we handle more than one case (e.g. SOF) +#define jpeg__DNL(x) ((x) == 0xdc) +#define jpeg__SOI(x) ((x) == 0xd8) +#define jpeg__EOI(x) ((x) == 0xd9) +#define jpeg__SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2) +#define jpeg__SOS(x) ((x) == 0xda) + +#define jpeg__SOF_progressive(x) ((x) == 0xc2) + + static int jpeg__decode_jpeg_header(jpeg__jpeg* z, int scan) + { + int m; + z->jfif = 0; + z->app14_color_transform = -1; // valid values are 0,1,2 + z->marker = JPEG__MARKER_none; // initialize cached marker to empty + m = jpeg__get_marker(z); + if (!jpeg__SOI(m)) return jpeg__err("no SOI", "Corrupt JPEG"); + if (scan == JPEG__SCAN_type) return 1; + m = jpeg__get_marker(z); + while (!jpeg__SOF(m)) { + if (!jpeg__process_marker(z, m)) return 0; + m = jpeg__get_marker(z); + while (m == JPEG__MARKER_none) { + // some files have extra padding after their blocks, so ok, we'll scan + if (jpeg__at_eof(z->s)) return jpeg__err("no SOF", "Corrupt JPEG"); + m = jpeg__get_marker(z); + } + } + z->progressive = jpeg__SOF_progressive(m); + if (!jpeg__process_frame_header(z, scan)) return 0; + return 1; + } + + // decode image to YCbCr format + static int jpeg__decode_jpeg_image(jpeg__jpeg* j) + { + int m; + for (m = 0; m < 4; m++) { + j->img_comp[m].raw_data = NULL; + j->img_comp[m].raw_coeff = NULL; + } + j->restart_interval = 0; + if (!jpeg__decode_jpeg_header(j, JPEG__SCAN_load)) return 0; + m = jpeg__get_marker(j); + while (!jpeg__EOI(m)) { + if (jpeg__SOS(m)) { + if (!jpeg__process_scan_header(j)) return 0; + if (!jpeg__parse_entropy_coded_data(j)) return 0; + if (j->marker == JPEG__MARKER_none) { + // handle 0s at the end of image data from IP Kamera 9060 + while (!jpeg__at_eof(j->s)) { + int x = jpeg__get8(j->s); + if (x == 255) { + j->marker = jpeg__get8(j->s); + break; + } + } + // if we reach eof without hitting a marker, jpeg__get_marker() below will fail and we'll eventually return 0 + } + } + else if (jpeg__DNL(m)) { + int Ld = jpeg__get16be(j->s); + jpeg__uint32 NL = jpeg__get16be(j->s); + if (Ld != 4) return jpeg__err("bad DNL len", "Corrupt JPEG"); + if (NL != j->s->img_y) return jpeg__err("bad DNL height", "Corrupt JPEG"); + } + else { + if (!jpeg__process_marker(j, m)) return 0; + } + m = jpeg__get_marker(j); + } + if (j->progressive) + jpeg__jpeg_finish(j); + return 1; + } + + // static jfif-centered resampling (across block boundaries) + + typedef jpeg_uc* (*resample_row_func)(jpeg_uc* out, jpeg_uc* in0, jpeg_uc* in1, + int w, int hs); + +#define jpeg__div4(x) ((jpeg_uc) ((x) >> 2)) + + static jpeg_uc* resample_row_1(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs) + { + JPEG_NOTUSED(out); + JPEG_NOTUSED(in_far); + JPEG_NOTUSED(w); + JPEG_NOTUSED(hs); + return in_near; + } + + static jpeg_uc* jpeg__resample_row_v_2(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs) + { + // need to generate two samples vertically for every one in input + int i; + JPEG_NOTUSED(hs); + for (i = 0; i < w; ++i) + out[i] = jpeg__div4(3 * in_near[i] + in_far[i] + 2); + return out; + } + + static jpeg_uc* jpeg__resample_row_h_2(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs) + { + // need to generate two samples horizontally for every one in input + int i; + jpeg_uc* input = in_near; + + if (w == 1) { + // if only one sample, can't do any interpolation + out[0] = out[1] = input[0]; + return out; + } + + out[0] = input[0]; + out[1] = jpeg__div4(input[0] * 3 + input[1] + 2); + for (i = 1; i < w - 1; ++i) { + int n = 3 * input[i] + 2; + out[i * 2 + 0] = jpeg__div4(n + input[i - 1]); + out[i * 2 + 1] = jpeg__div4(n + input[i + 1]); + } + out[i * 2 + 0] = jpeg__div4(input[w - 2] * 3 + input[w - 1] + 2); + out[i * 2 + 1] = input[w - 1]; + + JPEG_NOTUSED(in_far); + JPEG_NOTUSED(hs); + + return out; + } + +#define jpeg__div16(x) ((jpeg_uc) ((x) >> 4)) + + static jpeg_uc* jpeg__resample_row_hv_2(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs) + { + // need to generate 2x2 samples for every one in input + int i, t0, t1; + if (w == 1) { + out[0] = out[1] = jpeg__div4(3 * in_near[0] + in_far[0] + 2); + return out; + } + + t1 = 3 * in_near[0] + in_far[0]; + out[0] = jpeg__div4(t1 + 2); + for (i = 1; i < w; ++i) { + t0 = t1; + t1 = 3 * in_near[i] + in_far[i]; + out[i * 2 - 1] = jpeg__div16(3 * t0 + t1 + 8); + out[i * 2] = jpeg__div16(3 * t1 + t0 + 8); + } + out[w * 2 - 1] = jpeg__div4(t1 + 2); + + JPEG_NOTUSED(hs); + + return out; + } + +#if defined(JPEG_SSE2) || defined(JPEG_NEON) + static jpeg_uc* jpeg__resample_row_hv_2_simd(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs) + { + // need to generate 2x2 samples for every one in input + int i = 0, t0, t1; + + if (w == 1) { + out[0] = out[1] = jpeg__div4(3 * in_near[0] + in_far[0] + 2); + return out; + } + + t1 = 3 * in_near[0] + in_far[0]; + // process groups of 8 pixels for as long as we can. + // note we can't handle the last pixel in a row in this loop + // because we need to handle the filter boundary conditions. + for (; i < ((w - 1) & ~7); i += 8) { +#if defined(JPEG_SSE2) + // load and perform the vertical filtering pass + // this uses 3*x + y = 4*x + (y - x) + __m128i zero = _mm_setzero_si128(); + __m128i farb = _mm_loadl_epi64((__m128i*) (in_far + i)); + __m128i nearb = _mm_loadl_epi64((__m128i*) (in_near + i)); + __m128i farw = _mm_unpacklo_epi8(farb, zero); + __m128i nearw = _mm_unpacklo_epi8(nearb, zero); + __m128i diff = _mm_sub_epi16(farw, nearw); + __m128i nears = _mm_slli_epi16(nearw, 2); + __m128i curr = _mm_add_epi16(nears, diff); // current row + + // horizontal filter works the same based on shifted vers of current + // row. "prev" is current row shifted right by 1 pixel; we need to + // insert the previous pixel value (from t1). + // "next" is current row shifted left by 1 pixel, with first pixel + // of next block of 8 pixels added in. + __m128i prv0 = _mm_slli_si128(curr, 2); + __m128i nxt0 = _mm_srli_si128(curr, 2); + __m128i prev = _mm_insert_epi16(prv0, t1, 0); + __m128i next = _mm_insert_epi16(nxt0, 3 * in_near[i + 8] + in_far[i + 8], 7); + + // horizontal filter, polyphase implementation since it's convenient: + // even pixels = 3*cur + prev = cur*4 + (prev - cur) + // odd pixels = 3*cur + next = cur*4 + (next - cur) + // note the shared term. + __m128i bias = _mm_set1_epi16(8); + __m128i curs = _mm_slli_epi16(curr, 2); + __m128i prvd = _mm_sub_epi16(prev, curr); + __m128i nxtd = _mm_sub_epi16(next, curr); + __m128i curb = _mm_add_epi16(curs, bias); + __m128i even = _mm_add_epi16(prvd, curb); + __m128i odd = _mm_add_epi16(nxtd, curb); + + // interleave even and odd pixels, then undo scaling. + __m128i int0 = _mm_unpacklo_epi16(even, odd); + __m128i int1 = _mm_unpackhi_epi16(even, odd); + __m128i de0 = _mm_srli_epi16(int0, 4); + __m128i de1 = _mm_srli_epi16(int1, 4); + + // pack and write output + __m128i outv = _mm_packus_epi16(de0, de1); + _mm_storeu_si128((__m128i*) (out + i * 2), outv); +#elif defined(JPEG_NEON) + // load and perform the vertical filtering pass + // this uses 3*x + y = 4*x + (y - x) + uint8x8_t farb = vld1_u8(in_far + i); + uint8x8_t nearb = vld1_u8(in_near + i); + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb)); + int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2)); + int16x8_t curr = vaddq_s16(nears, diff); // current row + + // horizontal filter works the same based on shifted vers of current + // row. "prev" is current row shifted right by 1 pixel; we need to + // insert the previous pixel value (from t1). + // "next" is current row shifted left by 1 pixel, with first pixel + // of next block of 8 pixels added in. + int16x8_t prv0 = vextq_s16(curr, curr, 7); + int16x8_t nxt0 = vextq_s16(curr, curr, 1); + int16x8_t prev = vsetq_lane_s16(t1, prv0, 0); + int16x8_t next = vsetq_lane_s16(3 * in_near[i + 8] + in_far[i + 8], nxt0, 7); + + // horizontal filter, polyphase implementation since it's convenient: + // even pixels = 3*cur + prev = cur*4 + (prev - cur) + // odd pixels = 3*cur + next = cur*4 + (next - cur) + // note the shared term. + int16x8_t curs = vshlq_n_s16(curr, 2); + int16x8_t prvd = vsubq_s16(prev, curr); + int16x8_t nxtd = vsubq_s16(next, curr); + int16x8_t even = vaddq_s16(curs, prvd); + int16x8_t odd = vaddq_s16(curs, nxtd); + + // undo scaling and round, then store with even/odd phases interleaved + uint8x8x2_t o; + o.val[0] = vqrshrun_n_s16(even, 4); + o.val[1] = vqrshrun_n_s16(odd, 4); + vst2_u8(out + i * 2, o); +#endif + + // "previous" value for next iter + t1 = 3 * in_near[i + 7] + in_far[i + 7]; + } + + t0 = t1; + t1 = 3 * in_near[i] + in_far[i]; + out[i * 2] = jpeg__div16(3 * t1 + t0 + 8); + + for (++i; i < w; ++i) { + t0 = t1; + t1 = 3 * in_near[i] + in_far[i]; + out[i * 2 - 1] = jpeg__div16(3 * t0 + t1 + 8); + out[i * 2] = jpeg__div16(3 * t1 + t0 + 8); + } + out[w * 2 - 1] = jpeg__div4(t1 + 2); + + JPEG_NOTUSED(hs); + + return out; + } +#endif + + static jpeg_uc* jpeg__resample_row_generic(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs) + { + // resample with nearest-neighbor + int i, j; + JPEG_NOTUSED(in_far); + for (i = 0; i < w; ++i) + for (j = 0; j < hs; ++j) + out[i * hs + j] = in_near[i]; + return out; + } + + // this is a reduced-precision calculation of YCbCr-to-RGB introduced + // to make sure the code produces the same results in both SIMD and scalar +#define jpeg__float2fixed(x) (((int) ((x) * 4096.0f + 0.5f)) << 8) + static void jpeg__YCbCr_to_RGB_row(jpeg_uc* out, const jpeg_uc* y, const jpeg_uc* pcb, const jpeg_uc* pcr, int count, int step) + { + int i; + for (i = 0; i < count; ++i) { + int y_fixed = (y[i] << 20) + (1 << 19); // rounding + int r, g, b; + int cr = pcr[i] - 128; + int cb = pcb[i] - 128; + r = y_fixed + cr * jpeg__float2fixed(1.40200f); + g = y_fixed + (cr * -jpeg__float2fixed(0.71414f)) + ((cb * -jpeg__float2fixed(0.34414f)) & 0xffff0000); + b = y_fixed + cb * jpeg__float2fixed(1.77200f); + r >>= 20; + g >>= 20; + b >>= 20; + if ((unsigned)r > 255) { if (r < 0) r = 0; else r = 255; } + if ((unsigned)g > 255) { if (g < 0) g = 0; else g = 255; } + if ((unsigned)b > 255) { if (b < 0) b = 0; else b = 255; } + out[0] = (jpeg_uc)r; + out[1] = (jpeg_uc)g; + out[2] = (jpeg_uc)b; + out[3] = 255; + out += step; + } + } + +#if defined(JPEG_SSE2) || defined(JPEG_NEON) + static void jpeg__YCbCr_to_RGB_simd(jpeg_uc* out, jpeg_uc const* y, jpeg_uc const* pcb, jpeg_uc const* pcr, int count, int step) + { + int i = 0; + +#ifdef JPEG_SSE2 + // step == 3 is pretty ugly on the final interleave, and i'm not convinced + // it's useful in practice (you wouldn't use it for textures, for example). + // so just accelerate step == 4 case. + if (step == 4) { + // this is a fairly straightforward implementation and not super-optimized. + __m128i signflip = _mm_set1_epi8(-0x80); + __m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f)); + __m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f)); + __m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f)); + __m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f)); + __m128i y_bias = _mm_set1_epi8((char)(unsigned char)128); + __m128i xw = _mm_set1_epi16(255); // alpha channel + + for (; i + 7 < count; i += 8) { + // load + __m128i y_bytes = _mm_loadl_epi64((__m128i*) (y + i)); + __m128i cr_bytes = _mm_loadl_epi64((__m128i*) (pcr + i)); + __m128i cb_bytes = _mm_loadl_epi64((__m128i*) (pcb + i)); + __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128 + __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128 + + // unpack to short (and left-shift cr, cb by 8) + __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes); + __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased); + __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased); + + // color transform + __m128i yws = _mm_srli_epi16(yw, 4); + __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw); + __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw); + __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1); + __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1); + __m128i rws = _mm_add_epi16(cr0, yws); + __m128i gwt = _mm_add_epi16(cb0, yws); + __m128i bws = _mm_add_epi16(yws, cb1); + __m128i gws = _mm_add_epi16(gwt, cr1); + + // descale + __m128i rw = _mm_srai_epi16(rws, 4); + __m128i bw = _mm_srai_epi16(bws, 4); + __m128i gw = _mm_srai_epi16(gws, 4); + + // back to byte, set up for transpose + __m128i brb = _mm_packus_epi16(rw, bw); + __m128i gxb = _mm_packus_epi16(gw, xw); + + // transpose to interleave channels + __m128i t0 = _mm_unpacklo_epi8(brb, gxb); + __m128i t1 = _mm_unpackhi_epi8(brb, gxb); + __m128i o0 = _mm_unpacklo_epi16(t0, t1); + __m128i o1 = _mm_unpackhi_epi16(t0, t1); + + // store + _mm_storeu_si128((__m128i*) (out + 0), o0); + _mm_storeu_si128((__m128i*) (out + 16), o1); + out += 32; + } + } +#endif + +#ifdef JPEG_NEON + // in this version, step=3 support would be easy to add. but is there demand? + if (step == 4) { + // this is a fairly straightforward implementation and not super-optimized. + uint8x8_t signflip = vdup_n_u8(0x80); + int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f)); + int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f)); + int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f)); + int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f)); + + for (; i + 7 < count; i += 8) { + // load + uint8x8_t y_bytes = vld1_u8(y + i); + uint8x8_t cr_bytes = vld1_u8(pcr + i); + uint8x8_t cb_bytes = vld1_u8(pcb + i); + int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip)); + int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip)); + + // expand to s16 + int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4)); + int16x8_t crw = vshll_n_s8(cr_biased, 7); + int16x8_t cbw = vshll_n_s8(cb_biased, 7); + + // color transform + int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0); + int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0); + int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1); + int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1); + int16x8_t rws = vaddq_s16(yws, cr0); + int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1); + int16x8_t bws = vaddq_s16(yws, cb1); + + // undo scaling, round, convert to byte + uint8x8x4_t o; + o.val[0] = vqrshrun_n_s16(rws, 4); + o.val[1] = vqrshrun_n_s16(gws, 4); + o.val[2] = vqrshrun_n_s16(bws, 4); + o.val[3] = vdup_n_u8(255); + + // store, interleaving r/g/b/a + vst4_u8(out, o); + out += 8 * 4; + } + } +#endif + + for (; i < count; ++i) { + int y_fixed = (y[i] << 20) + (1 << 19); // rounding + int r, g, b; + int cr = pcr[i] - 128; + int cb = pcb[i] - 128; + r = y_fixed + cr * jpeg__float2fixed(1.40200f); + g = y_fixed + cr * -jpeg__float2fixed(0.71414f) + ((cb * -jpeg__float2fixed(0.34414f)) & 0xffff0000); + b = y_fixed + cb * jpeg__float2fixed(1.77200f); + r >>= 20; + g >>= 20; + b >>= 20; + if ((unsigned)r > 255) { if (r < 0) r = 0; else r = 255; } + if ((unsigned)g > 255) { if (g < 0) g = 0; else g = 255; } + if ((unsigned)b > 255) { if (b < 0) b = 0; else b = 255; } + out[0] = (jpeg_uc)r; + out[1] = (jpeg_uc)g; + out[2] = (jpeg_uc)b; + out[3] = 255; + out += step; + } + } +#endif + + // set up the kernels + static void jpeg__setup_jpeg(jpeg__jpeg* j) + { + j->idct_block_kernel = jpeg__idct_block; + j->YCbCr_to_RGB_kernel = jpeg__YCbCr_to_RGB_row; + j->resample_row_hv_2_kernel = jpeg__resample_row_hv_2; + +#ifdef JPEG_SSE2 + if (jpeg__sse2_available()) { + j->idct_block_kernel = jpeg__idct_simd; + j->YCbCr_to_RGB_kernel = jpeg__YCbCr_to_RGB_simd; + j->resample_row_hv_2_kernel = jpeg__resample_row_hv_2_simd; + } +#endif + +#ifdef JPEG_NEON + j->idct_block_kernel = jpeg__idct_simd; + j->YCbCr_to_RGB_kernel = jpeg__YCbCr_to_RGB_simd; + j->resample_row_hv_2_kernel = jpeg__resample_row_hv_2_simd; +#endif + } + + // clean up the temporary component buffers + static void jpeg__cleanup_jpeg(jpeg__jpeg* j) + { + jpeg__free_jpeg_components(j, j->s->img_n, 0); + } + + typedef struct + { + resample_row_func resample; + jpeg_uc* line0, * line1; + int hs, vs; // expansion factor in each axis + int w_lores; // horizontal pixels pre-expansion + int ystep; // how far through vertical expansion we are + int ypos; // which pre-expansion row we're on + } jpeg__resample; + + // fast 0..255 * 0..255 => 0..255 rounded multiplication + static jpeg_uc jpeg__blinn_8x8(jpeg_uc x, jpeg_uc y) + { + unsigned int t = x * y + 128; + return (jpeg_uc)((t + (t >> 8)) >> 8); + } + + static jpeg_uc* load_jpeg_image(jpeg__jpeg* z, int* out_x, int* out_y, int* comp, int req_comp) + { + int n, decode_n, is_rgb; + z->s->img_n = 0; // make jpeg__cleanup_jpeg safe + + // validate req_comp + if (req_comp < 0 || req_comp > 4) return jpeg__errpuc("bad req_comp", "Internal error"); + + // load a jpeg image from whichever source, but leave in YCbCr format + if (!jpeg__decode_jpeg_image(z)) { jpeg__cleanup_jpeg(z); return NULL; } + + // determine actual number of components to generate + n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1; + + is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif)); + + if (z->s->img_n == 3 && n < 3 && !is_rgb) + decode_n = 1; + else + decode_n = z->s->img_n; + + // resample and color-convert + { + int k; + unsigned int i, j; + jpeg_uc* output; + jpeg_uc* coutput[4] = { NULL, NULL, NULL, NULL }; + + jpeg__resample res_comp[4]; + + for (k = 0; k < decode_n; ++k) { + jpeg__resample* r = &res_comp[k]; + + // allocate line buffer big enough for upsampling off the edges + // with upsample factor of 4 + z->img_comp[k].linebuf = (jpeg_uc*)jpeg__malloc(z->s->img_x + 3); + if (!z->img_comp[k].linebuf) { jpeg__cleanup_jpeg(z); return jpeg__errpuc("outofmem", "Out of memory"); } + + r->hs = z->img_h_max / z->img_comp[k].h; + r->vs = z->img_v_max / z->img_comp[k].v; + r->ystep = r->vs >> 1; + r->w_lores = (z->s->img_x + r->hs - 1) / r->hs; + r->ypos = 0; + r->line0 = r->line1 = z->img_comp[k].data; + + if (r->hs == 1 && r->vs == 1) r->resample = resample_row_1; + else if (r->hs == 1 && r->vs == 2) r->resample = jpeg__resample_row_v_2; + else if (r->hs == 2 && r->vs == 1) r->resample = jpeg__resample_row_h_2; + else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel; + else r->resample = jpeg__resample_row_generic; + } + + // can't error after this so, this is safe + output = (jpeg_uc*)jpeg__malloc_mad3(n, z->s->img_x, z->s->img_y, 1); + if (!output) { jpeg__cleanup_jpeg(z); return jpeg__errpuc("outofmem", "Out of memory"); } + + // now go ahead and resample + for (j = 0; j < z->s->img_y; ++j) { + jpeg_uc* out = output + n * z->s->img_x * j; + for (k = 0; k < decode_n; ++k) { + jpeg__resample* r = &res_comp[k]; + int y_bot = r->ystep >= (r->vs >> 1); + coutput[k] = r->resample(z->img_comp[k].linebuf, + y_bot ? r->line1 : r->line0, + y_bot ? r->line0 : r->line1, + r->w_lores, r->hs); + if (++r->ystep >= r->vs) { + r->ystep = 0; + r->line0 = r->line1; + if (++r->ypos < z->img_comp[k].y) + r->line1 += z->img_comp[k].w2; + } + } + if (n >= 3) { + jpeg_uc* y = coutput[0]; + if (z->s->img_n == 3) { + if (is_rgb) { + for (i = 0; i < z->s->img_x; ++i) { + out[0] = y[i]; + out[1] = coutput[1][i]; + out[2] = coutput[2][i]; + out[3] = 255; + out += n; + } + } + else { + z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); + } + } + else if (z->s->img_n == 4) { + if (z->app14_color_transform == 0) { // CMYK + for (i = 0; i < z->s->img_x; ++i) { + jpeg_uc m = coutput[3][i]; + out[0] = jpeg__blinn_8x8(coutput[0][i], m); + out[1] = jpeg__blinn_8x8(coutput[1][i], m); + out[2] = jpeg__blinn_8x8(coutput[2][i], m); + out[3] = 255; + out += n; + } + } + else if (z->app14_color_transform == 2) { // YCCK + z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); + for (i = 0; i < z->s->img_x; ++i) { + jpeg_uc m = coutput[3][i]; + out[0] = jpeg__blinn_8x8(255 - out[0], m); + out[1] = jpeg__blinn_8x8(255 - out[1], m); + out[2] = jpeg__blinn_8x8(255 - out[2], m); + out += n; + } + } + else { // YCbCr + alpha? Ignore the fourth channel for now + z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); + } + } + else + for (i = 0; i < z->s->img_x; ++i) { + out[0] = out[1] = out[2] = y[i]; + out[3] = 255; // not used if n==3 + out += n; + } + } + else { + if (is_rgb) { + if (n == 1) + for (i = 0; i < z->s->img_x; ++i) + *out++ = jpeg__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]); + else { + for (i = 0; i < z->s->img_x; ++i, out += 2) { + out[0] = jpeg__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]); + out[1] = 255; + } + } + } + else if (z->s->img_n == 4 && z->app14_color_transform == 0) { + for (i = 0; i < z->s->img_x; ++i) { + jpeg_uc m = coutput[3][i]; + jpeg_uc r = jpeg__blinn_8x8(coutput[0][i], m); + jpeg_uc g = jpeg__blinn_8x8(coutput[1][i], m); + jpeg_uc b = jpeg__blinn_8x8(coutput[2][i], m); + out[0] = jpeg__compute_y(r, g, b); + out[1] = 255; + out += n; + } + } + else if (z->s->img_n == 4 && z->app14_color_transform == 2) { + for (i = 0; i < z->s->img_x; ++i) { + out[0] = jpeg__blinn_8x8(255 - coutput[0][i], coutput[3][i]); + out[1] = 255; + out += n; + } + } + else { + jpeg_uc* y = coutput[0]; + if (n == 1) + for (i = 0; i < z->s->img_x; ++i) out[i] = y[i]; + else + for (i = 0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; } + } + } + } + jpeg__cleanup_jpeg(z); + *out_x = z->s->img_x; + *out_y = z->s->img_y; + if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output + return output; + } + } + + static void* jpeg__jpeg_load(jpeg__context* s, int* x, int* y, int* comp, int req_comp, jpeg__result_info* ri) + { + unsigned char* result; + jpeg__jpeg* j = (jpeg__jpeg*)jpeg__malloc(sizeof(jpeg__jpeg)); + JPEG_NOTUSED(ri); + j->s = s; + jpeg__setup_jpeg(j); + result = load_jpeg_image(j, x, y, comp, req_comp); + JPEG_FREE(j); + return result; + } + + static int jpeg__jpeg_test(jpeg__context* s) + { + int r; + jpeg__jpeg* j = (jpeg__jpeg*)jpeg__malloc(sizeof(jpeg__jpeg)); + j->s = s; + jpeg__setup_jpeg(j); + r = jpeg__decode_jpeg_header(j, JPEG__SCAN_type); + jpeg__rewind(s); + JPEG_FREE(j); + return r; + } + + static int jpeg__jpeg_info_raw(jpeg__jpeg* j, int* x, int* y, int* comp) + { + if (!jpeg__decode_jpeg_header(j, JPEG__SCAN_header)) { + jpeg__rewind(j->s); + return 0; + } + if (x) *x = j->s->img_x; + if (y) *y = j->s->img_y; + if (comp) *comp = j->s->img_n >= 3 ? 3 : 1; + return 1; + } + + static int jpeg__jpeg_info(jpeg__context* s, int* x, int* y, int* comp) + { + int result; + jpeg__jpeg* j = (jpeg__jpeg*)(jpeg__malloc(sizeof(jpeg__jpeg))); + j->s = s; + result = jpeg__jpeg_info_raw(j, x, y, comp); + JPEG_FREE(j); + return result; + } + + //------------------------------------------------------------------------ + + static int jpeg__stdio_read(void* user, char* data, int size) + { + InputMemoryStream* stream = (InputMemoryStream*)user; + return (int)stream->Read(size, data); + } + + static void jpeg__stdio_skip(void* user, int n) + { + InputMemoryStream* stream = (InputMemoryStream*)user; + stream->Skip(n); + } + + static int jpeg__stdio_eof(void* user) + { + InputMemoryStream* stream = (InputMemoryStream*)user; + return stream->Pos() == stream->Size() ? 1 : 0; + } + + //--------------------------------------------------------------------- + + ImageJpegLoader::ImageJpegLoader(const ImageLoaderParam& param) + : ImageLoader(param) + { + if (_param.format == SimdPixelFormatNone) + _param.format = SimdPixelFormatRgb24; + } + + bool ImageJpegLoader::FromStream() + { + int x, y, comp; + jpeg__context s; + s.io.eof = jpeg__stdio_eof; + s.io.read = jpeg__stdio_read; + s.io.skip = jpeg__stdio_skip; + s.io_user_data = &_stream; + s.buflen = sizeof(s.buffer_start); + s.read_from_callbacks = 1; + s.callback_already_read = 0; + s.img_buffer = s.img_buffer_original = s.buffer_start; + jpeg__refill_buffer(&s); + s.img_buffer_original_end = s.img_buffer_end; + jpeg__result_info ri; + uint8_t * data = (uint8_t*)jpeg__jpeg_load(&s, &x, &y, &comp, 3, &ri); + if (data) + { + size_t stride = 3 * x; + _image.Recreate(x, y, (Image::Format)_param.format); + switch (_param.format) + { + case SimdPixelFormatGray8: + Base::RgbToGray(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgr24: + Base::BgrToRgb(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgra32: + Base::RgbToBgra(data, x, y, stride, _image.data, _image.stride, 0xFF); + break; + case SimdPixelFormatRgb24: + Base::Copy(data, stride, x, y, 3, _image.data, _image.stride); + break; + case SimdPixelFormatRgba32: + Base::BgrToBgra(data, x, y, stride, _image.data, _image.stride, 0xFF); + break; + default: + break; + } + JPEG_FREE(data); + return true; + } + return false; + } + } +} diff --git a/3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp b/3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp new file mode 100644 index 0000000000..03ae0fab6f --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp @@ -0,0 +1,1317 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdImageSavePng.h" +#include "Simd/SimdArray.h" +#include "Simd/SimdCpu.h" +#include "Simd/SimdBase.h" + +namespace Simd +{ + namespace Base + { +#define PNG_MALLOC(sz) malloc(sz) +#define PNG_REALLOC(p,newsz) realloc(p,newsz) +#define PNG_FREE(p) free(p) + +#define PNG__BYTECAST(x) ((uint8_t) ((x) & 255)) // truncate int to byte without warnings + + SIMD_INLINE int PngError(const char* str, const char* stub) + { + std::cout << "PNG load error: " << str << ", " << stub << "!" << std::endl; + return 0; + } + + SIMD_INLINE uint8_t * PngErrorPtr(const char* str, const char* stub) + { + return (uint8_t*)(size_t)(PngError(str, stub) ? NULL : NULL); + } + + static void* png__malloc(size_t size) + { + return PNG_MALLOC(size); + } + + struct PngContext + { + uint32_t img_x, img_y; + int img_n, img_out_n; + }; + + static int png__addsizes_valid(int a, int b) + { + if (b < 0) return 0; + // now 0 <= b <= INT_MAX, hence also + // 0 <= INT_MAX - b <= INTMAX. + // And "a + b <= INT_MAX" (which might overflow) is the + // same as a <= INT_MAX - b (no overflow) + return a <= INT_MAX - b; + } + + // returns 1 if the product is valid, 0 on overflow. + // negative factors are considered invalid. + static int png__mul2sizes_valid(int a, int b) + { + if (a < 0 || b < 0) return 0; + if (b == 0) return 1; // mul-by-0 is always safe + // portable way to check for no overflows in a*b + return a <= INT_MAX / b; + } + + // returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow + static int png__mad2sizes_valid(int a, int b, int add) + { + return png__mul2sizes_valid(a, b) && png__addsizes_valid(a * b, add); + } + + // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow + static int png__mad3sizes_valid(int a, int b, int c, int add) + { + return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) && + png__addsizes_valid(a * b * c, add); + } + + // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow + static int png__mad4sizes_valid(int a, int b, int c, int d, int add) + { + return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) && + png__mul2sizes_valid(a * b * c, d) && png__addsizes_valid(a * b * c * d, add); + } + + // mallocs with size overflow checking + static void* png__malloc_mad2(int a, int b, int add) + { + if (!png__mad2sizes_valid(a, b, add)) return NULL; + return png__malloc(a * b + add); + } + + static void* png__malloc_mad3(int a, int b, int c, int add) + { + if (!png__mad3sizes_valid(a, b, c, add)) return NULL; + return png__malloc(a * b * c + add); + } + + static void* png__malloc_mad4(int a, int b, int c, int d, int add) + { + if (!png__mad4sizes_valid(a, b, c, d, add)) return NULL; + return png__malloc(a * b * c * d + add); + } + + static uint8_t png__compute_y(int r, int g, int b) + { + return (uint8_t)(((r * 77) + (g * 150) + (29 * b)) >> 8); + } + + static uint8_t* png__convert_format(uint8_t* data, int img_n, int req_comp, unsigned int x, unsigned int y) + { + int i, j; + uint8_t* good; + + if (req_comp == img_n) + return data; + assert(req_comp >= 1 && req_comp <= 4); + + good = (uint8_t*)png__malloc_mad3(req_comp, x, y, 0); + if (good == NULL) + { + PNG_FREE(data); + return PngErrorPtr("outofmem", "Out of memory"); + } + + for (j = 0; j < (int)y; ++j) + { + uint8_t* src = data + j * x * img_n; + uint8_t* dest = good + j * x * req_comp; + +#define PNG__COMBO(a,b) ((a)*8+(b)) +#define PNG__CASE(a,b) case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b) + // convert source image with img_n components to one with req_comp components; + // avoid switch per pixel, so use switch per scanline and massive macros + switch (PNG__COMBO(img_n, req_comp)) + { + PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 255; } break; + PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 255; } break; + PNG__CASE(2, 1) { dest[0] = src[0]; } break; + PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break; + PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 255; } break; + PNG__CASE(3, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break; + PNG__CASE(3, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = 255; } break; + PNG__CASE(4, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break; + PNG__CASE(4, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = src[3]; } break; + PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break; + default: assert(0); PNG_FREE(data); PNG_FREE(good); return PngErrorPtr("unsupported", "Unsupported format conversion"); + } +#undef PNG__CASE + } + + PNG_FREE(data); + return good; + } + + static uint16_t png__compute_y_16(int r, int g, int b) + { + return (uint16_t)(((r * 77) + (g * 150) + (29 * b)) >> 8); + } + + static uint16_t* png__convert_format16(uint16_t* data, int img_n, int req_comp, unsigned int x, unsigned int y) + { + int i, j; + uint16_t* good; + + if (req_comp == img_n) + return data; + assert(req_comp >= 1 && req_comp <= 4); + + good = (uint16_t*)png__malloc(req_comp * x * y * 2); + if (good == NULL) + { + PNG_FREE(data); + return (uint16_t*)PngErrorPtr("outofmem", "Out of memory"); + } + + for (j = 0; j < (int)y; ++j) + { + uint16_t* src = data + j * x * img_n; + uint16_t* dest = good + j * x * req_comp; + +#define PNG__COMBO(a,b) ((a)*8+(b)) +#define PNG__CASE(a,b) case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b) + // convert source image with img_n components to one with req_comp components; + // avoid switch per pixel, so use switch per scanline and massive macros + switch (PNG__COMBO(img_n, req_comp)) { + PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 0xffff; } break; + PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 0xffff; } break; + PNG__CASE(2, 1) { dest[0] = src[0]; } break; + PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break; + PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 0xffff; } break; + PNG__CASE(3, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break; + PNG__CASE(3, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = 0xffff; } break; + PNG__CASE(4, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break; + PNG__CASE(4, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = src[3]; } break; + PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break; + default: assert(0); PNG_FREE(data); PNG_FREE(good); return (uint16_t*)PngErrorPtr("unsupported", "Unsupported format conversion"); + } +#undef PNG__CASE + } + + PNG_FREE(data); + return good; + } + + namespace Zlib + { + const size_t ZFAST_BITS = 9; + const size_t ZFAST_SIZE = 1 << ZFAST_BITS; + const size_t ZFAST_MASK = ZFAST_SIZE - 1; + + struct Zhuffman + { + uint16_t fast[ZFAST_SIZE]; + uint16_t firstCode[16]; + int maxCode[17]; + uint16_t firstSymbol[16]; + uint8_t size[288]; + uint16_t value[288]; + + bool Build(const uint8_t* sizelist, int num) + { + int i, k = 0; + int code, nextCode[16], sizes[17]; + + memset(sizes, 0, sizeof(sizes)); + memset(fast, 0, sizeof(fast)); + for (i = 0; i < num; ++i) + ++sizes[sizelist[i]]; + sizes[0] = 0; + for (i = 1; i < 16; ++i) + if (sizes[i] > (1 << i)) + return PngError("bad sizes", "Corrupt PNG"); + code = 0; + for (i = 1; i < 16; ++i) + { + nextCode[i] = code; + firstCode[i] = (uint16_t)code; + firstSymbol[i] = (uint16_t)k; + code = (code + sizes[i]); + if (sizes[i] && code - 1 >= (1 << i)) + return PngError("bad codelengths", "Corrupt PNG"); + maxCode[i] = code << (16 - i); // preshift for inner loop + code <<= 1; + k += sizes[i]; + } + maxCode[16] = 0x10000; // sentinel + for (i = 0; i < num; ++i) + { + int s = sizelist[i]; + if (s) + { + int c = nextCode[s] - firstCode[s] + firstSymbol[s]; + uint16_t fastv = (uint16_t)((s << 9) | i); + size[c] = (uint8_t)s; + value[c] = (uint16_t)i; + if (s <= (int)ZFAST_BITS) + { + int j = ZlibBitRev(nextCode[s], s); + while (j < (1 << ZFAST_BITS)) + { + fast[j] = fastv; + j += (1 << s); + } + } + ++nextCode[s]; + } + } + return 1; + } + }; + + SIMD_INLINE static int BitRev16(int n) + { + n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1); + n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2); + n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4); + n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8); + return n; + } + + static int ZhuffmanDecode(InputMemoryStream& is, const Zhuffman& z) + { + int b, s; + if (is.BitCount() < 16) + { + if (is.Eof()) + return -1; + is.FillBits(); + } + b = z.fast[is.BitBuffer() & ZFAST_MASK]; + if (b) + { + s = b >> 9; + is.BitBuffer() >>= s; + is.BitCount() -= s; + return b & 511; + } + else + { + int k; + k = BitRev16(is.BitBuffer()); + for (s = ZFAST_BITS + 1; k >= z.maxCode[s]; ++s); + if (s >= 16) + return -1; + b = (k >> (16 - s)) - z.firstCode[s] + z.firstSymbol[s]; + if (b >= sizeof(z.size) || z.size[b] != s) + return -1; + is.BitBuffer() >>= s; + is.BitCount() -= s; + return z.value[b]; + } + } + + static int ParseHuffmanBlock(InputMemoryStream& is, const Zhuffman& zLength, const Zhuffman& zDistance, OutputMemoryStream& os) + { + static const int zlengthBase[31] = { 3,4,5,6,7,8,9,10,11,13, 15,17,19,23,27,31,35,43,51,59, 67,83,99,115,131,163,195,227,258,0,0 }; + static const int zlengthExtra[31] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 }; + static const int zdistBase[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193, 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0 }; + static const int zdistExtra[32] = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 }; + + uint8_t* beg = os.Data(), * dst = os.Current(), * end = beg + os.Capacity(); + for (;;) + { + ptrdiff_t z = ZhuffmanDecode(is, zLength); + if (z < 256) + { + if (z < 0) + return PngError("bad huffman code", "Corrupt PNG"); + if (dst >= end) + { + os.Reserve(end - beg + 1); + beg = os.Data(); + dst = os.Current(); + end = beg + os.Capacity(); + } + *dst++ = (uint8_t)z; + } + else + { + uint8_t* p; + ptrdiff_t len, dist; + if (z == 256) + { + os.Seek(dst - beg); + return 1; + } + z -= 257; + len = zlengthBase[z]; + if (zlengthExtra[z]) + len += is.ReadBits(zlengthExtra[z]); + z = ZhuffmanDecode(is, zDistance); + if (z < 0) + return PngError("bad huffman code", "Corrupt PNG"); + dist = zdistBase[z]; + if (zdistExtra[z]) + dist += is.ReadBits(zdistExtra[z]); + if (dst - beg < dist) + return PngError("bad dist", "Corrupt PNG"); + if (dst + len > end) + { + os.Reserve(end - beg + 1); + beg = os.Data(); + dst = os.Current(); + end = beg + os.Capacity(); + } + uint8_t* src = dst - dist; + if (dist == 1) + { + memset(dst, *src, len); + dst += len; + } + else if (dist < len || len < 16) + { + for (; len; len--) + *dst++ = *src++; + } + else + { + memcpy(dst, src, len); + dst += len; + } + } + } + } + + static int ComputeHuffmanCodes(InputMemoryStream& is, Zhuffman& zLength, Zhuffman& zDistance) + { + static const uint8_t length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 }; + Zhuffman z_codelength; + uint8_t lencodes[286 + 32 + 137]; + uint8_t codelength_sizes[19]; + int i, n; + + int hlit = is.ReadBits(5) + 257; + int hdist = is.ReadBits(5) + 1; + int hclen = is.ReadBits(4) + 4; + int ntot = hlit + hdist; + + memset(codelength_sizes, 0, sizeof(codelength_sizes)); + for (i = 0; i < hclen; ++i) + { + int s = is.ReadBits(3); + codelength_sizes[length_dezigzag[i]] = (uint8_t)s; + } + if (!z_codelength.Build(codelength_sizes, 19)) + return 0; + n = 0; + while (n < ntot) + { + int c = ZhuffmanDecode(is, z_codelength); + if (c < 0 || c >= 19) + return PngError("bad codelengths", "Corrupt PNG"); + if (c < 16) + lencodes[n++] = (uint8_t)c; + else + { + uint8_t fill = 0; + if (c == 16) + { + c = is.ReadBits(2) + 3; + if (n == 0) return PngError("bad codelengths", "Corrupt PNG"); + fill = lencodes[n - 1]; + } + else if (c == 17) + c = is.ReadBits(3) + 3; + else if (c == 18) + c = is.ReadBits(7) + 11; + else + return PngError("bad codelengths", "Corrupt PNG"); + if (ntot - n < c) + return PngError("bad codelengths", "Corrupt PNG"); + memset(lencodes + n, fill, c); + n += c; + } + } + if (n != ntot) + return PngError("bad codelengths", "Corrupt PNG"); + if (!zLength.Build(lencodes, hlit)) + return 0; + if (!zDistance.Build(lencodes + hlit, hdist)) + return 0; + return 1; + } + + static int ParseUncompressedBlock(InputMemoryStream& is, OutputMemoryStream& os) + { + is.ClearBits(); + uint16_t len, nlen; + if (!is.Read16u(len) || !is.Read16u(nlen) || nlen != (len ^ 0xffff)) + return PngError("zlib corrupt", "Corrupt PNG"); + if (!os.Write(is, len)) + return PngError("read past buffer", "Corrupt PNG"); + return 1; + } + + static int ParseHeader(InputMemoryStream& is) + { + uint8_t cmf, flg; + if (!(is.Read8u(cmf) && is.Read8u(flg))) + return PngError("bad zlib header", "Corrupt PNG"); + if ((int(cmf) * 256 + flg) % 31 != 0) + return PngError("bad zlib header", "Corrupt PNG"); + if (flg & 32) + return PngError("no preset dict", "Corrupt PNG"); + if ((cmf & 15) != 8) + return PngError("bad compression", "Corrupt PNG"); + return 1; + } + + bool Decode(InputMemoryStream& is, OutputMemoryStream& os, bool parseHeader) + { + static const uint8_t ZdefaultLength[288] = { + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8 + }; + static const uint8_t ZdefaultDistance[32] = { + 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 + }; + + Zhuffman zLength, zDistance; + int final, type; + if (parseHeader) + { + if (!ParseHeader(is)) + return false; + } + do + { + final = is.ReadBits(1); + type = is.ReadBits(2); + if (type == 0) + { + if (!ParseUncompressedBlock(is, os)) + return false; + } + else if (type == 3) + return false; + else + { + if (type == 1) + { + if (!zLength.Build(ZdefaultLength, 288)) + return false; + if (!zDistance.Build(ZdefaultDistance, 32)) + return false; + } + else + { + if (!ComputeHuffmanCodes(is, zLength, zDistance)) + return false; + } + if (!ParseHuffmanBlock(is, zLength, zDistance, os)) + return false; + } + } while (!final); + return true; + } + } + + typedef struct + { + PngContext* s; + uint8_t * out; + uint8_t depth; + } png__png; + + enum + { + PNG__F_none = 0, + PNG__F_sub = 1, + PNG__F_up = 2, + PNG__F_avg = 3, + PNG__F_paeth = 4, + // synthetic filters used for first scanline to avoid needing a dummy row of 0s + PNG__F_avg_first, + PNG__F_paeth_first + }; + + static uint8_t first_row_filter[5] = + { + PNG__F_none, + PNG__F_sub, + PNG__F_none, + PNG__F_avg_first, + PNG__F_paeth_first + }; + + static int png__paeth(int a, int b, int c) + { + int p = a + b - c; + int pa = abs(p - a); + int pb = abs(p - b); + int pc = abs(p - c); + if (pa <= pb && pa <= pc) return a; + if (pb <= pc) return b; + return c; + } + + static const uint8_t png__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 }; + + // create the png data from post-deflated data + static int png__create_png_image_raw(png__png* a, uint8_t* raw, uint32_t raw_len, int out_n, uint32_t x, uint32_t y, int depth, int color) + { + int bytes = (depth == 16 ? 2 : 1); + PngContext* s = a->s; + uint32_t i, j, stride = x * out_n * bytes; + uint32_t img_len, img_width_bytes; + int k; + int img_n = s->img_n; // copy it into a local for later + + int output_bytes = out_n * bytes; + int filter_bytes = img_n * bytes; + int width = x; + + assert(out_n == s->img_n || out_n == s->img_n + 1); + a->out = (uint8_t*)png__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into + if (!a->out) return PngError("outofmem", "Out of memory"); + + if (!png__mad3sizes_valid(img_n, x, depth, 7)) return PngError("too large", "Corrupt PNG"); + img_width_bytes = (((img_n * x * depth) + 7) >> 3); + img_len = (img_width_bytes + 1) * y; + + // we used to check for exact match between raw_len and img_len on non-interlaced PNGs, + // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros), + // so just check for raw_len < img_len always. + if (raw_len < img_len) + return PngError("not enough pixels", "Corrupt PNG"); + + for (j = 0; j < y; ++j) + { + uint8_t* cur = a->out + stride * j; + uint8_t* prior; + int filter = *raw++; + + if (filter > 4) + return PngError("invalid filter", "Corrupt PNG"); + + if (depth < 8) + { + if (img_width_bytes > x) + return PngError("invalid width", "Corrupt PNG"); + cur += x * out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place + filter_bytes = 1; + width = img_width_bytes; + } + prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above + + // if first row, use special filter that doesn't sample previous row + if (j == 0) filter = first_row_filter[filter]; + + // handle first byte explicitly + for (k = 0; k < filter_bytes; ++k) + { + switch (filter) { + case PNG__F_none: cur[k] = raw[k]; break; + case PNG__F_sub: cur[k] = raw[k]; break; + case PNG__F_up: cur[k] = PNG__BYTECAST(raw[k] + prior[k]); break; + case PNG__F_avg: cur[k] = PNG__BYTECAST(raw[k] + (prior[k] >> 1)); break; + case PNG__F_paeth: cur[k] = PNG__BYTECAST(raw[k] + png__paeth(0, prior[k], 0)); break; + case PNG__F_avg_first: cur[k] = raw[k]; break; + case PNG__F_paeth_first: cur[k] = raw[k]; break; + } + } + + if (depth == 8) + { + if (img_n != out_n) + cur[img_n] = 255; // first pixel + raw += img_n; + cur += out_n; + prior += out_n; + } + else if (depth == 16) + { + if (img_n != out_n) + { + cur[filter_bytes] = 255; // first pixel top byte + cur[filter_bytes + 1] = 255; // first pixel bottom byte + } + raw += filter_bytes; + cur += output_bytes; + prior += output_bytes; + } + else + { + raw += 1; + cur += 1; + prior += 1; + } + + // this is a little gross, so that we don't switch per-pixel or per-component + if (depth < 8 || img_n == out_n) + { + int nk = (width - 1) * filter_bytes; +#define PNG__CASE(f) \ + case f: \ + for (k=0; k < nk; ++k) + switch (filter) { + // "none" filter turns into a memcpy here; make that explicit. + case PNG__F_none: memcpy(cur, raw, nk); break; + PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - filter_bytes]); } break; + PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break; + PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1)); } break; + PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes])); } break; + PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); } break; + PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], 0, 0)); } break; + } +#undef PNG__CASE + raw += nk; + } + else + { + assert(img_n + 1 == out_n); +#define PNG__CASE(f) \ + case f: \ + for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \ + for (k=0; k < filter_bytes; ++k) + switch (filter) { + PNG__CASE(PNG__F_none) { cur[k] = raw[k]; } break; + PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - output_bytes]); } break; + PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break; + PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - output_bytes]) >> 1)); } break; + PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes])); } break; + PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - output_bytes] >> 1)); } break; + PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], 0, 0)); } break; + } +#undef PNG__CASE + + // the loop above sets the high byte of the pixels' alpha, but for + // 16 bit png files we also need the low byte set. we'll do that here. + if (depth == 16) + { + cur = a->out + stride * j; // start at the beginning of the row again + for (i = 0; i < x; ++i, cur += output_bytes) + cur[filter_bytes + 1] = 255; + } + } + } + + // we make a separate pass to expand bits to pixels; for performance, + // this could run two scanlines behind the above code, so it won't + // intefere with filtering but will still be in the cache. + if (depth < 8) + { + for (j = 0; j < y; ++j) + { + uint8_t* cur = a->out + stride * j; + uint8_t* in = a->out + stride * j + x * out_n - img_width_bytes; + // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit + // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop + uint8_t scale = (color == 0) ? png__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range + + // note that the final byte might overshoot and write more data than desired. + // we can allocate enough data that this never writes out of memory, but it + // could also overwrite the next scanline. can it overwrite non-empty data + // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel. + // so we need to explicitly clamp the final ones + + if (depth == 4) + { + for (k = x * img_n; k >= 2; k -= 2, ++in) + { + *cur++ = scale * ((*in >> 4)); + *cur++ = scale * ((*in) & 0x0f); + } + if (k > 0) + *cur++ = scale * ((*in >> 4)); + } + else if (depth == 2) + { + for (k = x * img_n; k >= 4; k -= 4, ++in) + { + *cur++ = scale * ((*in >> 6)); + *cur++ = scale * ((*in >> 4) & 0x03); + *cur++ = scale * ((*in >> 2) & 0x03); + *cur++ = scale * ((*in) & 0x03); + } + if (k > 0) + *cur++ = scale * ((*in >> 6)); + if (k > 1) + *cur++ = scale * ((*in >> 4) & 0x03); + if (k > 2) + *cur++ = scale * ((*in >> 2) & 0x03); + } + else if (depth == 1) + { + for (k = x * img_n; k >= 8; k -= 8, ++in) + { + *cur++ = scale * ((*in >> 7)); + *cur++ = scale * ((*in >> 6) & 0x01); + *cur++ = scale * ((*in >> 5) & 0x01); + *cur++ = scale * ((*in >> 4) & 0x01); + *cur++ = scale * ((*in >> 3) & 0x01); + *cur++ = scale * ((*in >> 2) & 0x01); + *cur++ = scale * ((*in >> 1) & 0x01); + *cur++ = scale * ((*in) & 0x01); + } + if (k > 0) *cur++ = scale * ((*in >> 7)); + if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01); + if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01); + if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01); + if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01); + if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01); + if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01); + } + if (img_n != out_n) + { + int q; + // insert alpha = 255 + cur = a->out + stride * j; + if (img_n == 1) + { + for (q = x - 1; q >= 0; --q) + { + cur[q * 2 + 1] = 255; + cur[q * 2 + 0] = cur[q]; + } + } + else + { + assert(img_n == 3); + for (q = x - 1; q >= 0; --q) + { + cur[q * 4 + 3] = 255; + cur[q * 4 + 2] = cur[q * 3 + 2]; + cur[q * 4 + 1] = cur[q * 3 + 1]; + cur[q * 4 + 0] = cur[q * 3 + 0]; + } + } + } + } + } + else if (depth == 16) + { + // force the image data from big-endian to platform-native. + // this is done in a separate pass due to the decoding relying + // on the data being untouched, but could probably be done + // per-line during decode if care is taken. + uint8_t* cur = a->out; + uint16_t* cur16 = (uint16_t*)cur; + + for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) + *cur16 = (cur[0] << 8) | cur[1]; + } + + return 1; + } + + static int png__create_png_image(png__png* a, uint8_t* image_data, uint32_t image_data_len, int out_n, int depth, int color, int interlaced) + { + int bytes = (depth == 16 ? 2 : 1); + int out_bytes = out_n * bytes; + uint8_t* final; + int p; + if (!interlaced) + return png__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color); + + // de-interlacing + final = (uint8_t*)png__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0); + for (p = 0; p < 7; ++p) + { + int xorig[] = { 0,4,0,2,0,1,0 }; + int yorig[] = { 0,0,4,0,2,0,1 }; + int xspc[] = { 8,8,4,4,2,2,1 }; + int yspc[] = { 8,8,8,4,4,2,2 }; + int i, j, x, y; + // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1 + x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p]; + y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p]; + if (x && y) + { + uint32_t img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y; + if (!png__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) + { + PNG_FREE(final); + return 0; + } + for (j = 0; j < y; ++j) + { + for (i = 0; i < x; ++i) + { + int out_y = j * yspc[p] + yorig[p]; + int out_x = i * xspc[p] + xorig[p]; + memcpy(final + out_y * a->s->img_x * out_bytes + out_x * out_bytes, + a->out + (j * x + i) * out_bytes, out_bytes); + } + } + PNG_FREE(a->out); + image_data += img_len; + image_data_len -= img_len; + } + } + a->out = final; + + return 1; + } + + static int png__compute_transparency(png__png* z, uint8_t tc[3], int out_n) + { + PngContext* s = z->s; + uint32_t i, pixel_count = s->img_x * s->img_y; + uint8_t* p = z->out; + + // compute color-based transparency, assuming we've + // already got 255 as the alpha value in the output + assert(out_n == 2 || out_n == 4); + + if (out_n == 2) + { + for (i = 0; i < pixel_count; ++i) + { + p[1] = (p[0] == tc[0] ? 0 : 255); + p += 2; + } + } + else + { + for (i = 0; i < pixel_count; ++i) + { + if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) + p[3] = 0; + p += 4; + } + } + return 1; + } + + static int png__compute_transparency16(png__png* z, uint16_t tc[3], int out_n) + { + PngContext* s = z->s; + uint32_t i, pixel_count = s->img_x * s->img_y; + uint16_t* p = (uint16_t*)z->out; + + // compute color-based transparency, assuming we've + // already got 65535 as the alpha value in the output + assert(out_n == 2 || out_n == 4); + + if (out_n == 2) + { + for (i = 0; i < pixel_count; ++i) + { + p[1] = (p[0] == tc[0] ? 0 : 65535); + p += 2; + } + } + else + { + for (i = 0; i < pixel_count; ++i) + { + if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) + p[3] = 0; + p += 4; + } + } + return 1; + } + + static int png__expand_png_palette(png__png* a, uint8_t* palette, int len, int pal_img_n) + { + uint32_t i, pixel_count = a->s->img_x * a->s->img_y; + uint8_t* p, * temp_out, * orig = a->out; + + p = (uint8_t*)png__malloc_mad2(pixel_count, pal_img_n, 0); + if (p == NULL) + return PngError("outofmem", "Out of memory"); + + // between here and free(out) below, exitting would leak + temp_out = p; + + if (pal_img_n == 3) + { + for (i = 0; i < pixel_count; ++i) + { + int n = orig[i] * 4; + p[0] = palette[n]; + p[1] = palette[n + 1]; + p[2] = palette[n + 2]; + p += 3; + } + } + else + { + for (i = 0; i < pixel_count; ++i) + { + int n = orig[i] * 4; + p[0] = palette[n]; + p[1] = palette[n + 1]; + p[2] = palette[n + 2]; + p[3] = palette[n + 3]; + p += 4; + } + } + PNG_FREE(a->out); + a->out = temp_out; + + return 1; + } + + //--------------------------------------------------------------------- + + ImagePngLoader::ImagePngLoader(const ImageLoaderParam& param) + : ImageLoader(param) + , _toAny8(NULL) + , _toBgra8(NULL) + , _toAny16(NULL) + , _toBgra16(NULL) + { + if (_param.format == SimdPixelFormatNone) + _param.format = SimdPixelFormatRgba32; + } + + void ImagePngLoader::SetConverters() + { + _bgrToBgra = Base::BgrToBgra; + } + + SIMD_INLINE constexpr uint32_t ChunkType(char a, char b, char c, char d) + { + return ((uint32_t(a) << 24) + (uint32_t(b) << 16) + (uint32_t(c) << 8) + uint32_t(d)); + } + + bool ImagePngLoader::FromStream() + { + const int req_comp = 4; + PngContext context; + png__png p; + p.s = &context; + png__png* z = &p; + + PngContext* s = z->s; + + z->out = NULL; + + if (!ParseFile()) + return false; + + s->img_x = _width; + s->img_y = _height; + z->depth = _depth; + s->img_n = _channels; + + InputMemoryStream zSrc = MergedDataStream(); + OutputMemoryStream zDst(AlignHi(size_t(_width) * _depth, 8) * _height * _channels + _height); + if(!Zlib::Decode(zSrc, zDst, !_iPhone)) + return false; + + if ((req_comp == s->img_n + 1 && req_comp != 3 && !_paletteChannels) || _hasTrans) + s->img_out_n = s->img_n + 1; + else + s->img_out_n = s->img_n; + if (!png__create_png_image(z, zDst.Data(), zDst.Size(), s->img_out_n, z->depth, _color, _interlace)) + return 0; + if (_hasTrans) + { + if (z->depth == 16) + { + if (!png__compute_transparency16(z, _tc16, s->img_out_n)) + return false; + } + else + { + if (!png__compute_transparency(z, _tc, s->img_out_n)) + return false; + } + } + if (_paletteChannels) + { + s->img_n = _paletteChannels; // record the actual colors we had + s->img_out_n = _paletteChannels; + if (req_comp >= 3) + s->img_out_n = req_comp; + if (!png__expand_png_palette(z, _palette.data, (int)_palette.size, s->img_out_n)) + return false; + } + else if (_hasTrans) + ++s->img_n; + + if (!(p.depth <= 8 || p.depth == 16)) + return false; + uint8_t* data = p.out; + p.out = NULL; + if (req_comp && req_comp != p.s->img_out_n) + { + if (p.depth <= 8) + data = png__convert_format((uint8_t*)data, p.s->img_out_n, req_comp, _width, _height); + else + data = (uint8_t*)png__convert_format16((uint16_t*)data, p.s->img_out_n, req_comp, _width, _height); + p.s->img_out_n = req_comp; + if (data == NULL) + return false; + } + if (p.depth == 16) + { + size_t size = context.img_x * context.img_y * req_comp; + const uint16_t* src = (uint16_t*)data; + uint8_t* dst = (uint8_t*)PNG_MALLOC(size); + for (size_t i = 0; i < size; ++i) + dst[i] = uint8_t(src[i] >> 8); + PNG_FREE(data); + data = dst; + } + PNG_FREE(p.out); + if (data) + { + size_t stride = 4 * context.img_x; + _image.Recreate(context.img_x, context.img_y, (Image::Format)_param.format); + switch (_param.format) + { + case SimdPixelFormatGray8: + Base::RgbaToGray(data, context.img_x, context.img_y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgr24: + Base::BgraToRgb(data, context.img_x, context.img_y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgra32: + Base::BgraToRgba(data, context.img_x, context.img_y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatRgb24: + Base::BgraToBgr(data, context.img_x, context.img_y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatRgba32: + Base::Copy(data, stride, context.img_x, context.img_y, 4, _image.data, _image.stride); + break; + default: + break; + } + PNG_FREE(data); + return true; + } + return false; + } + + bool ImagePngLoader::ParseFile() + { + _first = true, _iPhone = false, _hasTrans = false; + if (!CheckHeader()) + return false; + for (bool run = true; run;) + { + Chunk chunk; + if (!ReadChunk(chunk)) + return 0; + if (chunk.type == ChunkType('C', 'g', 'B', 'I')) + { + _iPhone = true; + _stream.Skip(chunk.size); + } + else if (chunk.type == ChunkType('I', 'H', 'D', 'R')) + { + if (!ReadHeader(chunk)) + return false; + SetConverters(); + } + else if (chunk.type == ChunkType('P', 'L', 'T', 'E')) + { + if (!ReadPalette(chunk)) + return false; + } + else if (chunk.type == ChunkType('t', 'R', 'N', 'S')) + { + if (!ReadTransparency(chunk)) + return false; + } + else if (chunk.type == ChunkType('I', 'D', 'A', 'T')) + { + if (!ReadData(chunk)) + return false; + } + else if (chunk.type == ChunkType('I', 'E', 'N', 'D')) + { + if (_first) + return false; + run = false; + } + else + { + if (_first || (chunk.type & (1 << 29)) == 0) + return false; + _stream.Skip(chunk.size); + } + uint32_t crc32; + if (!_stream.ReadBe32u(crc32)) + return false; + } + return _idats.size() != 0; + } + + bool ImagePngLoader::CheckHeader() + { + const size_t size = 8; + const uint8_t control[size] = { 137, 80, 78, 71, 13, 10, 26, 10 }; + uint8_t buffer[size]; + return _stream.Read(size, buffer) == size && memcmp(buffer, control, size) == 0; + } + + SIMD_INLINE bool ImagePngLoader::ReadChunk(Chunk& chunk) + { + if (_stream.ReadBe32u(chunk.size) && _stream.ReadBe32u(chunk.type)) + { + chunk.offs = (uint32_t)_stream.Pos(); + return true; + } + return false; + } + + bool ImagePngLoader::ReadHeader(const Chunk& chunk) + { + const int MAX_SIZE = 1 << 24; + if (!_first) + return false; + _first = false; + if (!(chunk.size == 13 && _stream.CanRead(13))) + return false; + uint8_t comp, filter; + if (!(_stream.ReadBe32u(_width) && _stream.ReadBe32u(_height) && + _stream.Read8u(_depth) && _stream.Read8u(_color) && _stream.Read8u(comp) && + _stream.Read8u(filter) && _stream.Read8u(_interlace))) + return false; + if (_width == 0 || _width > MAX_SIZE || _height == 0 || _height > MAX_SIZE) + return false; + if (_depth != 1 && _depth != 2 && _depth != 4 && _depth != 8 && _depth != 16) + return false; + if (_color > 6 || (_color == 3 && _depth == 16)) + return false; + _paletteChannels = 0; + if (_color == 3) + _paletteChannels = 3; + else if (_color & 1) + return false; + if (comp != 0 || filter != 0 || _interlace > 1) + return false; + if (!_paletteChannels) + { + _channels = (_color & 2 ? 3 : 1) + (_color & 4 ? 1 : 0); + if ((1 << 30) / _width / _channels < _height) + return false; + } + else + { + _channels = 1; + if ((1 << 30) / _width / 4 < _height) + return false; + } + return true; + } + + bool ImagePngLoader::ReadPalette(const Chunk& chunk) + { + if (_first || chunk.size > 256 * 3) + return false; + size_t length = chunk.size / 3; + if (length * 3 != chunk.size) + return false; + if (_stream.CanRead(chunk.size)) + { + _palette.Resize(length * 4); + _bgrToBgra(_stream.Current(), length, 1, length, _palette.data, _palette.size, 0xFF); + _stream.Skip(chunk.size); + return true; + } + else + return false; + } + + bool ImagePngLoader::ReadTransparency(const Chunk& chunk) + { + if (_first) + return false; + if (_idats.size()) + return false; + if (_paletteChannels) + { + if (_palette.size == 0 || chunk.size > _palette.size || !_stream.CanRead(chunk.size)) + return false; + _paletteChannels = 4; + for (size_t i = 0; i < chunk.size; ++i) + _palette.data[i * 4 + 3] = _stream.Current()[i]; + _stream.Skip(chunk.size); + } + else + { + if (!(_channels & 1) || chunk.size != _channels * 2) + return false; + _hasTrans = true; + for (size_t k = 0; k < _channels; ++k) + if (!_stream.ReadBe16u(_tc16[k])) + return false; + if (_depth != 16) + { + for (size_t k = 0; k < _channels; ++k) + _tc[k] = uint8_t(_tc16[k]) * png__depth_scale_table[_depth]; + } + } + return true; + } + + bool ImagePngLoader::ReadData(const Chunk& chunk) + { + if (_first) + return false; + if (_paletteChannels && !_palette.size) + return false; + if (!_stream.CanRead(chunk.size)) + return false; + _idats.push_back(chunk); + _stream.Skip(chunk.size); + return true; + } + + InputMemoryStream ImagePngLoader::MergedDataStream() + { + if (_idats.size() == 1) + return InputMemoryStream((uint8_t*)_stream.Data() + _idats[0].offs, _idats[0].size); + else + { + size_t size = 0; + for (size_t i = 0; i < _idats.size(); ++i) + size += _idats[i].size; + _idat.Resize(size); + for (size_t i = 0, offset = 0; i < _idats.size(); ++i) + { + memcpy(_idat.data + offset, _stream.Data() + _idats[i].offs, _idats[i].size); + offset += _idats[i].size; + } + return InputMemoryStream(_idat.data, _idat.size); + } + } + } +} diff --git a/3rdparty/simdlib/Simd/SimdBaseImageSave.cpp b/3rdparty/simdlib/Simd/SimdBaseImageSave.cpp new file mode 100644 index 0000000000..fb5a8eacef --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseImageSave.cpp @@ -0,0 +1,340 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdCpu.h" +#include "Simd/SimdBase.h" + +#include + +#include +#include + +#if defined(_MSC_VER) +#pragma warning (push) +#pragma warning (disable: 4996) +#endif + +namespace Simd +{ + SimdBool ImageSaveToFile(const ImageSaveToMemoryPtr saver, const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char* path) + { + SimdBool result = SimdFalse; + size_t size; + uint8_t * data = saver(src, stride, width, height, format, file, quality, &size); + if (data) + { + ::FILE* file = ::fopen(path, "wb"); + if (file) + { + if (::fwrite(data, 1, size, file) == size) + result = SimdTrue; + ::fclose(file); + } + Simd::Free(data); + } + return result; + } + + //------------------------------------------------------------------------- + + namespace Base + { + ImagePxmSaver::ImagePxmSaver(const ImageSaverParam& param) + : ImageSaver(param) + , _convert(NULL) + { + _block = _param.height; + if (_param.file == SimdImageFilePgmTxt || _param.file == SimdImageFilePgmBin) + { + _size = _param.width * 1; + if (_param.format != SimdPixelFormatGray8) + { + _block = Simd::RestrictRange(Base::AlgCacheL1() / _size, 1, _param.height); + _buffer.Resize(_block * _size); + } + } + else if (_param.file == SimdImageFilePpmTxt || _param.file == SimdImageFilePpmBin) + { + _size = _param.width * 3; + if (_param.format != SimdPixelFormatRgb24) + { + _block = Simd::RestrictRange(Base::AlgCacheL1() / _size, 1, _param.height); + _buffer.Resize(_block * _size); + } + } + else + assert(0); + } + + void ImagePxmSaver::WriteHeader(size_t version) + { + std::stringstream header; + header << "P" << version << "\n" << _param.width << " " << _param.height << "\n255\n"; + _stream.Write(header.str().c_str(), header.str().size()); + } + + uint8_t g_pxmPrint[256][4]; + bool PxmPrintInit() + { + for (int i = 0; i < 256; ++i) + { + int d0 = i / 100; + int d1 = (i / 10) % 10; + int d2 = i % 10; + g_pxmPrint[i][0] = d0 ? '0' + d0 : ' '; + g_pxmPrint[i][1] = (d1 || d0) ? '0' + d1 : ' '; + g_pxmPrint[i][2] = '0' + d2; + g_pxmPrint[i][3] = ' '; + } + return true; + } + bool g_pxmPrintInited = PxmPrintInit(); + + //--------------------------------------------------------------------- + + ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param) + : ImagePxmSaver(param) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Base::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Base::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Base::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Base::RgbaToGray; break; + default: break; + } + } + + bool ImagePgmTxtSaver::ToStream(const uint8_t* src, size_t stride) + { + size_t grayStride = _param.format == SimdPixelFormatGray8 ? stride : _size; + _stream.Reserve(32 + _param.height * (_param.width * 4 + DivHi(_param.width, 17))); + WriteHeader(2); + for (size_t row = 0; row < _param.height;) + { + size_t block = Simd::Min(row + _block, _param.height) - row; + const uint8_t* gray = src; + if (_param.format != SimdPixelFormatGray8) + { + _convert(src, _param.width, block, stride, _buffer.data, grayStride); + gray = _buffer.data; + } + for (size_t b = 0; b < block; ++b) + { + uint8_t string[70]; + for (size_t col = 0, offset = 0; col < _param.width; ++col) + { + *(uint32_t*)(string + offset) = *(uint32_t*)g_pxmPrint[gray[col]]; + offset += 4; + if (offset >= 68 || col == _param.width - 1) + { + string[offset++] = '\n'; + _stream.Write(string, offset); + offset = 0; + } + } + gray += grayStride; + } + src += stride * block; + row += block; + } + return true; + } + + //--------------------------------------------------------------------- + + ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param) + : ImagePxmSaver(param) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Base::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Base::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Base::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Base::RgbaToGray; break; + default: break; + } + } + + bool ImagePgmBinSaver::ToStream(const uint8_t* src, size_t stride) + { + size_t grayStride = _param.format == SimdPixelFormatGray8 ? stride : _size; + _stream.Reserve(32 + _param.height * _size); + WriteHeader(5); + for (size_t row = 0; row < _param.height;) + { + size_t block = Simd::Min(row + _block, _param.height) - row; + const uint8_t* gray = src; + if (_param.format != SimdPixelFormatGray8) + { + _convert(src, _param.width, block, stride, _buffer.data, grayStride); + gray = _buffer.data; + } + for (size_t b = 0; b < block; ++b) + { + _stream.Write(gray, _size); + gray += grayStride; + } + src += stride * block; + row += block; + } + return true; + } + + //--------------------------------------------------------------------- + + ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param) + : ImagePxmSaver(param) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Base::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Base::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Base::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Base::BgraToBgr; break; + default: break; + } + } + + bool ImagePpmTxtSaver::ToStream(const uint8_t* src, size_t stride) + { + size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? stride : _size; + _stream.Reserve(32 + _param.height * (_param.width * 13 + DivHi(_param.width, 5))); + WriteHeader(3); + for (size_t row = 0; row < _param.height;) + { + size_t block = Simd::Min(row + _block, _param.height) - row; + const uint8_t* rgb = src; + if (_param.format != SimdPixelFormatRgb24) + { + _convert(src, _param.width, block, stride, _buffer.data, rgbStride); + rgb = _buffer.data; + } + for (size_t b = 0; b < block; ++b) + { + uint8_t string[70]; + for (size_t col = 0, offset = 0; col < _size; col += 3) + { + ((uint32_t*)(string + offset))[0] = *(uint32_t*)g_pxmPrint[rgb[col + 0]]; + ((uint32_t*)(string + offset))[1] = *(uint32_t*)g_pxmPrint[rgb[col + 1]]; + ((uint32_t*)(string + offset))[2] = *(uint32_t*)g_pxmPrint[rgb[col + 2]]; + offset += 12; + if (offset >= 68 || col == _size - 3) + { + string[offset++] = '\n'; + _stream.Write(string, offset); + offset = 0; + } + else + { + string[offset++] = ' '; + string[offset++] = ' '; + } + } + rgb += rgbStride; + } + src += stride * block; + row += block; + } + return true; + } + + //--------------------------------------------------------------------- + + ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param) + : ImagePxmSaver(param) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Base::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Base::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Base::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Base::BgraToBgr; break; + default: break; + } + } + + bool ImagePpmBinSaver::ToStream(const uint8_t* src, size_t stride) + { + size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? stride : _size; + _stream.Reserve(32 + _param.height * _size); + WriteHeader(6); + for (size_t row = 0; row < _param.height;) + { + size_t block = Simd::Min(row + _block, _param.height) - row; + const uint8_t* rgb = src; + if (_param.format != SimdPixelFormatRgb24) + { + _convert(src, _param.width, block, stride, _buffer.data, rgbStride); + rgb = _buffer.data; + } + for (size_t b = 0; b < block; ++b) + { + _stream.Write(rgb, _size); + rgb += rgbStride; + } + src += stride * block; + row += block; + } + return true; + } + + //--------------------------------------------------------------------- + + ImageSaver* CreateImageSaver(const ImageSaverParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param); + case SimdImageFilePgmBin: return new ImagePgmBinSaver(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param); + case SimdImageFilePpmBin: return new ImagePpmBinSaver(param); + case SimdImageFilePng: return new ImagePngSaver(param); + case SimdImageFileJpeg: return new ImageJpegSaver(param); + default: + return NULL; + } + } + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size) + { + ImageSaverParam param(width, height, format, file, quality); + if (param.Validate()) + { + Holder saver(CreateImageSaver(param)); + if (saver) + { + if (saver->ToStream(src, stride)) + return saver->Release(size); + } + } + return NULL; + } + } +} + +#if defined(_MSC_VER) +#pragma warning (pop) +#endif diff --git a/3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp b/3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp new file mode 100644 index 0000000000..f7ba583247 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp @@ -0,0 +1,451 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdImageSaveJpeg.h" +#include "Simd/SimdBase.h" + +namespace Simd +{ + namespace Base + { + const uint8_t JpegZigZagD[64] = { + 0, 1, 5, 6, 14, 15, 27, 28, + 2, 4, 7, 13, 16, 26, 29, 42, + 3, 8, 12, 17, 25, 30, 41, 43, + 9, 11, 18, 24, 31, 40, 44, 53, + 10, 19, 23, 32, 39, 45, 52, 54, + 20, 22, 33, 38, 46, 51, 55, 60, + 21, 34, 37, 47, 50, 56, 59, 61, + 35, 36, 48, 49, 57, 58, 62, 63 }; + + const uint8_t JpegZigZagT[64] = { + 0, 2, 3, 9, 10, 20, 21, 35, + 1, 4, 8, 11, 19, 22, 34, 36, + 5, 7, 12, 18, 23, 33, 37, 48, + 6, 13, 17, 24, 32, 38, 47, 49, + 14, 16, 25, 31, 39, 46, 50, 57, + 15, 26, 30, 40, 45, 51, 56, 58, + 27, 29, 41, 44, 52, 55, 59, 62, + 28, 42, 43, 53, 54, 60, 61, 63 }; + + const uint16_t HuffmanYdc[256][2] = { {0, 2}, {2, 3}, {3, 3}, {4, 3}, {5, 3}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9} }; + const uint16_t HuffmanUVdc[256][2] = { {0, 2}, {1, 2}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}, {1022, 10}, {2046, 11} }; + const uint16_t HuffmanYac[256][2] = { + {10, 4}, {0, 2}, {1, 2}, {4, 3}, {11, 4}, {26, 5}, {120, 7}, {248, 8}, {1014, 10}, {65410, 16}, {65411, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {12, 4}, {27, 5}, {121, 7}, {502, 9}, {2038, 11}, {65412, 16}, {65413, 16}, {65414, 16}, {65415, 16}, {65416, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {28, 5}, {249, 8}, {1015, 10}, {4084, 12}, {65417, 16}, {65418, 16}, {65419, 16}, {65420, 16}, {65421, 16}, {65422, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {58, 6}, {503, 9}, {4085, 12}, {65423, 16}, {65424, 16}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {59, 6}, {1016, 10}, {65430, 16}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {122, 7}, {2039, 11}, {65438, 16}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {123, 7}, {4086, 12}, {65446, 16}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {250, 8}, {4087, 12}, {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {504, 9}, {32704, 15}, {65462, 16}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {505, 9}, {65470, 16}, {65471, 16}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {506, 9}, {65479, 16}, {65480, 16}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {1017, 10}, {65488, 16}, {65489, 16}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {1018, 10}, {65497, 16}, {65498, 16}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {2040, 11}, {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {2041, 11}, {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0} + }; + const uint16_t HuffmanUVac[256][2] = { + {0, 2}, {1, 2}, {4, 3}, {10, 4}, {24, 5}, {25, 5}, {56, 6}, {120, 7}, {500, 9}, {1014, 10}, {4084, 12}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {11, 4}, {57, 6}, {246, 8}, {501, 9}, {2038, 11}, {4085, 12}, {65416, 16}, {65417, 16}, {65418, 16}, {65419, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {26, 5}, {247, 8}, {1015, 10}, {4086, 12}, {32706, 15}, {65420, 16}, {65421, 16}, {65422, 16}, {65423, 16}, {65424, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {27, 5}, {248, 8}, {1016, 10}, {4087, 12}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {65430, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {58, 6}, {502, 9}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {65438, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {59, 6}, {1017, 10}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {65446, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {121, 7}, {2039, 11}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {65454, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {122, 7}, {2040, 11}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {65462, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {249, 8}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {65470, 16}, {65471, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {503, 9}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {65479, 16}, {65480, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {504, 9}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {65488, 16}, {65489, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {505, 9}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {65497, 16}, {65498, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {506, 9}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {65506, 16}, {65507, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {2041, 11}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {65515, 16}, {65516, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {65525, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {1018, 10}, {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0} + }; + +#if defined(SIMD_JPEG_CALC_BITS_TABLE) + uint16_t JpegCalcBitsTable[JpegCalcBitsRange * 2][2]; + bool JpegCalcBitsTableInit() + { + for (int i = 0, n = JpegCalcBitsRange * 2; i < n; ++i) + { + int val = i - JpegCalcBitsRange; + int tmp = val < 0 ? -val : val; + val = val < 0 ? val - 1 : val; + int cnt = 1; + while (tmp >>= 1) + ++cnt; + JpegCalcBitsTable[i][0] = val & ((1 << cnt) - 1); + JpegCalcBitsTable[i][1] = cnt; + } + return true; + } + bool JpegCalcBitsTableInited = JpegCalcBitsTableInit(); +#endif + + SIMD_INLINE void JpegDct(float* d0p, float* d1p, float* d2p, float* d3p, float* d4p, float* d5p, float* d6p, float* d7p) + { + float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p; + float z1, z2, z3, z4, z5, z11, z13; + float tmp0 = d0 + d7; + float tmp7 = d0 - d7; + float tmp1 = d1 + d6; + float tmp6 = d1 - d6; + float tmp2 = d2 + d5; + float tmp5 = d2 - d5; + float tmp3 = d3 + d4; + float tmp4 = d3 - d4; + + float tmp10 = tmp0 + tmp3; + float tmp13 = tmp0 - tmp3; + float tmp11 = tmp1 + tmp2; + float tmp12 = tmp1 - tmp2; + + d0 = tmp10 + tmp11; + d4 = tmp10 - tmp11; + + z1 = (tmp12 + tmp13) * 0.707106781f; + d2 = tmp13 + z1; + d6 = tmp13 - z1; + + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + z5 = (tmp10 - tmp12) * 0.382683433f; + z2 = tmp10 * 0.541196100f + z5; + z4 = tmp12 * 1.306562965f + z5; + z3 = tmp11 * 0.707106781f; + + z11 = tmp7 + z3; + z13 = tmp7 - z3; + + *d5p = z13 + z2; + *d3p = z13 - z2; + *d1p = z11 + z4; + *d7p = z11 - z4; + + *d0p = d0; *d2p = d2; *d4p = d4; *d6p = d6; + } + + static int JpegProcessDu(Base::BitBuf& bitBuf, float* CDU, int stride, const float* fdtbl, int DC, const uint16_t HTDC[256][2], const uint16_t HTAC[256][2]) + { + int offs, i, j, n, diff, end0pos, x, y; + for (offs = 0; offs < 8; ++offs) + JpegDct(&CDU[offs], &CDU[offs + stride], &CDU[offs + stride * 2], &CDU[offs + stride * 3], &CDU[offs + stride * 4], + &CDU[offs + stride * 5], &CDU[offs + stride * 6], &CDU[offs + stride * 7]); + for (offs = 0, n = stride * 8; offs < n; offs += stride) + JpegDct(&CDU[offs], &CDU[offs + 1], &CDU[offs + 2], &CDU[offs + 3], &CDU[offs + 4], &CDU[offs + 5], &CDU[offs + 6], &CDU[offs + 7]); + int DU[64]; + for (y = 0, j = 0; y < 8; ++y) + { + for (x = 0; x < 8; ++x, ++j) + { + i = y * stride + x; + float v = CDU[i] * fdtbl[j]; + DU[JpegZigZagD[j]] = Round(v); + } + } + diff = DU[0] - DC; + if (diff == 0) + bitBuf.Push(HTDC[0]); + else + { + uint16_t bits[2]; + JpegCalcBits(diff, bits); + bitBuf.Push(HTDC[bits[1]]); + bitBuf.Push(bits); + } + end0pos = 63; + for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos); + if (end0pos == 0) + { + bitBuf.Push(HTAC[0x00]); + return DU[0]; + } + for (i = 1; i <= end0pos; ++i) + { + int startpos = i; + int nrzeroes; + uint16_t bits[2]; + for (; DU[i] == 0 && i <= end0pos; ++i); + nrzeroes = i - startpos; + if (nrzeroes >= 16) + { + int lng = nrzeroes >> 4; + int nrmarker; + for (nrmarker = 1; nrmarker <= lng; ++nrmarker) + bitBuf.Push(HTAC[0xF0]); + nrzeroes &= 15; + } + JpegCalcBits(DU[i], bits); + bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]); + bitBuf.Push(bits); + } + if (end0pos != 63) + bitBuf.Push(HTAC[0x00]); + return DU[0]; + } + + void JpegWriteBlockSubs(OutputMemoryStream & stream, int width, int height, const uint8_t * red, + const uint8_t* green, const uint8_t* blue, int stride, const float * fY, const float* fUv, int dc[3]) + { + int & DCY = dc[0], & DCU = dc[1], & DCV = dc[2]; + float Y[256], U[256], V[256]; + float subU[64], subV[64]; + bool gray = red == green && red == blue; + Base::BitBuf bitBuf; + for (int y = 0; y < height; y += 16) + { + for (int x = 0; x < width; x += 16) + { + if (gray) + Base::GrayToY(red + x, stride, height - y, width - x, Y, 16); + else + Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 16); + DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + for (int yy = 0, pos = 0; yy < 8; ++yy) + { + for (int xx = 0; xx < 8; ++xx, ++pos) + { + int j = yy * 32 + xx * 2; + subU[pos] = (U[j + 0] + U[j + 1] + U[j + 16] + U[j + 17]) * 0.25f; + subV[pos] = (V[j + 0] + V[j + 1] + V[j + 16] + V[j + 17]) * 0.25f; + } + } + DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + if (bitBuf.Full()) + { + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + } + } + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + + void JpegWriteBlockFull(OutputMemoryStream& stream, int width, int height, const uint8_t* red, + const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3]) + { + int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2]; + float Y[64], U[64], V[64]; + bool gray = red == green && red == blue; + Base::BitBuf bitBuf; + for (int y = 0; y < height; y += 8) + { + for (int x = 0; x < width; x += 8) + { + if (gray) + Base::GrayToY(red + x, stride, height - y, width - x, Y, 8); + else + Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 8); + DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + if (bitBuf.Full()) + { + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + } + } + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + + //--------------------------------------------------------------------- + + ImageJpegSaver::ImageJpegSaver(const ImageSaverParam& param) + : ImageSaver(param) + , _deintBgra(NULL) + , _deintBgr(NULL) + { + } + + void ImageJpegSaver::Init() + { + InitParams(false); + switch (_param.format) + { + case SimdPixelFormatBgr24: + case SimdPixelFormatRgb24: + _deintBgr = Base::DeinterleaveBgr; + break; + case SimdPixelFormatBgra32: + case SimdPixelFormatRgba32: + _deintBgra = Base::DeinterleaveBgra; + break; + default: + break; + } + _writeBlock = _subSample ? JpegWriteBlockSubs : JpegWriteBlockFull; + } + + void ImageJpegSaver::InitParams(bool trans) + { + static const int YQT[] = { 16, 11, 10, 16, 24, 40, 51, 61, 12, 12, 14, 19, 26, 58, 60, 55, 14, 13, + 16, 24, 40, 57, 69, 56, 14, 17, 22, 29, 51, 87, 80, 62, 18, 22, 37, 56, 68, 109, 103, 77, 24, + 35, 55, 64, 81, 104, 113, 92, 49, 64, 78, 87, 103, 121, 120, 101, 72, 92, 95, 98, 112, 100, 103, 99 }; + static const int UVQT[] = { 17, 18, 24, 47, 99, 99, 99, 99, 18, 21, 26, 66, 99, 99, 99, 99, 24, + 26, 56, 99, 99, 99, 99, 99, 47, 66, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99 }; + static const float AASF[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, + 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f, 1.0f * 2.828427125f, + 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f }; + _quality = _param.quality; + _quality = _quality ? _quality : 90; + _subSample = _quality <= 90 ? 1 : 0; + _quality = _quality < 1 ? 1 : _quality > 100 ? 100 : _quality; + _quality = _quality < 50 ? 5000 / _quality : 200 - _quality * 2; + for (size_t i = 0; i < 64; ++i) + { + int uvti, yti = (YQT[i] * _quality + 50) / 100; + _uY[Base::JpegZigZagD[i]] = uint8_t(yti < 1 ? 1 : yti > 255 ? 255 : yti); + uvti = (UVQT[i] * _quality + 50) / 100; + _uUv[Base::JpegZigZagD[i]] = uint8_t(uvti < 1 ? 1 : uvti > 255 ? 255 : uvti); + } + const uint8_t *ZigZag = trans ? Base::JpegZigZagT : Base::JpegZigZagD; + for (size_t y = 0, i = 0; y < 8; ++y) + { + for (size_t x = 0; x < 8; ++x, ++i) + { + _fY[i] = 1.0f / (_uY[ZigZag[i]] * AASF[y] * AASF[x]); + _fUv[i] = 1.0f / (_uUv[ZigZag[i]] * AASF[y] * AASF[x]); + } + } + _block = _subSample ? 16 : 8; + _width = (int)AlignHi(_param.width, _block); + if (_param.format != SimdPixelFormatGray8) + _buffer.Resize(_width * _block * 3); + } + + void ImageJpegSaver::WriteHeader() + { + static const uint8_t DC_LUM_COD[] = { 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 }; + static const uint8_t DC_LUM_VAL[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; + static const uint8_t AC_LUM_COD[] = { 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d }; + static const uint8_t AC_LUM_VAL[] = { + 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08, + 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0, 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28, + 0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, + 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, + 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, + 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2, + 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa + }; + static const uint8_t DC_CHR_COD[] = { 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }; + static const uint8_t DC_CHR_VAL[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; + static const uint8_t AC_CHR_COD[] = { 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 }; + static const uint8_t AC_CHR_VAL[] = { + 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71, 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, + 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0, 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34, 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26, + 0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, + 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, + 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa + }; + static const uint8_t head0[] = { 0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F', 'I', 'F', 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0xFF, 0xDB, 0, 0x84, 0 }; + static const uint8_t head2[] = { 0xFF, 0xDA, 0, 0xC, 3, 1, 0, 2, 0x11, 3, 0x11, 0, 0x3F, 0 }; + const uint8_t head1[] = { 0xFF, 0xC0, 0, 0x11, 8, uint8_t(_param.height >> 8), uint8_t(_param.height), uint8_t(_param.width >> 8), + uint8_t(_param.width), 3, 1, uint8_t(_subSample ? 0x22 : 0x11), 0, 2, 0x11, 1, 3, 0x11, 1, 0xFF, 0xC4, 0x01, 0xA2, 0 }; + _stream.Write(head0, sizeof(head0)); + _stream.Write(_uY, 64); + _stream.Write8u(1); + _stream.Write(_uUv, 64); + _stream.Write(head1, sizeof(head1)); + _stream.Write(DC_LUM_COD + 1, sizeof(DC_LUM_COD) - 1); + _stream.Write(DC_LUM_VAL, sizeof(DC_LUM_VAL)); + _stream.Write8u(0x10); // HTYACinfo + _stream.Write(AC_LUM_COD + 1, sizeof(AC_LUM_COD) - 1); + _stream.Write(AC_LUM_VAL, sizeof(AC_LUM_VAL)); + _stream.Write8u(1); // HTUDCinfo + _stream.Write(DC_CHR_COD + 1, sizeof(DC_CHR_COD) - 1); + _stream.Write(DC_CHR_VAL, sizeof(DC_CHR_VAL)); + _stream.Write8u(0x11); // HTUACinfo + _stream.Write(AC_CHR_COD + 1, sizeof(AC_CHR_COD) - 1); + _stream.Write(AC_CHR_VAL, sizeof(AC_CHR_VAL)); + _stream.Write(head2, sizeof(head2)); + } + + bool ImageJpegSaver::ToStream(const uint8_t* src, size_t stride) + { + Init(); + WriteHeader(); + uint8_t* r = _buffer.data, * g = r + _width * _block,* b = g + _width * _block; + int dc[3] = { 0, 0, 0 }; + for (int row = 0; row < (int)_param.height; row += _block) + { + int block = Simd::Min(row + _block, (int)_param.height) - row; + switch (_param.format) + { + case SimdPixelFormatBgr24: + _deintBgr(src, stride, _param.width, block, b, _width, g, _width, r, _width); + break; + case SimdPixelFormatBgra32: + _deintBgra(src, stride, _param.width, block, b, _width, g, _width, r, _width, NULL, 0); + break; + case SimdPixelFormatRgb24: + _deintBgr(src, stride, _param.width, block, r, _width, g, _width, b, _width); + break; + case SimdPixelFormatRgba32: + _deintBgra(src, stride, _param.width, block, r, _width, g, _width, b, _width, NULL, 0); + break; + default: + break; + } + if(_param.format == SimdPixelFormatGray8) + _writeBlock(_stream, (int)_param.width, block, src, src, src, (int)stride, _fY, _fUv, dc); + else + _writeBlock(_stream, (int)_param.width, block, r, g, b, _width, _fY, _fUv, dc); + src += block * stride; + } + static const uint16_t FILL_BITS[] = { 0x7F, 7 }; + Base::WriteBits(_stream, FILL_BITS); + _stream.Write8u(0xFF); + _stream.Write8u(0xD9); + return true; + } + } +} diff --git a/3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp b/3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp new file mode 100644 index 0000000000..dcb8f2efbb --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp @@ -0,0 +1,379 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdImageSavePng.h" +#include "Simd/SimdBase.h" +#include "Simd/SimdCpu.h" + +namespace Simd +{ + namespace Base + { + const uint16_t ZlibLenC[30] = { 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 259 }; + const uint8_t ZlibLenEb[29] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0 }; + const uint16_t ZlibDistC[31] = { 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 32768 }; + const uint8_t ZlibDistEb[30] = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13 }; + +#if defined(SIMD_PNG_ZLIB_BIT_REV_TABLE) + int ZlibBitRevTable[512]; + static bool ZlibBitRevTableInit() + { + for (int i = 0; i < 512; i++) + { + int rev = 0, val = i; + for (size_t b = 0; b < 9; b++) + { + rev = (rev << 1) | (val & 1); + val >>= 1; + } + ZlibBitRevTable[i] = rev; + } + return true; + } + bool ZlibBitRevTableInited = ZlibBitRevTableInit(); + +#endif + + uint32_t ZlibAdler32(uint8_t* data, int size) + { + uint32_t lo = 1, hi = 0; + for (int b = 0, n = (int)(size % 5552); b < size;) + { + for (int i = 0; i < n; ++i) + { + lo += data[b + i]; + hi += lo; + } + lo %= 65521; + hi %= 65521; + b += n; + n = 5552; + } + return (hi << 16) | lo; + } + + void ZlibCompress(uint8_t* data, int size, int quality, OutputMemoryStream& stream) + { + const int ZHASH = 16384; + if (quality < 5) + quality = 5; + const int basket = quality * 2; + Array32i hashTable(ZHASH * basket); + memset(hashTable.data, -1, hashTable.RawSize()); + + stream.Write(uint8_t(0x78)); + stream.Write(uint8_t(0x5e)); + stream.WriteBits(1, 1); + stream.WriteBits(1, 2); + + int i = 0, j; + while (i < size - 3) + { + int h = ZlibHash(data + i) & (ZHASH - 1), best = 3; + uint8_t* bestLoc = 0; + int* hList = hashTable.data + h * basket; + for (j = 0; hList[j] != -1 && j < basket; ++j) + { + if (hList[j] > i - 32768) + { + int d = ZlibCount(data + hList[j], data + i, size - i); + if (d >= best) + { + best = d; + bestLoc = data + hList[j]; + } + } + } + if (j == basket) + { + memcpy(hList, hList + quality, quality * sizeof(int)); + memset(hList + quality, -1, quality * sizeof(int)); + j = quality; + } + hList[j] = i; + + if (bestLoc) + { + h = ZlibHash(data + i + 1) & (ZHASH - 1); + int* hList = hashTable.data + h * basket; + for (j = 0; hList[j] != -1 && j < basket; ++j) + { + if (hList[j] > i - 32767) + { + int e = ZlibCount(data + hList[j], data + i + 1, size - i - 1); + if (e > best) + { + bestLoc = NULL; + break; + } + } + } + } + + if (bestLoc) + { + int d = (int)(data + i - bestLoc); + assert(d <= 32767 && best <= 258); + for (j = 0; best > Base::ZlibLenC[j + 1] - 1; ++j); + Base::ZlibHuff(j + 257, stream); + if (Base::ZlibLenEb[j]) + stream.WriteBits(best - Base::ZlibLenC[j], Base::ZlibLenEb[j]); + for (j = 0; d > Base::ZlibDistC[j + 1] - 1; ++j); + stream.WriteBits(Base::ZlibBitRev(j, 5), 5); + if (Base::ZlibDistEb[j]) + stream.WriteBits(d - Base::ZlibDistC[j], Base::ZlibDistEb[j]); + i += best; + } + else + { + ZlibHuffB(data[i], stream); + ++i; + } + } + for (; i < size; ++i) + ZlibHuffB(data[i], stream); + ZlibHuff(256, stream); + stream.FlushBits(); + stream.WriteBe32u(ZlibAdler32(data, size)); + } + + uint32_t EncodeLine0(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + uint32_t sum = 0; + for (size_t i = 0; i < size; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine1(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + uint32_t sum = 0; + for (size_t i = 0; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + for (size_t i = n; i < size; ++i) + { + dst[i] = src[i] - src[i - n]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine2(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + uint32_t sum = 0; + for (size_t i = 0; i < n; ++i) + { + dst[i] = src[i] - src[i - stride]; + sum += ::abs(dst[i]); + } + for (size_t i = n; i < size; ++i) + { + dst[i] = src[i] - src[i - stride]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine3(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + uint32_t sum = 0; + for (size_t i = 0; i < n; ++i) + { + dst[i] = src[i] - (src[i - stride] >> 1); + sum += ::abs(dst[i]); + } + for (size_t i = n; i < size; ++i) + { + dst[i] = src[i] - ((src[i - n] + src[i - stride]) >> 1); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine4(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + uint32_t sum = 0; + for (size_t i = 0; i < n; ++i) + { + dst[i] = (int8_t)(src[i] - src[i - stride]); + sum += ::abs(dst[i]); + } + for (size_t i = n; i < size; ++i) + { + dst[i] = src[i] - Paeth(src[i - n], src[i - stride], src[i - stride - n]); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine5(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + uint32_t sum = 0; + for (size_t i = 0; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + for (size_t i = n; i < size; ++i) + { + dst[i] = src[i] - (src[i - n] >> 1); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine6(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + uint32_t sum = 0; + for (size_t i = 0; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + for (size_t i = n; i < size; ++i) + { + dst[i] = src[i] - src[i - n]; + sum += ::abs(dst[i]); + } + return sum; + } + + ImagePngSaver::ImagePngSaver(const ImageSaverParam& param) + : ImageSaver(param) + , _channels(0) + , _size(0) + , _convert(NULL) + { + switch (_param.format) + { + case SimdPixelFormatGray8: + _channels = 1; + break; + case SimdPixelFormatBgr24: + _channels = 3; + break; + case SimdPixelFormatBgra32: + _channels = 4; + break; + case SimdPixelFormatRgb24: + _channels = 3; + break; + case SimdPixelFormatRgba32: + _channels = 4; + break; + default: + break; + } + _size = _param.width * _channels; + if (_param.format == SimdPixelFormatBgr24) + { + _convert = Base::BgrToRgb; + _buff.Resize(_param.height * _size); + } + else if (_param.format == SimdPixelFormatBgra32) + { + _convert = Base::BgraToRgba; + _buff.Resize(_param.height * _size); + } + _filt.Resize((_size + 1) * _param.height); + _line.Resize(_size * FILTERS); + _encode[0] = Base::EncodeLine0; + _encode[1] = Base::EncodeLine1; + _encode[2] = Base::EncodeLine2; + _encode[3] = Base::EncodeLine3; + _encode[4] = Base::EncodeLine4; + _encode[5] = Base::EncodeLine5; + _encode[6] = Base::EncodeLine6; + _compress = Base::ZlibCompress; + } + + bool ImagePngSaver::ToStream(const uint8_t* src, size_t stride) + { + if (_convert) + { + _convert(src, _param.width, _param.height, stride, _buff.data, _size); + src = _buff.data; + stride = _size; + } + for (size_t row = 0; row < _param.height; ++row) + { + int bestFilter = 0, bestSum = INT_MAX; + for (int filter = 0; filter < FILTERS; filter++) + { + static const int TYPES[] = { 0, 1, 0, 5, 6, 0, 1, 2, 3, 4 }; + int type = TYPES[filter + (row ? 1 : 0) * FILTERS]; + int sum = _encode[type](src + stride * row, stride, _channels, _size, _line.data + _size * filter); + if (sum < bestSum) + { + bestSum = sum; + bestFilter = filter; + } + } + _filt[row * (_size + 1)] = (uint8_t)bestFilter; + memcpy(_filt.data + row * (_size + 1) + 1, _line.data + _size * bestFilter, _size); + } + OutputMemoryStream zlib(Min(_param.width * _param.height, Base::AlgCacheL1())); + _compress(_filt.data, (int)_filt.size, COMPRESSION, zlib); + WriteToStream(zlib.Data(), zlib.Size()); + return true; + } + + SIMD_INLINE void WriteCrc32(OutputMemoryStream& stream, size_t size) + { + stream.WriteBe32u(Base::Crc32(stream.Current() - size - 4, size + 4)); + } + + void ImagePngSaver::WriteToStream(const uint8_t* zlib, size_t zlen) + { + const uint8_t SIGNATURE[8] = { 137, 80, 78, 71, 13, 10, 26, 10 }; + const int8_t CTYPE[5] = { -1, 0, 4, 2, 6 }; + _stream.Reserve(8 + 12 + 13 + 12 + zlen + 12); + _stream.Write(SIGNATURE, 8); + _stream.WriteBe32u(13); + _stream.Write("IHDR", 4); + _stream.WriteBe32u((uint32_t)_param.width); + _stream.WriteBe32u((uint32_t)_param.height); + _stream.Write8u(8); + _stream.Write8u(CTYPE[_channels]); + _stream.Write8u(0); + _stream.Write8u(0); + _stream.Write8u(0); + WriteCrc32(_stream, 13); + _stream.WriteBe32u((uint32_t)zlen); + _stream.Write("IDAT", 4); + _stream.Write(zlib, zlen); + WriteCrc32(_stream, zlen); + _stream.WriteBe32u(0); + _stream.Write("IEND", 4); + WriteCrc32(_stream, 0); + } + } +} diff --git a/3rdparty/simdlib/Simd/SimdImageLoad.h b/3rdparty/simdlib/Simd/SimdImageLoad.h new file mode 100644 index 0000000000..43e44961e6 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdImageLoad.h @@ -0,0 +1,396 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdImageLoad_h__ +#define __SimdImageLoad_h__ + +#include "Simd/SimdMemoryStream.h" +#include "Simd/SimdArray.h" +#include "Simd/SimdAlignment.h" + +#include "Simd/SimdView.hpp" + +#include + +namespace Simd +{ + typedef uint8_t* (*ImageLoadFromMemoryPtr)(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format); + + uint8_t* ImageLoadFromFile(const ImageLoadFromMemoryPtr loader, const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format); + + //------------------------------------------------------------------------- + + struct ImageLoaderParam + { + const uint8_t* data; + size_t size; + SimdImageFileType file; + SimdPixelFormatType format; + + ImageLoaderParam(const uint8_t* d, size_t s, SimdPixelFormatType f); + + bool Validate(); + }; + + class ImageLoader + { + protected: + typedef Simd::View Image; + + ImageLoaderParam _param; + InputMemoryStream _stream; + Image _image; + + public: + ImageLoader(const ImageLoaderParam& param) + : _param(param) + , _stream(_param.data, _param.size) + { + } + + virtual ~ImageLoader() + { + } + + virtual bool FromStream() = 0; + + SIMD_INLINE uint8_t* Release(size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) + { + *stride = _image.stride; + *width = _image.width; + *height = _image.height; + *format = (SimdPixelFormatType)_image.format; + return _image.Release(); + } + }; + + namespace Base + { + class ImagePxmLoader : public ImageLoader + { + public: + ImagePxmLoader(const ImageLoaderParam& param); + + protected: + typedef void (*ToAnyPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride); + typedef void (*ToBgraPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + ToAnyPtr _toAny; + ToBgraPtr _toBgra; + Array8u _buffer; + size_t _block, _size; + + bool ReadHeader(size_t version); + virtual void SetConverters() = 0; + }; + + class ImagePgmTxtLoader : public ImagePxmLoader + { + public: + ImagePgmTxtLoader(const ImageLoaderParam& param); + + virtual bool FromStream(); + + protected: + virtual void SetConverters(); + }; + + class ImagePgmBinLoader : public ImagePxmLoader + { + public: + ImagePgmBinLoader(const ImageLoaderParam& param); + + virtual bool FromStream(); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmTxtLoader : public ImagePxmLoader + { + public: + ImagePpmTxtLoader(const ImageLoaderParam& param); + + virtual bool FromStream(); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmBinLoader : public ImagePxmLoader + { + public: + ImagePpmBinLoader(const ImageLoaderParam& param); + + virtual bool FromStream(); + + protected: + virtual void SetConverters(); + }; + + class ImagePngLoader : public ImageLoader + { + public: + ImagePngLoader(const ImageLoaderParam& param); + + virtual bool FromStream(); + + protected: + typedef void (*ToAny8Ptr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride); + typedef void (*ToBgra8Ptr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + typedef void (*ToAny16Ptr)(const uint16_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride); + typedef void (*ToBgra16Ptr)(const uint16_t* src, size_t width, size_t height, size_t srcStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + ToAny8Ptr _toAny8; + ToBgra8Ptr _toBgra8, _bgrToBgra; + ToAny16Ptr _toAny16; + ToBgra16Ptr _toBgra16; + + virtual void SetConverters(); + private: + bool _first, _hasTrans, _iPhone; + uint32_t _width, _height, _channels; + uint16_t _tc16[3]; + uint8_t _depth, _color, _interlace, _paletteChannels, _tc[3]; + Array8u _palette, _idat; + + struct Chunk + { + uint32_t size; + uint32_t type; + uint32_t offs; + }; + typedef std::vector Chunks; + Chunks _idats; + + bool ParseFile(); + bool CheckHeader(); + bool ReadChunk(Chunk& chunk); + bool ReadHeader(const Chunk & chunk); + bool ReadPalette(const Chunk& chunk); + bool ReadTransparency(const Chunk& chunk); + bool ReadData(const Chunk& chunk); + InputMemoryStream MergedDataStream(); + }; + + class ImageJpegLoader : public ImageLoader + { + public: + ImageJpegLoader(const ImageLoaderParam& param); + + virtual bool FromStream(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format); + } + +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + class ImagePgmTxtLoader : public Base::ImagePgmTxtLoader + { + public: + ImagePgmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePgmBinLoader : public Base::ImagePgmBinLoader + { + public: + ImagePgmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmTxtLoader : public Base::ImagePpmTxtLoader + { + public: + ImagePpmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmBinLoader : public Base::ImagePpmBinLoader + { + public: + ImagePpmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePngLoader : public Base::ImagePngLoader + { + public: + ImagePngLoader(const ImageLoaderParam& param); + + virtual bool FromStream(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format); + } +#endif// SIMD_SSE41_ENABLE + +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + class ImagePgmTxtLoader : public Sse41::ImagePgmTxtLoader + { + public: + ImagePgmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePgmBinLoader : public Sse41::ImagePgmBinLoader + { + public: + ImagePgmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmTxtLoader : public Sse41::ImagePpmTxtLoader + { + public: + ImagePpmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmBinLoader : public Sse41::ImagePpmBinLoader + { + public: + ImagePpmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format); + } +#endif// SIMD_AVX2_ENABLE + +#ifdef SIMD_AVX512BW_ENABLE + namespace Avx512bw + { + class ImagePgmTxtLoader : public Avx2::ImagePgmTxtLoader + { + public: + ImagePgmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePgmBinLoader : public Avx2::ImagePgmBinLoader + { + public: + ImagePgmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmTxtLoader : public Avx2::ImagePpmTxtLoader + { + public: + ImagePpmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmBinLoader : public Avx2::ImagePpmBinLoader + { + public: + ImagePpmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format); + } +#endif// SIMD_AVX512BW_ENABLE + +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + class ImagePgmTxtLoader : public Base::ImagePgmTxtLoader + { + public: + ImagePgmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePgmBinLoader : public Base::ImagePgmBinLoader + { + public: + ImagePgmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmTxtLoader : public Base::ImagePpmTxtLoader + { + public: + ImagePpmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmBinLoader : public Base::ImagePpmBinLoader + { + public: + ImagePpmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format); + } +#endif// SIMD_NEON_ENABLE +} + +#endif//__SimdImageLoad_h__ diff --git a/3rdparty/simdlib/Simd/SimdImageSave.h b/3rdparty/simdlib/Simd/SimdImageSave.h new file mode 100644 index 0000000000..4e1945c077 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdImageSave.h @@ -0,0 +1,386 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdImageSave_h__ +#define __SimdImageSave_h__ + +#include "Simd/SimdMemoryStream.h" +#include "Simd/SimdArray.h" +#include "Simd/SimdPerformance.h" + +namespace Simd +{ + typedef uint8_t* (*ImageSaveToMemoryPtr)(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size); + + SimdBool ImageSaveToFile(const ImageSaveToMemoryPtr saver, const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char* path); + + //--------------------------------------------------------------------- + + struct ImageSaverParam + { + size_t width, height; + SimdPixelFormatType format; + SimdImageFileType file; + int quality; + + SIMD_INLINE ImageSaverParam(size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality) + { + this->width = width; + this->height = height; + this->format = format; + this->file = file; + this->quality = quality; + } + + bool Validate() + { + if (file == SimdImageFileUndefined) + { + if (format == SimdPixelFormatGray8) + file = SimdImageFilePgmBin; + else + file = SimdImageFilePpmBin; + } + if (format < SimdPixelFormatGray8 || format > SimdPixelFormatRgba32) + return false; + if (width == 0 || height == 0) + return false; + if (file <= SimdImageFileUndefined || file > SimdImageFileJpeg) + return false; + return true; + } + }; + + class ImageSaver + { + protected: + ImageSaverParam _param; + OutputMemoryStream _stream; + public: + ImageSaver(const ImageSaverParam& param) + : _param(param) + { + } + + virtual ~ImageSaver() + { + } + + virtual bool ToStream(const uint8_t* src, size_t stride) = 0; + + SIMD_INLINE uint8_t* Release(size_t* size) + { + return _stream.Release(size); + } + }; + + namespace Base + { + class ImagePxmSaver : public ImageSaver + { + public: + ImagePxmSaver(const ImageSaverParam& param); + + protected: + typedef void (*ConvertPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride); + ConvertPtr _convert; + Array8u _buffer; + size_t _block, _size; + + void WriteHeader(size_t version); + }; + + class ImagePgmTxtSaver : public ImagePxmSaver + { + public: + ImagePgmTxtSaver(const ImageSaverParam& param); + + virtual bool ToStream(const uint8_t* src, size_t stride); + }; + + class ImagePgmBinSaver : public ImagePxmSaver + { + public: + ImagePgmBinSaver(const ImageSaverParam& param); + + virtual bool ToStream(const uint8_t* src, size_t stride); + }; + + class ImagePpmTxtSaver : public ImagePxmSaver + { + public: + ImagePpmTxtSaver(const ImageSaverParam& param); + + virtual bool ToStream(const uint8_t* src, size_t stride); + }; + + class ImagePpmBinSaver : public ImagePxmSaver + { + public: + ImagePpmBinSaver(const ImageSaverParam& param); + + virtual bool ToStream(const uint8_t* src, size_t stride); + }; + + class ImagePngSaver : public ImageSaver + { + public: + ImagePngSaver(const ImageSaverParam& param); + + virtual bool ToStream(const uint8_t* src, size_t stride); + protected: + static const int COMPRESSION = 8; + static const int FILTERS = 5; + static const int TYPES = 7; + typedef void (*ConvertPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride); + typedef uint32_t (*EncodePtr)(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst); + typedef void (*CompressPtr)(uint8_t* data, int size, int quality, OutputMemoryStream& stream); + ConvertPtr _convert; + EncodePtr _encode[TYPES]; + CompressPtr _compress; + size_t _channels, _size; + Array8u _filt, _buff; + Array8i _line; + + void WriteToStream(const uint8_t* zlib, size_t zlen); + }; + + class ImageJpegSaver : public ImageSaver + { + public: + ImageJpegSaver(const ImageSaverParam& param); + + virtual bool ToStream(const uint8_t* src, size_t stride); + protected: + typedef void (*DeintBgrPtr)(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, + uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride); + typedef void (*DeintBgraPtr)(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height, + uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride); + typedef void (*WriteBlockPtr)(OutputMemoryStream& stream, int width, int height, const uint8_t* red, + const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3]); + + Array8u _buffer; + DeintBgrPtr _deintBgr; + DeintBgraPtr _deintBgra; + WriteBlockPtr _writeBlock; + bool _subSample; + int _quality, _block, _width; + float _fY[64], _fUv[64]; + uint8_t _uY[64], _uUv[64]; + + virtual void Init(); + + void InitParams(bool trans); + void WriteHeader(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size); + } + +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + class ImagePgmTxtSaver : public Base::ImagePgmTxtSaver + { + public: + ImagePgmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePgmBinSaver : public Base::ImagePgmBinSaver + { + public: + ImagePgmBinSaver(const ImageSaverParam& param); + }; + + class ImagePpmTxtSaver : public Base::ImagePpmTxtSaver + { + public: + ImagePpmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePpmBinSaver : public Base::ImagePpmBinSaver + { + public: + ImagePpmBinSaver(const ImageSaverParam& param); + }; + + class ImagePngSaver : public Base::ImagePngSaver + { + public: + ImagePngSaver(const ImageSaverParam& param); + }; + + class ImageJpegSaver : public Base::ImageJpegSaver + { + public: + ImageJpegSaver(const ImageSaverParam& param); + + protected: + virtual void Init(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size); + } +#endif// SIMD_SSE41_ENABLE + +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + class ImagePgmTxtSaver : public Sse41::ImagePgmTxtSaver + { + public: + ImagePgmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePgmBinSaver : public Sse41::ImagePgmBinSaver + { + public: + ImagePgmBinSaver(const ImageSaverParam& param); + }; + + class ImagePpmTxtSaver : public Sse41::ImagePpmTxtSaver + { + public: + ImagePpmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePpmBinSaver : public Sse41::ImagePpmBinSaver + { + public: + ImagePpmBinSaver(const ImageSaverParam& param); + }; + + class ImagePngSaver : public Sse41::ImagePngSaver + { + public: + ImagePngSaver(const ImageSaverParam& param); + }; + + class ImageJpegSaver : public Sse41::ImageJpegSaver + { + public: + ImageJpegSaver(const ImageSaverParam& param); + + protected: + virtual void Init(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size); + } +#endif// SIMD_AVX2_ENABLE + +#ifdef SIMD_AVX512BW_ENABLE + namespace Avx512bw + { + class ImagePgmTxtSaver : public Avx2::ImagePgmTxtSaver + { + public: + ImagePgmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePgmBinSaver : public Avx2::ImagePgmBinSaver + { + public: + ImagePgmBinSaver(const ImageSaverParam& param); + }; + + class ImagePpmTxtSaver : public Avx2::ImagePpmTxtSaver + { + public: + ImagePpmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePpmBinSaver : public Avx2::ImagePpmBinSaver + { + public: + ImagePpmBinSaver(const ImageSaverParam& param); + }; + + class ImagePngSaver : public Avx2::ImagePngSaver + { + public: + ImagePngSaver(const ImageSaverParam& param); + }; + + class ImageJpegSaver : public Avx2::ImageJpegSaver + { + public: + ImageJpegSaver(const ImageSaverParam& param); + + protected: + virtual void Init(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size); + } +#endif// SIMD_AVX512BW_ENABLE + +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + class ImagePgmTxtSaver : public Base::ImagePgmTxtSaver + { + public: + ImagePgmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePgmBinSaver : public Base::ImagePgmBinSaver + { + public: + ImagePgmBinSaver(const ImageSaverParam& param); + }; + + class ImagePpmTxtSaver : public Base::ImagePpmTxtSaver + { + public: + ImagePpmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePpmBinSaver : public Base::ImagePpmBinSaver + { + public: + ImagePpmBinSaver(const ImageSaverParam& param); + }; + + class ImagePngSaver : public Base::ImagePngSaver + { + public: + ImagePngSaver(const ImageSaverParam& param); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size); + } +#endif// SIMD_NEON_ENABLE +} + +#endif//__SimdImageSave_h__ diff --git a/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h new file mode 100644 index 0000000000..d54164f7d4 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h @@ -0,0 +1,649 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdImageSaveJpeg_h__ +#define __SimdImageSaveJpeg_h__ + +#include "Simd/SimdImageSave.h" +#include "Simd/SimdMath.h" + +#define SIMD_JPEG_CALC_BITS_TABLE + +namespace Simd +{ + namespace Base + { + struct BitBuf + { + static const uint32_t capacity = 1024; + uint32_t size; + uint16_t data[1024][2]; + + SIMD_INLINE BitBuf() + : size(0) + { + } + + SIMD_INLINE void Push(const uint16_t* bits) + { + ((uint32_t*)data)[size++] = ((uint32_t*)bits)[0]; + } + + SIMD_INLINE bool Full(uint32_t tail = capacity / 2) const + { + return size + tail >= capacity; + } + + SIMD_INLINE uint32_t Capacity() const + { + return capacity; + } + + SIMD_INLINE void Clear() + { + size = 0; + } + }; + + extern const uint8_t JpegZigZagD[64]; + extern const uint8_t JpegZigZagT[64]; + + extern const uint16_t HuffmanYdc[256][2]; + extern const uint16_t HuffmanUVdc[256][2]; + extern const uint16_t HuffmanYac[256][2]; + extern const uint16_t HuffmanUVac[256][2]; + +#if defined(SIMD_JPEG_CALC_BITS_TABLE) + const int JpegCalcBitsRange = 2048; + extern uint16_t JpegCalcBitsTable[JpegCalcBitsRange * 2][2]; + SIMD_INLINE void JpegCalcBits(int val, uint16_t bits[2]) + { + assert(val >= -JpegCalcBitsRange && val < JpegCalcBitsRange); + ((uint32_t*)bits)[0] = ((uint32_t*)JpegCalcBitsTable)[val + JpegCalcBitsRange]; + } +#else + SIMD_INLINE void JpegCalcBits(int val, uint16_t bits[2]) + { + int tmp = val < 0 ? -val : val; + val = val < 0 ? val - 1 : val; + bits[1] = 1; + while (tmp >>= 1) + ++bits[1]; + bits[0] = val & ((1 << bits[1]) - 1); + } +#endif + + SIMD_INLINE void RgbToYuv(const uint8_t* r, const uint8_t* g, const uint8_t* b, int stride, int height, int width, float* y, float* u, float* v, int size) + { + for (int row = 0; row < size;) + { + for (int col = 0; col < size; col += 1) + { + int offs = (col < width ? col : width - 1); + float _r = r[offs], _g = g[offs], _b = b[offs]; + y[col] = +0.29900f * _r + 0.58700f * _g + 0.11400f * _b - 128.000f; + u[col] = -0.16874f * _r - 0.33126f * _g + 0.50000f * _b; + v[col] = +0.50000f * _r - 0.41869f * _g - 0.08131f * _b; + } + if (++row < height) + r += stride, g += stride, b += stride; + y += size, u += size, v += size; + } + } + + SIMD_INLINE void GrayToY(const uint8_t* g, int stride, int height, int width, float* y, int size) + { + for (int row = 0; row < size;) + { + for (int col = 0; col < size; col += 1) + { + int offs = (col < width ? col : width - 1); + y[col] = g[offs] - 128.000f; + } + if (++row < height) + g += stride; + y += size; + } + } + + SIMD_INLINE void JpegProcessDuGrayUv(BitBuf & bitBuf) + { + bitBuf.Push(Base::HuffmanUVdc[0]); + bitBuf.Push(Base::HuffmanUVac[0]); + bitBuf.Push(Base::HuffmanUVdc[0]); + bitBuf.Push(Base::HuffmanUVac[0]); + } + + SIMD_INLINE void WriteBits(OutputMemoryStream & stream, const uint16_t bits[2]) + { + stream.BitCount() += bits[1]; +#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) + stream.BitBuffer() |= uint64_t(bits[0]) << (64 - stream.BitCount()); + while (stream.BitCount() >= 8) + { + uint8_t byte = stream.BitBuffer() >> 56; + stream.Write8u(byte); + if (byte == 255) + stream.Write8u(0); + stream.BitBuffer() <<= 8; + stream.BitCount() -= 8; + } +#else + stream.BitBuffer() |= uint32_t(bits[0]) << (32 - stream.BitCount()); + while (stream.BitCount() >= 8) + { + uint8_t byte = stream.BitBuffer() >> 24; + stream.Write8u(byte); + if (byte == 255) + stream.Write8u(0); + stream.BitBuffer() <<= 8; + stream.BitCount() -= 8; + } +#endif + } + + SIMD_INLINE void WriteBits(OutputMemoryStream& stream, const uint16_t bits[][2], size_t size) + { + size_t pos = stream.Pos(); + stream.Reserve(pos + size * 2); + uint8_t* data = stream.Data(); + size_t & bitCount = stream.BitCount(); + size_t i = 0; +#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) + uint64_t &bitBuffer = stream.BitBuffer(); + for (size_t size3 = AlignLoAny(size, 3); i < size3; i += 3, bits += 3) + { + bitCount += bits[0][1]; + bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount); + bitCount += bits[1][1]; + bitBuffer |= uint64_t(bits[1][0]) << (64 - bitCount); + bitCount += bits[2][1]; + bitBuffer |= uint64_t(bits[2][0]) << (64 - bitCount); + assert(bitCount <= 64); + while (bitCount >= 16) + { + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + byte = uint8_t(bitBuffer >> 48); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 16; + bitCount -= 16; + } + } + if(bitCount >= 8) + { + assert(bitCount < 16); + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + for (; i < size; ++i, ++bits) + { + bitCount += bits[0][1]; + bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount); + while (bitCount >= 8) + { + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + } +#else + uint32_t &bitBuffer = stream.BitBuffer(); + for (; i < size; ++i, ++bits) + { + bitCount += bits[0][1]; + bitBuffer |= uint32_t(bits[0][0]) << (32 - bitCount); + while (bitCount >= 8) + { + uint8_t byte = uint8_t(bitBuffer >> 24); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + } +#endif + stream.Seek(pos); + } + } + +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + } +#endif// SIMD_SSE41_ENABLE + +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + extern const uint32_t JpegZigZagTi32[64]; + + SIMD_INLINE void JpegDctV(const float* src, size_t srcStride, float* dst, size_t dstStride) + { + static const __m256 _0_707106781 = _mm256_set1_ps(0.707106781f); + static const __m256 _0_382683433 = _mm256_set1_ps(0.382683433f); + static const __m256 _0_541196100 = _mm256_set1_ps(0.541196100f); + static const __m256 _1_306562965 = _mm256_set1_ps(1.306562965f); + + __m256 d0 = _mm256_loadu_ps(src + 0 * srcStride); + __m256 d1 = _mm256_loadu_ps(src + 1 * srcStride); + __m256 d2 = _mm256_loadu_ps(src + 2 * srcStride); + __m256 d3 = _mm256_loadu_ps(src + 3 * srcStride); + __m256 d4 = _mm256_loadu_ps(src + 4 * srcStride); + __m256 d5 = _mm256_loadu_ps(src + 5 * srcStride); + __m256 d6 = _mm256_loadu_ps(src + 6 * srcStride); + __m256 d7 = _mm256_loadu_ps(src + 7 * srcStride); + + __m256 tmp0 = _mm256_add_ps(d0, d7); + __m256 tmp7 = _mm256_sub_ps(d0, d7); + __m256 tmp1 = _mm256_add_ps(d1, d6); + __m256 tmp6 = _mm256_sub_ps(d1, d6); + __m256 tmp2 = _mm256_add_ps(d2, d5); + __m256 tmp5 = _mm256_sub_ps(d2, d5); + __m256 tmp3 = _mm256_add_ps(d3, d4); + __m256 tmp4 = _mm256_sub_ps(d3, d4); + + __m256 tmp10 = _mm256_add_ps(tmp0, tmp3); + __m256 tmp13 = _mm256_sub_ps(tmp0, tmp3); + __m256 tmp11 = _mm256_add_ps(tmp1, tmp2); + __m256 tmp12 = _mm256_sub_ps(tmp1, tmp2); + + d0 = _mm256_add_ps(tmp10, tmp11); + d4 = _mm256_sub_ps(tmp10, tmp11); + + __m256 z1 = _mm256_mul_ps(_mm256_add_ps(tmp12, tmp13), _0_707106781); + d2 = _mm256_add_ps(tmp13, z1); + d6 = _mm256_sub_ps(tmp13, z1); + + tmp10 = _mm256_add_ps(tmp4, tmp5); + tmp11 = _mm256_add_ps(tmp5, tmp6); + tmp12 = _mm256_add_ps(tmp6, tmp7); + + __m256 z5 = _mm256_mul_ps(_mm256_sub_ps(tmp10, tmp12), _0_382683433); + __m256 z2 = _mm256_add_ps(_mm256_mul_ps(tmp10, _0_541196100), z5); + __m256 z4 = _mm256_add_ps(_mm256_mul_ps(tmp12, _1_306562965), z5); + __m256 z3 = _mm256_mul_ps(tmp11, _0_707106781); + + __m256 z11 = _mm256_add_ps(tmp7, z3); + __m256 z13 = _mm256_sub_ps(tmp7, z3); + + _mm256_storeu_ps(dst + 0 * dstStride, d0); + _mm256_storeu_ps(dst + 1 * dstStride, _mm256_add_ps(z11, z4)); + _mm256_storeu_ps(dst + 2 * dstStride, d2); + _mm256_storeu_ps(dst + 3 * dstStride, _mm256_sub_ps(z13, z2)); + _mm256_storeu_ps(dst + 4 * dstStride, d4); + _mm256_storeu_ps(dst + 5 * dstStride, _mm256_add_ps(z13, z2)); + _mm256_storeu_ps(dst + 6 * dstStride, d6); + _mm256_storeu_ps(dst + 7 * dstStride, _mm256_sub_ps(z11, z4)); + } + + SIMD_INLINE void JpegDct(const float* src, size_t stride, const float* fdt, int* dst) + { + static const __m256 _0_707106781 = _mm256_set1_ps(0.707106781f); + static const __m256 _0_382683433 = _mm256_set1_ps(0.382683433f); + static const __m256 _0_541196100 = _mm256_set1_ps(0.541196100f); + static const __m256 _1_306562965 = _mm256_set1_ps(1.306562965f); + + __m256 d0 = _mm256_loadu_ps(src + 0 * stride); + __m256 d1 = _mm256_loadu_ps(src + 1 * stride); + __m256 d2 = _mm256_loadu_ps(src + 2 * stride); + __m256 d3 = _mm256_loadu_ps(src + 3 * stride); + __m256 d4 = _mm256_loadu_ps(src + 4 * stride); + __m256 d5 = _mm256_loadu_ps(src + 5 * stride); + __m256 d6 = _mm256_loadu_ps(src + 6 * stride); + __m256 d7 = _mm256_loadu_ps(src + 7 * stride); + + __m256 tmp0 = _mm256_add_ps(d0, d7); + __m256 tmp7 = _mm256_sub_ps(d0, d7); + __m256 tmp1 = _mm256_add_ps(d1, d6); + __m256 tmp6 = _mm256_sub_ps(d1, d6); + __m256 tmp2 = _mm256_add_ps(d2, d5); + __m256 tmp5 = _mm256_sub_ps(d2, d5); + __m256 tmp3 = _mm256_add_ps(d3, d4); + __m256 tmp4 = _mm256_sub_ps(d3, d4); + + __m256 tmp10 = _mm256_add_ps(tmp0, tmp3); + __m256 tmp13 = _mm256_sub_ps(tmp0, tmp3); + __m256 tmp11 = _mm256_add_ps(tmp1, tmp2); + __m256 tmp12 = _mm256_sub_ps(tmp1, tmp2); + + d0 = _mm256_add_ps(tmp10, tmp11); + d4 = _mm256_sub_ps(tmp10, tmp11); + + __m256 z1 = _mm256_mul_ps(_mm256_add_ps(tmp12, tmp13), _0_707106781); + d2 = _mm256_add_ps(tmp13, z1); + d6 = _mm256_sub_ps(tmp13, z1); + + tmp10 = _mm256_add_ps(tmp4, tmp5); + tmp11 = _mm256_add_ps(tmp5, tmp6); + tmp12 = _mm256_add_ps(tmp6, tmp7); + + __m256 z5 = _mm256_mul_ps(_mm256_sub_ps(tmp10, tmp12), _0_382683433); + __m256 z2 = _mm256_add_ps(_mm256_mul_ps(tmp10, _0_541196100), z5); + __m256 z4 = _mm256_add_ps(_mm256_mul_ps(tmp12, _1_306562965), z5); + __m256 z3 = _mm256_mul_ps(tmp11, _0_707106781); + + __m256 z11 = _mm256_add_ps(tmp7, z3); + __m256 z13 = _mm256_sub_ps(tmp7, z3); + + d1 = _mm256_add_ps(z11, z4); + d3 = _mm256_sub_ps(z13, z2); + d5 = _mm256_add_ps(z13, z2); + d7 = _mm256_sub_ps(z11, z4); + + tmp10 = _mm256_permute2f128_ps(d0, d4, 0x20); + tmp11 = _mm256_permute2f128_ps(d1, d5, 0x20); + tmp12 = _mm256_permute2f128_ps(d2, d6, 0x20); + tmp13 = _mm256_permute2f128_ps(d3, d7, 0x20); + d4 = _mm256_permute2f128_ps(d0, d4, 0x31); + d5 = _mm256_permute2f128_ps(d1, d5, 0x31); + d6 = _mm256_permute2f128_ps(d2, d6, 0x31); + d7 = _mm256_permute2f128_ps(d3, d7, 0x31); + + tmp0 = _mm256_unpacklo_ps(tmp10, tmp12); + tmp1 = _mm256_unpackhi_ps(tmp10, tmp12); + tmp2 = _mm256_unpacklo_ps(tmp11, tmp13); + tmp3 = _mm256_unpackhi_ps(tmp11, tmp13); + d0 = _mm256_unpacklo_ps(tmp0, tmp2); + d1 = _mm256_unpackhi_ps(tmp0, tmp2); + d2 = _mm256_unpacklo_ps(tmp1, tmp3); + d3 = _mm256_unpackhi_ps(tmp1, tmp3); + + tmp0 = _mm256_unpacklo_ps(d4, d6); + tmp1 = _mm256_unpackhi_ps(d4, d6); + tmp2 = _mm256_unpacklo_ps(d5, d7); + tmp3 = _mm256_unpackhi_ps(d5, d7); + d4 = _mm256_unpacklo_ps(tmp0, tmp2); + d5 = _mm256_unpackhi_ps(tmp0, tmp2); + d6 = _mm256_unpacklo_ps(tmp1, tmp3); + d7 = _mm256_unpackhi_ps(tmp1, tmp3); + + tmp0 = _mm256_add_ps(d0, d7); + tmp1 = _mm256_add_ps(d1, d6); + tmp2 = _mm256_add_ps(d2, d5); + tmp3 = _mm256_add_ps(d3, d4); + tmp7 = _mm256_sub_ps(d0, d7); + tmp6 = _mm256_sub_ps(d1, d6); + tmp5 = _mm256_sub_ps(d2, d5); + tmp4 = _mm256_sub_ps(d3, d4); + + tmp10 = _mm256_add_ps(tmp0, tmp3); + tmp13 = _mm256_sub_ps(tmp0, tmp3); + tmp11 = _mm256_add_ps(tmp1, tmp2); + tmp12 = _mm256_sub_ps(tmp1, tmp2); + + d0 = _mm256_add_ps(tmp10, tmp11); + d4 = _mm256_sub_ps(tmp10, tmp11); + + z1 = _mm256_mul_ps(_mm256_add_ps(tmp12, tmp13), _0_707106781); + d2 = _mm256_add_ps(tmp13, z1); + d6 = _mm256_sub_ps(tmp13, z1); + + tmp10 = _mm256_add_ps(tmp4, tmp5); + tmp11 = _mm256_add_ps(tmp5, tmp6); + tmp12 = _mm256_add_ps(tmp6, tmp7); + + z5 = _mm256_mul_ps(_mm256_sub_ps(tmp10, tmp12), _0_382683433); + z2 = _mm256_add_ps(_mm256_mul_ps(tmp10, _0_541196100), z5); + z4 = _mm256_add_ps(_mm256_mul_ps(tmp12, _1_306562965), z5); + z3 = _mm256_mul_ps(tmp11, _0_707106781); + + z11 = _mm256_add_ps(tmp7, z3); + z13 = _mm256_sub_ps(tmp7, z3); + + d1 = _mm256_add_ps(z11, z4); + d3 = _mm256_sub_ps(z13, z2); + d5 = _mm256_add_ps(z13, z2); + d7 = _mm256_sub_ps(z11, z4); + + _mm256_storeu_si256((__m256i*)dst + 0, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 0), d0))); + _mm256_storeu_si256((__m256i*)dst + 1, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 1), d1))); + _mm256_storeu_si256((__m256i*)dst + 2, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 2), d2))); + _mm256_storeu_si256((__m256i*)dst + 3, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 3), d3))); + _mm256_storeu_si256((__m256i*)dst + 4, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 4), d4))); + _mm256_storeu_si256((__m256i*)dst + 5, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 5), d5))); + _mm256_storeu_si256((__m256i*)dst + 6, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 6), d6))); + _mm256_storeu_si256((__m256i*)dst + 7, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 7), d7))); + } + + const __m256i K32_PERM_LD = SIMD_MM256_SETR_EPI32(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1); + + const __m256i K8_SHFL_VS = SIMD_MM256_SETR_EPI8( + 0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1, + 0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1); + + const __m256i K8_SHFL_SH = SIMD_MM256_SETR_EPI8( + 0x2, 0x3, -1, -1, 0x6, 0x7, -1, -1, 0xA, 0xB, -1, -1, -1, -1, -1, -1, + 0x2, 0x3, -1, -1, 0x6, 0x7, -1, -1, 0xA, 0xB, -1, -1, -1, -1, -1, -1); + + const __m256i K32_32 = SIMD_MM256_SET1_EPI32(32); + +#if defined(SIMD_X64_ENABLE) + SIMD_INLINE void WriteBits(uint8_t* data, size_t & pos, uint64_t & bitBuffer, size_t &bitCount, uint64_t shift, uint64_t value, uint64_t mask) + { + bitCount += shift; + assert(bitCount <= 64); + bitBuffer |= _pext_u64(value, mask) << (64 - bitCount); + while (bitCount >= 16) + { + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + byte = uint8_t(bitBuffer >> 48); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 16; + bitCount -= 16; + } + } +#endif + + SIMD_INLINE void WriteBits(OutputMemoryStream& stream, const uint16_t bits[][2], size_t size) + { + size_t pos = stream.Pos(); + stream.Reserve(pos + size * 2); + uint8_t* data = stream.Data(); + size_t& bitCount = stream.BitCount(); + size_t i = 0; +#if defined(SIMD_X64_ENABLE) + uint64_t &bitBuffer = stream.BitBuffer(); + size_t size12 = AlignLoAny(size, 12); + for (; i < size12; i += 12, bits += 12) + { + __m256i b0 = _mm256_permutevar8x32_epi32(_mm256_loadu_si256((__m256i*)(bits + 0)), K32_PERM_LD); + __m256i b1 = _mm256_permutevar8x32_epi32(_mm256_loadu_si256((__m256i*)(bits + 6)), K32_PERM_LD); + __m256i vs0 = _mm256_shuffle_epi8(b0, K8_SHFL_VS); + __m256i vs1 = _mm256_shuffle_epi8(b1, K8_SHFL_VS); + __m256i vv = Shuffle64i<0x0>(vs0, vs1); + __m256i ss = Shuffle64i<0xF>(vs0, vs1); + SIMD_ALIGNED(32) uint64_t value[4], mask[4], shift[4]; + _mm256_storeu_si256((__m256i*)value, vv); + _mm256_storeu_si256((__m256i*)shift, _mm256_sad_epu8(ss, K_ZERO)); + __m256i s0 = _mm256_sub_epi32(K32_32, _mm256_shuffle_epi8(b0, K8_SHFL_SH)); + __m256i m0 = _mm256_srlv_epi32(K_INV_ZERO, s0); + __m256i s1 = _mm256_sub_epi32(K32_32, _mm256_shuffle_epi8(b1, K8_SHFL_SH)); + __m256i m1 = _mm256_srlv_epi32(K_INV_ZERO, s1); + __m256i ms0 = _mm256_shuffle_epi8(m0, K8_SHFL_VS); + __m256i ms1 = _mm256_shuffle_epi8(m1, K8_SHFL_VS); + _mm256_storeu_si256((__m256i*)mask, Shuffle64i<0x0>(ms0, ms1)); + WriteBits(data, pos, bitBuffer, bitCount, shift[0], value[0], mask[0]); + WriteBits(data, pos, bitBuffer, bitCount, shift[2], value[2], mask[2]); + WriteBits(data, pos, bitBuffer, bitCount, shift[1], value[1], mask[1]); + WriteBits(data, pos, bitBuffer, bitCount, shift[3], value[3], mask[3]); + } + if (bitCount >= 8) + { + assert(bitCount < 16); + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + for (; i < size; ++i, ++bits) + { + bitCount += bits[0][1]; + bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount); + while (bitCount >= 8) + { + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + } +#else + uint32_t& bitBuffer = stream.BitBuffer(); + for (; i < size; ++i, ++bits) + { + bitCount += bits[0][1]; + bitBuffer |= uint32_t(bits[0][0]) << (32 - bitCount); + while (bitCount >= 8) + { + uint8_t byte = uint8_t(bitBuffer >> 24); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + } +#endif + stream.Seek(pos); + } + } +#endif// SIMD_AVX2_ENABLE + +#ifdef SIMD_AVX512BW_ENABLE + namespace Avx512bw + { + const __m512i K32_PERM_LD = SIMD_MM512_SETR_EPI32(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); + + const __m512i K8_SHFL_VS = SIMD_MM512_SETR_EPI8( + 0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1, + 0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1, + 0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1, + 0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1); + + SIMD_INLINE void WriteBits(OutputMemoryStream& stream, const uint16_t bits[][2], size_t size) + { + size_t pos = stream.Pos(); + stream.Reserve(pos + size * 2); + uint8_t* data = stream.Data(); + size_t& bitCount = stream.BitCount(); + size_t i = 0; +#if defined(SIMD_X64_ENABLE) + uint64_t &bitBuffer = stream.BitBuffer(); + size_t size24 = AlignLoAny(size, 24); + for (; i < size24; i += 24, bits += 24) + { + __m512i b0 = _mm512_permutexvar_epi32(K32_PERM_LD, _mm512_loadu_si512((__m512i*)(bits + 00))); + __m512i b1 = _mm512_permutexvar_epi32(K32_PERM_LD, _mm512_loadu_si512((__m512i*)(bits + 12))); + __m512i vs0 = _mm512_shuffle_epi8(b0, K8_SHFL_VS); + __m512i vs1 = _mm512_shuffle_epi8(b1, K8_SHFL_VS); + __m512i vv = Shuffle64i<0x00>(vs0, vs1); + __m512i ss = Shuffle64i<0xFF>(vs0, vs1); + SIMD_ALIGNED(64) uint64_t value[8], mask[8], shift[8]; + _mm512_storeu_si512((__m512i*)value, vv); + _mm512_storeu_si512((__m512i*)shift, _mm512_sad_epu8(ss, K_ZERO)); + _mm512_storeu_si512((__m512i*)mask, _mm512_srlv_epi16(K_INV_ZERO, _mm512_sub_epi16(K16_0010, ss))); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[0], value[0], mask[0]); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[2], value[2], mask[2]); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[4], value[4], mask[4]); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[6], value[6], mask[6]); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[1], value[1], mask[1]); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[3], value[3], mask[3]); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[5], value[5], mask[5]); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[7], value[7], mask[7]); + } + if (bitCount >= 8) + { + assert(bitCount < 16); + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + for (; i < size; ++i, ++bits) + { + bitCount += bits[0][1]; + bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount); + while (bitCount >= 8) + { + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + } +#else + uint32_t& bitBuffer = stream.BitBuffer(); + for (; i < size; ++i, ++bits) + { + bitCount += bits[0][1]; + bitBuffer |= uint32_t(bits[0][0]) << (32 - bitCount); + while (bitCount >= 8) + { + uint8_t byte = uint8_t(bitBuffer >> 24); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + } +#endif + stream.Seek(pos); + } + } +#endif// SIMD_AVX512BW_ENABLE + +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + } +#endif// SIMD_NEON_ENABLE +} + +#endif//__SimdImageSaveJpeg_h__ diff --git a/3rdparty/simdlib/Simd/SimdImageSavePng.h b/3rdparty/simdlib/Simd/SimdImageSavePng.h new file mode 100644 index 0000000000..71efd1ca60 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdImageSavePng.h @@ -0,0 +1,235 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdImageSavePng_h__ +#define __SimdImageSavePng_h__ + +#include "Simd/SimdImageSave.h" +#include "Simd/SimdLoad.h" + +#define SIMD_PNG_ZLIB_BIT_REV_TABLE + +namespace Simd +{ + namespace Base + { + extern const uint16_t ZlibLenC[30]; + extern const uint8_t ZlibLenEb[29]; + extern const uint16_t ZlibDistC[31]; + extern const uint8_t ZlibDistEb[30]; + +#if defined(SIMD_PNG_ZLIB_BIT_REV_TABLE) + const int ZlibBitRevShift = 9; + const int ZlibBitRevSize = 1 << ZlibBitRevShift; + extern int ZlibBitRevTable[ZlibBitRevSize]; + SIMD_INLINE int ZlibBitRev(int bits, int count) + { + assert(bits < ZlibBitRevSize&& count <= ZlibBitRevShift); + return ZlibBitRevTable[bits] >> (ZlibBitRevShift - count); + } +#else + SIMD_INLINE int ZlibBitRev(int bits, int count) + { + int rev = 0; + for (size_t b = 0; b < count; b++) + { + rev = (rev << 1) | (bits & 1); + bits >>= 1; + } + return rev; + } +#endif + + SIMD_INLINE uint32_t ZlibHash(const uint8_t* data) + { + uint32_t hash = data[0] + (data[1] << 8) + (data[2] << 16); + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + return hash; + } + + SIMD_INLINE void ZlibHuffA(int bits, int count, OutputMemoryStream& stream) + { + stream.WriteBits(ZlibBitRev(bits, count), count); + } + + SIMD_INLINE void ZlibHuff1(int bits, OutputMemoryStream& stream) + { + ZlibHuffA(0x30 + bits, 8, stream); + } + + SIMD_INLINE void ZlibHuff2(int bits, OutputMemoryStream& stream) + { + ZlibHuffA(0x190 + bits - 144, 9, stream); + } + + SIMD_INLINE void ZlibHuff3(int bits, OutputMemoryStream& stream) + { + ZlibHuffA(0 + bits - 256, 7, stream); + } + + SIMD_INLINE void ZlibHuff4(int bits, OutputMemoryStream& stream) + { + ZlibHuffA(0xc0 + bits - 280, 8, stream); + } + + SIMD_INLINE void ZlibHuff(int bits, OutputMemoryStream& stream) + { + if (bits <= 143) + ZlibHuff1(bits, stream); + else if (bits <= 255) + ZlibHuff2(bits, stream); + else if (bits <= 279) + ZlibHuff3(bits, stream); + else + ZlibHuff4(bits, stream); + } + + SIMD_INLINE void ZlibHuffB(int bits, OutputMemoryStream& stream) + { + if (bits <= 143) + ZlibHuff1(bits, stream); + else + ZlibHuff2(bits, stream); + } + + SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit) + { + limit = Min(limit, 258); + int i = 0; +#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) + int limit8 = limit & (~7); + for (; i < limit8; i += 8) + if (*(uint64_t*)(a + i) != *(uint64_t*)(b + i)) + break; +#else + int limit4 = limit & (~3); + for (; i < limit4; i += 4) + if (*(uint32_t*)(a + i) != *(uint32_t*)(b + i)) + break; +#endif + for (; i < limit; i += 1) + if (a[i] != b[i]) + break; + return i; + } + + SIMD_INLINE uint8_t Paeth(int a, int b, int c) + { + int p = a + b - c, pa = abs(p - a), pb = abs(p - b), pc = abs(p - c); + if (pa <= pb && pa <= pc) + return uint8_t(a); + if (pb <= pc) + return uint8_t(b); + return uint8_t(c); + } + } + +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit) + { + limit = Min(limit, 258); + int i = 0; + int limit16 = limit & (~15); + for (; i < limit16; i += 16) + if (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)(a + i)), _mm_loadu_si128((__m128i*)(b + i)))) != 0xFFFF) + break; +#if defined(SIMD_X64_ENABLE) + int limit8 = limit & (~7); + for (; i < limit8; i += 8) + if (*(uint64_t*)(a + i) != *(uint64_t*)(b + i)) + break; +#else + int limit4 = limit & (~3); + for (; i < limit4; i += 4) + if (*(uint32_t*)(a + i) != *(uint32_t*)(b + i)) + break; +#endif + for (; i < limit; i += 1) + if (a[i] != b[i]) + break; + return i; + } + } +#endif// SIMD_SSE41_ENABLE + +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit) + { + limit = Min(limit, 258); + int i = 0; + for (; i < limit; i += 32) + { + __m256i _a = _mm256_loadu_si256((__m256i*)(a + i)); + __m256i _b = _mm256_loadu_si256((__m256i*)(b + i)); + uint32_t mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(_a, _b)); + if (mask != 0xFFFFFFFF) + { + i += _tzcnt_u32(~mask); + break; + } + } + return Min(i, limit); + } + } +#endif// SIMD_AVX2_ENABLE + +#ifdef SIMD_AVX512BW_ENABLE + namespace Avx512bw + { + SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit) + { + limit = Min(limit, 258); + int i = 0; + for (; i < limit; i += 64) + { + __m512i _a = _mm512_loadu_si512(a + i); + __m512i _b = _mm512_loadu_si512(b + i); + uint64_t mask = _mm512_cmp_epi8_mask(_a, _b, _MM_CMPINT_NE); + if (mask != 0) + { + i += (int)FirstNotZero64(mask); + break; + } + } + return Min(i, limit); + } + } +#endif// SIMD_AVX512BW_ENABLE + +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + } +#endif// SIMD_NEON_ENABLE +} + +#endif//__SimdImageSavePng_h__ diff --git a/3rdparty/simdlib/Simd/SimdLib.cpp b/3rdparty/simdlib/Simd/SimdLib.cpp index 89718bb80e..c168701413 100755 --- a/3rdparty/simdlib/Simd/SimdLib.cpp +++ b/3rdparty/simdlib/Simd/SimdLib.cpp @@ -61,8 +61,10 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved) #include "Simd/SimdConst.h" #include "Simd/SimdLog.h" -#include "Simd/SimdResizer.h" #include "Simd/SimdGaussianBlur.h" +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdResizer.h" #include "Simd/SimdBase.h" #include "Simd/SimdSse2.h" @@ -451,6 +453,34 @@ SIMD_API void SimdGrayToBgra(const uint8_t * gray, size_t width, size_t height, Base::GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha); } +SIMD_API uint8_t* SimdImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size) +{ + const static Simd::ImageSaveToMemoryPtr imageSaveToMemory = SIMD_FUNC3(ImageSaveToMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC); + + return imageSaveToMemory(src, stride, width, height, format, file, quality, size); +} + +SIMD_API SimdBool SimdImageSaveToFile(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char* path) +{ + const static Simd::ImageSaveToMemoryPtr imageSaveToMemory = SIMD_FUNC3(ImageSaveToMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC); + + return ImageSaveToFile(imageSaveToMemory, src, stride, width, height, format, file, quality, path); +} + +SIMD_API uint8_t* SimdImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) +{ + const static Simd::ImageLoadFromMemoryPtr imageLoadFromMemory = SIMD_FUNC3(ImageLoadFromMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC); + + return imageLoadFromMemory(data, size, stride, width, height, format); +} + +SIMD_API uint8_t* SimdImageLoadFromFile(const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) +{ + const static Simd::ImageLoadFromMemoryPtr imageLoadFromMemory = SIMD_FUNC3(ImageLoadFromMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC); + + return ImageLoadFromFile(imageLoadFromMemory, path, stride, width, height, format); +} + SIMD_API void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride) { diff --git a/3rdparty/simdlib/Simd/SimdLib.h b/3rdparty/simdlib/Simd/SimdLib.h index 4838b82261..5441805969 100755 --- a/3rdparty/simdlib/Simd/SimdLib.h +++ b/3rdparty/simdlib/Simd/SimdLib.h @@ -116,6 +116,27 @@ typedef enum SimdCpuInfoNeon, /*!< Availability of NEON (ARM). */ } SimdCpuInfoType; +/*! @ingroup c_types + Describes formats of image file. It is used in functions ::SimdImageSaveToMemory and ::SimdImageSaveToFile. +*/ +typedef enum +{ + /*! An undefined image file format (format auto choice). */ + SimdImageFileUndefined = 0, + /*! A PGM (Portable Gray Map) text (P2) image file format. */ + SimdImageFilePgmTxt, + /*! A PGM (Portable Gray Map) binary (P5) image file format. */ + SimdImageFilePgmBin, + /*! A PGM (Portable Pixel Map) text (P3) image file format. */ + SimdImageFilePpmTxt, + /*! A PGM (Portable Pixel Map) binary (P6) image file format. */ + SimdImageFilePpmBin, + /*! A PNG (Portable Network Graphics) image file format. */ + SimdImageFilePng, + /*! A JPEG (Joint Photographic Experts Group) image file format. */ + SimdImageFileJpeg, +} SimdImageFileType; + /*! @ingroup c_types Describes types of binary operation between two images performed by function ::SimdOperationBinary8u. Images must have the same format (unsigned 8-bit integer for every channel). @@ -167,18 +188,6 @@ typedef enum SimdPixelFormatFloat, /*! A single channel 64-bit float point pixel format. */ SimdPixelFormatDouble, - /*! A 8-bit Bayer pixel format (GRBG). */ - SimdPixelFormatBayerGrbg, - /*! A 8-bit Bayer pixel format (GBRG). */ - SimdPixelFormatBayerGbrg, - /*! A 8-bit Bayer pixel format (RGGB). */ - SimdPixelFormatBayerRggb, - /*! A 8-bit Bayer pixel format (BGGR). */ - SimdPixelFormatBayerBggr, - /*! A 24-bit (3 8-bit channels) HSV (Hue, Saturation, Value) pixel format. */ - SimdPixelFormatHsv24, - /*! A 24-bit (3 8-bit channels) HSL (Hue, Saturation, Lightness) pixel format. */ - SimdPixelFormatHsl24, /*! A 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */ SimdPixelFormatRgb24, /*! A 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */ @@ -753,6 +762,82 @@ extern "C" SIMD_API void SimdGrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha); + /*! @ingroup image_io + + \fn uint8_t* SimdImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t * size); + + \short Saves an image to memory in given image file format. + + \param [in] src - a pointer to pixels data of input image. + \param [in] stride - a row size of input image in bytes. + \param [in] width - a width of input image. + \param [in] height - a height of input image. + \param [in] format - a pixel format of input image. + Supported pixel formats: ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32. + \param [in] file - a format of output image file. To auto choise format of output file set this parameter to ::SimdImageFileUndefined. + \param [in] quality - a parameter of compression quality (if file format supports it). + \param [out] size - a pointer to the size of output image file in bytes. + \return a pointer to memory buffer with output image file. + It has to be deleted after use by function ::SimdFree. On error it returns NULL. + */ + SIMD_API uint8_t* SimdImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t * size); + + /*! @ingroup image_io + + \fn SimdBool SimdImageSaveToFile(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char * path); + + \short Saves an image to memory in given image file format. + + \param [in] src - a pointer to pixels data of input image. + \param [in] stride - a row size of input image in bytes. + \param [in] width - a width of input image. + \param [in] height - a height of input image. + \param [in] format - a pixel format of input image. + Supported pixel formats: ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32. + \param [in] file - a format of output image file. To auto choise format of output file set this parameter to ::SimdImageFileUndefined. + \param [in] quality - a parameter of compression quality (if file format supports it). + \param [in] path - a path to output image file. + \return result of the operation. + */ + SIMD_API SimdBool SimdImageSaveToFile(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char * path); + + /*! @ingroup image_io + + \fn uint8_t* SimdImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format); + + \short Loads an image from memory buffer. + + \param [in] data - a pointer to memory buffer with input image file. + \param [in] size - a size of input image file in bytes. + \param [out] stride - a pointer to row size of output image in bytes. + \param [out] width - a pointer to width of output image. + \param [out] height - a pointer to height of output image. + \param [in, out] format - a pointer to pixel format of output image. + Here you can set desired pixel format (it can be ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32). + Or set ::SimdPixelFormatNone and use pixel format of input image file. + \return a pointer to pixels data of output image. + It has to be deleted after use by function ::SimdFree. On error it returns NULL. + */ + SIMD_API uint8_t* SimdImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format); + + /*! @ingroup image_io + + \fn uint8_t* SimdImageLoadFromFile(const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format); + + \short Loads an image from file. + + \param [in] path - a path to input image file. + \param [out] stride - a pointer to row size of output image in bytes. + \param [out] width - a pointer to width of output image. + \param [out] height - a pointer to height of output image. + \param [in, out] format - a pointer to pixel format of output image. + Here you can set desired pixel format (it can be ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32). + Or set ::SimdPixelFormatNone and use pixel format of input image file. + \return a pointer to pixels data of output image. + It has to be deleted after use by function ::SimdFree. On error it returns NULL. + */ + SIMD_API uint8_t* SimdImageLoadFromFile(const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format); + /*! @ingroup other_conversion \fn void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride); diff --git a/3rdparty/simdlib/Simd/SimdMath.h b/3rdparty/simdlib/Simd/SimdMath.h index 0f7425f76e..f8c192a189 100755 --- a/3rdparty/simdlib/Simd/SimdMath.h +++ b/3rdparty/simdlib/Simd/SimdMath.h @@ -750,6 +750,11 @@ namespace Simd return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(lo), _mm256_castsi256_ps(hi), imm)); } + template SIMD_INLINE __m256i Shuffle64i(__m256i lo, __m256i hi) + { + return _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(lo), _mm256_castsi256_pd(hi), imm)); + } + template SIMD_INLINE __m256 Permute4x64(__m256 a) { return _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(a), imm)); diff --git a/3rdparty/simdlib/Simd/SimdMemory.h b/3rdparty/simdlib/Simd/SimdMemory.h index d7772ffa3c..f0fca8840a 100755 --- a/3rdparty/simdlib/Simd/SimdMemory.h +++ b/3rdparty/simdlib/Simd/SimdMemory.h @@ -35,6 +35,18 @@ namespace Simd { + SIMD_INLINE size_t DivHi(size_t value, size_t divider) + { + return (value + divider - 1) / divider; + } + + SIMD_INLINE size_t Pow2Hi(size_t value) + { + size_t pow2 = 1; + for (; pow2 < value; pow2 *= 2); + return pow2; + } + SIMD_INLINE size_t AlignHiAny(size_t size, size_t align) { return (size + align - 1) / align * align; @@ -108,6 +120,13 @@ namespace Simd return ptr; } + template T* Allocate(uint8_t*& buffer, size_t size, size_t align = SIMD_ALIGN) + { + T* ptr = (T*)buffer; + buffer = buffer + AlignHi(size * sizeof(T), align); + return ptr; + } + SIMD_INLINE void Free(void * ptr) { #ifdef SIMD_NO_MANS_LAND diff --git a/3rdparty/simdlib/Simd/SimdMemoryStream.h b/3rdparty/simdlib/Simd/SimdMemoryStream.h new file mode 100644 index 0000000000..9665f33d63 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdMemoryStream.h @@ -0,0 +1,510 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdMemoryStream_h__ +#define __SimdMemoryStream_h__ + +#include "Simd/SimdMemory.h" +#include "Simd/SimdPerformance.h" + +namespace Simd +{ + class InputMemoryStream + { + const uint8_t* _data; + size_t _pos, _size, _bitCount; +#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) + uint64_t _bitBuffer; +#else + uint32_t _bitBuffer; +#endif + + public: + SIMD_INLINE InputMemoryStream(const uint8_t* data = NULL, size_t size = 0) + { + Init(data, size); + } + + SIMD_INLINE void Init(const uint8_t* data, size_t size) + { + _pos = 0; + _data = data; + _size = size; + _bitBuffer = 0; + _bitCount = 0; + } + + SIMD_INLINE bool Seek(size_t pos) + { + if (pos <= _size) + { + _pos = pos; + return true; + } + return false; + } + + SIMD_INLINE size_t Size() const + { + return _size; + } + + SIMD_INLINE const uint8_t* Data() const + { + return _data; + } + + SIMD_INLINE size_t Pos() const + { + return _pos; + } + + SIMD_INLINE const uint8_t* Current() const + { + return _data + _pos; + } + + SIMD_INLINE bool Eof() const + { + return _pos >= _size; + } + + SIMD_INLINE bool CanRead(size_t size) const + { + return _pos + size <= _size; + } + + SIMD_INLINE size_t Read(size_t size, void* data) + { + size = Min(_size - _pos, size); + memcpy(data, _data + _pos, size); + _pos += size; + return size; + } + + template SIMD_INLINE bool Read(Value & value) + { + return Read(sizeof(Value), &value) == sizeof(Value); + } + + SIMD_INLINE bool Read8u(uint8_t & value) + { + if (_pos < _size) + { + value = _data[_pos++]; + return true; + } + else + return false; + } + + SIMD_INLINE bool Read16u(uint16_t& value) + { + if (_pos + 2 <= _size) + { + value = *(uint16_t*)(_data + _pos); + _pos += 2; + return true; + } + else + return false; + } + + SIMD_INLINE bool Read32u(uint32_t& value) + { + if (_pos + 4 <= _size) + { + value = *(uint32_t*)(_data + _pos); + _pos += 4; + return true; + } + else + return false; + } + + SIMD_INLINE bool ReadBe16u(uint16_t& value) + { + if (Read16u(value)) + { +#if !defined(SIMD_BIG_ENDIAN) + value = + (value & 0x00FF) << 8 | + (value & 0xFF00) >> 8; +#endif + return true; + } + else + return false; + } + + SIMD_INLINE bool ReadBe32u(uint32_t& value) + { + if (Read32u(value)) + { +#if !defined(SIMD_BIG_ENDIAN) + value = + (value & 0x000000FF) << 24 | + (value & 0x0000FF00) << 8 | + (value & 0x00FF0000) >> 8 | + (value & 0xFF000000) >> 24; +#endif + return true; + } + else + return false; + } + + template SIMD_INLINE bool ReadUnsigned(Unsigned& value) + { + if (!SkipGap()) + return false; + value = 0; + while (!IsGap(_data[_pos]) && _pos < _size) + { + if (_data[_pos] >= '0' && _data[_pos] <= '9') + value = value * 10 + Unsigned(_data[_pos] - '0'); + else + return false; + _pos++; + } + return true; + } + + SIMD_INLINE bool Skip(size_t size) + { + if (_pos + size < _size) + { + _pos += size; + return true; + } + return false; + } + + SIMD_INLINE bool SkipValue(uint8_t value) + { + while (_data[_pos] == value && _pos < _size) + _pos++; + return _pos < _size; + } + + SIMD_INLINE bool SkipNotGap() + { + while (!IsGap(_data[_pos]) && _pos < _size) + _pos++; + return _pos < _size; + } + + SIMD_INLINE bool SkipGap() + { + while (IsGap(_data[_pos]) && _pos < _size) + _pos++; + return _pos < _size; + } + + static SIMD_INLINE bool IsGap(uint8_t value) + { + return value == ' ' || value == '\t' || value == '\n' || value == '\r'; + } + +#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) + SIMD_INLINE uint64_t& BitBuffer() + { + return _bitBuffer; + } +#else + SIMD_INLINE uint32_t& BitBuffer() + { + return _bitBuffer; + } +#endif + + SIMD_INLINE size_t& BitCount() + { + return _bitCount; + } + + SIMD_INLINE void FillBits() + { + static const size_t canReadByte = (sizeof(_bitBuffer) - 1) * 8; + while (_bitCount <= canReadByte && _pos < _size) + { + _bitBuffer |= (size_t)_data[_pos++] << _bitCount; + _bitCount += 8; + } + } + + SIMD_INLINE void ClearBits() + { + _pos -= _bitCount / 8; + _bitBuffer = 0; + _bitCount = 0; + } + + SIMD_INLINE bool ReadBits(size_t & bits, size_t count) + { + if (_bitCount < count) + FillBits(); + if (_bitCount < count) + return false; + bits = _bitBuffer & ((size_t(1) << count) - 1); + _bitBuffer >>= count; + _bitCount -= count; + return true; + } + + SIMD_INLINE size_t ReadBits(size_t count) + { + if (_bitCount < count) + FillBits(); + size_t bits = _bitBuffer & ((size_t(1) << count) - 1); + _bitBuffer >>= count; + _bitCount -= count; + return bits; + } + }; + + //------------------------------------------------------------------------- + + class OutputMemoryStream + { + const size_t CAPACITY_MIN = 64; + + uint8_t * _data; + size_t _pos, _size, _capacity, _bitCount; +#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) + uint64_t _bitBuffer; +#else + uint32_t _bitBuffer; +#endif + + SIMD_INLINE void Reset(bool owner) + { + if (_data && owner) + Free(_data); + _data = NULL; + _pos = 0; + _size = 0; + _capacity = 0; + _bitBuffer = 0; + _bitCount = 0; + } + + public: + SIMD_INLINE OutputMemoryStream(size_t capacity = 0) + { + Reset(false); + if (capacity) + Reserve(capacity); + } + + SIMD_INLINE ~OutputMemoryStream() + { + Reset(true); + } + + SIMD_INLINE void Seek(size_t pos) + { + _pos = pos; + _size = Max(_size, _pos); + Reserve(_pos); + } + + SIMD_INLINE size_t Pos() const + { + return _pos; + } + + SIMD_INLINE size_t Size() const + { + return _size; + } + + SIMD_INLINE size_t Capacity() const + { + return _capacity; + } + + SIMD_INLINE uint8_t* Data() + { + return _data; + } + + SIMD_INLINE const uint8_t * Data() const + { + return _data; + } + + SIMD_INLINE uint8_t* Current() + { + return _data + _pos; + } + + SIMD_INLINE const uint8_t* Current() const + { + return _data + _pos; + } + + SIMD_INLINE void Write(const void * data, size_t size) + { + Reserve(_pos + size); + memcpy(_data + _pos, data, size); + _pos += size; + _size = Max(_size, _pos); + } + + SIMD_INLINE bool Write(InputMemoryStream & input, size_t size) + { + if (input.CanRead(size)) + { + Write(input.Current(), size); + input.Skip(size); + return true; + } + return false; + } + + SIMD_INLINE bool WriteSelf(ptrdiff_t offset, size_t size) + { + if (offset < 0) + return false; + Reserve(_pos + size); + if (offset + size > _pos) + { + for (size_t i = 0; i < size; ++i) + _data[_pos++] = _data[offset++]; + } + else + { + memcpy(_data + _pos, _data + offset, size); + _pos += size; + } + _size = Max(_size, _pos); + return true; + } + + template SIMD_INLINE void Write(const Value& value) + { + Write(&value, sizeof(Value)); + } + + SIMD_INLINE void Write8u(uint8_t value) + { + Reserve(_pos + 1); + _data[_pos++] = value; + _size = Max(_size, _pos); + } + + SIMD_INLINE void Write8u(uint8_t value, size_t count) + { + Reserve(_pos + count); + memset(_data + _pos, value, count); + _pos += count; + _size = Max(_size, _pos); + } + + SIMD_INLINE void WriteBe32u(const uint32_t & value) + { +#if defined(SIMD_BIG_ENDIAN) + Write(value); +#else + Write( + (value & 0x000000FF) << 24 | + (value & 0x0000FF00) << 8 | + (value & 0x00FF0000) >> 8 | + (value & 0xFF000000) >> 24); +#endif + } + + SIMD_INLINE uint8_t* Release(size_t* size = NULL) + { + uint8_t* data = _data; + if(size) + *size = _size; + Reset(false); + return data; + } + + SIMD_INLINE void Reserve(size_t size) + { + if (size > _capacity) + { + size_t capacity = Max(CAPACITY_MIN, Max(_capacity * 2, size)); + uint8_t* data = (uint8_t*)Allocate(capacity, SIMD_ALIGN); + if (_data) + { + memcpy(data, _data, _size); + Free(_data); + } + _data = data; + _capacity = capacity; + } + } + + SIMD_INLINE void WriteBits(const size_t bits, size_t count) + { + _bitBuffer |= (bits) << _bitCount; + _bitCount += count; + while (_bitCount >= 8) + { + Write8u((uint8_t)_bitBuffer); + _bitBuffer >>= 8; + _bitCount -= 8; + } + } + + SIMD_INLINE void FlushBits() + { + while (_bitCount >= 8) + { + Write8u((uint8_t)_bitBuffer); + _bitBuffer >>= 8; + _bitCount -= 8; + } + if (_bitCount) + { + Write8u((uint8_t)_bitBuffer); + _bitBuffer = 0; + _bitCount = 0; + } + } + +#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) + SIMD_INLINE uint64_t & BitBuffer() + { + return _bitBuffer; + } +#else + SIMD_INLINE uint32_t& BitBuffer() + { + return _bitBuffer; + } +#endif + + SIMD_INLINE size_t& BitCount() + { + return _bitCount; + } + }; +} + +#endif//__SimdMemoryStream_h__ diff --git a/3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp b/3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp new file mode 100644 index 0000000000..61c5d90359 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp @@ -0,0 +1,154 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdNeon.h" + +#include + +namespace Simd +{ +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param) + : Base::ImagePgmTxtLoader(param) + { + } + + void ImagePgmTxtLoader::SetConverters() + { + Base::ImagePgmTxtLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Neon::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Neon::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Neon::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Neon::GrayToBgra; break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param) + : Base::ImagePgmBinLoader(param) + { + } + + void ImagePgmBinLoader::SetConverters() + { + Base::ImagePgmBinLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Neon::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Neon::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Neon::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Neon::GrayToBgra; break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param) + : Base::ImagePpmTxtLoader(param) + { + } + + void ImagePpmTxtLoader::SetConverters() + { + Base::ImagePpmTxtLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Neon::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Neon::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Neon::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Neon::BgrToBgra; break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param) + : Base::ImagePpmBinLoader(param) + { + } + + void ImagePpmBinLoader::SetConverters() + { + Base::ImagePpmBinLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Neon::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Neon::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Neon::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Neon::BgrToBgra; break; + } + } + } + + //--------------------------------------------------------------------- + + ImageLoader* CreateImageLoader(const ImageLoaderParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param); + case SimdImageFilePgmBin: return new ImagePgmBinLoader(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param); + case SimdImageFilePpmBin: return new ImagePpmBinLoader(param); + case SimdImageFilePng: return new Base::ImagePngLoader(param); + case SimdImageFileJpeg: return new Base::ImageJpegLoader(param); + default: + return NULL; + } + } + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) + { + ImageLoaderParam param(data, size, *format); + if (param.Validate()) + { + Holder loader(CreateImageLoader(param)); + if (loader) + { + if (loader->FromStream()) + return loader->Release(stride, width, height, format); + } + } + return NULL; + } + } +#endif// SIMD_NEON_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdNeonImageSave.cpp b/3rdparty/simdlib/Simd/SimdNeonImageSave.cpp new file mode 100644 index 0000000000..a0fbbd071a --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdNeonImageSave.cpp @@ -0,0 +1,134 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdNeon.h" + +#include + +namespace Simd +{ +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param) + : Base::ImagePgmTxtSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Neon::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Neon::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Neon::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Neon::RgbaToGray; break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param) + : Base::ImagePgmBinSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Neon::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Neon::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Neon::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Neon::RgbaToGray; break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param) + : Base::ImagePpmTxtSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Neon::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Neon::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Neon::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Neon::BgraToBgr; break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param) + : Base::ImagePpmBinSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Neon::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Neon::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Neon::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Neon::BgraToBgr; break; + } + } + } + + //--------------------------------------------------------------------- + + ImageSaver* CreateImageSaver(const ImageSaverParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param); + case SimdImageFilePgmBin: return new ImagePgmBinSaver(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param); + case SimdImageFilePpmBin: return new ImagePpmBinSaver(param); + case SimdImageFilePng: return new ImagePngSaver(param); + case SimdImageFileJpeg: return new Base::ImageJpegSaver(param); + default: + return NULL; + } + } + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size) + { + ImageSaverParam param(width, height, format, file, quality); + if (param.Validate()) + { + Holder saver(CreateImageSaver(param)); + if (saver) + { + if (saver->ToStream(src, stride)) + return saver->Release(size); + } + } + return NULL; + } + } +#endif// SIMD_NEON_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdPerformance.h b/3rdparty/simdlib/Simd/SimdPerformance.h new file mode 100644 index 0000000000..e695326a69 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdPerformance.h @@ -0,0 +1,197 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdPerformance_h__ +#define __SimdPerformance_h__ + +#include "Simd/SimdDefs.h" + +#include +#include + +namespace Simd +{ + typedef std::string String; + + template SIMD_INLINE String ToStr(const T & value) + { + std::stringstream ss; + ss << value; + return ss.str(); + } +} + +#if defined(SIMD_PERFORMANCE_STATISTIC) && (defined(NDEBUG) || defined(SIMD_PERF_STAT_IN_DEBUG)) + +#include "Simd/SimdTime.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Simd +{ + namespace Base + { + class PerformanceMeasurer + { + String _name; + int64_t _start, _current, _total, _min, _max; + int64_t _count, _flop; + bool _entered, _paused; + + public: + PerformanceMeasurer(const String& name = "Unknown", int64_t flop = 0); + + PerformanceMeasurer(const PerformanceMeasurer& pm); + + void Enter(); + + void Leave(bool pause = false); + + String Statistic() const; + + void Combine(const PerformanceMeasurer& other); + + private: + double Average() const; + double GFlops() const; + }; + + class PerformanceMeasurerHolder + { + PerformanceMeasurer * _pm; + + public: + SIMD_INLINE PerformanceMeasurerHolder(PerformanceMeasurer * pm, bool enter = true) + : _pm(pm) + { + if (_pm && enter) + _pm->Enter(); + } + + SIMD_INLINE void Enter() + { + if (_pm) + _pm->Enter(); + } + + SIMD_INLINE void Leave(bool pause) + { + if (_pm) + _pm->Leave(pause); + } + + SIMD_INLINE ~PerformanceMeasurerHolder() + { + if (_pm) + _pm->Leave(); + } + }; + + class PerformanceMeasurerStorage + { + typedef PerformanceMeasurer Pm; + typedef std::shared_ptr PmPtr; + typedef std::map FunctionMap; + typedef std::map ThreadMap; + + ThreadMap _map; + mutable std::mutex _mutex; + String _report; + + SIMD_INLINE FunctionMap & ThisThread() + { + static thread_local FunctionMap * thread = NULL; + if (thread == NULL) + { + std::lock_guard lock(_mutex); + thread = &_map[std::this_thread::get_id()]; + } + return *thread; + } + + public: + static PerformanceMeasurerStorage s_storage; + + PerformanceMeasurerStorage() + { + } + + SIMD_INLINE PerformanceMeasurer * Get(const String & name, int64_t flop = 0) + { + FunctionMap & thread = ThisThread(); + PerformanceMeasurer * pm = NULL; + FunctionMap::iterator it = thread.find(name); + if (it == thread.end()) + { + pm = new PerformanceMeasurer(name, flop); + thread[name].reset(pm); + } + else + pm = it->second.get(); + return pm; + } + + SIMD_INLINE PerformanceMeasurer * Get(const String func, const String & desc, int64_t flop = 0) + { + return Get(func + "{ " + desc + " }", flop); + } + + const char* PerformanceStatistic(); + }; + } +} +#define SIMD_PERF_FUNCF(flop) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)(Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, (int64_t)(flop))) +#define SIMD_PERF_FUNC() SIMD_PERF_FUNCF(0) +#define SIMD_PERF_BEGF(desc, flop) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)(Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc, (int64_t)(flop))) +#define SIMD_PERF_BEG(desc) SIMD_PERF_BEGF(desc, 0) +#define SIMD_PERF_IFF(cond, desc, flop) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)((cond) ? Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc, (int64_t)(flop)) : NULL) +#define SIMD_PERF_IF(cond, desc) SIMD_PERF_IFF(cond, desc, 0) +#define SIMD_PERF_END(desc) Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc)->Leave(); +#define SIMD_PERF_INITF(name, desc, flop) Simd::Base::PerformanceMeasurerHolder name(Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc, (int64_t)(flop)), false); +#define SIMD_PERF_INIT(name, desc) SIMD_PERF_INITF(name, desc, 0); +#define SIMD_PERF_START(name) name.Enter(); +#define SIMD_PERF_PAUSE(name) name.Leave(true); +#define SIMD_PERF_EXT(ext) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)((ext)->Perf(SIMD_FUNCTION)) +#else//SIMD_PERFORMANCE_STATISTIC +#define SIMD_PERF_FUNCF(flop) +#define SIMD_PERF_FUNC() +#define SIMD_PERF_BEGF(desc, flop) +#define SIMD_PERF_BEG(desc) +#define SIMD_PERF_IFF(cond, desc, flop) +#define SIMD_PERF_IF(cond, desc) +#define SIMD_PERF_END(desc) +#define SIMD_PERF_INITF(name, desc, flop) +#define SIMD_PERF_INIT(name, desc) +#define SIMD_PERF_START(name) +#define SIMD_PERF_PAUSE(name) +#define SIMD_PERF_EXT(ext) +#endif//SIMD_PERFORMANCE_STATISTIC + +#endif//__SimdPerformance_h__ diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp new file mode 100644 index 0000000000..eca83c63ed --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp @@ -0,0 +1,159 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdSse2.h" +#include "Simd/SimdSse41.h" + +#include + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param) + : Base::ImagePgmTxtLoader(param) + { + } + + void ImagePgmTxtLoader::SetConverters() + { + Base::ImagePgmTxtLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Sse41::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Sse2::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Sse41::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Sse41::GrayToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param) + : Base::ImagePgmBinLoader(param) + { + } + + void ImagePgmBinLoader::SetConverters() + { + Base::ImagePgmBinLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Sse41::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Sse2::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Sse41::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Sse41::GrayToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param) + : Base::ImagePpmTxtLoader(param) + { + } + + void ImagePpmTxtLoader::SetConverters() + { + Base::ImagePpmTxtLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Sse41::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Sse41::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Sse41::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Sse41::BgrToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param) + : Base::ImagePpmBinLoader(param) + { + } + + void ImagePpmBinLoader::SetConverters() + { + Base::ImagePpmBinLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Sse41::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Sse41::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Sse41::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Sse41::BgrToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImageLoader* CreateImageLoader(const ImageLoaderParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param); + case SimdImageFilePgmBin: return new ImagePgmBinLoader(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param); + case SimdImageFilePpmBin: return new ImagePpmBinLoader(param); + case SimdImageFilePng: return new ImagePngLoader(param); + case SimdImageFileJpeg: return new Base::ImageJpegLoader(param); + default: + return NULL; + } + } + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) + { + ImageLoaderParam param(data, size, *format); + if (param.Validate()) + { + std::unique_ptr loader(CreateImageLoader(param)); + if (loader) + { + if (loader->FromStream()) + return loader->Release(stride, width, height, format); + } + } + return NULL; + } + } +#endif// SIMD_SSE41_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp new file mode 100644 index 0000000000..1ec6ca0118 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp @@ -0,0 +1,1805 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdArray.h" +#include "Simd/SimdCpu.h" +#include "Simd/SimdBase.h" +#include "Simd/SimdSse2.h" +#include "Simd/SimdSse41.h" + +namespace Simd +{ +#if defined(SIMD_SSE41_ENABLE) + namespace Sse41 + { + typedef unsigned char png_uc; + typedef unsigned short png_us; + + typedef uint16_t png__uint16; + typedef uint32_t png__uint32; + +#define png_inline SIMD_INLINE +#define PNG_ASSERT assert +#define PNG_MALLOC(sz) malloc(sz) +#define PNG_REALLOC(p,newsz) realloc(p,newsz) +#define PNG_FREE(p) free(p) +#define PNG_REALLOC_SIZED(p,oldsz,newsz) PNG_REALLOC(p,newsz) +#define STBIDEF static + +#ifdef _MSC_VER +#define PNG_NOTUSED(v) (void)(v) +#else +#define PNG_NOTUSED(v) (void)sizeof(v) +#endif + +#define PNG__BYTECAST(x) ((png_uc) ((x) & 255)) // truncate int to byte without warnings +#define PNG_MAX_DIMENSIONS (1 << 24) + + static int png__err(const char* str, const char* stub) + { + return 0; + } + +#define png__errpuc(x,y) ((unsigned char *)(size_t) (png__err(x,y)?NULL:NULL)) + + static void* png__malloc(size_t size) + { + return PNG_MALLOC(size); + } + + typedef struct + { + int (*read) (void* user, char* data, int size); // fill 'data' with 'size' bytes. return number of bytes actually read + void (*skip) (void* user, int n); // skip the next 'n' bytes, or 'unget' the last -n bytes if negative + int (*eof) (void* user); // returns nonzero if we are at end of file/data + } png_io_callbacks; + + typedef struct + { + png__uint32 img_x, img_y; + int img_n, img_out_n; + + png_io_callbacks io; + void* io_user_data; + + int read_from_callbacks; + int buflen; + png_uc buffer_start[128]; + int callback_already_read; + + png_uc* img_buffer, * img_buffer_end; + png_uc* img_buffer_original, * img_buffer_original_end; + } png__context; + + typedef struct + { + int bits_per_channel; + int num_channels; + int channel_order; + } png__result_info; + + enum + { + PNG__SCAN_load = 0, + PNG__SCAN_type, + PNG__SCAN_header + }; + + enum + { + PNG_ORDER_RGB, + PNG_ORDER_BGR + }; + + static void png__rewind(png__context* s) + { + // conceptually rewind SHOULD rewind to the beginning of the stream, + // but we just rewind to the beginning of the initial buffer, because + // we only use it after doing 'test', which only ever looks at at most 92 bytes + s->img_buffer = s->img_buffer_original; + s->img_buffer_end = s->img_buffer_original_end; + } + + static void png__refill_buffer(png__context* s) + { + int n = (s->io.read)(s->io_user_data, (char*)s->buffer_start, s->buflen); + s->callback_already_read += (int)(s->img_buffer - s->img_buffer_original); + if (n == 0) { + // at end of file, treat same as if from memory, but need to handle case + // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file + s->read_from_callbacks = 0; + s->img_buffer = s->buffer_start; + s->img_buffer_end = s->buffer_start + 1; + *s->img_buffer = 0; + } + else { + s->img_buffer = s->buffer_start; + s->img_buffer_end = s->buffer_start + n; + } + } + + png_inline static png_uc png__get8(png__context* s) + { + if (s->img_buffer < s->img_buffer_end) + return *s->img_buffer++; + if (s->read_from_callbacks) { + png__refill_buffer(s); + return *s->img_buffer++; + } + return 0; + } + + static int png__get16be(png__context* s) + { + int z = png__get8(s); + return (z << 8) + png__get8(s); + } + + static png__uint32 png__get32be(png__context* s) + { + png__uint32 z = png__get16be(s); + return (z << 16) + png__get16be(s); + } + + png_inline static int png__at_eof(png__context* s) + { + if (s->io.read) { + if (!(s->io.eof)(s->io_user_data)) return 0; + // if feof() is true, check if buffer = end + // special case: we've only got the special 0 character at the end + if (s->read_from_callbacks == 0) return 1; + } + + return s->img_buffer >= s->img_buffer_end; + } + + static void png__skip(png__context* s, int n) + { + if (n == 0) return; // already there! + if (n < 0) { + s->img_buffer = s->img_buffer_end; + return; + } + if (s->io.read) { + int blen = (int)(s->img_buffer_end - s->img_buffer); + if (blen < n) { + s->img_buffer = s->img_buffer_end; + (s->io.skip)(s->io_user_data, n - blen); + return; + } + } + s->img_buffer += n; + } + + static int png__getn(png__context* s, png_uc* buffer, int n) + { + if (s->io.read) { + int blen = (int)(s->img_buffer_end - s->img_buffer); + if (blen < n) { + int res, count; + + memcpy(buffer, s->img_buffer, blen); + + count = (s->io.read)(s->io_user_data, (char*)buffer + blen, n - blen); + res = (count == (n - blen)); + s->img_buffer = s->img_buffer_end; + return res; + } + } + + if (s->img_buffer + n <= s->img_buffer_end) { + memcpy(buffer, s->img_buffer, n); + s->img_buffer += n; + return 1; + } + else + return 0; + } + + static int png__addsizes_valid(int a, int b) + { + if (b < 0) return 0; + // now 0 <= b <= INT_MAX, hence also + // 0 <= INT_MAX - b <= INTMAX. + // And "a + b <= INT_MAX" (which might overflow) is the + // same as a <= INT_MAX - b (no overflow) + return a <= INT_MAX - b; + } + + // returns 1 if the product is valid, 0 on overflow. + // negative factors are considered invalid. + static int png__mul2sizes_valid(int a, int b) + { + if (a < 0 || b < 0) return 0; + if (b == 0) return 1; // mul-by-0 is always safe + // portable way to check for no overflows in a*b + return a <= INT_MAX / b; + } + + // returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow + static int png__mad2sizes_valid(int a, int b, int add) + { + return png__mul2sizes_valid(a, b) && png__addsizes_valid(a * b, add); + } + + // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow + static int png__mad3sizes_valid(int a, int b, int c, int add) + { + return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) && + png__addsizes_valid(a * b * c, add); + } + + // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow + static int png__mad4sizes_valid(int a, int b, int c, int d, int add) + { + return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) && + png__mul2sizes_valid(a * b * c, d) && png__addsizes_valid(a * b * c * d, add); + } + + // mallocs with size overflow checking + static void* png__malloc_mad2(int a, int b, int add) + { + if (!png__mad2sizes_valid(a, b, add)) return NULL; + return png__malloc(a * b + add); + } + + static void* png__malloc_mad3(int a, int b, int c, int add) + { + if (!png__mad3sizes_valid(a, b, c, add)) return NULL; + return png__malloc(a * b * c + add); + } + + static void* png__malloc_mad4(int a, int b, int c, int d, int add) + { + if (!png__mad4sizes_valid(a, b, c, d, add)) return NULL; + return png__malloc(a * b * c * d + add); + } + + static png_uc png__compute_y(int r, int g, int b) + { + return (png_uc)(((r * 77) + (g * 150) + (29 * b)) >> 8); + } + + static unsigned char* png__convert_format(unsigned char* data, int img_n, int req_comp, unsigned int x, unsigned int y) + { + int i, j; + unsigned char* good; + + if (req_comp == img_n) return data; + PNG_ASSERT(req_comp >= 1 && req_comp <= 4); + + good = (unsigned char*)png__malloc_mad3(req_comp, x, y, 0); + if (good == NULL) { + PNG_FREE(data); + return png__errpuc("outofmem", "Out of memory"); + } + + for (j = 0; j < (int)y; ++j) { + unsigned char* src = data + j * x * img_n; + unsigned char* dest = good + j * x * req_comp; + +#define PNG__COMBO(a,b) ((a)*8+(b)) +#define PNG__CASE(a,b) case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b) + // convert source image with img_n components to one with req_comp components; + // avoid switch per pixel, so use switch per scanline and massive macros + switch (PNG__COMBO(img_n, req_comp)) { + PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 255; } break; + PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 255; } break; + PNG__CASE(2, 1) { dest[0] = src[0]; } break; + PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break; + PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 255; } break; + PNG__CASE(3, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break; + PNG__CASE(3, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = 255; } break; + PNG__CASE(4, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break; + PNG__CASE(4, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = src[3]; } break; + PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break; + default: PNG_ASSERT(0); PNG_FREE(data); PNG_FREE(good); return png__errpuc("unsupported", "Unsupported format conversion"); + } +#undef PNG__CASE + } + + PNG_FREE(data); + return good; + } + + static png__uint16 png__compute_y_16(int r, int g, int b) + { + return (png__uint16)(((r * 77) + (g * 150) + (29 * b)) >> 8); + } + + static png__uint16* png__convert_format16(png__uint16* data, int img_n, int req_comp, unsigned int x, unsigned int y) + { + int i, j; + png__uint16* good; + + if (req_comp == img_n) return data; + PNG_ASSERT(req_comp >= 1 && req_comp <= 4); + + good = (png__uint16*)png__malloc(req_comp * x * y * 2); + if (good == NULL) { + PNG_FREE(data); + return (png__uint16*)png__errpuc("outofmem", "Out of memory"); + } + + for (j = 0; j < (int)y; ++j) { + png__uint16* src = data + j * x * img_n; + png__uint16* dest = good + j * x * req_comp; + +#define PNG__COMBO(a,b) ((a)*8+(b)) +#define PNG__CASE(a,b) case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b) + // convert source image with img_n components to one with req_comp components; + // avoid switch per pixel, so use switch per scanline and massive macros + switch (PNG__COMBO(img_n, req_comp)) { + PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 0xffff; } break; + PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 0xffff; } break; + PNG__CASE(2, 1) { dest[0] = src[0]; } break; + PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break; + PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 0xffff; } break; + PNG__CASE(3, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break; + PNG__CASE(3, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = 0xffff; } break; + PNG__CASE(4, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break; + PNG__CASE(4, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = src[3]; } break; + PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break; + default: PNG_ASSERT(0); PNG_FREE(data); PNG_FREE(good); return (png__uint16*)png__errpuc("unsupported", "Unsupported format conversion"); + } +#undef PNG__CASE + } + + PNG_FREE(data); + return good; + } + + // fast-way is faster to check than jpeg huffman, but slow way is slower +#define PNG__ZFAST_BITS 9 // accelerate all cases in default tables +#define PNG__ZFAST_MASK ((1 << PNG__ZFAST_BITS) - 1) + +// zlib-style huffman encoding +// (jpegs packs from left, zlib from right, so can't share code) + typedef struct + { + png__uint16 fast[1 << PNG__ZFAST_BITS]; + png__uint16 firstcode[16]; + int maxcode[17]; + png__uint16 firstsymbol[16]; + png_uc size[288]; + png__uint16 value[288]; + } png__zhuffman; + + png_inline static int png__bitreverse16(int n) + { + n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1); + n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2); + n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4); + n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8); + return n; + } + + png_inline static int png__bit_reverse(int v, int bits) + { + PNG_ASSERT(bits <= 16); + // to bit reverse n bits, reverse 16 and shift + // e.g. 11 bits, bit reverse and shift away 5 + return png__bitreverse16(v) >> (16 - bits); + } + + static int png__zbuild_huffman(png__zhuffman* z, const png_uc* sizelist, int num) + { + int i, k = 0; + int code, next_code[16], sizes[17]; + + // DEFLATE spec for generating codes + memset(sizes, 0, sizeof(sizes)); + memset(z->fast, 0, sizeof(z->fast)); + for (i = 0; i < num; ++i) + ++sizes[sizelist[i]]; + sizes[0] = 0; + for (i = 1; i < 16; ++i) + if (sizes[i] > (1 << i)) + return png__err("bad sizes", "Corrupt PNG"); + code = 0; + for (i = 1; i < 16; ++i) { + next_code[i] = code; + z->firstcode[i] = (png__uint16)code; + z->firstsymbol[i] = (png__uint16)k; + code = (code + sizes[i]); + if (sizes[i]) + if (code - 1 >= (1 << i)) return png__err("bad codelengths", "Corrupt PNG"); + z->maxcode[i] = code << (16 - i); // preshift for inner loop + code <<= 1; + k += sizes[i]; + } + z->maxcode[16] = 0x10000; // sentinel + for (i = 0; i < num; ++i) { + int s = sizelist[i]; + if (s) { + int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s]; + png__uint16 fastv = (png__uint16)((s << 9) | i); + z->size[c] = (png_uc)s; + z->value[c] = (png__uint16)i; + if (s <= PNG__ZFAST_BITS) { + int j = png__bit_reverse(next_code[s], s); + while (j < (1 << PNG__ZFAST_BITS)) { + z->fast[j] = fastv; + j += (1 << s); + } + } + ++next_code[s]; + } + } + return 1; + } + + // zlib-from-memory implementation for PNG reading + // because PNG allows splitting the zlib stream arbitrarily, + // and it's annoying structurally to have PNG call ZLIB call PNG, + // we require PNG read all the IDATs and combine them into a single + // memory buffer + + typedef struct + { + png_uc* zbuffer, * zbuffer_end; + int num_bits; + png__uint32 code_buffer; + + char* zout; + char* zout_start; + char* zout_end; + int z_expandable; + + png__zhuffman z_length, z_distance; + } png__zbuf; + + png_inline static int png__zeof(png__zbuf* z) + { + return (z->zbuffer >= z->zbuffer_end); + } + + png_inline static png_uc png__zget8(png__zbuf* z) + { + return png__zeof(z) ? 0 : *z->zbuffer++; + } + + static void png__fill_bits(png__zbuf* z) + { + do { + if (z->code_buffer >= (1U << z->num_bits)) { + z->zbuffer = z->zbuffer_end; /* treat this as EOF so we fail. */ + return; + } + z->code_buffer |= (unsigned int)png__zget8(z) << z->num_bits; + z->num_bits += 8; + } while (z->num_bits <= 24); + } + + png_inline static unsigned int png__zreceive(png__zbuf* z, int n) + { + unsigned int k; + if (z->num_bits < n) png__fill_bits(z); + k = z->code_buffer & ((1 << n) - 1); + z->code_buffer >>= n; + z->num_bits -= n; + return k; + } + + static int png__zhuffman_decode_slowpath(png__zbuf* a, png__zhuffman* z) + { + int b, s, k; + // not resolved by fast table, so compute it the slow way + // use jpeg approach, which requires MSbits at top + k = png__bit_reverse(a->code_buffer, 16); + for (s = PNG__ZFAST_BITS + 1; ; ++s) + if (k < z->maxcode[s]) + break; + if (s >= 16) return -1; // invalid code! + // code size is s, so: + b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s]; + if (b >= sizeof(z->size)) return -1; // some data was corrupt somewhere! + if (z->size[b] != s) return -1; // was originally an assert, but report failure instead. + a->code_buffer >>= s; + a->num_bits -= s; + return z->value[b]; + } + + png_inline static int png__zhuffman_decode(png__zbuf* a, png__zhuffman* z) + { + int b, s; + if (a->num_bits < 16) { + if (png__zeof(a)) { + return -1; /* report error for unexpected end of data. */ + } + png__fill_bits(a); + } + b = z->fast[a->code_buffer & PNG__ZFAST_MASK]; + if (b) { + s = b >> 9; + a->code_buffer >>= s; + a->num_bits -= s; + return b & 511; + } + return png__zhuffman_decode_slowpath(a, z); + } + + static int png__zexpand(png__zbuf* z, char* zout, int n) // need to make room for n bytes + { + char* q; + unsigned int cur, limit, old_limit; + z->zout = zout; + if (!z->z_expandable) return png__err("output buffer limit", "Corrupt PNG"); + cur = (unsigned int)(z->zout - z->zout_start); + limit = old_limit = (unsigned)(z->zout_end - z->zout_start); + if (UINT_MAX - cur < (unsigned)n) return png__err("outofmem", "Out of memory"); + while (cur + n > limit) { + if (limit > UINT_MAX / 2) return png__err("outofmem", "Out of memory"); + limit *= 2; + } + q = (char*)PNG_REALLOC_SIZED(z->zout_start, old_limit, limit); + PNG_NOTUSED(old_limit); + if (q == NULL) return png__err("outofmem", "Out of memory"); + z->zout_start = q; + z->zout = q + cur; + z->zout_end = q + limit; + return 1; + } + + static const int png__zlength_base[31] = { + 3,4,5,6,7,8,9,10,11,13, + 15,17,19,23,27,31,35,43,51,59, + 67,83,99,115,131,163,195,227,258,0,0 }; + + static const int png__zlength_extra[31] = + { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 }; + + static const int png__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193, + 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0 }; + + static const int png__zdist_extra[32] = + { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 }; + + static int png__parse_huffman_block(png__zbuf* a) + { + char* zout = a->zout; + for (;;) { + int z = png__zhuffman_decode(a, &a->z_length); + if (z < 256) { + if (z < 0) return png__err("bad huffman code", "Corrupt PNG"); // error in huffman codes + if (zout >= a->zout_end) { + if (!png__zexpand(a, zout, 1)) return 0; + zout = a->zout; + } + *zout++ = (char)z; + } + else { + png_uc* p; + int len, dist; + if (z == 256) { + a->zout = zout; + return 1; + } + z -= 257; + len = png__zlength_base[z]; + if (png__zlength_extra[z]) len += png__zreceive(a, png__zlength_extra[z]); + z = png__zhuffman_decode(a, &a->z_distance); + if (z < 0) return png__err("bad huffman code", "Corrupt PNG"); + dist = png__zdist_base[z]; + if (png__zdist_extra[z]) dist += png__zreceive(a, png__zdist_extra[z]); + if (zout - a->zout_start < dist) return png__err("bad dist", "Corrupt PNG"); + if (zout + len > a->zout_end) { + if (!png__zexpand(a, zout, len)) return 0; + zout = a->zout; + } + p = (png_uc*)(zout - dist); + if (dist == 1) { // run of one byte; common in images. + png_uc v = *p; + if (len) { do *zout++ = v; while (--len); } + } + else { + if (len) { do *zout++ = *p++; while (--len); } + } + } + } + } + + static int png__compute_huffman_codes(png__zbuf* a) + { + static const png_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 }; + png__zhuffman z_codelength; + png_uc lencodes[286 + 32 + 137];//padding for maximum single op + png_uc codelength_sizes[19]; + int i, n; + + int hlit = png__zreceive(a, 5) + 257; + int hdist = png__zreceive(a, 5) + 1; + int hclen = png__zreceive(a, 4) + 4; + int ntot = hlit + hdist; + + memset(codelength_sizes, 0, sizeof(codelength_sizes)); + for (i = 0; i < hclen; ++i) { + int s = png__zreceive(a, 3); + codelength_sizes[length_dezigzag[i]] = (png_uc)s; + } + if (!png__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0; + + n = 0; + while (n < ntot) { + int c = png__zhuffman_decode(a, &z_codelength); + if (c < 0 || c >= 19) return png__err("bad codelengths", "Corrupt PNG"); + if (c < 16) + lencodes[n++] = (png_uc)c; + else { + png_uc fill = 0; + if (c == 16) { + c = png__zreceive(a, 2) + 3; + if (n == 0) return png__err("bad codelengths", "Corrupt PNG"); + fill = lencodes[n - 1]; + } + else if (c == 17) { + c = png__zreceive(a, 3) + 3; + } + else if (c == 18) { + c = png__zreceive(a, 7) + 11; + } + else { + return png__err("bad codelengths", "Corrupt PNG"); + } + if (ntot - n < c) return png__err("bad codelengths", "Corrupt PNG"); + memset(lencodes + n, fill, c); + n += c; + } + } + if (n != ntot) return png__err("bad codelengths", "Corrupt PNG"); + if (!png__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0; + if (!png__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist)) return 0; + return 1; + } + + static int png__parse_uncompressed_block(png__zbuf* a) + { + png_uc header[4]; + int len, nlen, k; + if (a->num_bits & 7) + png__zreceive(a, a->num_bits & 7); // discard + // drain the bit-packed data into header + k = 0; + while (a->num_bits > 0) { + header[k++] = (png_uc)(a->code_buffer & 255); // suppress MSVC run-time check + a->code_buffer >>= 8; + a->num_bits -= 8; + } + if (a->num_bits < 0) return png__err("zlib corrupt", "Corrupt PNG"); + // now fill header the normal way + while (k < 4) + header[k++] = png__zget8(a); + len = header[1] * 256 + header[0]; + nlen = header[3] * 256 + header[2]; + if (nlen != (len ^ 0xffff)) return png__err("zlib corrupt", "Corrupt PNG"); + if (a->zbuffer + len > a->zbuffer_end) return png__err("read past buffer", "Corrupt PNG"); + if (a->zout + len > a->zout_end) + if (!png__zexpand(a, a->zout, len)) return 0; + memcpy(a->zout, a->zbuffer, len); + a->zbuffer += len; + a->zout += len; + return 1; + } + + static int png__parse_zlib_header(png__zbuf* a) + { + int cmf = png__zget8(a); + int cm = cmf & 15; + /* int cinfo = cmf >> 4; */ + int flg = png__zget8(a); + if (png__zeof(a)) return png__err("bad zlib header", "Corrupt PNG"); // zlib spec + if ((cmf * 256 + flg) % 31 != 0) return png__err("bad zlib header", "Corrupt PNG"); // zlib spec + if (flg & 32) return png__err("no preset dict", "Corrupt PNG"); // preset dictionary not allowed in png + if (cm != 8) return png__err("bad compression", "Corrupt PNG"); // DEFLATE required for png + // window = 1 << (8 + cinfo)... but who cares, we fully buffer output + return 1; + } + + static const png_uc png__zdefault_length[288] = + { + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8 + }; + static const png_uc png__zdefault_distance[32] = + { + 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 + }; + /* + Init algorithm: + { + int i; // use <= to match clearly with spec + for (i=0; i <= 143; ++i) png__zdefault_length[i] = 8; + for ( ; i <= 255; ++i) png__zdefault_length[i] = 9; + for ( ; i <= 279; ++i) png__zdefault_length[i] = 7; + for ( ; i <= 287; ++i) png__zdefault_length[i] = 8; + + for (i=0; i <= 31; ++i) png__zdefault_distance[i] = 5; + } + */ + + static int png__parse_zlib(png__zbuf* a, int parse_header) + { + int final, type; + if (parse_header) + if (!png__parse_zlib_header(a)) return 0; + a->num_bits = 0; + a->code_buffer = 0; + do { + final = png__zreceive(a, 1); + type = png__zreceive(a, 2); + if (type == 0) { + if (!png__parse_uncompressed_block(a)) return 0; + } + else if (type == 3) { + return 0; + } + else { + if (type == 1) { + // use fixed code lengths + if (!png__zbuild_huffman(&a->z_length, png__zdefault_length, 288)) return 0; + if (!png__zbuild_huffman(&a->z_distance, png__zdefault_distance, 32)) return 0; + } + else { + if (!png__compute_huffman_codes(a)) return 0; + } + if (!png__parse_huffman_block(a)) return 0; + } + } while (!final); + return 1; + } + + static int png__do_zlib(png__zbuf* a, char* obuf, int olen, int exp, int parse_header) + { + a->zout_start = obuf; + a->zout = obuf; + a->zout_end = obuf + olen; + a->z_expandable = exp; + + return png__parse_zlib(a, parse_header); + } + + STBIDEF char* png_zlib_decode_malloc_guesssize(const char* buffer, int len, int initial_size, int* outlen) + { + png__zbuf a; + char* p = (char*)png__malloc(initial_size); + if (p == NULL) return NULL; + a.zbuffer = (png_uc*)buffer; + a.zbuffer_end = (png_uc*)buffer + len; + if (png__do_zlib(&a, p, initial_size, 1, 1)) { + if (outlen) *outlen = (int)(a.zout - a.zout_start); + return a.zout_start; + } + else { + PNG_FREE(a.zout_start); + return NULL; + } + } + + STBIDEF char* png_zlib_decode_malloc(char const* buffer, int len, int* outlen) + { + return png_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen); + } + + STBIDEF char* png_zlib_decode_malloc_guesssize_headerflag(const char* buffer, int len, int initial_size, int* outlen, int parse_header) + { + png__zbuf a; + char* p = (char*)png__malloc(initial_size); + if (p == NULL) return NULL; + a.zbuffer = (png_uc*)buffer; + a.zbuffer_end = (png_uc*)buffer + len; + if (png__do_zlib(&a, p, initial_size, 1, parse_header)) { + if (outlen) *outlen = (int)(a.zout - a.zout_start); + return a.zout_start; + } + else { + PNG_FREE(a.zout_start); + return NULL; + } + } + + STBIDEF int png_zlib_decode_buffer(char* obuffer, int olen, char const* ibuffer, int ilen) + { + png__zbuf a; + a.zbuffer = (png_uc*)ibuffer; + a.zbuffer_end = (png_uc*)ibuffer + ilen; + if (png__do_zlib(&a, obuffer, olen, 0, 1)) + return (int)(a.zout - a.zout_start); + else + return -1; + } + + STBIDEF char* png_zlib_decode_noheader_malloc(char const* buffer, int len, int* outlen) + { + png__zbuf a; + char* p = (char*)png__malloc(16384); + if (p == NULL) return NULL; + a.zbuffer = (png_uc*)buffer; + a.zbuffer_end = (png_uc*)buffer + len; + if (png__do_zlib(&a, p, 16384, 1, 0)) { + if (outlen) *outlen = (int)(a.zout - a.zout_start); + return a.zout_start; + } + else { + PNG_FREE(a.zout_start); + return NULL; + } + } + + STBIDEF int png_zlib_decode_noheader_buffer(char* obuffer, int olen, const char* ibuffer, int ilen) + { + png__zbuf a; + a.zbuffer = (png_uc*)ibuffer; + a.zbuffer_end = (png_uc*)ibuffer + ilen; + if (png__do_zlib(&a, obuffer, olen, 0, 0)) + return (int)(a.zout - a.zout_start); + else + return -1; + } + + + // public domain "baseline" PNG decoder v0.10 Sean Barrett 2006-11-18 + // simple implementation + // - only 8-bit samples + // - no CRC checking + // - allocates lots of intermediate memory + // - avoids problem of streaming data between subsystems + // - avoids explicit window management + // performance + // - uses stb_zlib, a PD zlib implementation with fast huffman decoding + + typedef struct + { + png__uint32 length; + png__uint32 type; + } png__pngchunk; + + static png__pngchunk png__get_chunk_header(png__context* s) + { + png__pngchunk c; + c.length = png__get32be(s); + c.type = png__get32be(s); + return c; + } + + static int png__check_png_header(png__context* s) + { + static const png_uc png_sig[8] = { 137,80,78,71,13,10,26,10 }; + int i; + for (i = 0; i < 8; ++i) + if (png__get8(s) != png_sig[i]) return png__err("bad png sig", "Not a PNG"); + return 1; + } + + typedef struct + { + png__context* s; + png_uc* idata, * expanded, * out; + int depth; + } png__png; + + + enum { + PNG__F_none = 0, + PNG__F_sub = 1, + PNG__F_up = 2, + PNG__F_avg = 3, + PNG__F_paeth = 4, + // synthetic filters used for first scanline to avoid needing a dummy row of 0s + PNG__F_avg_first, + PNG__F_paeth_first + }; + + static png_uc first_row_filter[5] = + { + PNG__F_none, + PNG__F_sub, + PNG__F_none, + PNG__F_avg_first, + PNG__F_paeth_first + }; + + static int png__paeth(int a, int b, int c) + { + int p = a + b - c; + int pa = abs(p - a); + int pb = abs(p - b); + int pc = abs(p - c); + if (pa <= pb && pa <= pc) return a; + if (pb <= pc) return b; + return c; + } + + static const png_uc png__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 }; + + // create the png data from post-deflated data + static int png__create_png_image_raw(png__png* a, png_uc* raw, png__uint32 raw_len, int out_n, png__uint32 x, png__uint32 y, int depth, int color) + { + int bytes = (depth == 16 ? 2 : 1); + png__context* s = a->s; + png__uint32 i, j, stride = x * out_n * bytes; + png__uint32 img_len, img_width_bytes; + int k; + int img_n = s->img_n; // copy it into a local for later + + int output_bytes = out_n * bytes; + int filter_bytes = img_n * bytes; + int width = x; + + PNG_ASSERT(out_n == s->img_n || out_n == s->img_n + 1); + a->out = (png_uc*)png__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into + if (!a->out) return png__err("outofmem", "Out of memory"); + + if (!png__mad3sizes_valid(img_n, x, depth, 7)) return png__err("too large", "Corrupt PNG"); + img_width_bytes = (((img_n * x * depth) + 7) >> 3); + img_len = (img_width_bytes + 1) * y; + + // we used to check for exact match between raw_len and img_len on non-interlaced PNGs, + // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros), + // so just check for raw_len < img_len always. + if (raw_len < img_len) return png__err("not enough pixels", "Corrupt PNG"); + + for (j = 0; j < y; ++j) { + png_uc* cur = a->out + stride * j; + png_uc* prior; + int filter = *raw++; + + if (filter > 4) + return png__err("invalid filter", "Corrupt PNG"); + + if (depth < 8) { + if (img_width_bytes > x) return png__err("invalid width", "Corrupt PNG"); + cur += x * out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place + filter_bytes = 1; + width = img_width_bytes; + } + prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above + + // if first row, use special filter that doesn't sample previous row + if (j == 0) filter = first_row_filter[filter]; + + // handle first byte explicitly + for (k = 0; k < filter_bytes; ++k) { + switch (filter) { + case PNG__F_none: cur[k] = raw[k]; break; + case PNG__F_sub: cur[k] = raw[k]; break; + case PNG__F_up: cur[k] = PNG__BYTECAST(raw[k] + prior[k]); break; + case PNG__F_avg: cur[k] = PNG__BYTECAST(raw[k] + (prior[k] >> 1)); break; + case PNG__F_paeth: cur[k] = PNG__BYTECAST(raw[k] + png__paeth(0, prior[k], 0)); break; + case PNG__F_avg_first: cur[k] = raw[k]; break; + case PNG__F_paeth_first: cur[k] = raw[k]; break; + } + } + + if (depth == 8) { + if (img_n != out_n) + cur[img_n] = 255; // first pixel + raw += img_n; + cur += out_n; + prior += out_n; + } + else if (depth == 16) { + if (img_n != out_n) { + cur[filter_bytes] = 255; // first pixel top byte + cur[filter_bytes + 1] = 255; // first pixel bottom byte + } + raw += filter_bytes; + cur += output_bytes; + prior += output_bytes; + } + else { + raw += 1; + cur += 1; + prior += 1; + } + + // this is a little gross, so that we don't switch per-pixel or per-component + if (depth < 8 || img_n == out_n) { + int nk = (width - 1) * filter_bytes; +#define PNG__CASE(f) \ + case f: \ + for (k=0; k < nk; ++k) + switch (filter) { + // "none" filter turns into a memcpy here; make that explicit. + case PNG__F_none: memcpy(cur, raw, nk); break; + PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - filter_bytes]); } break; + PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break; + PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1)); } break; + PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes])); } break; + PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); } break; + PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], 0, 0)); } break; + } +#undef PNG__CASE + raw += nk; + } + else { + PNG_ASSERT(img_n + 1 == out_n); +#define PNG__CASE(f) \ + case f: \ + for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \ + for (k=0; k < filter_bytes; ++k) + switch (filter) { + PNG__CASE(PNG__F_none) { cur[k] = raw[k]; } break; + PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - output_bytes]); } break; + PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break; + PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - output_bytes]) >> 1)); } break; + PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes])); } break; + PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - output_bytes] >> 1)); } break; + PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], 0, 0)); } break; + } +#undef PNG__CASE + + // the loop above sets the high byte of the pixels' alpha, but for + // 16 bit png files we also need the low byte set. we'll do that here. + if (depth == 16) { + cur = a->out + stride * j; // start at the beginning of the row again + for (i = 0; i < x; ++i, cur += output_bytes) { + cur[filter_bytes + 1] = 255; + } + } + } + } + + // we make a separate pass to expand bits to pixels; for performance, + // this could run two scanlines behind the above code, so it won't + // intefere with filtering but will still be in the cache. + if (depth < 8) { + for (j = 0; j < y; ++j) { + png_uc* cur = a->out + stride * j; + png_uc* in = a->out + stride * j + x * out_n - img_width_bytes; + // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit + // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop + png_uc scale = (color == 0) ? png__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range + + // note that the final byte might overshoot and write more data than desired. + // we can allocate enough data that this never writes out of memory, but it + // could also overwrite the next scanline. can it overwrite non-empty data + // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel. + // so we need to explicitly clamp the final ones + + if (depth == 4) { + for (k = x * img_n; k >= 2; k -= 2, ++in) { + *cur++ = scale * ((*in >> 4)); + *cur++ = scale * ((*in) & 0x0f); + } + if (k > 0) *cur++ = scale * ((*in >> 4)); + } + else if (depth == 2) { + for (k = x * img_n; k >= 4; k -= 4, ++in) { + *cur++ = scale * ((*in >> 6)); + *cur++ = scale * ((*in >> 4) & 0x03); + *cur++ = scale * ((*in >> 2) & 0x03); + *cur++ = scale * ((*in) & 0x03); + } + if (k > 0) *cur++ = scale * ((*in >> 6)); + if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03); + if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03); + } + else if (depth == 1) { + for (k = x * img_n; k >= 8; k -= 8, ++in) { + *cur++ = scale * ((*in >> 7)); + *cur++ = scale * ((*in >> 6) & 0x01); + *cur++ = scale * ((*in >> 5) & 0x01); + *cur++ = scale * ((*in >> 4) & 0x01); + *cur++ = scale * ((*in >> 3) & 0x01); + *cur++ = scale * ((*in >> 2) & 0x01); + *cur++ = scale * ((*in >> 1) & 0x01); + *cur++ = scale * ((*in) & 0x01); + } + if (k > 0) *cur++ = scale * ((*in >> 7)); + if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01); + if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01); + if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01); + if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01); + if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01); + if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01); + } + if (img_n != out_n) { + int q; + // insert alpha = 255 + cur = a->out + stride * j; + if (img_n == 1) { + for (q = x - 1; q >= 0; --q) { + cur[q * 2 + 1] = 255; + cur[q * 2 + 0] = cur[q]; + } + } + else { + PNG_ASSERT(img_n == 3); + for (q = x - 1; q >= 0; --q) { + cur[q * 4 + 3] = 255; + cur[q * 4 + 2] = cur[q * 3 + 2]; + cur[q * 4 + 1] = cur[q * 3 + 1]; + cur[q * 4 + 0] = cur[q * 3 + 0]; + } + } + } + } + } + else if (depth == 16) { + // force the image data from big-endian to platform-native. + // this is done in a separate pass due to the decoding relying + // on the data being untouched, but could probably be done + // per-line during decode if care is taken. + png_uc* cur = a->out; + png__uint16* cur16 = (png__uint16*)cur; + + for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) { + *cur16 = (cur[0] << 8) | cur[1]; + } + } + + return 1; + } + + static int png__create_png_image(png__png* a, png_uc* image_data, png__uint32 image_data_len, int out_n, int depth, int color, int interlaced) + { + int bytes = (depth == 16 ? 2 : 1); + int out_bytes = out_n * bytes; + png_uc* final; + int p; + if (!interlaced) + return png__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color); + + // de-interlacing + final = (png_uc*)png__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0); + for (p = 0; p < 7; ++p) { + int xorig[] = { 0,4,0,2,0,1,0 }; + int yorig[] = { 0,0,4,0,2,0,1 }; + int xspc[] = { 8,8,4,4,2,2,1 }; + int yspc[] = { 8,8,8,4,4,2,2 }; + int i, j, x, y; + // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1 + x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p]; + y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p]; + if (x && y) { + png__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y; + if (!png__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) { + PNG_FREE(final); + return 0; + } + for (j = 0; j < y; ++j) { + for (i = 0; i < x; ++i) { + int out_y = j * yspc[p] + yorig[p]; + int out_x = i * xspc[p] + xorig[p]; + memcpy(final + out_y * a->s->img_x * out_bytes + out_x * out_bytes, + a->out + (j * x + i) * out_bytes, out_bytes); + } + } + PNG_FREE(a->out); + image_data += img_len; + image_data_len -= img_len; + } + } + a->out = final; + + return 1; + } + + static int png__compute_transparency(png__png* z, png_uc tc[3], int out_n) + { + png__context* s = z->s; + png__uint32 i, pixel_count = s->img_x * s->img_y; + png_uc* p = z->out; + + // compute color-based transparency, assuming we've + // already got 255 as the alpha value in the output + PNG_ASSERT(out_n == 2 || out_n == 4); + + if (out_n == 2) { + for (i = 0; i < pixel_count; ++i) { + p[1] = (p[0] == tc[0] ? 0 : 255); + p += 2; + } + } + else { + for (i = 0; i < pixel_count; ++i) { + if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) + p[3] = 0; + p += 4; + } + } + return 1; + } + + static int png__compute_transparency16(png__png* z, png__uint16 tc[3], int out_n) + { + png__context* s = z->s; + png__uint32 i, pixel_count = s->img_x * s->img_y; + png__uint16* p = (png__uint16*)z->out; + + // compute color-based transparency, assuming we've + // already got 65535 as the alpha value in the output + PNG_ASSERT(out_n == 2 || out_n == 4); + + if (out_n == 2) { + for (i = 0; i < pixel_count; ++i) { + p[1] = (p[0] == tc[0] ? 0 : 65535); + p += 2; + } + } + else { + for (i = 0; i < pixel_count; ++i) { + if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) + p[3] = 0; + p += 4; + } + } + return 1; + } + + static int png__expand_png_palette(png__png* a, png_uc* palette, int len, int pal_img_n) + { + png__uint32 i, pixel_count = a->s->img_x * a->s->img_y; + png_uc* p, * temp_out, * orig = a->out; + + p = (png_uc*)png__malloc_mad2(pixel_count, pal_img_n, 0); + if (p == NULL) return png__err("outofmem", "Out of memory"); + + // between here and free(out) below, exitting would leak + temp_out = p; + + if (pal_img_n == 3) { + for (i = 0; i < pixel_count; ++i) { + int n = orig[i] * 4; + p[0] = palette[n]; + p[1] = palette[n + 1]; + p[2] = palette[n + 2]; + p += 3; + } + } + else { + for (i = 0; i < pixel_count; ++i) { + int n = orig[i] * 4; + p[0] = palette[n]; + p[1] = palette[n + 1]; + p[2] = palette[n + 2]; + p[3] = palette[n + 3]; + p += 4; + } + } + PNG_FREE(a->out); + a->out = temp_out; + + PNG_NOTUSED(len); + + return 1; + } + + static int png__unpremultiply_on_load = 0; + static int png__de_iphone_flag = 0; + + STBIDEF void png_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply) + { + png__unpremultiply_on_load = flag_true_if_should_unpremultiply; + } + + STBIDEF void png_convert_iphone_png_to_rgb(int flag_true_if_should_convert) + { + png__de_iphone_flag = flag_true_if_should_convert; + } + + static void png__de_iphone(png__png* z) + { + png__context* s = z->s; + png__uint32 i, pixel_count = s->img_x * s->img_y; + png_uc* p = z->out; + + if (s->img_out_n == 3) { // convert bgr to rgb + for (i = 0; i < pixel_count; ++i) { + png_uc t = p[0]; + p[0] = p[2]; + p[2] = t; + p += 3; + } + } + else { + PNG_ASSERT(s->img_out_n == 4); + if (png__unpremultiply_on_load) { + // convert bgr to rgb and unpremultiply + for (i = 0; i < pixel_count; ++i) { + png_uc a = p[3]; + png_uc t = p[0]; + if (a) { + png_uc half = a / 2; + p[0] = (p[2] * 255 + half) / a; + p[1] = (p[1] * 255 + half) / a; + p[2] = (t * 255 + half) / a; + } + else { + p[0] = p[2]; + p[2] = t; + } + p += 4; + } + } + else { + // convert bgr to rgb + for (i = 0; i < pixel_count; ++i) { + png_uc t = p[0]; + p[0] = p[2]; + p[2] = t; + p += 4; + } + } + } + } + +#define PNG__PNG_TYPE(a,b,c,d) (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d)) + + static int png__parse_png_file(png__png* z, int scan, int req_comp) + { + png_uc palette[1024], pal_img_n = 0; + png_uc has_trans = 0, tc[3] = { 0 }; + png__uint16 tc16[3]; + png__uint32 ioff = 0, idata_limit = 0, i, pal_len = 0; + int first = 1, k, interlace = 0, color = 0, is_iphone = 0; + png__context* s = z->s; + + z->expanded = NULL; + z->idata = NULL; + z->out = NULL; + + if (!png__check_png_header(s)) return 0; + + if (scan == PNG__SCAN_type) return 1; + + for (;;) { + png__pngchunk c = png__get_chunk_header(s); + switch (c.type) { + case PNG__PNG_TYPE('C', 'g', 'B', 'I'): + is_iphone = 1; + png__skip(s, c.length); + break; + case PNG__PNG_TYPE('I', 'H', 'D', 'R'): { + int comp, filter; + if (!first) return png__err("multiple IHDR", "Corrupt PNG"); + first = 0; + if (c.length != 13) return png__err("bad IHDR len", "Corrupt PNG"); + s->img_x = png__get32be(s); + s->img_y = png__get32be(s); + if (s->img_y > PNG_MAX_DIMENSIONS) return png__err("too large", "Very large image (corrupt?)"); + if (s->img_x > PNG_MAX_DIMENSIONS) return png__err("too large", "Very large image (corrupt?)"); + z->depth = png__get8(s); if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16) return png__err("1/2/4/8/16-bit only", "PNG not supported: 1/2/4/8/16-bit only"); + color = png__get8(s); if (color > 6) return png__err("bad ctype", "Corrupt PNG"); + if (color == 3 && z->depth == 16) return png__err("bad ctype", "Corrupt PNG"); + if (color == 3) pal_img_n = 3; else if (color & 1) return png__err("bad ctype", "Corrupt PNG"); + comp = png__get8(s); if (comp) return png__err("bad comp method", "Corrupt PNG"); + filter = png__get8(s); if (filter) return png__err("bad filter method", "Corrupt PNG"); + interlace = png__get8(s); if (interlace > 1) return png__err("bad interlace method", "Corrupt PNG"); + if (!s->img_x || !s->img_y) return png__err("0-pixel image", "Corrupt PNG"); + if (!pal_img_n) { + s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0); + if ((1 << 30) / s->img_x / s->img_n < s->img_y) return png__err("too large", "Image too large to decode"); + if (scan == PNG__SCAN_header) return 1; + } + else { + // if paletted, then pal_n is our final components, and + // img_n is # components to decompress/filter. + s->img_n = 1; + if ((1 << 30) / s->img_x / 4 < s->img_y) return png__err("too large", "Corrupt PNG"); + // if SCAN_header, have to scan to see if we have a tRNS + } + break; + } + + case PNG__PNG_TYPE('P', 'L', 'T', 'E'): { + if (first) return png__err("first not IHDR", "Corrupt PNG"); + if (c.length > 256 * 3) return png__err("invalid PLTE", "Corrupt PNG"); + pal_len = c.length / 3; + if (pal_len * 3 != c.length) return png__err("invalid PLTE", "Corrupt PNG"); + for (i = 0; i < pal_len; ++i) { + palette[i * 4 + 0] = png__get8(s); + palette[i * 4 + 1] = png__get8(s); + palette[i * 4 + 2] = png__get8(s); + palette[i * 4 + 3] = 255; + } + break; + } + + case PNG__PNG_TYPE('t', 'R', 'N', 'S'): { + if (first) return png__err("first not IHDR", "Corrupt PNG"); + if (z->idata) return png__err("tRNS after IDAT", "Corrupt PNG"); + if (pal_img_n) { + if (scan == PNG__SCAN_header) { s->img_n = 4; return 1; } + if (pal_len == 0) return png__err("tRNS before PLTE", "Corrupt PNG"); + if (c.length > pal_len) return png__err("bad tRNS len", "Corrupt PNG"); + pal_img_n = 4; + for (i = 0; i < c.length; ++i) + palette[i * 4 + 3] = png__get8(s); + } + else { + if (!(s->img_n & 1)) return png__err("tRNS with alpha", "Corrupt PNG"); + if (c.length != (png__uint32)s->img_n * 2) return png__err("bad tRNS len", "Corrupt PNG"); + has_trans = 1; + if (z->depth == 16) { + for (k = 0; k < s->img_n; ++k) tc16[k] = (png__uint16)png__get16be(s); // copy the values as-is + } + else { + for (k = 0; k < s->img_n; ++k) tc[k] = (png_uc)(png__get16be(s) & 255) * png__depth_scale_table[z->depth]; // non 8-bit images will be larger + } + } + break; + } + + case PNG__PNG_TYPE('I', 'D', 'A', 'T'): { + if (first) return png__err("first not IHDR", "Corrupt PNG"); + if (pal_img_n && !pal_len) return png__err("no PLTE", "Corrupt PNG"); + if (scan == PNG__SCAN_header) { s->img_n = pal_img_n; return 1; } + if ((int)(ioff + c.length) < (int)ioff) return 0; + if (ioff + c.length > idata_limit) { + png__uint32 idata_limit_old = idata_limit; + png_uc* p; + if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096; + while (ioff + c.length > idata_limit) + idata_limit *= 2; + PNG_NOTUSED(idata_limit_old); + p = (png_uc*)PNG_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return png__err("outofmem", "Out of memory"); + z->idata = p; + } + if (!png__getn(s, z->idata + ioff, c.length)) return png__err("outofdata", "Corrupt PNG"); + ioff += c.length; + break; + } + + case PNG__PNG_TYPE('I', 'E', 'N', 'D'): { + png__uint32 raw_len, bpl; + if (first) return png__err("first not IHDR", "Corrupt PNG"); + if (scan != PNG__SCAN_load) return 1; + if (z->idata == NULL) return png__err("no IDAT", "Corrupt PNG"); + // initial guess for decoded data size to avoid unnecessary reallocs + bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component + raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */; + z->expanded = (png_uc*)png_zlib_decode_malloc_guesssize_headerflag((char*)z->idata, ioff, raw_len, (int*)&raw_len, !is_iphone); + if (z->expanded == NULL) return 0; // zlib should set error + PNG_FREE(z->idata); z->idata = NULL; + if ((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || has_trans) + s->img_out_n = s->img_n + 1; + else + s->img_out_n = s->img_n; + if (!png__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0; + if (has_trans) { + if (z->depth == 16) { + if (!png__compute_transparency16(z, tc16, s->img_out_n)) return 0; + } + else { + if (!png__compute_transparency(z, tc, s->img_out_n)) return 0; + } + } + if (is_iphone && png__de_iphone_flag && s->img_out_n > 2) + png__de_iphone(z); + if (pal_img_n) { + // pal_img_n == 3 or 4 + s->img_n = pal_img_n; // record the actual colors we had + s->img_out_n = pal_img_n; + if (req_comp >= 3) s->img_out_n = req_comp; + if (!png__expand_png_palette(z, palette, pal_len, s->img_out_n)) + return 0; + } + else if (has_trans) { + // non-paletted image with tRNS -> source image has (constant) alpha + ++s->img_n; + } + PNG_FREE(z->expanded); z->expanded = NULL; + // end of PNG chunk, read and skip CRC + png__get32be(s); + return 1; + } + + default: + // if critical, fail + if (first) return png__err("first not IHDR", "Corrupt PNG"); + if ((c.type & (1 << 29)) == 0) { +#ifndef PNG_NO_FAILURE_STRINGS + // not threadsafe + static char invalid_chunk[] = "XXXX PNG chunk not known"; + invalid_chunk[0] = PNG__BYTECAST(c.type >> 24); + invalid_chunk[1] = PNG__BYTECAST(c.type >> 16); + invalid_chunk[2] = PNG__BYTECAST(c.type >> 8); + invalid_chunk[3] = PNG__BYTECAST(c.type >> 0); +#endif + return png__err(invalid_chunk, "PNG not supported: unknown PNG chunk type"); + } + png__skip(s, c.length); + break; + } + // end of PNG chunk, read and skip CRC + png__get32be(s); + } + } + + static void* png__do_png(png__png* p, int* x, int* y, int* n, int req_comp, png__result_info* ri) + { + void* result = NULL; + if (req_comp < 0 || req_comp > 4) return png__errpuc("bad req_comp", "Internal error"); + if (png__parse_png_file(p, PNG__SCAN_load, req_comp)) { + if (p->depth <= 8) + ri->bits_per_channel = 8; + else if (p->depth == 16) + ri->bits_per_channel = 16; + else + return png__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth"); + result = p->out; + p->out = NULL; + if (req_comp && req_comp != p->s->img_out_n) { + if (ri->bits_per_channel == 8) + result = png__convert_format((unsigned char*)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y); + else + result = png__convert_format16((png__uint16*)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y); + p->s->img_out_n = req_comp; + if (result == NULL) return result; + } + *x = p->s->img_x; + *y = p->s->img_y; + if (n) *n = p->s->img_n; + } + PNG_FREE(p->out); p->out = NULL; + PNG_FREE(p->expanded); p->expanded = NULL; + PNG_FREE(p->idata); p->idata = NULL; + + return result; + } + + static void* png__png_load(png__context* s, int* x, int* y, int* comp, int req_comp, png__result_info* ri) + { + png__png p; + p.s = s; + return png__do_png(&p, x, y, comp, req_comp, ri); + } + + static int png__png_test(png__context* s) + { + int r; + r = png__check_png_header(s); + png__rewind(s); + return r; + } + + static int png__png_info_raw(png__png* p, int* x, int* y, int* comp) + { + if (!png__parse_png_file(p, PNG__SCAN_header, 0)) { + png__rewind(p->s); + return 0; + } + if (x) *x = p->s->img_x; + if (y) *y = p->s->img_y; + if (comp) *comp = p->s->img_n; + return 1; + } + + static int png__png_info(png__context* s, int* x, int* y, int* comp) + { + png__png p; + p.s = s; + return png__png_info_raw(&p, x, y, comp); + } + + static int png__png_is16(png__context* s) + { + png__png p; + p.s = s; + if (!png__png_info_raw(&p, NULL, NULL, NULL)) + return 0; + if (p.depth != 16) { + png__rewind(p.s); + return 0; + } + return 1; + } + + static void* png__load_main(png__context* s, int* x, int* y, int* comp, int req_comp, png__result_info* ri, int bpc) + { + memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields + ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed + ri->channel_order = PNG_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order + ri->num_channels = 0; + + if (png__png_test(s)) return png__png_load(s, x, y, comp, req_comp, ri); + + return png__errpuc("unknown image type", "Image not of any known type, or corrupt"); + } + + static png_uc* png__convert_16_to_8(png__uint16* orig, int w, int h, int channels) + { + int i; + int img_len = w * h * channels; + png_uc* reduced; + + reduced = (png_uc*)png__malloc(img_len); + if (reduced == NULL) return png__errpuc("outofmem", "Out of memory"); + + for (i = 0; i < img_len; ++i) + reduced[i] = (png_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling + + PNG_FREE(orig); + return reduced; + } + + static unsigned char* png__load_and_postprocess_8bit(png__context* s, int* x, int* y, int* comp, int req_comp) + { + png__result_info ri; + void* result = png__load_main(s, x, y, comp, req_comp, &ri, 8); + + if (result == NULL) + return NULL; + + // it is the responsibility of the loaders to make sure we get either 8 or 16 bit. + PNG_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16); + + if (ri.bits_per_channel != 8) { + result = png__convert_16_to_8((png__uint16*)result, *x, *y, req_comp == 0 ? *comp : req_comp); + ri.bits_per_channel = 8; + } + + // @TODO: move png__convert_format to here + + //if (png__vertically_flip_on_load) { + // int channels = req_comp ? req_comp : *comp; + // png__vertical_flip(result, *x, *y, channels * sizeof(png_uc)); + //} + + return (unsigned char*)result; + } + + static void png__start_mem(png__context* s, png_uc const* buffer, int len) + { + s->io.read = NULL; + s->read_from_callbacks = 0; + s->callback_already_read = 0; + s->img_buffer = s->img_buffer_original = (png_uc*)buffer; + s->img_buffer_end = s->img_buffer_original_end = (png_uc*)buffer + len; + } + + STBIDEF png_uc* png_load_from_memory(png_uc const* buffer, int len, int* x, int* y, int* comp, int req_comp) + { + png__context s; + png__start_mem(&s, buffer, len); + return png__load_and_postprocess_8bit(&s, x, y, comp, req_comp); + } + + //------------------------------------------------------------------------ + + static int png__stdio_read(void* user, char* data, int size) + { + InputMemoryStream* stream = (InputMemoryStream*)user; + return (int)stream->Read(size, data); + } + + static void png__stdio_skip(void* user, int n) + { + InputMemoryStream* stream = (InputMemoryStream*)user; + stream->Skip(n); + } + + static int png__stdio_eof(void* user) + { + InputMemoryStream* stream = (InputMemoryStream*)user; + return stream->Pos() == stream->Size() ? 1 : 0; + } + + + //--------------------------------------------------------------------- + + ImagePngLoader::ImagePngLoader(const ImageLoaderParam& param) + : Base::ImagePngLoader(param) + { + if (_param.format == SimdPixelFormatNone) + _param.format = SimdPixelFormatRgb24; + } + + bool ImagePngLoader::FromStream() + { + const int req_comp = 4; + int x, y, comp; + png__context s; + s.io.eof = png__stdio_eof; + s.io.read = png__stdio_read; + s.io.skip = png__stdio_skip; + s.io_user_data = &_stream; + s.buflen = sizeof(s.buffer_start); + s.read_from_callbacks = 1; + s.callback_already_read = 0; + s.img_buffer = s.img_buffer_original = s.buffer_start; + png__refill_buffer(&s); + s.img_buffer_original_end = s.img_buffer_end; + png__result_info ri; + uint8_t* data = (uint8_t*)png__png_load(&s, &x, &y, &comp, req_comp, &ri); + if (data) + { + if (ri.bits_per_channel == 16) + { + const uint16_t* src = (uint16_t*)data; + size_t size = x * y * req_comp; + uint8_t* dst = (uint8_t*)PNG_MALLOC(size); + for (size_t i = 0; i < size; ++i) + dst[i] = uint8_t(src[i] >> 8); + PNG_FREE(data); + data = dst; + } + size_t stride = 4 * x; + _image.Recreate(x, y, (Image::Format)_param.format); + if (x < A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: + Base::RgbaToGray(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgr24: + Base::BgraToRgb(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgra32: + Base::BgraToRgba(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatRgb24: + Base::BgraToBgr(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatRgba32: + Base::Copy(data, stride, x, y, 4, _image.data, _image.stride); + break; + default: + break; + } + } + else + { + switch (_param.format) + { + case SimdPixelFormatGray8: + Sse2::RgbaToGray(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgr24: + Sse41::BgraToRgb(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgra32: + Sse41::BgraToRgba(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatRgb24: + Sse41::BgraToBgr(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatRgba32: + Base::Copy(data, stride, x, y, 4, _image.data, _image.stride); + break; + default: + break; + } + } + PNG_FREE(data); + return true; + } + return false; + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp new file mode 100644 index 0000000000..da20b395c0 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp @@ -0,0 +1,139 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdSse2.h" +#include "Simd/SimdSse41.h" + +#include + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param) + : Base::ImagePgmTxtSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Sse41::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Sse2::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Sse41::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Sse41::RgbaToGray; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param) + : Base::ImagePgmBinSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Sse41::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Sse2::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Sse41::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Sse41::RgbaToGray; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param) + : Base::ImagePpmTxtSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Sse41::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Sse41::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Sse41::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Sse41::BgraToBgr; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param) + : Base::ImagePpmBinSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Sse41::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Sse41::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Sse41::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Sse41::BgraToBgr; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImageSaver* CreateImageSaver(const ImageSaverParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param); + case SimdImageFilePgmBin: return new ImagePgmBinSaver(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param); + case SimdImageFilePpmBin: return new ImagePpmBinSaver(param); + case SimdImageFilePng: return new ImagePngSaver(param); + case SimdImageFileJpeg: return new ImageJpegSaver(param); + default: + return NULL; + } + } + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size) + { + ImageSaverParam param(width, height, format, file, quality); + if (param.Validate()) + { + std::unique_ptr saver(CreateImageSaver(param)); + if (saver) + { + if (saver->ToStream(src, stride)) + return saver->Release(size); + } + } + return NULL; + } + } +#endif// SIMD_SSE41_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp new file mode 100644 index 0000000000..3a0a2079c1 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp @@ -0,0 +1,431 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdImageSaveJpeg.h" +#include "Simd/SimdSse41.h" +#include "Simd/SimdBase.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + SIMD_INLINE void JpegDctV(const float* src, size_t srcStride, float *dst, size_t dstStride) + { + for (int i = 0; i < 2; i++, src += 4, dst += 4) + { + __m128 d0 = _mm_loadu_ps(src + 0 * srcStride); + __m128 d1 = _mm_loadu_ps(src + 1 * srcStride); + __m128 d2 = _mm_loadu_ps(src + 2 * srcStride); + __m128 d3 = _mm_loadu_ps(src + 3 * srcStride); + __m128 d4 = _mm_loadu_ps(src + 4 * srcStride); + __m128 d5 = _mm_loadu_ps(src + 5 * srcStride); + __m128 d6 = _mm_loadu_ps(src + 6 * srcStride); + __m128 d7 = _mm_loadu_ps(src + 7 * srcStride); + + __m128 tmp0 = _mm_add_ps(d0, d7); + __m128 tmp7 = _mm_sub_ps(d0, d7); + __m128 tmp1 = _mm_add_ps(d1, d6); + __m128 tmp6 = _mm_sub_ps(d1, d6); + __m128 tmp2 = _mm_add_ps(d2, d5); + __m128 tmp5 = _mm_sub_ps(d2, d5); + __m128 tmp3 = _mm_add_ps(d3, d4); + __m128 tmp4 = _mm_sub_ps(d3, d4); + + __m128 tmp10 = _mm_add_ps(tmp0, tmp3); + __m128 tmp13 = _mm_sub_ps(tmp0, tmp3); + __m128 tmp11 = _mm_add_ps(tmp1, tmp2); + __m128 tmp12 = _mm_sub_ps(tmp1, tmp2); + + d0 = _mm_add_ps(tmp10, tmp11); + d4 = _mm_sub_ps(tmp10, tmp11); + + __m128 z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), _mm_set1_ps(0.707106781f)); + d2 = _mm_add_ps(tmp13, z1); + d6 = _mm_sub_ps(tmp13, z1); + + tmp10 = _mm_add_ps(tmp4, tmp5); + tmp11 = _mm_add_ps(tmp5, tmp6); + tmp12 = _mm_add_ps(tmp6, tmp7); + + __m128 z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12), _mm_set1_ps(0.382683433f)); + __m128 z2 = _mm_add_ps(_mm_mul_ps(tmp10, _mm_set1_ps(0.541196100f)), z5); + __m128 z4 = _mm_add_ps(_mm_mul_ps(tmp12, _mm_set1_ps(1.306562965f)), z5); + __m128 z3 = _mm_mul_ps(tmp11, _mm_set1_ps(0.707106781f)); + + __m128 z11 = _mm_add_ps(tmp7, z3); + __m128 z13 = _mm_sub_ps(tmp7, z3); + + _mm_storeu_ps(dst + 0 * dstStride, d0); + _mm_storeu_ps(dst + 1 * dstStride, _mm_add_ps(z11, z4)); + _mm_storeu_ps(dst + 2 * dstStride, d2); + _mm_storeu_ps(dst + 3 * dstStride, _mm_sub_ps(z13, z2)); + _mm_storeu_ps(dst + 4 * dstStride, d4); + _mm_storeu_ps(dst + 5 * dstStride, _mm_add_ps(z13, z2)); + _mm_storeu_ps(dst + 6 * dstStride, d6); + _mm_storeu_ps(dst + 7 * dstStride, _mm_sub_ps(z11, z4)); + } + } + + SIMD_INLINE void JpegDctH(const float* src, size_t srcStride, const float * fdt, int* dst) + { + for (int i = 0; i < 2; i++, src += 4 * srcStride, fdt += 4, dst += 4) + { + __m128 tmp0, tmp1, tmp2, tmp3; + __m128 d0 = _mm_loadu_ps(src + 0 * srcStride); + __m128 d1 = _mm_loadu_ps(src + 1 * srcStride); + __m128 d2 = _mm_loadu_ps(src + 2 * srcStride); + __m128 d3 = _mm_loadu_ps(src + 3 * srcStride); + tmp0 = _mm_unpacklo_ps(d0, d2); + tmp1 = _mm_unpackhi_ps(d0, d2); + tmp2 = _mm_unpacklo_ps(d1, d3); + tmp3 = _mm_unpackhi_ps(d1, d3); + d0 = _mm_unpacklo_ps(tmp0, tmp2); + d1 = _mm_unpackhi_ps(tmp0, tmp2); + d2 = _mm_unpacklo_ps(tmp1, tmp3); + d3 = _mm_unpackhi_ps(tmp1, tmp3); + + __m128 d4 = _mm_loadu_ps(src + 0 * srcStride + 4); + __m128 d5 = _mm_loadu_ps(src + 1 * srcStride + 4); + __m128 d6 = _mm_loadu_ps(src + 2 * srcStride + 4); + __m128 d7 = _mm_loadu_ps(src + 3 * srcStride + 4); + tmp0 = _mm_unpacklo_ps(d4, d6); + tmp1 = _mm_unpackhi_ps(d4, d6); + tmp2 = _mm_unpacklo_ps(d5, d7); + tmp3 = _mm_unpackhi_ps(d5, d7); + d4 = _mm_unpacklo_ps(tmp0, tmp2); + d5 = _mm_unpackhi_ps(tmp0, tmp2); + d6 = _mm_unpacklo_ps(tmp1, tmp3); + d7 = _mm_unpackhi_ps(tmp1, tmp3); + + tmp0 = _mm_add_ps(d0, d7); + tmp1 = _mm_add_ps(d1, d6); + tmp2 = _mm_add_ps(d2, d5); + tmp3 = _mm_add_ps(d3, d4); + __m128 tmp7 = _mm_sub_ps(d0, d7); + __m128 tmp6 = _mm_sub_ps(d1, d6); + __m128 tmp5 = _mm_sub_ps(d2, d5); + __m128 tmp4 = _mm_sub_ps(d3, d4); + + __m128 tmp10 = _mm_add_ps(tmp0, tmp3); + __m128 tmp13 = _mm_sub_ps(tmp0, tmp3); + __m128 tmp11 = _mm_add_ps(tmp1, tmp2); + __m128 tmp12 = _mm_sub_ps(tmp1, tmp2); + + d0 = _mm_add_ps(tmp10, tmp11); + d4 = _mm_sub_ps(tmp10, tmp11); + + __m128 z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), _mm_set1_ps(0.707106781f)); + d2 = _mm_add_ps(tmp13, z1); + d6 = _mm_sub_ps(tmp13, z1); + + tmp10 = _mm_add_ps(tmp4, tmp5); + tmp11 = _mm_add_ps(tmp5, tmp6); + tmp12 = _mm_add_ps(tmp6, tmp7); + + __m128 z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12), _mm_set1_ps(0.382683433f)); + __m128 z2 = _mm_add_ps(_mm_mul_ps(tmp10, _mm_set1_ps(0.541196100f)), z5); + __m128 z4 = _mm_add_ps(_mm_mul_ps(tmp12, _mm_set1_ps(1.306562965f)), z5); + __m128 z3 = _mm_mul_ps(tmp11, _mm_set1_ps(0.707106781f)); + + __m128 z11 = _mm_add_ps(tmp7, z3); + __m128 z13 = _mm_sub_ps(tmp7, z3); + + d1 = _mm_add_ps(z11, z4); + d3 = _mm_sub_ps(z13, z2); + d5 = _mm_add_ps(z13, z2); + d7 = _mm_sub_ps(z11, z4); + + _mm_storeu_si128((__m128i*)dst + 0x0, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 0), d0))); + _mm_storeu_si128((__m128i*)dst + 0x2, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 1), d1))); + _mm_storeu_si128((__m128i*)dst + 0x4, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 2), d2))); + _mm_storeu_si128((__m128i*)dst + 0x6, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 3), d3))); + _mm_storeu_si128((__m128i*)dst + 0x8, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 4), d4))); + _mm_storeu_si128((__m128i*)dst + 0xA, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 5), d5))); + _mm_storeu_si128((__m128i*)dst + 0xC, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 6), d6))); + _mm_storeu_si128((__m128i*)dst + 0xE, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 7), d7))); + } + } + + static int JpegProcessDu(Base::BitBuf& bitBuf, float* CDU, int stride, const float* fdtbl, int DC, const uint16_t HTDC[256][2], const uint16_t HTAC[256][2]) + { + JpegDctV(CDU, stride, CDU, stride); + SIMD_ALIGNED(16) int DUO[64], DU[64]; + JpegDctH(CDU, stride, fdtbl, DUO); + for (int i = 0; i < 64; ++i) + DU[Base::JpegZigZagT[i]] = DUO[i]; + int diff = DU[0] - DC; + if (diff == 0) + bitBuf.Push(HTDC[0]); + else + { + uint16_t bits[2]; + Base::JpegCalcBits(diff, bits); + bitBuf.Push(HTDC[bits[1]]); + bitBuf.Push(bits); + } + int end0pos4 = 60; + for (; end0pos4 > 0 && _mm_testz_si128(_mm_loadu_si128((__m128i*)(DU + end0pos4)), Sse2::K_INV_ZERO); end0pos4 -= 4); + int end0pos = end0pos4 + 3; + for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos); + if (end0pos == 0) + { + bitBuf.Push(HTAC[0x00]); + return DU[0]; + } + for (int i = 1; i <= end0pos; ++i) + { + int startpos = i; + for (; DU[i] == 0 && i <= end0pos; ++i); + int nrzeroes = i - startpos; + if (nrzeroes >= 16) + { + int lng = nrzeroes >> 4; + int nrmarker; + for (nrmarker = 1; nrmarker <= lng; ++nrmarker) + bitBuf.Push(HTAC[0xF0]); + nrzeroes &= 15; + } + uint16_t bits[2]; + Base::JpegCalcBits(DU[i], bits); + bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]); + bitBuf.Push(bits); + } + if (end0pos != 63) + bitBuf.Push(HTAC[0x00]); + return DU[0]; + } + + SIMD_INLINE void RgbToYuvInit(__m128 k[10]) + { + k[0] = _mm_set1_ps(+0.29900f); + k[1] = _mm_set1_ps(+0.58700f); + k[2] = _mm_set1_ps(+0.11400f); + k[3] = _mm_set1_ps(-128.000f); + k[4] = _mm_set1_ps(-0.16874f); + k[5] = _mm_set1_ps(-0.33126f); + k[6] = _mm_set1_ps(+0.50000f); + k[7] = _mm_set1_ps(+0.50000f); + k[8] = _mm_set1_ps(-0.41869f); + k[9] = _mm_set1_ps(-0.08131f); + } + + SIMD_INLINE void RgbToYuv(const uint8_t* r, const uint8_t* g, const uint8_t* b, int stride, int height, + const __m128 k[10], float* y, float* u, float* v, int size) + { + for (int row = 0; row < size;) + { + for (int col = 0; col < size; col += 4) + { + __m128 _r = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(r + col)))); + __m128 _g = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(g + col)))); + __m128 _b = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(b + col)))); + _mm_storeu_ps(y + col, _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, k[0]), _mm_mul_ps(_g, k[1])), _mm_mul_ps(_b, k[2])), k[3])); + //_mm_storeu_ps(y + col, _mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, _yr), _mm_mul_ps(_g, _yg)), _mm_add_ps(_mm_mul_ps(_b, _yb), _yt))); + _mm_storeu_ps(u + col, _mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, k[4]), _mm_mul_ps(_g, k[5])), _mm_mul_ps(_b, k[6]))); + _mm_storeu_ps(v + col, _mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, k[7]), _mm_mul_ps(_g, k[8])), _mm_mul_ps(_b, k[9]))); + } + if(++row < height) + r += stride, g += stride, b += stride; + y += size, u += size, v += size; + } + } + + SIMD_INLINE void GrayToY(const uint8_t* g, int stride, int height, const __m128 k[10], float* y, int size) + { + for (int row = 0; row < size;) + { + for (int col = 0; col < size; col += 4) + { + __m128 _g = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(g + col)))); + _mm_storeu_ps(y + col, _mm_add_ps(_g, k[3])); + } + if (++row < height) + g += stride; + y += size; + } + } + + SIMD_INLINE void SubUv(const float * src, float * dst) + { + __m128 _0_25 = _mm_set1_ps(0.25f), s0, s1; + for (int yy = 0; yy < 8; yy += 1) + { + s0 = _mm_add_ps(_mm_loadu_ps(src + 0), _mm_loadu_ps(src + 16)); + s1 = _mm_add_ps(_mm_loadu_ps(src + 4), _mm_loadu_ps(src + 20)); + _mm_storeu_ps(dst + 0, _mm_mul_ps(_mm_hadd_ps(s0, s1), _0_25)); + s0 = _mm_add_ps(_mm_loadu_ps(src + 8), _mm_loadu_ps(src + 24)); + s1 = _mm_add_ps(_mm_loadu_ps(src + 12), _mm_loadu_ps(src + 28)); + _mm_storeu_ps(dst + 4, _mm_mul_ps(_mm_hadd_ps(s0, s1), _0_25)); + src += 32; + dst += 8; + } + } + + void JpegWriteBlockSubs(OutputMemoryStream& stream, int width, int height, const uint8_t* red, + const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3]) + { + __m128 k[10]; + RgbToYuvInit(k); + int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2]; + int width16 = width& (~15); + bool gray = red == green && red == blue; + Base::BitBuf bitBuf; + for (int y = 0; y < height; y += 16) + { + int x = 0; + SIMD_ALIGNED(16) float Y[256], U[256], V[256]; + SIMD_ALIGNED(16) float subU[64], subV[64]; + for (; x < width16; x += 16) + { + if (gray) + GrayToY(red + x, stride, height - y, k, Y, 16); + else + RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 16); + DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + SubUv(U, subU); + SubUv(V, subV); + DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + if (bitBuf.Full()) + { + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + } + for (; x < width; x += 16) + { + if (gray) + Base::GrayToY(red + x, stride, height - y, width - x, Y, 16); + else + Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 16); + DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + SubUv(U, subU); + SubUv(V, subV); + DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + } + } + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + + void JpegWriteBlockFull(OutputMemoryStream& stream, int width, int height, const uint8_t* red, + const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3]) + { + __m128 k[10]; + RgbToYuvInit(k); + int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2]; + int width8 = width & (~7); + bool gray = red == green && red == blue; + Base::BitBuf bitBuf; + for (int y = 0; y < height; y += 8) + { + int x = 0; + SIMD_ALIGNED(16) float Y[64], U[64], V[64]; + for (; x < width8; x += 8) + { + if (gray) + GrayToY(red + x, stride, height - y, k, Y, 8); + else + RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 8); + DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + if (bitBuf.Full()) + { + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + } + for (; x < width; x += 8) + { + if (gray) + Base::GrayToY(red + x, stride, height - y, width - x, Y, 8); + else + Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 8); + DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + } + } + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + + //--------------------------------------------------------------------- + + ImageJpegSaver::ImageJpegSaver(const ImageSaverParam& param) + : Base::ImageJpegSaver(param) + { + } + + void ImageJpegSaver::Init() + { + InitParams(true); + switch (_param.format) + { + case SimdPixelFormatBgr24: + case SimdPixelFormatRgb24: + _deintBgr = _param.width < 16 ? Base::DeinterleaveBgr : Sse41::DeinterleaveBgr; + break; + case SimdPixelFormatBgra32: + case SimdPixelFormatRgba32: + _deintBgra = _param.width < 16 ? Base::DeinterleaveBgra : Sse41::DeinterleaveBgra; + break; + default: + break; + } + _writeBlock = _subSample ? JpegWriteBlockSubs : JpegWriteBlockFull; + } + } +#endif// SIMD_SSE41_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp new file mode 100644 index 0000000000..0e1c76b710 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp @@ -0,0 +1,370 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdImageSavePng.h" +#include "Simd/SimdBase.h" +#include "Simd/SimdSse41.h" +#include "Simd/SimdExtract.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + uint32_t ZlibAdler32(uint8_t* data, int size) + { + __m128i _i0 = _mm_setr_epi32(0, -1, -2, -3), _4 = _mm_set1_epi32(4); + uint32_t lo = 1, hi = 0; + for (int b = 0, n = (int)(size % 5552); b < size;) + { + int n4 = n & (~3), i = 0; + __m128i _i = _mm_add_epi32(_i0, _mm_set1_epi32(n)); + __m128i _l = _mm_setzero_si128(), _h = _mm_setzero_si128(); + for (; i < n4; i += 4) + { + __m128i d = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(data + b + i))); + _l = _mm_add_epi32(_l, d); + _h = _mm_add_epi32(_h, _mm_mullo_epi32(d, _i)); + _i = _mm_sub_epi32(_i, _4); + } + int l = Sse2::ExtractInt32Sum(_l), h = Sse2::ExtractInt32Sum(_h); + for (; i < n; ++i) + { + l += data[b + i]; + h += data[b + i]*(n - i); + } + hi = (hi + h + lo*n) % 65521; + lo = (lo + l) % 65521; + b += n; + n = 5552; + } + return (hi << 16) | lo; + } + + void ZlibCompress(uint8_t* data, int size, int quality, OutputMemoryStream& stream) + { + const int ZHASH = 16384; + if (quality < 5) + quality = 5; + const int basket = quality * 2; + Array32i hashTable(ZHASH * basket); + memset(hashTable.data, -1, hashTable.RawSize()); + + stream.Write(uint8_t(0x78)); + stream.Write(uint8_t(0x5e)); + stream.WriteBits(1, 1); + stream.WriteBits(1, 2); + + int i = 0, j; + while (i < size - 3) + { + int h = Base::ZlibHash(data + i) & (ZHASH - 1), best = 3; + uint8_t* bestLoc = 0; + int* hList = hashTable.data + h * basket; + for (j = 0; hList[j] != -1 && j < basket; ++j) + { + if (hList[j] > i - 32768) + { + int d = ZlibCount(data + hList[j], data + i, size - i); + if (d >= best) + { + best = d; + bestLoc = data + hList[j]; + } + } + } + if (j == basket) + { + memcpy(hList, hList + quality, quality * sizeof(int)); + memset(hList + quality, -1, quality * sizeof(int)); + j = quality; + } + hList[j] = i; + + if (bestLoc) + { + h = Base::ZlibHash(data + i + 1) & (ZHASH - 1); + int* hList = hashTable.data + h * basket; + for (j = 0; hList[j] != -1 && j < basket; ++j) + { + if (hList[j] > i - 32767) + { + int e = ZlibCount(data + hList[j], data + i + 1, size - i - 1); + if (e > best) + { + bestLoc = NULL; + break; + } + } + } + } + + if (bestLoc) + { + int d = (int)(data + i - bestLoc); + assert(d <= 32767 && best <= 258); + for (j = 0; best > Base::ZlibLenC[j + 1] - 1; ++j); + Base::ZlibHuff(j + 257, stream); + if (Base::ZlibLenEb[j]) + stream.WriteBits(best - Base::ZlibLenC[j], Base::ZlibLenEb[j]); + for (j = 0; d > Base::ZlibDistC[j + 1] - 1; ++j); + stream.WriteBits(Base::ZlibBitRev(j, 5), 5); + if (Base::ZlibDistEb[j]) + stream.WriteBits(d - Base::ZlibDistC[j], Base::ZlibDistEb[j]); + i += best; + } + else + { + Base::ZlibHuffB(data[i], stream); + ++i; + } + } + for (; i < size; ++i) + Base::ZlibHuffB(data[i], stream); + Base::ZlibHuff(256, stream); + stream.FlushBits(); + stream.WriteBe32u(ZlibAdler32(data, size)); + } + + uint32_t EncodeLine0(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size, A); + __m128i _sum = _mm_setzero_si128(); + for (; i < sizeA; i += A) + { + __m128i _src = _mm_loadu_si128((__m128i*)(src + i)); + _mm_storeu_si128((__m128i*)(dst + i), _src); + _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_src))); + } + uint32_t sum = Sse2::ExtractInt32Sum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine1(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + __m128i _sum = _mm_setzero_si128(); + for (; i < sizeA; i += A) + { + __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i)); + __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n)); + __m128i _dst = _mm_sub_epi8(_src0, _src1); + _mm_storeu_si128((__m128i*)(dst + i), _dst); + _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst))); + } + sum += Sse2::ExtractInt32Sum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - src[i - n]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine2(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i] - src[i - stride]; + sum += ::abs(dst[i]); + } + __m128i _sum = _mm_setzero_si128(); + for (; i < sizeA; i += A) + { + __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i)); + __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - stride)); + __m128i _dst = _mm_sub_epi8(_src0, _src1); + _mm_storeu_si128((__m128i*)(dst + i), _dst); + _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst))); + } + sum += Sse2::ExtractInt32Sum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - src[i - stride]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine3(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i] - (src[i - stride] >> 1); + sum += ::abs(dst[i]); + } + __m128i _sum = _mm_setzero_si128(); + for (; i < sizeA; i += A) + { + __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i)); + __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n)); + __m128i _src2 = _mm_loadu_si128((__m128i*)(src + i - stride)); + __m128i lo = _mm_srli_epi16(_mm_add_epi16(UnpackU8<0>(_src1), UnpackU8<0>(_src2)), 1); + __m128i hi = _mm_srli_epi16(_mm_add_epi16(UnpackU8<1>(_src1), UnpackU8<1>(_src2)), 1); + __m128i _dst = _mm_sub_epi8(_src0, _mm_packus_epi16(lo, hi)); + _mm_storeu_si128((__m128i*)(dst + i), _dst); + _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst))); + } + sum += Sse2::ExtractInt32Sum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - ((src[i - n] + src[i - stride]) >> 1); + sum += ::abs(dst[i]); + } + return sum; + } + + SIMD_INLINE __m128i Paeth(__m128i a, __m128i b, __m128i c) + { + __m128i p = _mm_sub_epi16(_mm_add_epi16(a, b), c); + __m128i pa = _mm_abs_epi16(_mm_sub_epi16(p, a)); + __m128i pb = _mm_abs_epi16(_mm_sub_epi16(p, b)); + __m128i pc = _mm_abs_epi16(_mm_sub_epi16(p, c)); + __m128i mbc = _mm_or_si128(_mm_cmpgt_epi16(pa, pb), _mm_cmpgt_epi16(pa, pc)); + __m128i mc = _mm_cmpgt_epi16(pb, pc); + return _mm_blendv_epi8(a, _mm_blendv_epi8(b, c, mc), mbc); + } + + uint32_t EncodeLine4(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = (int8_t)(src[i] - src[i - stride]); + sum += ::abs(dst[i]); + } + __m128i _sum = _mm_setzero_si128(); + for (; i < sizeA; i += A) + { + __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i)); + __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n)); + __m128i _src2 = _mm_loadu_si128((__m128i*)(src + i - stride)); + __m128i _src3 = _mm_loadu_si128((__m128i*)(src + i - stride - n)); + __m128i lo = Paeth(UnpackU8<0>(_src1), UnpackU8<0>(_src2), UnpackU8<0>(_src3)); + __m128i hi = Paeth(UnpackU8<1>(_src1), UnpackU8<1>(_src2), UnpackU8<1>(_src3)); + __m128i _dst = _mm_sub_epi8(_src0, _mm_packus_epi16(lo, hi)); + _mm_storeu_si128((__m128i*)(dst + i), _dst); + _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst))); + } + sum += Sse2::ExtractInt32Sum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - Base::Paeth(src[i - n], src[i - stride], src[i - stride - n]); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine5(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + __m128i _sum = _mm_setzero_si128(); + for (; i < sizeA; i += A) + { + __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i)); + __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n)); + __m128i lo = _mm_srli_epi16(UnpackU8<0>(_src1), 1); + __m128i hi = _mm_srli_epi16(UnpackU8<1>(_src1), 1); + __m128i _dst = _mm_sub_epi8(_src0, _mm_packus_epi16(lo, hi)); + _mm_storeu_si128((__m128i*)(dst + i), _dst); + _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst))); + } + sum += Sse2::ExtractInt32Sum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - (src[i - n] >> 1); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine6(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + __m128i _sum = _mm_setzero_si128(); + for (; i < sizeA; i += A) + { + __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i)); + __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n)); + __m128i _dst = _mm_sub_epi8(_src0, _src1); + _mm_storeu_si128((__m128i*)(dst + i), _dst); + _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst))); + } + sum += Sse2::ExtractInt32Sum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - src[i - n]; + sum += ::abs(dst[i]); + } + return sum; + } + + ImagePngSaver::ImagePngSaver(const ImageSaverParam& param) + : Base::ImagePngSaver(param) + { + if (_param.format == SimdPixelFormatBgr24) + _convert = Sse41::BgrToRgb; + else if (_param.format == SimdPixelFormatBgra32) + _convert = Sse41::BgraToRgba; + _encode[0] = Sse41::EncodeLine0; + _encode[1] = Sse41::EncodeLine1; + _encode[2] = Sse41::EncodeLine2; + _encode[3] = Sse41::EncodeLine3; + _encode[4] = Sse41::EncodeLine4; + _encode[5] = Sse41::EncodeLine5; + _encode[6] = Sse41::EncodeLine6; + _compress = Sse41::ZlibCompress; + } + } +#endif// SIMD_SSE41_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdView.hpp b/3rdparty/simdlib/Simd/SimdView.hpp index 0c61a0e6e8..33629be94f 100755 --- a/3rdparty/simdlib/Simd/SimdView.hpp +++ b/3rdparty/simdlib/Simd/SimdView.hpp @@ -27,7 +27,6 @@ #ifndef __SimdView_hpp__ #define __SimdView_hpp__ -#include "Simd/SimdDefs.h" #include "Simd/SimdRectangle.hpp" #include "Simd/SimdAllocator.hpp" @@ -493,34 +492,57 @@ namespace Simd /*! Loads image from file. - Supported formats: - - PGM(Portable Gray Map) text(P2) or binary(P5) (the file is loaded as 8-bit gray image). - - PPM(Portable Pixel Map) text(P3) or binary(P6) (the file is loaded as 32-bit BGRA image). + Supported formats are described by ::SimdImageFileType enumeration. \note PGM and PPM files with comments are not supported. - \param [in] path - a path to file with PGM or PPM image. + \param [in] path - a path to image file. + \param [in] format - a desired format of loaded image. + Supported values are View::Gray8, View::Bgr24, View::Bgra32, View::Rgb24, View::Rgba32 and View::None. + Default value is View::None (loads image in native pixel format of image file). \return - a result of loading. */ - bool Load(const std::string & path); + bool Load(const std::string & path, Format format = None); + + /*! + Loads image from memory buffer. + + Supported formats are described by ::SimdImageFileType enumeration. + + \note PGM and PPM files with comments are not supported. + + \param [in] src - a pointer to memory buffer. + \param [in] size - a buffer size. + \param [in] format - a desired format of loaded image. + Supported values are View::Gray8, View::Bgr24, View::Bgra32, View::Rgb24, View::Rgba32 and View::None. + Default value is View::None (loads image in native pixel format of image file). + \return - a result of loading. + */ + bool Load(const uint8_t * src, size_t size, Format format = None); /*! Saves image to file. - Supported formats: - - PGM(Portable Gray Map) binary(P5) (this format is used in order to save 8-bit gray images). - - PPM(Portable Pixel Map) binary(P6) (this format is used in order to save 24-bit BGR and 32-bit BGRA images). - \param [in] path - a path to file. + \param [in] type - a image file format. By default is equal to ::SimdImageFileUndefined (format auto choice). + \param [in] quality - a parameter of compression quality (if file format supports it). \return - a result of saving. */ - bool Save(const std::string & path) const; + bool Save(const std::string & path, SimdImageFileType type = SimdImageFileUndefined, int quality = 100) const; /*! - Clear View structure (reset all fields) and free memory if it's owner + Clears View structure (reset all fields) and free memory if it's owner. */ void Clear(); + /*! + Releases pixel data and resets all fields. + + \param [out] size - a pointer to the size of released pixel data. Can be NULL. + \return - a released pointer to pixel data. It must be deleted by function ::SimdFree. + */ + uint8_t* Release(size_t* size = NULL); + private: bool _owner; }; @@ -1027,6 +1049,7 @@ namespace Simd case Float: return 4; case Double: return 8; case Rgb24: return 3; + case Rgba32: return 4; default: assert(0); return 0; } } @@ -1050,6 +1073,7 @@ namespace Simd case Float: return 4; case Double: return 8; case Rgb24: return 1; + case Rgba32: return 1; default: assert(0); return 0; } } @@ -1073,6 +1097,7 @@ namespace Simd case Float: return 1; case Double: return 1; case Rgb24: return 3; + case Rgba32: return 4; default: assert(0); return 0; } } @@ -1124,139 +1149,33 @@ namespace Simd std::swap((bool&)_owner, (bool&)other._owner); } - template class A> SIMD_INLINE bool View::Load(const std::string & path) + template class A> SIMD_INLINE bool View::Load(const std::string & path, Format format_) { - std::ifstream ifs(path.c_str(), std::ifstream::binary); - if (ifs.is_open()) - { - std::string type; - ifs >> type; - if (type == "P2" || type == "P5") - { - size_t w, h, d; - ifs >> w >> h >> d; - if (d != 255) - return false; - ifs.get(); - Recreate(w, h, View::Gray8); - if (type == "P2") - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - int gray; - ifs >> gray; - data[row * stride + col] = (uint8_t)gray; - } - } - } - else - { - for (size_t row = 0; row < height; ++row) - ifs.read((char*)(data + row*stride), width); - } - return true; - } - if (type == "P3" || type == "P6") - { - size_t w, h, d; - ifs >> w >> h >> d; - if (d != 255) - return false; - ifs.get(); - Recreate(w, h, View::Bgra32); - if (type == "P3") - { - for (size_t row = 0; row < height; ++row) - { - uint8_t * bgra = data + row * stride; - for (size_t col = 0; col < width; ++col, bgra += 4) - { - int blue, green, red; - ifs >> red >> green >> blue; - bgra[0] = (uint8_t)blue; - bgra[1] = (uint8_t)green; - bgra[2] = (uint8_t)red; - bgra[3] = 0xFF; - } - } - } - else - { - View buffer(width, 1, Bgr24); - for (size_t row = 0; row < height; ++row) - { - ifs.read((char*)buffer.data, width*3); - const uint8_t * rgb = buffer.data; - uint8_t * bgra = data + row*stride; - for (size_t col = 0; col < width; ++col, rgb += 3, bgra += 4) - { - bgra[0] = rgb[2]; - bgra[1] = rgb[1]; - bgra[2] = rgb[0]; - bgra[3] = 0xFF; - } - } - } - return true; - } - } - return false; + Clear(); + (Format&)format = format_; + *(uint8_t**)&data = SimdImageLoadFromFile(path.c_str(), (size_t*)&stride, (size_t*)&width, (size_t*)&height, (SimdPixelFormatType*)&format); + if (data) + _owner = true; + else + (Format&)format = None; + return _owner; } - template class A> SIMD_INLINE bool View::Save(const std::string & path) const + template class A> SIMD_INLINE bool View::Load(const uint8_t * src, size_t size, Format format_) { - if (!(format == View::Gray8 || format == View::Bgr24 || format == View::Bgra32)) - return false; - - std::ofstream ofs(path.c_str(), std::ofstream::binary); - if (ofs.is_open()) - { - if (format == View::Gray8) - { - ofs << "P5\n" << width << " " << height << "\n255\n"; - for (size_t row = 0; row < height; ++row) - ofs.write((const char*)(data + row*stride), width); - } - else if (format == View::Bgr24) - { - ofs << "P6\n" << width << " " << height << "\n255\n"; - View buffer(width, 1, Bgr24); - for (size_t row = 0; row < height; ++row) - { - const uint8_t * bgr = data + row*stride; - uint8_t * rgb = buffer.data; - for (size_t col = 0; col < width; ++col, bgr += 3, rgb += 3) - { - rgb[0] = bgr[2]; - rgb[1] = bgr[1]; - rgb[2] = bgr[0]; - } - ofs.write((const char*)(buffer.data), width*3); - } - } - else if (format == View::Bgra32) - { - ofs << "P6\n" << width << " " << height << "\n255\n"; - View buffer(width, 1, Bgr24); - for (size_t row = 0; row < height; ++row) - { - const uint8_t * bgra = data + row*stride; - uint8_t * rgb = buffer.data; - for (size_t col = 0; col < width; ++col, bgra += 4, rgb += 3) - { - rgb[0] = bgra[2]; - rgb[1] = bgra[1]; - rgb[2] = bgra[0]; - } - ofs.write((const char*)buffer.data, width * 3); - } - } - return true; - } + Clear(); + (Format&)format = format_; + *(uint8_t**)&data = SimdImageLoadFromMemory(src, size, (size_t*)&stride, (size_t*)&width, (size_t*)&height, (SimdPixelFormatType*)&format); + if (data) + _owner = true; else - return false; + (Format&)format = None; + return _owner; + } + + template class A> SIMD_INLINE bool View::Save(const std::string & path, SimdImageFileType type, int quality) const + { + return SimdImageSaveToFile(data, stride, width, height, (SimdPixelFormatType)format, type, quality, path.c_str()) == SimdTrue; } template class A> SIMD_INLINE void View::Clear() @@ -1279,6 +1198,16 @@ namespace Simd #endif } + template class A> SIMD_INLINE uint8_t* View::Release(size_t* size) + { + uint8_t* released = data; + if (size) + *size = DataSize(); + _owner = false; + Clear(); + return released; + } + // View utilities implementation: template class A, class T> const T & At(const View & view, size_t x, size_t y) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9adaac4edc..b2aa7a863e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -679,6 +679,8 @@ if(NOT USE_OPENCV AND (NOT USE_PNG OR NOT USE_JPEG)) else() set(WITH_STBIMAGE OFF) endif() +# TODO: +set(WITH_STBIMAGE ON) VP_OPTION(WITH_CATCH2 "" "" "Use catch2" "" ON IF (VISP_CXX_STANDARD GREATER VISP_CXX_STANDARD_98)) diff --git a/modules/io/CMakeLists.txt b/modules/io/CMakeLists.txt index 959ee1c9b6..949ec58aef 100644 --- a/modules/io/CMakeLists.txt +++ b/modules/io/CMakeLists.txt @@ -57,11 +57,21 @@ if(USE_PNG) add_definitions(${PNG_DEFINITIONS}) endif() -if(WITH_STBIMAGE) +# TODO: +#if(WITH_STBIMAGE) # stb_image is private include_directories(${STBIMAGE_INCLUDE_DIRS}) +#endif() + +if(WITH_CATCH2) + # catch2 is private + include_directories(${CATCH2_INCLUDE_DIRS}) endif() +# simdlib is always enabled since it contains fallback code to plain C++ code +# Simd lib is private +include_directories(${SIMDLIB_INCLUDE_DIRS}) + # OpenCV if(USE_OPENCV) # On win32 since OpenCV 2.4.7 and on OSX with OpenCV 2.4.10 we cannot use OpenCV_LIBS to set ViSP 3rd party libraries. @@ -178,7 +188,7 @@ endif() vp_glob_module_sources() vp_module_include_directories(${opt_incs}) vp_create_module(${opt_libs}) -vp_add_tests(DEPENDS_ON visp_features) +vp_add_tests() vp_set_source_file_compile_flag(src/tools/vpParseArgv.cpp -Wno-strict-overflow) diff --git a/modules/io/include/visp3/io/vpImageIo.h b/modules/io/include/visp3/io/vpImageIo.h index d37cad48e3..11bd9aa766 100644 --- a/modules/io/include/visp3/io/vpImageIo.h +++ b/modules/io/include/visp3/io/vpImageIo.h @@ -144,6 +144,10 @@ class VISP_EXPORT vpImageIo static void readPNG(vpImage &I, const std::string &filename); static void readPNG(vpImage &I, const std::string &filename); + //TODO: + static void readSimdlib(vpImage &I, const std::string &filename); + static void readStb(vpImage &I, const std::string &filename); + static void writePFM(const vpImage &I, const std::string &filename); static void writePGM(const vpImage &I, const std::string &filename); @@ -158,5 +162,9 @@ class VISP_EXPORT vpImageIo static void writePNG(const vpImage &I, const std::string &filename); static void writePNG(const vpImage &I, const std::string &filename); + + //TODO: + static void writeSimdlib(vpImage &I, const std::string &filename); + static void writeStb(vpImage &I, const std::string &filename); }; #endif diff --git a/modules/io/src/image/vpImageIo.cpp b/modules/io/src/image/vpImageIo.cpp index ab290fa5f7..cc7799d158 100644 --- a/modules/io/src/image/vpImageIo.cpp +++ b/modules/io/src/image/vpImageIo.cpp @@ -62,6 +62,15 @@ #include #endif +//TODO: +#include +//TODO: +#define STB_IMAGE_IMPLEMENTATION +#include + +#define STB_IMAGE_WRITE_IMPLEMENTATION +#include + #if !defined(VISP_HAVE_OPENCV) #if !defined(VISP_HAVE_JPEG) || !defined(VISP_HAVE_PNG) @@ -2059,6 +2068,60 @@ void vpImageIo::readPNG(vpImage &I, const std::string &filename) fclose(file); } +//TODO: +void vpImageIo::readSimdlib(vpImage &I, const std::string &filename) +{ + size_t stride = 0, width = 0, height = 0; + SimdPixelFormatType format = SimdPixelFormatRgba32; + uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format); + const bool copyData = false; + I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData); +} + +void vpImageIo::readStb(vpImage &I, const std::string &filename) +{ + int width = 0, height = 0, channels = 0; + unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha); + if (image == NULL) { + throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); + } + I.init(reinterpret_cast(image), static_cast(height), static_cast(width), true); + stbi_image_free(image); +} + +inline bool ends_with(std::string const & value, std::string const & ending) +{ + if (ending.size() > value.size()) return false; + return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); +} + +void vpImageIo::writeSimdlib(vpImage &I, const std::string &filename) +{ + if (ends_with(filename, ".png")) { + SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFilePng, 90, filename.c_str()); + } else { + SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, 90, filename.c_str()); + } +} + +void vpImageIo::writeStb(vpImage &I, const std::string &filename) +{ + if (ends_with(filename, ".png")) { + const int stride_in_bytes = static_cast(4 * I.getWidth()); + int res = stbi_write_png(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, + reinterpret_cast(I.bitmap), stride_in_bytes); + if (res == 0) { + throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str())); + } + } else { + int res = stbi_write_jpg(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, + reinterpret_cast(I.bitmap), 90); + if (res == 0) { + throw(vpImageException(vpImageException::ioError, "JEPG write error")); + } + } +} + #elif defined(VISP_HAVE_OPENCV) /*! diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp new file mode 100644 index 0000000000..ce0d416b70 --- /dev/null +++ b/modules/io/test/perfImageLoadSave.cpp @@ -0,0 +1,461 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Benchmark color image conversion. + * + *****************************************************************************/ + +#include + +#ifdef VISP_HAVE_CATCH2 +#define CATCH_CONFIG_ENABLE_BENCHMARKING +#define CATCH_CONFIG_RUNNER +#include + +#include +#include +#include + +static std::string ipath = vpIoTools::getViSPImagesDataPath(); +static std::string imagePathJpeg = vpIoTools::createFilePath(ipath, "Klimt/Klimt.jpeg"); +static std::string imagePathPng = vpIoTools::createFilePath(ipath, "Klimt/Klimt.png"); +static std::string imagePathPngBig = vpIoTools::createFilePath(ipath, "Klimt/test_image_resize.png"); +static int nThreads = 0; + +TEST_CASE("Benchmark Jpeg image loading", "[benchmark]") { + { + vpImage I; + + BENCHMARK("vpImageIo::read()") { + vpImageIo::read(I, imagePathJpeg); + return I; + }; + } + + { + vpImage I; + + BENCHMARK("vpImageIo::readSimdlib()") { + vpImageIo::readSimdlib(I, imagePathJpeg); + return I; + }; + } + + { + vpImage I; + + BENCHMARK("vpImageIo::readStb()") { + vpImageIo::readStb(I, imagePathJpeg); + return I; + }; + } +} + +TEST_CASE("Benchmark Png image loading", "[benchmark]") { + { + vpImage I; + + BENCHMARK("vpImageIo::read()") { + vpImageIo::read(I, imagePathPng); + return I; + }; + } + + { + vpImage I; + + BENCHMARK("vpImageIo::readSimdlib()") { + vpImageIo::readSimdlib(I, imagePathPng); + return I; + }; + } + + { + vpImage I; + + BENCHMARK("vpImageIo::readStb()") { + vpImageIo::readStb(I, imagePathPng); + return I; + }; + } +} + +TEST_CASE("Benchmark big Png image loading", "[benchmark]") { + { + vpImage I; + + BENCHMARK("vpImageIo::read()") { + vpImageIo::read(I, imagePathPngBig); + return I; + }; + } + + { + vpImage I; + + BENCHMARK("vpImageIo::readSimdlib()") { + vpImageIo::readSimdlib(I, imagePathPngBig); + return I; + }; + } + + { + vpImage I; + + BENCHMARK("vpImageIo::readStb()") { + vpImageIo::readStb(I, imagePathPngBig); + return I; + }; + } +} + +TEST_CASE("Benchmark Jpeg image saving", "[benchmark]") { + vpImage I; + vpImageIo::read(I, imagePathJpeg); + { + const std::string filename = "/tmp/Klimt_ViSP.jpg"; + + BENCHMARK("vpImageIo::write()") { + vpImageIo::write(I, filename); + return I; + }; + } + + { + const std::string filename = "/tmp/Klimt_Simd.jpg"; + + BENCHMARK("vpImageIo::writeSimdlib()") { + vpImageIo::writeSimdlib(I, filename); + return I; + }; + } + + { + const std::string filename = "/tmp/Klimt_stb.jpg"; + + BENCHMARK("vpImageIo::writeStb()") { + vpImageIo::writeStb(I, filename); + return I; + }; + } +} + +TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") { + vpImage I; + vpImageIo::read(I, imagePathPngBig); + { + const std::string filename = "/tmp/Big_images_ViSP.jpg"; + + BENCHMARK("vpImageIo::write()") { + vpImageIo::write(I, filename); + return I; + }; + } + +// { +// const std::string filename = "/tmp/Big_images_Simd.jpg"; + +// BENCHMARK("vpImageIo::writeSimdlib()") { +// vpImageIo::writeSimdlib(I, filename); +// return I; +// }; +// } + + { + const std::string filename = "/tmp/Big_images_stb.jpg"; + + BENCHMARK("vpImageIo::writeStb()") { + vpImageIo::writeStb(I, filename); + return I; + }; + } +} + +TEST_CASE("Benchmark Png image saving", "[benchmark]") { + vpImage I; + vpImageIo::read(I, imagePathPng); + { + const std::string filename = "/tmp/Klimt_ViSP.png"; + + BENCHMARK("vpImageIo::write()") { + vpImageIo::write(I, filename); + return I; + }; + } + + { + const std::string filename = "/tmp/Klimt_Simd.png"; + + BENCHMARK("vpImageIo::writeSimdlib()") { + vpImageIo::writeSimdlib(I, filename); + return I; + }; + } + + { + const std::string filename = "/tmp/Klimt_stb.png"; + + BENCHMARK("vpImageIo::writeStb()") { + vpImageIo::writeStb(I, filename); + return I; + }; + } +} + +TEST_CASE("Benchmark big Png image saving", "[benchmark]") { + vpImage I; + vpImageIo::read(I, imagePathPngBig); + { + const std::string filename = "/tmp/Big_images_ViSP.png"; + + BENCHMARK("vpImageIo::write()") { + vpImageIo::write(I, filename); + return I; + }; + } + + { + const std::string filename = "/tmp/Big_images_Simd.png"; + + BENCHMARK("vpImageIo::writeSimdlib()") { + vpImageIo::writeSimdlib(I, filename); + return I; + }; + } + + { + const std::string filename = "/tmp/Big_images_stb.png"; + + BENCHMARK("vpImageIo::writeStb()") { + vpImageIo::writeStb(I, filename); + return I; + }; + } +} + +//TEST_CASE("Benchmark bgr to grayscale (ViSP)", "[benchmark]") { +// vpImage I; +// vpImageIo::read(I, imagePathColor); + +// std::vector bgr; +// common_tools::RGBaToBGR(I, bgr); + +// vpImage I_gray(I.getHeight(), I.getWidth()); + +// BENCHMARK("Benchmark bgr to grayscale (ViSP)") { +// vpImageConvert::BGRToGrey(bgr.data(), +// I_gray.bitmap, +// I.getWidth(), I.getHeight(), +// false, nThreads); +// return I_gray; +// }; + +//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101) +// SECTION("OpenCV Mat type") +// { +// cv::Mat img; +// vpImageConvert::convert(I, img); + +// BENCHMARK("Benchmark bgr to grayscale (ViSP + OpenCV Mat type)") { +// vpImageConvert::convert(img, I_gray, false, nThreads); +// return I_gray; +// }; +// } +//#endif +//} +//#endif + +//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101) +//TEST_CASE("Benchmark bgr to grayscale (OpenCV)", "[benchmark]") { +// cv::Mat img = cv::imread(imagePathColor); +// cv::Mat img_gray(img.size(), CV_8UC1); + +// BENCHMARK("Benchmark bgr to grayscale (OpenCV)") { +// cv::cvtColor(img, img_gray, cv::COLOR_BGR2GRAY); +// return img_gray; +// }; +//} +//#endif + +//// C++11 to be able to do bgr.data() +//#if VISP_CXX_STANDARD >= VISP_CXX_STANDARD_11 +//TEST_CASE("Benchmark bgr to rgba (naive code)", "[benchmark]") { +// vpImage I; +// vpImageIo::read(I, imagePathColor); + +// std::vector bgr; +// common_tools::RGBaToBGR(I, bgr); + +// vpImage I_bench(I.getHeight(), I.getWidth()); +// BENCHMARK("Benchmark bgr to rgba (naive code)") { +// common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast(I_bench.bitmap), +// I.getWidth(), I.getHeight(), false); +// return I_bench; +// }; +//} + +//TEST_CASE("Benchmark bgr to rgba (ViSP)", "[benchmark]") { +// vpImage I; +// vpImageIo::read(I, imagePathColor); + +// std::vector bgr; +// common_tools::RGBaToBGR(I, bgr); + +// SECTION("Check BGR to RGBa conversion") +// { +// vpImage ref(I.getHeight(), I.getWidth()); +// common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast(ref.bitmap), +// I.getWidth(), I.getHeight(), false); +// vpImage rgba(I.getHeight(), I.getWidth()); +// vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast(rgba.bitmap), +// I.getWidth(), I.getHeight(), false); + +// CHECK((rgba == ref)); +// } + +// vpImage I_rgba(I.getHeight(), I.getWidth()); +// BENCHMARK("Benchmark bgr to rgba (ViSP)") { +// vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast(I_rgba.bitmap), +// I.getWidth(), I.getHeight(), false); +// return I_rgba; +// }; + +//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101) +// SECTION("OpenCV Mat type") +// { +// cv::Mat img; +// vpImageConvert::convert(I, img); + +// BENCHMARK("Benchmark bgr to rgba (ViSP + OpenCV Mat type)") { +// vpImageConvert::convert(img, I_rgba); +// return I_rgba; +// }; +// } +//#endif +//} + +//TEST_CASE("Benchmark bgra to rgba (naive code)", "[benchmark]") { +// vpImage I; +// vpImageIo::read(I, imagePathColor); + +// std::vector bgra; +// common_tools::RGBaToBGRa(I, bgra); + +// vpImage I_bench(I.getHeight(), I.getWidth()); +// BENCHMARK("Benchmark bgra to rgba (naive code)") { +// common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast(I_bench.bitmap), +// I.getWidth(), I.getHeight(), false); +// return I_bench; +// }; +//} + +//TEST_CASE("Benchmark bgra to rgba (ViSP)", "[benchmark]") { +// vpImage I; +// vpImageIo::read(I, imagePathColor); + +// std::vector bgra; +// common_tools::RGBaToBGRa(I, bgra); + +// SECTION("Check BGRa to RGBa conversion") +// { +// vpImage ref(I.getHeight(), I.getWidth()); +// common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast(ref.bitmap), +// I.getWidth(), I.getHeight(), false); +// vpImage rgba(I.getHeight(), I.getWidth()); +// vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast(rgba.bitmap), +// I.getWidth(), I.getHeight(), false); + +// CHECK((rgba == ref)); +// } +// vpImage I_rgba(I.getHeight(), I.getWidth()); +// BENCHMARK("Benchmark bgra to rgba (ViSP)") { +// vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast(I_rgba.bitmap), +// I.getWidth(), I.getHeight(), false); +// return I_rgba; +// }; +//} +//#endif + +int main(int argc, char *argv[]) +{ + Catch::Session session; // There must be exactly one instance + + bool runBenchmark = false; + // Build a new parser on top of Catch's + using namespace Catch::clara; + auto cli = session.cli() // Get Catch's composite command line parser + | Opt(runBenchmark) // bind variable to a new option, with a hint string + ["--benchmark"] // the option names it will respond to + ("run benchmark?") // description string for the help output + | Opt(imagePathJpeg, "imagePathColor") + ["--imagePathColor"] + ("Path to color image") + | Opt(imagePathPng, "imagePathColor") + ["--imagePathGray"] + ("Path to gray image") + | Opt(nThreads, "nThreads") + ["--nThreads"] + ("Number of threads"); + + // Now pass the new composite back to Catch so it uses that + session.cli(cli); + + // Let Catch (using Clara) parse the command line + session.applyCommandLine(argc, argv); + + if (runBenchmark) { +// vpImage I_color; +// vpImageIo::read(I_color, imagePathColor); +// std::cout << "imagePathColor:\n\t" << imagePathColor << "\n\t" << I_color.getWidth() << "x" << I_color.getHeight() << std::endl; + +// vpImage I_gray; +// vpImageIo::read(I_gray, imagePathGray); +// std::cout << "imagePathGray:\n\t" << imagePathGray << "\n\t" << I_gray.getWidth() << "x" << I_gray.getHeight() << std::endl; + std::cout << "nThreads: " << nThreads << " / available threads: " << std::thread::hardware_concurrency() << std::endl; + + int numFailed = session.run(); + + // numFailed is clamped to 255 as some unices only use the lower 8 bits. + // This clamping has already been applied, so just return it here + // You can also do any post run clean-up here + return numFailed; + } + + return EXIT_SUCCESS; +} +#else +#include + +int main() +{ + return 0; +} +#endif From aad93cd76dbcf3b08d6ffc1a981499758a0235bf Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Thu, 4 Nov 2021 14:06:19 +0100 Subject: [PATCH 05/18] Fix issue when writing big Jpeg images. --- 3rdparty/simdlib/Simd/SimdImageSaveJpeg.h | 5 +++-- modules/io/test/perfImageLoadSave.cpp | 14 +++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h index d54164f7d4..f3d5f4a96c 100644 --- a/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h +++ b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h @@ -35,9 +35,9 @@ namespace Simd { struct BitBuf { - static const uint32_t capacity = 1024; + static const uint32_t capacity = 2048; uint32_t size; - uint16_t data[1024][2]; + uint16_t data[capacity][2]; SIMD_INLINE BitBuf() : size(0) @@ -51,6 +51,7 @@ namespace Simd SIMD_INLINE bool Full(uint32_t tail = capacity / 2) const { + assert(size <= capacity); return size + tail >= capacity; } diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp index ce0d416b70..8efe2c759e 100644 --- a/modules/io/test/perfImageLoadSave.cpp +++ b/modules/io/test/perfImageLoadSave.cpp @@ -180,14 +180,14 @@ TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") { }; } -// { -// const std::string filename = "/tmp/Big_images_Simd.jpg"; + { + const std::string filename = "/tmp/Big_images_Simd.jpg"; -// BENCHMARK("vpImageIo::writeSimdlib()") { -// vpImageIo::writeSimdlib(I, filename); -// return I; -// }; -// } + BENCHMARK("vpImageIo::writeSimdlib()") { + vpImageIo::writeSimdlib(I, filename); + return I; + }; + } { const std::string filename = "/tmp/Big_images_stb.jpg"; From a70090ceda0d28077dde0217e6515a833c6e6b8a Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Wed, 17 Nov 2021 00:51:26 +0100 Subject: [PATCH 06/18] Experimental: wip code to try adding a backend system for image I/O. --- .../core/include/visp3/core/vpImageTools.h | 8 +- modules/io/include/visp3/io/vpImageIo.h | 34 +- .../io/src/image/private/vpImageIoBackend.h | 104 + .../io/src/image/private/vpImageIoLibjpeg.cpp | 345 +++ .../io/src/image/private/vpImageIoLibpng.cpp | 615 +++++ .../io/src/image/private/vpImageIoOpenCV.cpp | 205 ++ .../src/image/private/vpImageIoPortable.cpp | 569 +++++ .../io/src/image/private/vpImageIoSimd.cpp | 87 + modules/io/src/image/private/vpImageIoStb.cpp | 121 + modules/io/src/image/vpImageIo.cpp | 2112 ++--------------- modules/io/test/perfImageLoadSave.cpp | 171 +- 11 files changed, 2286 insertions(+), 2085 deletions(-) create mode 100644 modules/io/src/image/private/vpImageIoBackend.h create mode 100644 modules/io/src/image/private/vpImageIoLibjpeg.cpp create mode 100644 modules/io/src/image/private/vpImageIoLibpng.cpp create mode 100644 modules/io/src/image/private/vpImageIoOpenCV.cpp create mode 100644 modules/io/src/image/private/vpImageIoPortable.cpp create mode 100644 modules/io/src/image/private/vpImageIoSimd.cpp create mode 100644 modules/io/src/image/private/vpImageIoStb.cpp diff --git a/modules/core/include/visp3/core/vpImageTools.h b/modules/core/include/visp3/core/vpImageTools.h index bf6e4a77f8..f12246e61d 100644 --- a/modules/core/include/visp3/core/vpImageTools.h +++ b/modules/core/include/visp3/core/vpImageTools.h @@ -1496,19 +1496,19 @@ void vpImageTools::warpLinear(const vpImage &src, const vpMatrix &T, vpIma const Type val01 = src[y_][x_ + 1]; const Type val10 = src[y_ + 1][x_]; const Type val11 = src[y_ + 1][x_ + 1]; - const float col0 = lerp(val00, val01, s); - const float col1 = lerp(val10, val11, s); + const float col0 = lerp(static_cast(val00), static_cast(val01), s); + const float col1 = lerp(static_cast(val10), static_cast(val11), s); const float interp = lerp(col0, col1, t); dst[i][j] = vpMath::saturate(interp); } else if (y_ < static_cast(src.getHeight()) - 1) { const Type val00 = src[y_][x_]; const Type val10 = src[y_ + 1][x_]; - const float interp = lerp(val00, val10, t); + const float interp = lerp(static_cast(val00), static_cast(val10), t); dst[i][j] = vpMath::saturate(interp); } else if (x_ < static_cast(src.getWidth()) - 1) { const Type val00 = src[y_][x_]; const Type val01 = src[y_][x_ + 1]; - const float interp = lerp(val00, val01, s); + const float interp = lerp(static_cast(val00), static_cast(val01), s); dst[i][j] = vpMath::saturate(interp); } else { dst[i][j] = src[y_][x_]; diff --git a/modules/io/include/visp3/io/vpImageIo.h b/modules/io/include/visp3/io/vpImageIo.h index 11bd9aa766..fa395e3882 100644 --- a/modules/io/include/visp3/io/vpImageIo.h +++ b/modules/io/include/visp3/io/vpImageIo.h @@ -124,6 +124,16 @@ class VISP_EXPORT vpImageIo static std::string getExtension(const std::string &filename); public: + //TODO: + // Image IO backend for only jpeg and png formats + enum vpImageIoBackendType { + IO_DEFAULT_BACKEND, + IO_LIB_BACKEND, + IO_OPENCV_BACKEND, + IO_SIMDLIB_BACKEND, + IO_STB_IMAGE_BACKEND + }; + static void read(vpImage &I, const std::string &filename); static void read(vpImage &I, const std::string &filename); @@ -138,15 +148,11 @@ class VISP_EXPORT vpImageIo static void readPPM(vpImage &I, const std::string &filename); static void readPPM(vpImage &I, const std::string &filename); - static void readJPEG(vpImage &I, const std::string &filename); - static void readJPEG(vpImage &I, const std::string &filename); - - static void readPNG(vpImage &I, const std::string &filename); - static void readPNG(vpImage &I, const std::string &filename); + static void readJPEG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); + static void readJPEG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); - //TODO: - static void readSimdlib(vpImage &I, const std::string &filename); - static void readStb(vpImage &I, const std::string &filename); + static void readPNG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); + static void readPNG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); static void writePFM(const vpImage &I, const std::string &filename); @@ -157,14 +163,10 @@ class VISP_EXPORT vpImageIo static void writePPM(const vpImage &I, const std::string &filename); static void writePPM(const vpImage &I, const std::string &filename); - static void writeJPEG(const vpImage &I, const std::string &filename); - static void writeJPEG(const vpImage &I, const std::string &filename); - - static void writePNG(const vpImage &I, const std::string &filename); - static void writePNG(const vpImage &I, const std::string &filename); + static void writeJPEG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); + static void writeJPEG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); - //TODO: - static void writeSimdlib(vpImage &I, const std::string &filename); - static void writeStb(vpImage &I, const std::string &filename); + static void writePNG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); + static void writePNG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); }; #endif diff --git a/modules/io/src/image/private/vpImageIoBackend.h b/modules/io/src/image/private/vpImageIoBackend.h new file mode 100644 index 0000000000..e1b434c030 --- /dev/null +++ b/modules/io/src/image/private/vpImageIoBackend.h @@ -0,0 +1,104 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Read/write images. + * + * Authors: + * Eric Marchand + * + *****************************************************************************/ + +/*! + \file vpImageIo.h + \brief Read/write images +*/ + +#ifndef vpIMAGEIOBACKEND_H +#define vpIMAGEIOBACKEND_H + +#include + + +// +void vp_writePFM(const vpImage &I, const std::string &filename); +void vp_writePGM(const vpImage &I, const std::string &filename); +void vp_writePGM(const vpImage &I, const std::string &filename); +void vp_writePGM(const vpImage &I, const std::string &filename); +void vp_readPFM(vpImage &I, const std::string &filename); +void vp_readPGM(vpImage &I, const std::string &filename); +void vp_readPGM(vpImage &I, const std::string &filename); +void vp_readPPM(vpImage &I, const std::string &filename); +void vp_readPPM(vpImage &I, const std::string &filename); +void vp_writePPM(const vpImage &I, const std::string &filename); +void vp_writePPM(const vpImage &I, const std::string &filename); + +// +void readJPEGLibjpeg(vpImage &I, const std::string &filename); +void readJPEGLibjpeg(vpImage &I, const std::string &filename); + +void writeJPEGLibjpeg(const vpImage &I, const std::string &filename); +void writeJPEGLibjpeg(const vpImage &I, const std::string &filename); + +// +void readPNGLibpng(vpImage &I, const std::string &filename); +void readPNGLibpng(vpImage &I, const std::string &filename); + +void writePNGLibpng(const vpImage &I, const std::string &filename); +void writePNGLibpng(const vpImage &I, const std::string &filename); + +// +void readOpenCV(vpImage &I, const std::string &filename); +void readOpenCV(vpImage &I, const std::string &filename); + +void writeOpenCV(const vpImage &I, const std::string &filename); +void writeOpenCV(const vpImage &I, const std::string &filename); + +// +void readSimdlib(vpImage &I, const std::string &filename); +void readSimdlib(vpImage &I, const std::string &filename); + +void writeJPEGSimdlib(const vpImage &I, const std::string &filename); +void writeJPEGSimdlib(const vpImage &I, const std::string &filename); + +void writePNGSimdlib(const vpImage &I, const std::string &filename); +void writePNGSimdlib(const vpImage &I, const std::string &filename); + +// +void readStb(vpImage &I, const std::string &filename); +void readStb(vpImage &I, const std::string &filename); + +void writeJPEGStb(const vpImage &I, const std::string &filename); +void writeJPEGStb(const vpImage &I, const std::string &filename); + +void writePNGStb(const vpImage &I, const std::string &filename); +void writePNGStb(const vpImage &I, const std::string &filename); + +#endif diff --git a/modules/io/src/image/private/vpImageIoLibjpeg.cpp b/modules/io/src/image/private/vpImageIoLibjpeg.cpp new file mode 100644 index 0000000000..99debb3021 --- /dev/null +++ b/modules/io/src/image/private/vpImageIoLibjpeg.cpp @@ -0,0 +1,345 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Read/write images. + * + * Authors: + * Eric Marchand + * + *****************************************************************************/ + +/*! + \file vpImageIo.cpp + \brief Read/write images +*/ + +#include "vpImageIoBackend.h" +#include + +//TODO: +#if defined(_WIN32) +// Include WinSock2.h before windows.h to ensure that winsock.h is not +// included by windows.h since winsock.h and winsock2.h are incompatible +#include +#include +#endif + +#if defined(VISP_HAVE_JPEG) +#include +#include +#endif + + +//-------------------------------------------------------------------------- +// JPEG +//-------------------------------------------------------------------------- + +#if defined(VISP_HAVE_JPEG) + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a JPEG file. + + \param I : Image to save as a JPEG file. + \param filename : Name of the file containing the image. +*/ +void writeJPEGLibjpeg(const vpImage &I, const std::string &filename) +{ + struct jpeg_compress_struct cinfo; + struct jpeg_error_mgr jerr; + FILE *file; + + cinfo.err = jpeg_std_error(&jerr); + jpeg_create_compress(&cinfo); + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty")); + } + + file = fopen(filename.c_str(), "wb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str())); + } + + unsigned int width = I.getWidth(); + unsigned int height = I.getHeight(); + + jpeg_stdio_dest(&cinfo, file); + + cinfo.image_width = width; + cinfo.image_height = height; + cinfo.input_components = 1; + cinfo.in_color_space = JCS_GRAYSCALE; + jpeg_set_defaults(&cinfo); + + jpeg_start_compress(&cinfo, TRUE); + + unsigned char *line; + line = new unsigned char[width]; + unsigned char *input = (unsigned char *)I.bitmap; + while (cinfo.next_scanline < cinfo.image_height) { + for (unsigned int i = 0; i < width; i++) { + line[i] = *(input); + input++; + } + jpeg_write_scanlines(&cinfo, &line, 1); + } + + jpeg_finish_compress(&cinfo); + jpeg_destroy_compress(&cinfo); + delete[] line; + fclose(file); +} + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a JPEG file. + + \param I : Image to save as a JPEG file. + \param filename : Name of the file containing the image. +*/ +void writeJPEGLibjpeg(const vpImage &I, const std::string &filename) +{ + struct jpeg_compress_struct cinfo; + struct jpeg_error_mgr jerr; + FILE *file; + + cinfo.err = jpeg_std_error(&jerr); + jpeg_create_compress(&cinfo); + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty")); + } + + file = fopen(filename.c_str(), "wb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str())); + } + + unsigned int width = I.getWidth(); + unsigned int height = I.getHeight(); + + jpeg_stdio_dest(&cinfo, file); + + cinfo.image_width = width; + cinfo.image_height = height; + cinfo.input_components = 3; + cinfo.in_color_space = JCS_RGB; + jpeg_set_defaults(&cinfo); + + jpeg_start_compress(&cinfo, TRUE); + + unsigned char *line; + line = new unsigned char[3 * width]; + unsigned char *input = (unsigned char *)I.bitmap; + while (cinfo.next_scanline < cinfo.image_height) { + for (unsigned int i = 0; i < width; i++) { + line[i * 3] = *(input); + input++; + line[i * 3 + 1] = *(input); + input++; + line[i * 3 + 2] = *(input); + input++; + input++; + } + jpeg_write_scanlines(&cinfo, &line, 1); + } + + jpeg_finish_compress(&cinfo); + jpeg_destroy_compress(&cinfo); + delete[] line; + fclose(file); +} + +/*! + Read the contents of the JPEG file, allocate memory + for the corresponding gray level image, if necessary convert the data in + gray level, and set the bitmap whith the gray level data. That means that + the image \e I is a "black and white" rendering of the original image in \e + filename, as in a black and white photograph. If necessary, the quantization + formula used is \f$0,299 r + 0,587 g + 0,114 b\f$. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + \param I : Image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void readJPEGLibjpeg(vpImage &I, const std::string &filename) +{ + struct jpeg_decompress_struct cinfo; + struct jpeg_error_mgr jerr; + FILE *file; + + cinfo.err = jpeg_std_error(&jerr); + jpeg_create_decompress(&cinfo); + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty")); + } + + file = fopen(filename.c_str(), "rb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str())); + } + + jpeg_stdio_src(&cinfo, file); + jpeg_read_header(&cinfo, TRUE); + + unsigned int width = cinfo.image_width; + unsigned int height = cinfo.image_height; + + if ((width != I.getWidth()) || (height != I.getHeight())) + I.resize(height, width); + + jpeg_start_decompress(&cinfo); + + unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components); + JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1); + + if (cinfo.out_color_space == JCS_RGB) { + vpImage Ic(height, width); + unsigned char *output = (unsigned char *)Ic.bitmap; + while (cinfo.output_scanline < cinfo.output_height) { + jpeg_read_scanlines(&cinfo, buffer, 1); + for (unsigned int i = 0; i < width; i++) { + *(output++) = buffer[0][i * 3]; + *(output++) = buffer[0][i * 3 + 1]; + *(output++) = buffer[0][i * 3 + 2]; + *(output++) = vpRGBa::alpha_default; + } + } + vpImageConvert::convert(Ic, I); + } + + else if (cinfo.out_color_space == JCS_GRAYSCALE) { + while (cinfo.output_scanline < cinfo.output_height) { + unsigned int row = cinfo.output_scanline; + jpeg_read_scanlines(&cinfo, buffer, 1); + memcpy(I[row], buffer[0], rowbytes); + } + } + + jpeg_finish_decompress(&cinfo); + jpeg_destroy_decompress(&cinfo); + fclose(file); +} + +/*! + Read a JPEG file and initialize a scalar image. + + Read the contents of the JPEG file, allocate + memory for the corresponding image, and set + the bitmap whith the content of + the file. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + If the file corresponds to a grayscaled image, a conversion is done to deal + with \e I which is a color image. + + \param I : Color image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void readJPEGLibjpeg(vpImage &I, const std::string &filename) +{ + struct jpeg_decompress_struct cinfo; + struct jpeg_error_mgr jerr; + FILE *file; + + cinfo.err = jpeg_std_error(&jerr); + jpeg_create_decompress(&cinfo); + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty")); + } + + file = fopen(filename.c_str(), "rb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str())); + } + + jpeg_stdio_src(&cinfo, file); + + jpeg_read_header(&cinfo, TRUE); + + unsigned int width = cinfo.image_width; + unsigned int height = cinfo.image_height; + + if ((width != I.getWidth()) || (height != I.getHeight())) + I.resize(height, width); + + jpeg_start_decompress(&cinfo); + + unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components); + JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1); + + if (cinfo.out_color_space == JCS_RGB) { + unsigned char *output = (unsigned char *)I.bitmap; + while (cinfo.output_scanline < cinfo.output_height) { + jpeg_read_scanlines(&cinfo, buffer, 1); + for (unsigned int i = 0; i < width; i++) { + *(output++) = buffer[0][i * 3]; + *(output++) = buffer[0][i * 3 + 1]; + *(output++) = buffer[0][i * 3 + 2]; + *(output++) = vpRGBa::alpha_default; + } + } + } + + else if (cinfo.out_color_space == JCS_GRAYSCALE) { + vpImage Ig(height, width); + + while (cinfo.output_scanline < cinfo.output_height) { + unsigned int row = cinfo.output_scanline; + jpeg_read_scanlines(&cinfo, buffer, 1); + memcpy(Ig[row], buffer[0], rowbytes); + } + + vpImageConvert::convert(Ig, I); + } + + jpeg_finish_decompress(&cinfo); + jpeg_destroy_decompress(&cinfo); + fclose(file); +} +#endif diff --git a/modules/io/src/image/private/vpImageIoLibpng.cpp b/modules/io/src/image/private/vpImageIoLibpng.cpp new file mode 100644 index 0000000000..e350e4260b --- /dev/null +++ b/modules/io/src/image/private/vpImageIoLibpng.cpp @@ -0,0 +1,615 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Read/write images. + * + * Authors: + * Eric Marchand + * + *****************************************************************************/ + +/*! + \file vpImageIo.cpp + \brief Read/write images +*/ + +#include "vpImageIoBackend.h" +#include + +//TODO: +#if defined(_WIN32) +// Include WinSock2.h before windows.h to ensure that winsock.h is not +// included by windows.h since winsock.h and winsock2.h are incompatible +#include +#include +#endif + +#if defined(VISP_HAVE_PNG) +#include +#endif + + +//-------------------------------------------------------------------------- +// PNG +//-------------------------------------------------------------------------- + +#if defined(VISP_HAVE_PNG) + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a PNG file. + + \param I : Image to save as a PNG file. + \param filename : Name of the file containing the image. +*/ +void writePNGLibpng(const vpImage &I, const std::string &filename) +{ + FILE *file; + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty")); + } + + file = fopen(filename.c_str(), "wb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str())); + } + + /* create a png info struct */ + png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + if (!png_ptr) { + fclose(file); + vpERROR_TRACE("Error during png_create_write_struct()\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + png_infop info_ptr = png_create_info_struct(png_ptr); + if (!info_ptr) { + fclose(file); + png_destroy_write_struct(&png_ptr, NULL); + vpERROR_TRACE("Error during png_create_info_struct()\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + /* initialize the setjmp for returning properly after a libpng error occured + */ + if (setjmp(png_jmpbuf(png_ptr))) { + fclose(file); + png_destroy_write_struct(&png_ptr, &info_ptr); + vpERROR_TRACE("Error during init_io\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + /* setup libpng for using standard C fwrite() function with our FILE pointer + */ + png_init_io(png_ptr, file); + + unsigned int width = I.getWidth(); + unsigned int height = I.getHeight(); + int bit_depth = 8; + int color_type = PNG_COLOR_TYPE_GRAY; + /* set some useful information from header */ + + if (setjmp(png_jmpbuf(png_ptr))) { + fclose(file); + png_destroy_write_struct(&png_ptr, &info_ptr); + vpERROR_TRACE("Error during write header\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE, + PNG_FILTER_TYPE_BASE); + + png_write_info(png_ptr, info_ptr); + + png_bytep *row_ptrs = new png_bytep[height]; + for (unsigned int i = 0; i < height; i++) + row_ptrs[i] = new png_byte[width]; + + unsigned char *input = (unsigned char *)I.bitmap; + + for (unsigned int i = 0; i < height; i++) { + png_byte *row = row_ptrs[i]; + for (unsigned int j = 0; j < width; j++) { + row[j] = *(input); + input++; + } + } + + png_write_image(png_ptr, row_ptrs); + + png_write_end(png_ptr, NULL); + + for (unsigned int j = 0; j < height; j++) + delete[] row_ptrs[j]; + + delete[] row_ptrs; + + png_destroy_write_struct(&png_ptr, &info_ptr); + + fclose(file); +} + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a PNG file. + + \param I : Image to save as a PNG file. + \param filename : Name of the file containing the image. +*/ +void writePNGLibpng(const vpImage &I, const std::string &filename) +{ + FILE *file; + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty")); + } + + file = fopen(filename.c_str(), "wb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str())); + } + + /* create a png info struct */ + png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + if (!png_ptr) { + fclose(file); + vpERROR_TRACE("Error during png_create_write_struct()\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + png_infop info_ptr = png_create_info_struct(png_ptr); + if (!info_ptr) { + fclose(file); + png_destroy_write_struct(&png_ptr, NULL); + vpERROR_TRACE("Error during png_create_info_struct()\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + /* initialize the setjmp for returning properly after a libpng error occured + */ + if (setjmp(png_jmpbuf(png_ptr))) { + fclose(file); + png_destroy_write_struct(&png_ptr, &info_ptr); + vpERROR_TRACE("Error during init_io\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + /* setup libpng for using standard C fwrite() function with our FILE pointer + */ + png_init_io(png_ptr, file); + + unsigned int width = I.getWidth(); + unsigned int height = I.getHeight(); + int bit_depth = 8; + int color_type = PNG_COLOR_TYPE_RGB; + /* set some useful information from header */ + + if (setjmp(png_jmpbuf(png_ptr))) { + fclose(file); + png_destroy_write_struct(&png_ptr, &info_ptr); + vpERROR_TRACE("Error during write header\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE, + PNG_FILTER_TYPE_BASE); + + png_write_info(png_ptr, info_ptr); + + png_bytep *row_ptrs = new png_bytep[height]; + for (unsigned int i = 0; i < height; i++) + row_ptrs[i] = new png_byte[3 * width]; + + unsigned char *input = (unsigned char *)I.bitmap; + + for (unsigned int i = 0; i < height; i++) { + png_byte *row = row_ptrs[i]; + for (unsigned int j = 0; j < width; j++) { + row[3 * j] = *(input); + input++; + row[3 * j + 1] = *(input); + input++; + row[3 * j + 2] = *(input); + input++; + input++; + } + } + + png_write_image(png_ptr, row_ptrs); + + png_write_end(png_ptr, NULL); + + for (unsigned int j = 0; j < height; j++) + delete[] row_ptrs[j]; + + delete[] row_ptrs; + + png_destroy_write_struct(&png_ptr, &info_ptr); + + fclose(file); +} + +/*! + Read the contents of the PNG file, allocate memory + for the corresponding gray level image, if necessary convert the data in + gray level, and set the bitmap whith the gray level data. That means that + the image \e I is a "black and white" rendering of the original image in \e + filename, as in a black and white photograph. If necessary, the quantization + formula used is \f$0,299 r + 0,587 g + 0,114 b\f$. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + \param I : Image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void readPNGLibpng(vpImage &I, const std::string &filename) +{ + FILE *file; + png_byte magic[8]; + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty")); + } + + file = fopen(filename.c_str(), "rb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str())); + } + + /* read magic number */ + if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) { + fclose(file); + throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str())); + } + + /* check for valid magic number */ + if (png_sig_cmp(magic, 0, sizeof(magic))) { + fclose(file); + throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image", + filename.c_str())); + } + + /* create a png read struct */ + // printf("version %s\n", PNG_LIBPNG_VER_STRING); + png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + if (png_ptr == NULL) { + fprintf(stderr, "error: can't create a png read structure!\n"); + fclose(file); + throw(vpImageException(vpImageException::ioError, "error reading png file")); + } + + /* create a png info struct */ + png_infop info_ptr = png_create_info_struct(png_ptr); + if (info_ptr == NULL) { + fprintf(stderr, "error: can't create a png info structure!\n"); + fclose(file); + png_destroy_read_struct(&png_ptr, NULL, NULL); + throw(vpImageException(vpImageException::ioError, "error reading png file")); + } + + /* initialize the setjmp for returning properly after a libpng error occured + */ + if (setjmp(png_jmpbuf(png_ptr))) { + fclose(file); + png_destroy_read_struct(&png_ptr, &info_ptr, NULL); + vpERROR_TRACE("Error during init io\n"); + throw(vpImageException(vpImageException::ioError, "PNG read error")); + } + + /* setup libpng for using standard C fread() function with our FILE pointer + */ + png_init_io(png_ptr, file); + + /* tell libpng that we have already read the magic number */ + png_set_sig_bytes(png_ptr, sizeof(magic)); + + /* read png info */ + png_read_info(png_ptr, info_ptr); + + unsigned int width = png_get_image_width(png_ptr, info_ptr); + unsigned int height = png_get_image_height(png_ptr, info_ptr); + + unsigned int bit_depth, channels, color_type; + /* get some useful information from header */ + bit_depth = png_get_bit_depth(png_ptr, info_ptr); + channels = png_get_channels(png_ptr, info_ptr); + color_type = png_get_color_type(png_ptr, info_ptr); + + /* convert index color images to RGB images */ + if (color_type == PNG_COLOR_TYPE_PALETTE) + png_set_palette_to_rgb(png_ptr); + + /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */ + if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) + png_set_expand(png_ptr); + + // if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS)) + // png_set_tRNS_to_alpha (png_ptr); + + if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA) + png_set_strip_alpha(png_ptr); + + if (bit_depth == 16) + png_set_strip_16(png_ptr); + else if (bit_depth < 8) + png_set_packing(png_ptr); + + /* update info structure to apply transformations */ + png_read_update_info(png_ptr, info_ptr); + + channels = png_get_channels(png_ptr, info_ptr); + + if ((width != I.getWidth()) || (height != I.getHeight())) + I.resize(height, width); + + png_bytep *rowPtrs = new png_bytep[height]; + + unsigned int stride = png_get_rowbytes(png_ptr, info_ptr); + unsigned char *data = new unsigned char[stride * height]; + + for (unsigned int i = 0; i < height; i++) + rowPtrs[i] = (png_bytep)data + (i * stride); + + png_read_image(png_ptr, rowPtrs); + + vpImage Ic(height, width); + unsigned char *output; + + switch (channels) { + case 1: + output = (unsigned char *)I.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i]; + } + break; + + case 2: + output = (unsigned char *)I.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i * 2]; + } + break; + + case 3: + output = (unsigned char *)Ic.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i * 3]; + *(output++) = data[i * 3 + 1]; + *(output++) = data[i * 3 + 2]; + *(output++) = vpRGBa::alpha_default; + } + vpImageConvert::convert(Ic, I); + break; + + case 4: + output = (unsigned char *)Ic.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i * 4]; + *(output++) = data[i * 4 + 1]; + *(output++) = data[i * 4 + 2]; + *(output++) = data[i * 4 + 3]; + } + vpImageConvert::convert(Ic, I); + break; + } + + delete[](png_bytep) rowPtrs; + delete[] data; + png_read_end(png_ptr, NULL); + png_destroy_read_struct(&png_ptr, &info_ptr, NULL); + fclose(file); +} + +/*! + Read a PNG file and initialize a scalar image. + + Read the contents of the PNG file, allocate + memory for the corresponding image, and set + the bitmap whith the content of + the file. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + If the file corresponds to a grayscaled image, a conversion is done to deal + with \e I which is a color image. + + \param I : Color image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void readPNGLibpng(vpImage &I, const std::string &filename) +{ + FILE *file; + png_byte magic[8]; + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty")); + } + + file = fopen(filename.c_str(), "rb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str())); + } + + /* read magic number */ + if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) { + fclose(file); + throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str())); + } + + /* check for valid magic number */ + if (png_sig_cmp(magic, 0, sizeof(magic))) { + fclose(file); + throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image", + filename.c_str())); + } + + /* create a png read struct */ + png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + if (!png_ptr) { + fclose(file); + vpERROR_TRACE("Error during png_create_read_struct()\n"); + throw(vpImageException(vpImageException::ioError, "PNG read error")); + } + + /* create a png info struct */ + png_infop info_ptr = png_create_info_struct(png_ptr); + if (!info_ptr) { + fclose(file); + png_destroy_read_struct(&png_ptr, NULL, NULL); + vpERROR_TRACE("Error during png_create_info_struct()\n"); + throw(vpImageException(vpImageException::ioError, "PNG read error")); + } + + /* initialize the setjmp for returning properly after a libpng error occured + */ + if (setjmp(png_jmpbuf(png_ptr))) { + fclose(file); + png_destroy_read_struct(&png_ptr, &info_ptr, NULL); + vpERROR_TRACE("Error during init io\n"); + throw(vpImageException(vpImageException::ioError, "PNG read error")); + } + + /* setup libpng for using standard C fread() function with our FILE pointer + */ + png_init_io(png_ptr, file); + + /* tell libpng that we have already read the magic number */ + png_set_sig_bytes(png_ptr, sizeof(magic)); + + /* read png info */ + png_read_info(png_ptr, info_ptr); + + unsigned int width = png_get_image_width(png_ptr, info_ptr); + unsigned int height = png_get_image_height(png_ptr, info_ptr); + + unsigned int bit_depth, channels, color_type; + /* get some useful information from header */ + bit_depth = png_get_bit_depth(png_ptr, info_ptr); + channels = png_get_channels(png_ptr, info_ptr); + color_type = png_get_color_type(png_ptr, info_ptr); + + /* convert index color images to RGB images */ + if (color_type == PNG_COLOR_TYPE_PALETTE) + png_set_palette_to_rgb(png_ptr); + + /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */ + if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) + png_set_expand(png_ptr); + + // if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS)) + // png_set_tRNS_to_alpha (png_ptr); + + if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA) + png_set_strip_alpha(png_ptr); + + if (bit_depth == 16) + png_set_strip_16(png_ptr); + else if (bit_depth < 8) + png_set_packing(png_ptr); + + /* update info structure to apply transformations */ + png_read_update_info(png_ptr, info_ptr); + + channels = png_get_channels(png_ptr, info_ptr); + + if ((width != I.getWidth()) || (height != I.getHeight())) + I.resize(height, width); + + png_bytep *rowPtrs = new png_bytep[height]; + + unsigned int stride = png_get_rowbytes(png_ptr, info_ptr); + unsigned char *data = new unsigned char[stride * height]; + + for (unsigned int i = 0; i < height; i++) + rowPtrs[i] = (png_bytep)data + (i * stride); + + png_read_image(png_ptr, rowPtrs); + + vpImage Ig(height, width); + unsigned char *output; + + switch (channels) { + case 1: + output = (unsigned char *)Ig.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i]; + } + vpImageConvert::convert(Ig, I); + break; + + case 2: + output = (unsigned char *)Ig.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i * 2]; + } + vpImageConvert::convert(Ig, I); + break; + + case 3: + output = (unsigned char *)I.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i * 3]; + *(output++) = data[i * 3 + 1]; + *(output++) = data[i * 3 + 2]; + *(output++) = vpRGBa::alpha_default; + } + break; + + case 4: + output = (unsigned char *)I.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i * 4]; + *(output++) = data[i * 4 + 1]; + *(output++) = data[i * 4 + 2]; + *(output++) = data[i * 4 + 3]; + } + break; + } + + delete[](png_bytep) rowPtrs; + delete[] data; + png_read_end(png_ptr, NULL); + png_destroy_read_struct(&png_ptr, &info_ptr, NULL); + fclose(file); +} +#endif diff --git a/modules/io/src/image/private/vpImageIoOpenCV.cpp b/modules/io/src/image/private/vpImageIoOpenCV.cpp new file mode 100644 index 0000000000..93b6a1ca1d --- /dev/null +++ b/modules/io/src/image/private/vpImageIoOpenCV.cpp @@ -0,0 +1,205 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Read/write images. + * + * Authors: + * Eric Marchand + * + *****************************************************************************/ + +/*! + \file vpImageIo.cpp + \brief Read/write images +*/ + +#include "vpImageIoBackend.h" + +//TODO: +#ifdef VISP_HAVE_OPENCV +#if (VISP_HAVE_OPENCV_VERSION >= 0x030000) // Require opencv >= 3.0.0 +# include +#elif (VISP_HAVE_OPENCV_VERSION >= 0x020408) // Require opencv >= 2.4.8 +# include +# include +# include +#elif (VISP_HAVE_OPENCV_VERSION >= 0x020101) // Require opencv >= 2.1.1 +# include +# include +# include +# include +#else +# include +#endif +#endif + +#include + + +#if defined(VISP_HAVE_OPENCV) + +/*! + Read the contents of the JPEG file, allocate memory + for the corresponding gray level image, if necessary convert the data in + gray level, and set the bitmap whith the gray level data. That means that + the image \e I is a "black and white" rendering of the original image in \e + filename, as in a black and white photograph. If necessary, the quantization + formula used is \f$0,299 r + 0,587 g + 0,114 b\f$. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + If EXIF information is embedded in the image file, the EXIF orientation is ignored. + + \param I : Image to set with the \e filename content. + \param filename : Name of the file containing the image. + +*/ +void readOpenCV(vpImage &I, const std::string &filename) +{ +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 +#if VISP_HAVE_OPENCV_VERSION >= 0x030200 + int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION; +#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 + int flags = cv::IMREAD_GRAYSCALE; +#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 + int flags = CV_LOAD_IMAGE_GRAYSCALE; +#endif + cv::Mat Ip = cv::imread(filename.c_str(), flags); + if (!Ip.empty()) + vpImageConvert::convert(Ip, I); + else + throw(vpImageException(vpImageException::ioError, "Can't read the image")); +#else + IplImage *Ip = NULL; + Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE); + if (Ip != NULL) + vpImageConvert::convert(Ip, I); + else + throw(vpImageException(vpImageException::ioError, "Can't read the image")); + cvReleaseImage(&Ip); +#endif +} + +/*! + Read a JPEG file and initialize a scalar image. + + Read the contents of the JPEG file, allocate + memory for the corresponding image, and set + the bitmap whith the content of + the file. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + If the file corresponds to a grayscaled image, a conversion is done to deal + with \e I which is a color image. + + If EXIF information is embedded in the image file, the EXIF orientation is ignored. + + \param I : Color image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void readOpenCV(vpImage &I, const std::string &filename) +{ +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 +#if VISP_HAVE_OPENCV_VERSION >= 0x030200 + int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION; +#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 + int flags = cv::IMREAD_GRAYSCALE; +#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 + int flags = CV_LOAD_IMAGE_GRAYSCALE; +#endif + cv::Mat Ip = cv::imread(filename.c_str(), flags); + if (!Ip.empty()) + vpImageConvert::convert(Ip, I); + else + throw(vpImageException(vpImageException::ioError, "Can't read the image")); +#else + IplImage *Ip = NULL; + Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_COLOR); + if (Ip != NULL) + vpImageConvert::convert(Ip, I); + else + throw(vpImageException(vpImageException::ioError, "Can't read the image")); + cvReleaseImage(&Ip); +#endif +} + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a JPEG file. + + \param I : Image to save as a JPEG file. + \param filename : Name of the file containing the image. +*/ +void writeOpenCV(const vpImage &I, const std::string &filename) +{ +#if (VISP_HAVE_OPENCV_VERSION >= 0x020408) + cv::Mat Ip; + vpImageConvert::convert(I, Ip); + cv::imwrite(filename.c_str(), Ip); +#else + IplImage *Ip = NULL; + vpImageConvert::convert(I, Ip); + + cvSaveImage(filename.c_str(), Ip); + + cvReleaseImage(&Ip); +#endif +} + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a JPEG file. + + \param I : Image to save as a JPEG file. + \param filename : Name of the file containing the image. +*/ +void writeOpenCV(const vpImage &I, const std::string &filename) +{ +#if (VISP_HAVE_OPENCV_VERSION >= 0x020408) + cv::Mat Ip; + vpImageConvert::convert(I, Ip); + cv::imwrite(filename.c_str(), Ip); +#else + IplImage *Ip = NULL; + vpImageConvert::convert(I, Ip); + + cvSaveImage(filename.c_str(), Ip); + + cvReleaseImage(&Ip); +#endif +} + +#endif diff --git a/modules/io/src/image/private/vpImageIoPortable.cpp b/modules/io/src/image/private/vpImageIoPortable.cpp new file mode 100644 index 0000000000..0031e4c96a --- /dev/null +++ b/modules/io/src/image/private/vpImageIoPortable.cpp @@ -0,0 +1,569 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Read/write images. + * + * Authors: + * Eric Marchand + * + *****************************************************************************/ + +/*! + \file vpImageIo.cpp + \brief Read/write images +*/ + +#include "vpImageIoBackend.h" +#include +#include + +//TODO: +#if defined(_WIN32) +// Include WinSock2.h before windows.h to ensure that winsock.h is not +// included by windows.h since winsock.h and winsock2.h are incompatible +#include +#include +#endif + + +void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w, + unsigned int &h, unsigned int &maxval); + +#ifndef DOXYGEN_SHOULD_SKIP_THIS +/*! + * Decode the PNM image header. + * \param filename[in] : File name. + * \param fd[in] : File desdcriptor. + * \param magic[in] : Magic number for identifying the file type. + * \param w[out] : Image width. + * \param h[out] : Image height. + * \param maxval[out] : Maximum pixel value. + */ +void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w, + unsigned int &h, unsigned int &maxval) +{ + std::string line; + unsigned int nb_elt = 4, cpt_elt = 0; + while (cpt_elt != nb_elt) { + // Skip empty lines or lines starting with # (comment) + while (std::getline(fd, line) && (line.compare(0, 1, "#") == 0 || line.size() == 0)) { + } + + if (fd.eof()) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str())); + } + + std::vector header = vpIoTools::splitChain(line, std::string(" ")); + + if (header.size() == 0) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str())); + } + + if (cpt_elt == 0) { // decode magic + if (header[0].compare(0, magic.size(), magic) != 0) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "\"%s\" is not a PNM file with magic number %s", + filename.c_str(), magic.c_str())); + } + cpt_elt++; + header.erase(header.begin(), + header.begin() + 1); // erase first element that is processed + } + while (header.size()) { + if (cpt_elt == 1) { // decode width + std::istringstream ss(header[0]); + ss >> w; + cpt_elt++; + header.erase(header.begin(), + header.begin() + 1); // erase first element that is processed + } else if (cpt_elt == 2) { // decode height + std::istringstream ss(header[0]); + ss >> h; + cpt_elt++; + header.erase(header.begin(), + header.begin() + 1); // erase first element that is processed + } else if (cpt_elt == 3) { // decode maxval + std::istringstream ss(header[0]); + ss >> maxval; + cpt_elt++; + header.erase(header.begin(), + header.begin() + 1); // erase first element that is processed + } + } + } +} +#endif + +//-------------------------------------------------------------------------- +// PFM +//-------------------------------------------------------------------------- + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function is built like portable gray pixmap (eg PGM P5) file. + but considers float image data. + + \param I : Image to save as a (PFM P8) file. + \param filename : Name of the file containing the image. +*/ +void vp_writePFM(const vpImage &I, const std::string &filename) +{ + FILE *fd; + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot write PFM image: filename empty")); + } + + fd = fopen(filename.c_str(), "wb"); + + if (fd == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create PFM file \"%s\"", filename.c_str())); + } + + // Write the head + fprintf(fd, "P8\n"); // Magic number + fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size + fprintf(fd, "255\n"); // Max level + + // Write the bitmap + size_t ierr; + size_t nbyte = I.getWidth() * I.getHeight(); + + ierr = fwrite(I.bitmap, sizeof(float), nbyte, fd); + if (ierr != nbyte) { + fclose(fd); + throw(vpImageException(vpImageException::ioError, "Cannot save PFM file \"%s\": only %d bytes over %d saved ", + filename.c_str(), ierr, nbyte)); + } + + fflush(fd); + fclose(fd); +} + +//-------------------------------------------------------------------------- +// PGM +//-------------------------------------------------------------------------- + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a portable gray pixmap (PGM P5) file. + + \param I : Image to save as a (PGM P5) file. + \param filename : Name of the file containing the image. +*/ +void vp_writePGM(const vpImage &I, const std::string &filename) +{ + FILE *fd; + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty")); + } + + fd = fopen(filename.c_str(), "wb"); + + if (fd == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str())); + } + + // Write the head + fprintf(fd, "P5\n"); // Magic number + fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size + fprintf(fd, "255\n"); // Max level + + // Write the bitmap + size_t ierr; + size_t nbyte = I.getWidth() * I.getHeight(); + + ierr = fwrite(I.bitmap, sizeof(unsigned char), nbyte, fd); + if (ierr != nbyte) { + fclose(fd); + throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved", + filename.c_str(), ierr, nbyte)); + } + + fflush(fd); + fclose(fd); +} + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a portable gray pixmap (PGM P5) file. + + \param I : Image to save as a (PGM P5) file. + \param filename : Name of the file containing the image. +*/ +void vp_writePGM(const vpImage &I, const std::string &filename) +{ + vpImage Iuc; + unsigned int nrows = I.getHeight(); + unsigned int ncols = I.getWidth(); + + Iuc.resize(nrows, ncols); + + for (unsigned int i = 0; i < nrows * ncols; i++) + Iuc.bitmap[i] = (unsigned char)I.bitmap[i]; + + vp_writePGM(Iuc, filename); +} + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a portable gray pixmap (PGM P5) file. + Color image is converted into a grayscale image. + + \param I : Image to save as a (PGM P5) file. + \param filename : Name of the file containing the image. +*/ +void vp_writePGM(const vpImage &I, const std::string &filename) +{ + + FILE *fd; + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty")); + } + + fd = fopen(filename.c_str(), "wb"); + + if (fd == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str())); + } + + // Write the head + fprintf(fd, "P5\n"); // Magic number + fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size + fprintf(fd, "255\n"); // Max level + + // Write the bitmap + size_t ierr; + size_t nbyte = I.getWidth() * I.getHeight(); + + vpImage Itmp; + vpImageConvert::convert(I, Itmp); + + ierr = fwrite(Itmp.bitmap, sizeof(unsigned char), nbyte, fd); + if (ierr != nbyte) { + fclose(fd); + throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved", + filename.c_str(), ierr, nbyte)); + } + + fflush(fd); + fclose(fd); +} + +/*! + Read a PFM P8 file and initialize a float image. + + Read the contents of the portable gray pixmap (PFM P8) filename, allocate + memory for the corresponding image, and set the bitmap whith the content of + the file. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + \param I : Image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void vp_readPFM(vpImage &I, const std::string &filename) +{ + unsigned int w = 0, h = 0, maxval = 0; + unsigned int w_max = 100000, h_max = 100000, maxval_max = 255; + std::string magic("P8"); + + std::ifstream fd(filename.c_str(), std::ios::binary); + + // Open the filename + if (!fd.is_open()) { + throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str())); + } + + vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval); + + if (w > w_max || h > h_max) { + fd.close(); + throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str())); + } + if (maxval > maxval_max) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str())); + } + + if ((h != I.getHeight()) || (w != I.getWidth())) { + I.resize(h, w); + } + + unsigned int nbyte = I.getHeight() * I.getWidth(); + fd.read((char *)I.bitmap, sizeof(float) * nbyte); + if (!fd) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte, + filename.c_str())); + } + + fd.close(); +} + +/*! + Read a PGM P5 file and initialize a scalar image. + + Read the contents of the portable gray pixmap (PGM P5) filename, allocate + memory for the corresponding image, and set the bitmap whith the content of + the file. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + \param I : Image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void vp_readPGM(vpImage &I, const std::string &filename) +{ + unsigned int w = 0, h = 0, maxval = 0; + unsigned int w_max = 100000, h_max = 100000, maxval_max = 255; + std::string magic("P5"); + + std::ifstream fd(filename.c_str(), std::ios::binary); + + // Open the filename + if (!fd.is_open()) { + throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str())); + } + + vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval); + + if (w > w_max || h > h_max) { + fd.close(); + throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str())); + } + if (maxval > maxval_max) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str())); + } + + if ((h != I.getHeight()) || (w != I.getWidth())) { + I.resize(h, w); + } + + unsigned int nbyte = I.getHeight() * I.getWidth(); + fd.read((char *)I.bitmap, nbyte); + if (!fd) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte, + filename.c_str())); + } + + fd.close(); +} + +/*! + Read a PGM P5 file and initialize a scalar image. + + Read the contents of the portable gray pixmap (PGM P5) filename, allocate + memory for the corresponding image, and set the bitmap whith the content of + the file. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + The gray level image contained in the \e filename is converted in a + color image in \e I. + + \param I : Color image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void vp_readPGM(vpImage &I, const std::string &filename) +{ + vpImage Itmp; + + vp_readPGM(Itmp, filename); + + vpImageConvert::convert(Itmp, I); +} + +//-------------------------------------------------------------------------- +// PPM +//-------------------------------------------------------------------------- + +/*! + Read the contents of the portable pixmap (PPM P6) filename, allocate memory + for the corresponding gray level image, convert the data in gray level, and + set the bitmap whith the gray level data. That means that the image \e I is + a "black and white" rendering of the original image in \e filename, as in a + black and white photograph. The quantization formula used is \f$0,299 r + + 0,587 g + 0,114 b\f$. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + \param I : Image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void vp_readPPM(vpImage &I, const std::string &filename) +{ + vpImage Itmp; + + vp_readPPM(Itmp, filename); + + vpImageConvert::convert(Itmp, I); +} + +/*! + Read the contents of the portable pixmap (PPM P6) filename, + allocate memory for the corresponding vpRGBa image. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + \param I : Image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void vp_readPPM(vpImage &I, const std::string &filename) +{ + unsigned int w = 0, h = 0, maxval = 0; + unsigned int w_max = 100000, h_max = 100000, maxval_max = 255; + std::string magic("P6"); + + std::ifstream fd(filename.c_str(), std::ios::binary); + + // Open the filename + if (!fd.is_open()) { + throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str())); + } + + vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval); + + if (w > w_max || h > h_max) { + fd.close(); + throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str())); + } + if (maxval > maxval_max) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str())); + } + + if ((h != I.getHeight()) || (w != I.getWidth())) { + I.resize(h, w); + } + + for (unsigned int i = 0; i < I.getHeight(); i++) { + for (unsigned int j = 0; j < I.getWidth(); j++) { + unsigned char rgb[3]; + fd.read((char *)&rgb, 3); + + if (!fd) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", + (i * I.getWidth() + j) * 3 + fd.gcount(), I.getSize() * 3, filename.c_str())); + } + + I[i][j].R = rgb[0]; + I[i][j].G = rgb[1]; + I[i][j].B = rgb[2]; + I[i][j].A = vpRGBa::alpha_default; + } + } + + fd.close(); +} + +/*! + Write the content of the bitmap in the file which name is given by \e + filename. This function writes a portable gray pixmap (PPM P6) file. + grayscale image is converted into a color image vpRGBa. + + \param I : Image to save as a (PPM P6) file. + \param filename : Name of the file containing the image. +*/ +void vp_writePPM(const vpImage &I, const std::string &filename) +{ + vpImage Itmp; + + vpImageConvert::convert(I, Itmp); + + vp_writePPM(Itmp, filename); +} + +/*! + Write the content of the bitmap in the file which name is given by \e + filename. This function writes a portable gray pixmap (PPM P6) file. + + \param I : Image to save as a (PPM P6) file. + \param filename : Name of the file containing the image. +*/ +void vp_writePPM(const vpImage &I, const std::string &filename) +{ + FILE *f; + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot create PPM file: filename empty")); + } + + f = fopen(filename.c_str(), "wb"); + + if (f == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create PPM file \"%s\"", filename.c_str())); + } + + fprintf(f, "P6\n"); // Magic number + fprintf(f, "%u %u\n", I.getWidth(), I.getHeight()); // Image size + fprintf(f, "%d\n", 255); // Max level + + for (unsigned int i = 0; i < I.getHeight(); i++) { + for (unsigned int j = 0; j < I.getWidth(); j++) { + vpRGBa v = I[i][j]; + unsigned char rgb[3]; + rgb[0] = v.R; + rgb[1] = v.G; + rgb[2] = v.B; + + size_t res = fwrite(&rgb, 1, 3, f); + if (res != 3) { + fclose(f); + throw(vpImageException(vpImageException::ioError, "cannot write file \"%s\"", filename.c_str())); + } + } + } + + fflush(f); + fclose(f); +} diff --git a/modules/io/src/image/private/vpImageIoSimd.cpp b/modules/io/src/image/private/vpImageIoSimd.cpp new file mode 100644 index 0000000000..40986bf743 --- /dev/null +++ b/modules/io/src/image/private/vpImageIoSimd.cpp @@ -0,0 +1,87 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Read/write images. + * + * Authors: + * Eric Marchand + * + *****************************************************************************/ + +/*! + \file vpImageIo.cpp + \brief Read/write images +*/ + +#include "vpImageIoBackend.h" + +//TODO: +#include + + +//TODO: +void readSimdlib(vpImage &I, const std::string &filename) +{ + size_t stride = 0, width = 0, height = 0; + SimdPixelFormatType format = SimdPixelFormatGray8; + uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format); + const bool copyData = false; + I.init(data, (unsigned int)height, (unsigned int)width, copyData); +} + +void readSimdlib(vpImage &I, const std::string &filename) +{ + size_t stride = 0, width = 0, height = 0; + SimdPixelFormatType format = SimdPixelFormatRgba32; + uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format); + const bool copyData = false; + I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData); +} + +void writeJPEGSimdlib(const vpImage &I, const std::string &filename) +{ + SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFileJpeg, 90, filename.c_str()); +} + +void writeJPEGSimdlib(const vpImage &I, const std::string &filename) +{ + SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, 90, filename.c_str()); +} + +void writePNGSimdlib(const vpImage &I, const std::string &filename) +{ + SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFileJpeg, 90, filename.c_str()); +} + +void writePNGSimdlib(const vpImage &I, const std::string &filename) +{ + SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFilePng, 90, filename.c_str()); +} diff --git a/modules/io/src/image/private/vpImageIoStb.cpp b/modules/io/src/image/private/vpImageIoStb.cpp new file mode 100644 index 0000000000..97b453d841 --- /dev/null +++ b/modules/io/src/image/private/vpImageIoStb.cpp @@ -0,0 +1,121 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Read/write images. + * + * Authors: + * Eric Marchand + * + *****************************************************************************/ + +/*! + \file vpImageIo.cpp + \brief Read/write images +*/ + +#include "vpImageIoBackend.h" + +//TODO: +#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) +# define VISP_HAVE_SSE2 1 +#endif + +#ifndef VISP_HAVE_SSE2 +# define STBI_NO_SIMD +#endif + +#define STB_IMAGE_IMPLEMENTATION +#include + +#define STB_IMAGE_WRITE_IMPLEMENTATION +#include + + +//TODO: +void readStb(vpImage &I, const std::string &filename) +{ + int width = 0, height = 0, channels = 0; + unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_grey); + if (image == NULL) { + throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); + } + I.init(image, static_cast(height), static_cast(width), true); + stbi_image_free(image); +} + +void readStb(vpImage &I, const std::string &filename) +{ + int width = 0, height = 0, channels = 0; + unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha); + if (image == NULL) { + throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); + } + I.init(reinterpret_cast(image), static_cast(height), static_cast(width), true); + stbi_image_free(image); +} + +void writeJPEGStb(const vpImage &I, const std::string &filename) +{ + int res = stbi_write_jpg(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_grey, + reinterpret_cast(I.bitmap), 90); + if (res == 0) { + throw(vpImageException(vpImageException::ioError, "JEPG write error")); + } +} + +void writeJPEGStb(const vpImage &I, const std::string &filename) +{ + int res = stbi_write_jpg(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, + reinterpret_cast(I.bitmap), 90); + if (res == 0) { + throw(vpImageException(vpImageException::ioError, "JEPG write error")); + } +} + +void writePNGStb(const vpImage &I, const std::string &filename) +{ + const int stride_in_bytes = static_cast(I.getWidth()); + int res = stbi_write_png(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_grey, + reinterpret_cast(I.bitmap), stride_in_bytes); + if (res == 0) { + throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str())); + } +} + +void writePNGStb(const vpImage &I, const std::string &filename) +{ + const int stride_in_bytes = static_cast(4 * I.getWidth()); + int res = stbi_write_png(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, + reinterpret_cast(I.bitmap), stride_in_bytes); + if (res == 0) { + throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str())); + } +} diff --git a/modules/io/src/image/vpImageIo.cpp b/modules/io/src/image/vpImageIo.cpp index cc7799d158..e8b221049e 100644 --- a/modules/io/src/image/vpImageIo.cpp +++ b/modules/io/src/image/vpImageIo.cpp @@ -46,119 +46,9 @@ #include #include -#if defined(_WIN32) -// Include WinSock2.h before windows.h to ensure that winsock.h is not -// included by windows.h since winsock.h and winsock2.h are incompatible -#include -#include -#endif - -#if defined(VISP_HAVE_JPEG) -#include -#include -#endif - -#if defined(VISP_HAVE_PNG) -#include -#endif - //TODO: -#include -//TODO: -#define STB_IMAGE_IMPLEMENTATION -#include - -#define STB_IMAGE_WRITE_IMPLEMENTATION -#include - -#if !defined(VISP_HAVE_OPENCV) -#if !defined(VISP_HAVE_JPEG) || !defined(VISP_HAVE_PNG) - -#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) -# define VISP_HAVE_SSE2 1 -#endif - -#ifndef VISP_HAVE_SSE2 -# define STBI_NO_SIMD -#endif - -#define STB_IMAGE_IMPLEMENTATION -#include +#include "private/vpImageIoBackend.h" -#define STB_IMAGE_WRITE_IMPLEMENTATION -#include -#endif -#endif - -void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w, - unsigned int &h, unsigned int &maxval); - -#ifndef DOXYGEN_SHOULD_SKIP_THIS -/*! - * Decode the PNM image header. - * \param filename[in] : File name. - * \param fd[in] : File desdcriptor. - * \param magic[in] : Magic number for identifying the file type. - * \param w[out] : Image width. - * \param h[out] : Image height. - * \param maxval[out] : Maximum pixel value. - */ -void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w, - unsigned int &h, unsigned int &maxval) -{ - std::string line; - unsigned int nb_elt = 4, cpt_elt = 0; - while (cpt_elt != nb_elt) { - // Skip empty lines or lines starting with # (comment) - while (std::getline(fd, line) && (line.compare(0, 1, "#") == 0 || line.size() == 0)) { - } - - if (fd.eof()) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str())); - } - - std::vector header = vpIoTools::splitChain(line, std::string(" ")); - - if (header.size() == 0) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str())); - } - - if (cpt_elt == 0) { // decode magic - if (header[0].compare(0, magic.size(), magic) != 0) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "\"%s\" is not a PNM file with magic number %s", - filename.c_str(), magic.c_str())); - } - cpt_elt++; - header.erase(header.begin(), - header.begin() + 1); // erase first element that is processed - } - while (header.size()) { - if (cpt_elt == 1) { // decode width - std::istringstream ss(header[0]); - ss >> w; - cpt_elt++; - header.erase(header.begin(), - header.begin() + 1); // erase first element that is processed - } else if (cpt_elt == 2) { // decode height - std::istringstream ss(header[0]); - ss >> h; - cpt_elt++; - header.erase(header.begin(), - header.begin() + 1); // erase first element that is processed - } else if (cpt_elt == 3) { // decode maxval - std::istringstream ss(header[0]); - ss >> maxval; - cpt_elt++; - header.erase(header.begin(), - header.begin() + 1); // erase first element that is processed - } - } - } -} -#endif vpImageIo::vpImageFormatType vpImageIo::getFormat(const std::string &filename) { @@ -271,18 +161,10 @@ void vpImageIo::read(vpImage &I, const std::string &filename) readPPM(I, final_filename); break; case FORMAT_JPEG: -#ifdef VISP_HAVE_JPEG readJPEG(I, final_filename); -#else - try_opencv_reader = true; -#endif break; case FORMAT_PNG: -#if defined(VISP_HAVE_PNG) readPNG(I, final_filename); -#else - try_opencv_reader = true; -#endif break; case FORMAT_TIFF: case FORMAT_BMP: @@ -297,39 +179,10 @@ void vpImageIo::read(vpImage &I, const std::string &filename) if (try_opencv_reader) { #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 -#if VISP_HAVE_OPENCV_VERSION >= 0x030200 - int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION; -#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 - int flags = cv::IMREAD_GRAYSCALE; -#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 - int flags = CV_LOAD_IMAGE_GRAYSCALE; -#endif - // std::cout << "Use opencv to read the image" << std::endl; - cv::Mat cvI = cv::imread(final_filename, flags); - if (cvI.cols == 0 && cvI.rows == 0) { - std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported"; - throw(vpImageException(vpImageException::ioError, message)); - } - vpImageConvert::convert(cvI, I); + readOpenCV(I, filename); #else - switch (getFormat(final_filename)) { - case FORMAT_JPEG: - readJPEG(I, final_filename); - break; - case FORMAT_PNG: - readPNG(I, final_filename); - break; - case FORMAT_BMP: - case FORMAT_TIFF: - case FORMAT_DIB: - case FORMAT_PBM: - case FORMAT_RASTER: - case FORMAT_JPEG2000: - case FORMAT_UNKNOWN: - default: - std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported"; - throw(vpImageException(vpImageException::ioError, message)); - } + std::string message = "Cannot read file \"" + filename + "\": No backend able to support this image format"; + throw(vpImageException(vpImageException::ioError, message)); #endif } } @@ -374,18 +227,10 @@ void vpImageIo::read(vpImage &I, const std::string &filename) readPPM(I, final_filename); break; case FORMAT_JPEG: -#ifdef VISP_HAVE_JPEG readJPEG(I, final_filename); -#else - try_opencv_reader = true; -#endif break; case FORMAT_PNG: -#if defined(VISP_HAVE_PNG) readPNG(I, final_filename); -#else - try_opencv_reader = true; -#endif break; case FORMAT_TIFF: case FORMAT_BMP: @@ -400,39 +245,10 @@ void vpImageIo::read(vpImage &I, const std::string &filename) if (try_opencv_reader) { #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 -#if VISP_HAVE_OPENCV_VERSION >= 0x030200 - int flags = cv::IMREAD_COLOR | cv::IMREAD_IGNORE_ORIENTATION; -#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 - int flags = cv::IMREAD_COLOR; -#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 - int flags = CV_LOAD_IMAGE_COLOR; -#endif - // std::cout << "Use opencv to read the image" << std::endl; - cv::Mat cvI = cv::imread(final_filename, flags); - if (cvI.cols == 0 && cvI.rows == 0) { - std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported"; - throw(vpImageException(vpImageException::ioError, message)); - } - vpImageConvert::convert(cvI, I); + readOpenCV(I, filename); #else - switch (getFormat(final_filename)) { - case FORMAT_JPEG: - readJPEG(I, final_filename); - break; - case FORMAT_PNG: - readPNG(I, final_filename); - break; - case FORMAT_BMP: - case FORMAT_TIFF: - case FORMAT_DIB: - case FORMAT_PBM: - case FORMAT_RASTER: - case FORMAT_JPEG2000: - case FORMAT_UNKNOWN: - default: - std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported"; - throw(vpImageException(vpImageException::ioError, message)); - } + std::string message = "Cannot read file \"" + filename + "\": No backend able to support this image format"; + throw(vpImageException(vpImageException::ioError, message)); #endif } } @@ -463,18 +279,10 @@ void vpImageIo::write(const vpImage &I, const std::string &filena writePPM(I, filename); break; case FORMAT_JPEG: -#ifdef VISP_HAVE_JPEG writeJPEG(I, filename); -#else - try_opencv_writer = true; -#endif break; case FORMAT_PNG: -#ifdef VISP_HAVE_PNG writePNG(I, filename); -#else - try_opencv_writer = true; -#endif break; case FORMAT_TIFF: case FORMAT_BMP: @@ -488,30 +296,11 @@ void vpImageIo::write(const vpImage &I, const std::string &filena } if (try_opencv_writer) { -#if VISP_HAVE_OPENCV_VERSION >= 0x020100 - // std::cout << "Use opencv to write the image" << std::endl; - cv::Mat cvI; - vpImageConvert::convert(I, cvI); - cv::imwrite(filename, cvI); +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + writeOpenCV(I, filename); #else - switch (getFormat(filename)) { - case FORMAT_JPEG: - writeJPEG(I, filename); - break; - case FORMAT_PNG: - writePNG(I, filename); - break; - case FORMAT_BMP: - case FORMAT_TIFF: - case FORMAT_DIB: - case FORMAT_PBM: - case FORMAT_RASTER: - case FORMAT_JPEG2000: - case FORMAT_UNKNOWN: - default: - vpCERROR << "Cannot write file: Image format not supported..." << std::endl; - throw(vpImageException(vpImageException::ioError, "Cannot write file: Image format not supported")); - } + std::string message = "Cannot write file \"" + filename + "\": No backend able to support this image format"; + throw(vpImageException(vpImageException::ioError, message)); #endif } } @@ -542,18 +331,10 @@ void vpImageIo::write(const vpImage &I, const std::string &filename) writePPM(I, filename); break; case FORMAT_JPEG: -#ifdef VISP_HAVE_JPEG writeJPEG(I, filename); -#else - try_opencv_writer = true; -#endif break; case FORMAT_PNG: -#ifdef VISP_HAVE_PNG writePNG(I, filename); -#else - try_opencv_writer = true; -#endif break; case FORMAT_TIFF: case FORMAT_BMP: @@ -567,1735 +348,250 @@ void vpImageIo::write(const vpImage &I, const std::string &filename) } if (try_opencv_writer) { -#if VISP_HAVE_OPENCV_VERSION >= 0x020100 - // std::cout << "Use opencv to write the image" << std::endl; - cv::Mat cvI; - vpImageConvert::convert(I, cvI); - cv::imwrite(filename, cvI); +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + writeOpenCV(I, filename); #else - switch (getFormat(filename)) { - case FORMAT_JPEG: - writeJPEG(I, filename); - break; - case FORMAT_PNG: - writePNG(I, filename); - break; - case FORMAT_BMP: - case FORMAT_TIFF: - case FORMAT_DIB: - case FORMAT_PBM: - case FORMAT_RASTER: - case FORMAT_JPEG2000: - case FORMAT_UNKNOWN: - default: - vpCERROR << "Cannot write file: Image format not supported..." << std::endl; - throw(vpImageException(vpImageException::ioError, "Cannot write file: Image format not supported")); - } + std::string message = "Cannot write file \"" + filename + "\": No backend able to support this image format"; + throw(vpImageException(vpImageException::ioError, message)); #endif } } -//-------------------------------------------------------------------------- -// PFM -//-------------------------------------------------------------------------- - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function is built like portable gray pixmap (eg PGM P5) file. - but considers float image data. - - \param I : Image to save as a (PFM P8) file. - \param filename : Name of the file containing the image. -*/ - -void vpImageIo::writePFM(const vpImage &I, const std::string &filename) +void vpImageIo::readJPEG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) { - FILE *fd; - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot write PFM image: filename empty")); + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_JPEG) + readJPEGLibjpeg(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": Libjpeg backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + readOpenCV(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + readSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + readStb(I, filename); } +} - fd = fopen(filename.c_str(), "wb"); - - if (fd == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create PFM file \"%s\"", filename.c_str())); +void vpImageIo::readJPEG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) +{ + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_JPEG) + readJPEGLibjpeg(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": Libjpeg backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + readOpenCV(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + readSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + readStb(I, filename); } +} - // Write the head - fprintf(fd, "P8\n"); // Magic number - fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size - fprintf(fd, "255\n"); // Max level - - // Write the bitmap - size_t ierr; - size_t nbyte = I.getWidth() * I.getHeight(); - - ierr = fwrite(I.bitmap, sizeof(float), nbyte, fd); - if (ierr != nbyte) { - fclose(fd); - throw(vpImageException(vpImageException::ioError, "Cannot save PFM file \"%s\": only %d bytes over %d saved ", - filename.c_str(), ierr, nbyte)); +void vpImageIo::readPNG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) +{ + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_PNG) + readPNGLibpng(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": Libpng backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + readOpenCV(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + readSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + readStb(I, filename); } - - fflush(fd); - fclose(fd); } -//-------------------------------------------------------------------------- -// PGM -//-------------------------------------------------------------------------- - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a portable gray pixmap (PGM P5) file. - - \param I : Image to save as a (PGM P5) file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writePGM(const vpImage &I, const std::string &filename) +void vpImageIo::readPNG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) { - - FILE *fd; - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty")); + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_PNG) + readPNGLibpng(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": Libpng backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + readOpenCV(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + readSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + readStb(I, filename); } +} - fd = fopen(filename.c_str(), "wb"); - - if (fd == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str())); +void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) +{ + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_JPEG) + writeJPEGLibjpeg(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": Libjpeg backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + writeOpenCV(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + writeJPEGSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + writeJPEGStb(I, filename); } +} - // Write the head - fprintf(fd, "P5\n"); // Magic number - fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size - fprintf(fd, "255\n"); // Max level +void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) +{ + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_JPEG) + writeJPEGLibjpeg(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": Libjpeg backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + writeOpenCV(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + writeJPEGSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + writeJPEGStb(I, filename); + } +} - // Write the bitmap - size_t ierr; - size_t nbyte = I.getWidth() * I.getHeight(); +void vpImageIo::writePNG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) +{ + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_PNG) + writePNGLibpng(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": Libpng backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + writeOpenCV(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + writePNGSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + writePNGStb(I, filename); + } +} - ierr = fwrite(I.bitmap, sizeof(unsigned char), nbyte, fd); - if (ierr != nbyte) { - fclose(fd); - throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved", - filename.c_str(), ierr, nbyte)); +void vpImageIo::writePNG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) +{ + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_PNG) + writePNGLibpng(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": Libpng backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + writeOpenCV(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + writePNGSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + writePNGStb(I, filename); } +} - fflush(fd); - fclose(fd); +void vpImageIo::writePFM(const vpImage &I, const std::string &filename) +{ + vp_writePFM(I, filename); } -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a portable gray pixmap (PGM P5) file. +void vpImageIo::writePGM(const vpImage &I, const std::string &filename) +{ + vp_writePGM(I, filename); +} - \param I : Image to save as a (PGM P5) file. - \param filename : Name of the file containing the image. -*/ void vpImageIo::writePGM(const vpImage &I, const std::string &filename) { - vpImage Iuc; - unsigned int nrows = I.getHeight(); - unsigned int ncols = I.getWidth(); - - Iuc.resize(nrows, ncols); - - for (unsigned int i = 0; i < nrows * ncols; i++) - Iuc.bitmap[i] = (unsigned char)I.bitmap[i]; - - vpImageIo::writePGM(Iuc, filename); + vp_writePGM(I, filename); } -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a portable gray pixmap (PGM P5) file. - Color image is converted into a grayscale image. - - \param I : Image to save as a (PGM P5) file. - \param filename : Name of the file containing the image. -*/ void vpImageIo::writePGM(const vpImage &I, const std::string &filename) { - - FILE *fd; - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty")); - } - - fd = fopen(filename.c_str(), "wb"); - - if (fd == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str())); - } - - // Write the head - fprintf(fd, "P5\n"); // Magic number - fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size - fprintf(fd, "255\n"); // Max level - - // Write the bitmap - size_t ierr; - size_t nbyte = I.getWidth() * I.getHeight(); - - vpImage Itmp; - vpImageConvert::convert(I, Itmp); - - ierr = fwrite(Itmp.bitmap, sizeof(unsigned char), nbyte, fd); - if (ierr != nbyte) { - fclose(fd); - throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved", - filename.c_str(), ierr, nbyte)); - } - - fflush(fd); - fclose(fd); + vp_writePGM(I, filename); } -/*! - Read a PFM P8 file and initialize a float image. - - Read the contents of the portable gray pixmap (PFM P8) filename, allocate - memory for the corresponding image, and set the bitmap whith the content of - the file. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. - -*/ - void vpImageIo::readPFM(vpImage &I, const std::string &filename) { - unsigned int w = 0, h = 0, maxval = 0; - unsigned int w_max = 100000, h_max = 100000, maxval_max = 255; - std::string magic("P8"); - - std::ifstream fd(filename.c_str(), std::ios::binary); - - // Open the filename - if (!fd.is_open()) { - throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str())); - } - - vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval); - - if (w > w_max || h > h_max) { - fd.close(); - throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str())); - } - if (maxval > maxval_max) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str())); - } - - if ((h != I.getHeight()) || (w != I.getWidth())) { - I.resize(h, w); - } - - unsigned int nbyte = I.getHeight() * I.getWidth(); - fd.read((char *)I.bitmap, sizeof(float) * nbyte); - if (!fd) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte, - filename.c_str())); - } - - fd.close(); + vp_readPFM(I, filename); } -/*! - Read a PGM P5 file and initialize a scalar image. - - Read the contents of the portable gray pixmap (PGM P5) filename, allocate - memory for the corresponding image, and set the bitmap whith the content of - the file. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. -*/ - void vpImageIo::readPGM(vpImage &I, const std::string &filename) { - unsigned int w = 0, h = 0, maxval = 0; - unsigned int w_max = 100000, h_max = 100000, maxval_max = 255; - std::string magic("P5"); - - std::ifstream fd(filename.c_str(), std::ios::binary); - - // Open the filename - if (!fd.is_open()) { - throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str())); - } - - vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval); - - if (w > w_max || h > h_max) { - fd.close(); - throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str())); - } - if (maxval > maxval_max) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str())); - } - - if ((h != I.getHeight()) || (w != I.getWidth())) { - I.resize(h, w); - } - - unsigned int nbyte = I.getHeight() * I.getWidth(); - fd.read((char *)I.bitmap, nbyte); - if (!fd) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte, - filename.c_str())); - } - - fd.close(); + vp_readPGM(I, filename); } -/*! - Read a PGM P5 file and initialize a scalar image. - - Read the contents of the portable gray pixmap (PGM P5) filename, allocate - memory for the corresponding image, and set the bitmap whith the content of - the file. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - The gray level image contained in the \e filename is converted in a - color image in \e I. - - \param I : Color image to set with the \e filename content. - \param filename : Name of the file containing the image. -*/ - void vpImageIo::readPGM(vpImage &I, const std::string &filename) { - vpImage Itmp; + vp_readPGM(I, filename); +} - vpImageIo::readPGM(Itmp, filename); +void vpImageIo::readPPM(vpImage &I, const std::string &filename) +{ + vp_readPPM(I, filename); +} - vpImageConvert::convert(Itmp, I); +void vpImageIo::readPPM(vpImage &I, const std::string &filename) +{ + vp_readPPM(I, filename); } -//-------------------------------------------------------------------------- -// PPM -//-------------------------------------------------------------------------- - -/*! - Read the contents of the portable pixmap (PPM P6) filename, allocate memory - for the corresponding gray level image, convert the data in gray level, and - set the bitmap whith the gray level data. That means that the image \e I is - a "black and white" rendering of the original image in \e filename, as in a - black and white photograph. The quantization formula used is \f$0,299 r + - 0,587 g + 0,114 b\f$. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. - -*/ -void vpImageIo::readPPM(vpImage &I, const std::string &filename) -{ - vpImage Itmp; - - vpImageIo::readPPM(Itmp, filename); - - vpImageConvert::convert(Itmp, I); -} - -/*! - Read the contents of the portable pixmap (PPM P6) filename, - allocate memory for the corresponding vpRGBa image. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::readPPM(vpImage &I, const std::string &filename) -{ - unsigned int w = 0, h = 0, maxval = 0; - unsigned int w_max = 100000, h_max = 100000, maxval_max = 255; - std::string magic("P6"); - - std::ifstream fd(filename.c_str(), std::ios::binary); - - // Open the filename - if (!fd.is_open()) { - throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str())); - } - - vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval); - - if (w > w_max || h > h_max) { - fd.close(); - throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str())); - } - if (maxval > maxval_max) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str())); - } - - if ((h != I.getHeight()) || (w != I.getWidth())) { - I.resize(h, w); - } - - for (unsigned int i = 0; i < I.getHeight(); i++) { - for (unsigned int j = 0; j < I.getWidth(); j++) { - unsigned char rgb[3]; - fd.read((char *)&rgb, 3); - - if (!fd) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", - (i * I.getWidth() + j) * 3 + fd.gcount(), I.getSize() * 3, filename.c_str())); - } - - I[i][j].R = rgb[0]; - I[i][j].G = rgb[1]; - I[i][j].B = rgb[2]; - I[i][j].A = vpRGBa::alpha_default; - } - } - - fd.close(); -} - -/*! - Write the content of the bitmap in the file which name is given by \e - filename. This function writes a portable gray pixmap (PPM P6) file. - grayscale image is converted into a color image vpRGBa. - - \param I : Image to save as a (PPM P6) file. - \param filename : Name of the file containing the image. - -*/ - void vpImageIo::writePPM(const vpImage &I, const std::string &filename) { - vpImage Itmp; - - vpImageConvert::convert(I, Itmp); - - vpImageIo::writePPM(Itmp, filename); + vp_writePPM(I, filename); } -/*! - Write the content of the bitmap in the file which name is given by \e - filename. This function writes a portable gray pixmap (PPM P6) file. - - \param I : Image to save as a (PPM P6) file. - \param filename : Name of the file containing the image. -*/ void vpImageIo::writePPM(const vpImage &I, const std::string &filename) { - FILE *f; - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot create PPM file: filename empty")); - } - - f = fopen(filename.c_str(), "wb"); - - if (f == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create PPM file \"%s\"", filename.c_str())); - } - - fprintf(f, "P6\n"); // Magic number - fprintf(f, "%u %u\n", I.getWidth(), I.getHeight()); // Image size - fprintf(f, "%d\n", 255); // Max level - - for (unsigned int i = 0; i < I.getHeight(); i++) { - for (unsigned int j = 0; j < I.getWidth(); j++) { - vpRGBa v = I[i][j]; - unsigned char rgb[3]; - rgb[0] = v.R; - rgb[1] = v.G; - rgb[2] = v.B; - - size_t res = fwrite(&rgb, 1, 3, f); - if (res != 3) { - fclose(f); - throw(vpImageException(vpImageException::ioError, "cannot write file \"%s\"", filename.c_str())); - } - } - } - - fflush(f); - fclose(f); -} - -//-------------------------------------------------------------------------- -// JPEG -//-------------------------------------------------------------------------- - -#if defined(VISP_HAVE_JPEG) - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a JPEG file. - - \param I : Image to save as a JPEG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename) -{ - struct jpeg_compress_struct cinfo; - struct jpeg_error_mgr jerr; - FILE *file; - - cinfo.err = jpeg_std_error(&jerr); - jpeg_create_compress(&cinfo); - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty")); - } - - file = fopen(filename.c_str(), "wb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str())); - } - - unsigned int width = I.getWidth(); - unsigned int height = I.getHeight(); - - jpeg_stdio_dest(&cinfo, file); - - cinfo.image_width = width; - cinfo.image_height = height; - cinfo.input_components = 1; - cinfo.in_color_space = JCS_GRAYSCALE; - jpeg_set_defaults(&cinfo); - - jpeg_start_compress(&cinfo, TRUE); - - unsigned char *line; - line = new unsigned char[width]; - unsigned char *input = (unsigned char *)I.bitmap; - while (cinfo.next_scanline < cinfo.image_height) { - for (unsigned int i = 0; i < width; i++) { - line[i] = *(input); - input++; - } - jpeg_write_scanlines(&cinfo, &line, 1); - } - - jpeg_finish_compress(&cinfo); - jpeg_destroy_compress(&cinfo); - delete[] line; - fclose(file); -} - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a JPEG file. - - \param I : Image to save as a JPEG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename) -{ - struct jpeg_compress_struct cinfo; - struct jpeg_error_mgr jerr; - FILE *file; - - cinfo.err = jpeg_std_error(&jerr); - jpeg_create_compress(&cinfo); - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty")); - } - - file = fopen(filename.c_str(), "wb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str())); - } - - unsigned int width = I.getWidth(); - unsigned int height = I.getHeight(); - - jpeg_stdio_dest(&cinfo, file); - - cinfo.image_width = width; - cinfo.image_height = height; - cinfo.input_components = 3; - cinfo.in_color_space = JCS_RGB; - jpeg_set_defaults(&cinfo); - - jpeg_start_compress(&cinfo, TRUE); - - unsigned char *line; - line = new unsigned char[3 * width]; - unsigned char *input = (unsigned char *)I.bitmap; - while (cinfo.next_scanline < cinfo.image_height) { - for (unsigned int i = 0; i < width; i++) { - line[i * 3] = *(input); - input++; - line[i * 3 + 1] = *(input); - input++; - line[i * 3 + 2] = *(input); - input++; - input++; - } - jpeg_write_scanlines(&cinfo, &line, 1); - } - - jpeg_finish_compress(&cinfo); - jpeg_destroy_compress(&cinfo); - delete[] line; - fclose(file); -} - -/*! - Read the contents of the JPEG file, allocate memory - for the corresponding gray level image, if necessary convert the data in - gray level, and set the bitmap whith the gray level data. That means that - the image \e I is a "black and white" rendering of the original image in \e - filename, as in a black and white photograph. If necessary, the quantization - formula used is \f$0,299 r + 0,587 g + 0,114 b\f$. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. - -*/ -void vpImageIo::readJPEG(vpImage &I, const std::string &filename) -{ - struct jpeg_decompress_struct cinfo; - struct jpeg_error_mgr jerr; - FILE *file; - - cinfo.err = jpeg_std_error(&jerr); - jpeg_create_decompress(&cinfo); - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty")); - } - - file = fopen(filename.c_str(), "rb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str())); - } - - jpeg_stdio_src(&cinfo, file); - jpeg_read_header(&cinfo, TRUE); - - unsigned int width = cinfo.image_width; - unsigned int height = cinfo.image_height; - - if ((width != I.getWidth()) || (height != I.getHeight())) - I.resize(height, width); - - jpeg_start_decompress(&cinfo); - - unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components); - JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1); - - if (cinfo.out_color_space == JCS_RGB) { - vpImage Ic(height, width); - unsigned char *output = (unsigned char *)Ic.bitmap; - while (cinfo.output_scanline < cinfo.output_height) { - jpeg_read_scanlines(&cinfo, buffer, 1); - for (unsigned int i = 0; i < width; i++) { - *(output++) = buffer[0][i * 3]; - *(output++) = buffer[0][i * 3 + 1]; - *(output++) = buffer[0][i * 3 + 2]; - *(output++) = vpRGBa::alpha_default; - } - } - vpImageConvert::convert(Ic, I); - } - - else if (cinfo.out_color_space == JCS_GRAYSCALE) { - while (cinfo.output_scanline < cinfo.output_height) { - unsigned int row = cinfo.output_scanline; - jpeg_read_scanlines(&cinfo, buffer, 1); - memcpy(I[row], buffer[0], rowbytes); - } - } - - jpeg_finish_decompress(&cinfo); - jpeg_destroy_decompress(&cinfo); - fclose(file); -} - -/*! - Read a JPEG file and initialize a scalar image. - - Read the contents of the JPEG file, allocate - memory for the corresponding image, and set - the bitmap whith the content of - the file. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - If the file corresponds to a grayscaled image, a conversion is done to deal - with \e I which is a color image. - - \param I : Color image to set with the \e filename content. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::readJPEG(vpImage &I, const std::string &filename) -{ - struct jpeg_decompress_struct cinfo; - struct jpeg_error_mgr jerr; - FILE *file; - - cinfo.err = jpeg_std_error(&jerr); - jpeg_create_decompress(&cinfo); - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty")); - } - - file = fopen(filename.c_str(), "rb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str())); - } - - jpeg_stdio_src(&cinfo, file); - - jpeg_read_header(&cinfo, TRUE); - - unsigned int width = cinfo.image_width; - unsigned int height = cinfo.image_height; - - if ((width != I.getWidth()) || (height != I.getHeight())) - I.resize(height, width); - - jpeg_start_decompress(&cinfo); - - unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components); - JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1); - - if (cinfo.out_color_space == JCS_RGB) { - unsigned char *output = (unsigned char *)I.bitmap; - while (cinfo.output_scanline < cinfo.output_height) { - jpeg_read_scanlines(&cinfo, buffer, 1); - for (unsigned int i = 0; i < width; i++) { - *(output++) = buffer[0][i * 3]; - *(output++) = buffer[0][i * 3 + 1]; - *(output++) = buffer[0][i * 3 + 2]; - *(output++) = vpRGBa::alpha_default; - } - } - } - - else if (cinfo.out_color_space == JCS_GRAYSCALE) { - vpImage Ig(height, width); - - while (cinfo.output_scanline < cinfo.output_height) { - unsigned int row = cinfo.output_scanline; - jpeg_read_scanlines(&cinfo, buffer, 1); - memcpy(Ig[row], buffer[0], rowbytes); - } - - vpImageConvert::convert(Ig, I); - } - - jpeg_finish_decompress(&cinfo); - jpeg_destroy_decompress(&cinfo); - fclose(file); -} - -#elif defined(VISP_HAVE_OPENCV) - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a JPEG file. - - \param I : Image to save as a JPEG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename) -{ -#if (VISP_HAVE_OPENCV_VERSION >= 0x020408) - cv::Mat Ip; - vpImageConvert::convert(I, Ip); - cv::imwrite(filename.c_str(), Ip); -#else - IplImage *Ip = NULL; - vpImageConvert::convert(I, Ip); - - cvSaveImage(filename.c_str(), Ip); - - cvReleaseImage(&Ip); -#endif -} - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a JPEG file. - - \param I : Image to save as a JPEG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename) -{ -#if (VISP_HAVE_OPENCV_VERSION >= 0x020408) - cv::Mat Ip; - vpImageConvert::convert(I, Ip); - cv::imwrite(filename.c_str(), Ip); -#else - IplImage *Ip = NULL; - vpImageConvert::convert(I, Ip); - - cvSaveImage(filename.c_str(), Ip); - - cvReleaseImage(&Ip); -#endif -} - -/*! - Read the contents of the JPEG file, allocate memory - for the corresponding gray level image, if necessary convert the data in - gray level, and set the bitmap whith the gray level data. That means that - the image \e I is a "black and white" rendering of the original image in \e - filename, as in a black and white photograph. If necessary, the quantization - formula used is \f$0,299 r + 0,587 g + 0,114 b\f$. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - If EXIF information is embedded in the image file, the EXIF orientation is ignored. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. - -*/ -void vpImageIo::readJPEG(vpImage &I, const std::string &filename) -{ -#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 -#if VISP_HAVE_OPENCV_VERSION >= 0x030200 - int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION; -#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 - int flags = cv::IMREAD_GRAYSCALE; -#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 - int flags = CV_LOAD_IMAGE_GRAYSCALE; -#endif - cv::Mat Ip = cv::imread(filename.c_str(), flags); - if (!Ip.empty()) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); -#else - IplImage *Ip = NULL; - Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE); - if (Ip != NULL) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); - cvReleaseImage(&Ip); -#endif -} - -/*! - Read a JPEG file and initialize a scalar image. - - Read the contents of the JPEG file, allocate - memory for the corresponding image, and set - the bitmap whith the content of - the file. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - If the file corresponds to a grayscaled image, a conversion is done to deal - with \e I which is a color image. - - If EXIF information is embedded in the image file, the EXIF orientation is ignored. - - \param I : Color image to set with the \e filename content. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::readJPEG(vpImage &I, const std::string &filename) -{ -#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 -#if VISP_HAVE_OPENCV_VERSION >= 0x030200 - int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION; -#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 - int flags = cv::IMREAD_GRAYSCALE; -#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 - int flags = CV_LOAD_IMAGE_GRAYSCALE; -#endif - cv::Mat Ip = cv::imread(filename.c_str(), flags); - if (!Ip.empty()) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); -#else - IplImage *Ip = NULL; - Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_COLOR); - if (Ip != NULL) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); - cvReleaseImage(&Ip); -#endif -} -#else -void vpImageIo::readJPEG(vpImage &I, const std::string &filename) -{ - int width = 0, height = 0, channels = 0; - unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_grey); - if (image == NULL) { - throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); - } - I.init(image, static_cast(height), static_cast(width), true); - stbi_image_free(image); -} -void vpImageIo::readJPEG(vpImage &I, const std::string &filename) -{ - int width = 0, height = 0, channels = 0; - unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha); - if (image == NULL) { - throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); - } - I.init(reinterpret_cast(image), static_cast(height), static_cast(width), true); - stbi_image_free(image); -} -void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename) -{ - int res = stbi_write_jpg(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_grey, - reinterpret_cast(I.bitmap), 90); - if (res == 0) { - throw(vpImageException(vpImageException::ioError, "JPEG write error")); - } -} -void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename) -{ - int res = stbi_write_jpg(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, - reinterpret_cast(I.bitmap), 90); - if (res == 0) { - throw(vpImageException(vpImageException::ioError, "JEPG write error")); - } + vp_writePPM(I, filename); } -#endif - -//-------------------------------------------------------------------------- -// PNG -//-------------------------------------------------------------------------- - -#if defined(VISP_HAVE_PNG) - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a PNG file. - - \param I : Image to save as a PNG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writePNG(const vpImage &I, const std::string &filename) -{ - FILE *file; - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty")); - } - - file = fopen(filename.c_str(), "wb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str())); - } - - /* create a png info struct */ - png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); - if (!png_ptr) { - fclose(file); - vpERROR_TRACE("Error during png_create_write_struct()\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - png_infop info_ptr = png_create_info_struct(png_ptr); - if (!info_ptr) { - fclose(file); - png_destroy_write_struct(&png_ptr, NULL); - vpERROR_TRACE("Error during png_create_info_struct()\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - /* initialize the setjmp for returning properly after a libpng error occured - */ - if (setjmp(png_jmpbuf(png_ptr))) { - fclose(file); - png_destroy_write_struct(&png_ptr, &info_ptr); - vpERROR_TRACE("Error during init_io\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - /* setup libpng for using standard C fwrite() function with our FILE pointer - */ - png_init_io(png_ptr, file); - - unsigned int width = I.getWidth(); - unsigned int height = I.getHeight(); - int bit_depth = 8; - int color_type = PNG_COLOR_TYPE_GRAY; - /* set some useful information from header */ - - if (setjmp(png_jmpbuf(png_ptr))) { - fclose(file); - png_destroy_write_struct(&png_ptr, &info_ptr); - vpERROR_TRACE("Error during write header\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE, - PNG_FILTER_TYPE_BASE); - - png_write_info(png_ptr, info_ptr); - - png_bytep *row_ptrs = new png_bytep[height]; - for (unsigned int i = 0; i < height; i++) - row_ptrs[i] = new png_byte[width]; - - unsigned char *input = (unsigned char *)I.bitmap; - - for (unsigned int i = 0; i < height; i++) { - png_byte *row = row_ptrs[i]; - for (unsigned int j = 0; j < width; j++) { - row[j] = *(input); - input++; - } - } - - png_write_image(png_ptr, row_ptrs); - - png_write_end(png_ptr, NULL); - - for (unsigned int j = 0; j < height; j++) - delete[] row_ptrs[j]; - - delete[] row_ptrs; - - png_destroy_write_struct(&png_ptr, &info_ptr); - - fclose(file); -} - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a PNG file. - - \param I : Image to save as a PNG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writePNG(const vpImage &I, const std::string &filename) -{ - FILE *file; - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty")); - } - - file = fopen(filename.c_str(), "wb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str())); - } - - /* create a png info struct */ - png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); - if (!png_ptr) { - fclose(file); - vpERROR_TRACE("Error during png_create_write_struct()\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - png_infop info_ptr = png_create_info_struct(png_ptr); - if (!info_ptr) { - fclose(file); - png_destroy_write_struct(&png_ptr, NULL); - vpERROR_TRACE("Error during png_create_info_struct()\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - /* initialize the setjmp for returning properly after a libpng error occured - */ - if (setjmp(png_jmpbuf(png_ptr))) { - fclose(file); - png_destroy_write_struct(&png_ptr, &info_ptr); - vpERROR_TRACE("Error during init_io\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - /* setup libpng for using standard C fwrite() function with our FILE pointer - */ - png_init_io(png_ptr, file); - - unsigned int width = I.getWidth(); - unsigned int height = I.getHeight(); - int bit_depth = 8; - int color_type = PNG_COLOR_TYPE_RGB; - /* set some useful information from header */ - - if (setjmp(png_jmpbuf(png_ptr))) { - fclose(file); - png_destroy_write_struct(&png_ptr, &info_ptr); - vpERROR_TRACE("Error during write header\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE, - PNG_FILTER_TYPE_BASE); - - png_write_info(png_ptr, info_ptr); - - png_bytep *row_ptrs = new png_bytep[height]; - for (unsigned int i = 0; i < height; i++) - row_ptrs[i] = new png_byte[3 * width]; - - unsigned char *input = (unsigned char *)I.bitmap; - ; - - for (unsigned int i = 0; i < height; i++) { - png_byte *row = row_ptrs[i]; - for (unsigned int j = 0; j < width; j++) { - row[3 * j] = *(input); - input++; - row[3 * j + 1] = *(input); - input++; - row[3 * j + 2] = *(input); - input++; - input++; - } - } - - png_write_image(png_ptr, row_ptrs); - - png_write_end(png_ptr, NULL); - - for (unsigned int j = 0; j < height; j++) - delete[] row_ptrs[j]; - - delete[] row_ptrs; - - png_destroy_write_struct(&png_ptr, &info_ptr); - - fclose(file); -} - -/*! - Read the contents of the PNG file, allocate memory - for the corresponding gray level image, if necessary convert the data in - gray level, and set the bitmap whith the gray level data. That means that - the image \e I is a "black and white" rendering of the original image in \e - filename, as in a black and white photograph. If necessary, the quantization - formula used is \f$0,299 r + 0,587 g + 0,114 b\f$. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. - -*/ -void vpImageIo::readPNG(vpImage &I, const std::string &filename) -{ - FILE *file; - png_byte magic[8]; - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty")); - } - - file = fopen(filename.c_str(), "rb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str())); - } - - /* read magic number */ - if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) { - fclose(file); - throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str())); - } - - /* check for valid magic number */ - if (png_sig_cmp(magic, 0, sizeof(magic))) { - fclose(file); - throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image", - filename.c_str())); - } - - /* create a png read struct */ - // printf("version %s\n", PNG_LIBPNG_VER_STRING); - png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); - if (png_ptr == NULL) { - fprintf(stderr, "error: can't create a png read structure!\n"); - fclose(file); - throw(vpImageException(vpImageException::ioError, "error reading png file")); - } - - /* create a png info struct */ - png_infop info_ptr = png_create_info_struct(png_ptr); - if (info_ptr == NULL) { - fprintf(stderr, "error: can't create a png info structure!\n"); - fclose(file); - png_destroy_read_struct(&png_ptr, NULL, NULL); - throw(vpImageException(vpImageException::ioError, "error reading png file")); - } - - /* initialize the setjmp for returning properly after a libpng error occured - */ - if (setjmp(png_jmpbuf(png_ptr))) { - fclose(file); - png_destroy_read_struct(&png_ptr, &info_ptr, NULL); - vpERROR_TRACE("Error during init io\n"); - throw(vpImageException(vpImageException::ioError, "PNG read error")); - } - - /* setup libpng for using standard C fread() function with our FILE pointer - */ - png_init_io(png_ptr, file); - - /* tell libpng that we have already read the magic number */ - png_set_sig_bytes(png_ptr, sizeof(magic)); - - /* read png info */ - png_read_info(png_ptr, info_ptr); - - unsigned int width = png_get_image_width(png_ptr, info_ptr); - unsigned int height = png_get_image_height(png_ptr, info_ptr); - - unsigned int bit_depth, channels, color_type; - /* get some useful information from header */ - bit_depth = png_get_bit_depth(png_ptr, info_ptr); - channels = png_get_channels(png_ptr, info_ptr); - color_type = png_get_color_type(png_ptr, info_ptr); - - /* convert index color images to RGB images */ - if (color_type == PNG_COLOR_TYPE_PALETTE) - png_set_palette_to_rgb(png_ptr); - - /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */ - if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) - png_set_expand(png_ptr); - - // if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS)) - // png_set_tRNS_to_alpha (png_ptr); - - if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA) - png_set_strip_alpha(png_ptr); - - if (bit_depth == 16) - png_set_strip_16(png_ptr); - else if (bit_depth < 8) - png_set_packing(png_ptr); - - /* update info structure to apply transformations */ - png_read_update_info(png_ptr, info_ptr); - - channels = png_get_channels(png_ptr, info_ptr); - - if ((width != I.getWidth()) || (height != I.getHeight())) - I.resize(height, width); - - png_bytep *rowPtrs = new png_bytep[height]; - - unsigned int stride = png_get_rowbytes(png_ptr, info_ptr); - unsigned char *data = new unsigned char[stride * height]; - - for (unsigned int i = 0; i < height; i++) - rowPtrs[i] = (png_bytep)data + (i * stride); - - png_read_image(png_ptr, rowPtrs); - - vpImage Ic(height, width); - unsigned char *output; - - switch (channels) { - case 1: - output = (unsigned char *)I.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i]; - } - break; - - case 2: - output = (unsigned char *)I.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i * 2]; - } - break; - - case 3: - output = (unsigned char *)Ic.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i * 3]; - *(output++) = data[i * 3 + 1]; - *(output++) = data[i * 3 + 2]; - *(output++) = vpRGBa::alpha_default; - } - vpImageConvert::convert(Ic, I); - break; - - case 4: - output = (unsigned char *)Ic.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i * 4]; - *(output++) = data[i * 4 + 1]; - *(output++) = data[i * 4 + 2]; - *(output++) = data[i * 4 + 3]; - } - vpImageConvert::convert(Ic, I); - break; - } - - delete[](png_bytep) rowPtrs; - delete[] data; - png_read_end(png_ptr, NULL); - png_destroy_read_struct(&png_ptr, &info_ptr, NULL); - fclose(file); -} - -/*! - Read a PNG file and initialize a scalar image. - - Read the contents of the PNG file, allocate - memory for the corresponding image, and set - the bitmap whith the content of - the file. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - If the file corresponds to a grayscaled image, a conversion is done to deal - with \e I which is a color image. - - \param I : Color image to set with the \e filename content. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::readPNG(vpImage &I, const std::string &filename) -{ - FILE *file; - png_byte magic[8]; - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty")); - } - - file = fopen(filename.c_str(), "rb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str())); - } - - /* read magic number */ - if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) { - fclose(file); - throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str())); - } - - /* check for valid magic number */ - if (png_sig_cmp(magic, 0, sizeof(magic))) { - fclose(file); - throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image", - filename.c_str())); - } - - /* create a png read struct */ - png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); - if (!png_ptr) { - fclose(file); - vpERROR_TRACE("Error during png_create_read_struct()\n"); - throw(vpImageException(vpImageException::ioError, "PNG read error")); - } - - /* create a png info struct */ - png_infop info_ptr = png_create_info_struct(png_ptr); - if (!info_ptr) { - fclose(file); - png_destroy_read_struct(&png_ptr, NULL, NULL); - vpERROR_TRACE("Error during png_create_info_struct()\n"); - throw(vpImageException(vpImageException::ioError, "PNG read error")); - } - - /* initialize the setjmp for returning properly after a libpng error occured - */ - if (setjmp(png_jmpbuf(png_ptr))) { - fclose(file); - png_destroy_read_struct(&png_ptr, &info_ptr, NULL); - vpERROR_TRACE("Error during init io\n"); - throw(vpImageException(vpImageException::ioError, "PNG read error")); - } - - /* setup libpng for using standard C fread() function with our FILE pointer - */ - png_init_io(png_ptr, file); - - /* tell libpng that we have already read the magic number */ - png_set_sig_bytes(png_ptr, sizeof(magic)); - - /* read png info */ - png_read_info(png_ptr, info_ptr); - - unsigned int width = png_get_image_width(png_ptr, info_ptr); - unsigned int height = png_get_image_height(png_ptr, info_ptr); - - unsigned int bit_depth, channels, color_type; - /* get some useful information from header */ - bit_depth = png_get_bit_depth(png_ptr, info_ptr); - channels = png_get_channels(png_ptr, info_ptr); - color_type = png_get_color_type(png_ptr, info_ptr); - - /* convert index color images to RGB images */ - if (color_type == PNG_COLOR_TYPE_PALETTE) - png_set_palette_to_rgb(png_ptr); - - /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */ - if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) - png_set_expand(png_ptr); - - // if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS)) - // png_set_tRNS_to_alpha (png_ptr); - - if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA) - png_set_strip_alpha(png_ptr); - - if (bit_depth == 16) - png_set_strip_16(png_ptr); - else if (bit_depth < 8) - png_set_packing(png_ptr); - - /* update info structure to apply transformations */ - png_read_update_info(png_ptr, info_ptr); - - channels = png_get_channels(png_ptr, info_ptr); - - if ((width != I.getWidth()) || (height != I.getHeight())) - I.resize(height, width); - - png_bytep *rowPtrs = new png_bytep[height]; - - unsigned int stride = png_get_rowbytes(png_ptr, info_ptr); - unsigned char *data = new unsigned char[stride * height]; - - for (unsigned int i = 0; i < height; i++) - rowPtrs[i] = (png_bytep)data + (i * stride); - - png_read_image(png_ptr, rowPtrs); - - vpImage Ig(height, width); - unsigned char *output; - - switch (channels) { - case 1: - output = (unsigned char *)Ig.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i]; - } - vpImageConvert::convert(Ig, I); - break; - - case 2: - output = (unsigned char *)Ig.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i * 2]; - } - vpImageConvert::convert(Ig, I); - break; - - case 3: - output = (unsigned char *)I.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i * 3]; - *(output++) = data[i * 3 + 1]; - *(output++) = data[i * 3 + 2]; - *(output++) = vpRGBa::alpha_default; - } - break; - - case 4: - output = (unsigned char *)I.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i * 4]; - *(output++) = data[i * 4 + 1]; - *(output++) = data[i * 4 + 2]; - *(output++) = data[i * 4 + 3]; - } - break; - } - - delete[](png_bytep) rowPtrs; - delete[] data; - png_read_end(png_ptr, NULL); - png_destroy_read_struct(&png_ptr, &info_ptr, NULL); - fclose(file); -} - -//TODO: -void vpImageIo::readSimdlib(vpImage &I, const std::string &filename) -{ - size_t stride = 0, width = 0, height = 0; - SimdPixelFormatType format = SimdPixelFormatRgba32; - uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format); - const bool copyData = false; - I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData); -} - -void vpImageIo::readStb(vpImage &I, const std::string &filename) -{ - int width = 0, height = 0, channels = 0; - unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha); - if (image == NULL) { - throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); - } - I.init(reinterpret_cast(image), static_cast(height), static_cast(width), true); - stbi_image_free(image); -} - -inline bool ends_with(std::string const & value, std::string const & ending) -{ - if (ending.size() > value.size()) return false; - return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); -} - -void vpImageIo::writeSimdlib(vpImage &I, const std::string &filename) -{ - if (ends_with(filename, ".png")) { - SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFilePng, 90, filename.c_str()); - } else { - SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, 90, filename.c_str()); - } -} - -void vpImageIo::writeStb(vpImage &I, const std::string &filename) -{ - if (ends_with(filename, ".png")) { - const int stride_in_bytes = static_cast(4 * I.getWidth()); - int res = stbi_write_png(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, - reinterpret_cast(I.bitmap), stride_in_bytes); - if (res == 0) { - throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str())); - } - } else { - int res = stbi_write_jpg(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, - reinterpret_cast(I.bitmap), 90); - if (res == 0) { - throw(vpImageException(vpImageException::ioError, "JEPG write error")); - } - } -} - -#elif defined(VISP_HAVE_OPENCV) - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a PNG file. - - \param I : Image to save as a PNG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writePNG(const vpImage &I, const std::string &filename) -{ -#if (VISP_HAVE_OPENCV_VERSION >= 0x020408) - cv::Mat Ip; - vpImageConvert::convert(I, Ip); - cv::imwrite(filename.c_str(), Ip); -#else - IplImage *Ip = NULL; - vpImageConvert::convert(I, Ip); - - cvSaveImage(filename.c_str(), Ip); - - cvReleaseImage(&Ip); -#endif -} - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a PNG file. - - \param I : Image to save as a PNG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writePNG(const vpImage &I, const std::string &filename) -{ -#if (VISP_HAVE_OPENCV_VERSION >= 0x020408) - cv::Mat Ip; - vpImageConvert::convert(I, Ip); - cv::imwrite(filename.c_str(), Ip); -#else - IplImage *Ip = NULL; - vpImageConvert::convert(I, Ip); - - cvSaveImage(filename.c_str(), Ip); - - cvReleaseImage(&Ip); -#endif -} - -/*! - Read the contents of the PNG file, allocate memory - for the corresponding gray level image, if necessary convert the data in - gray level, and set the bitmap whith the gray level data. That means that - the image \e I is a "black and white" rendering of the original image in \e - filename, as in a black and white photograph. If necessary, the quantization - formula used is \f$0,299 r + 0,587 g + 0,114 b\f$. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - If EXIF information is embedded in the image file, the EXIF orientation is ignored. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. - -*/ -void vpImageIo::readPNG(vpImage &I, const std::string &filename) -{ -#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 -#if VISP_HAVE_OPENCV_VERSION >= 0x030200 - int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION; -#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 - int flags = cv::IMREAD_GRAYSCALE; -#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 - int flags = CV_LOAD_IMAGE_GRAYSCALE; -#endif - cv::Mat Ip = cv::imread(filename.c_str(), flags); - if (!Ip.empty()) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); -#else - IplImage *Ip = NULL; - Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE); - if (Ip != NULL) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); - cvReleaseImage(&Ip); -#endif -} - -/*! - Read a PNG file and initialize a scalar image. - - Read the contents of the PNG file, allocate - memory for the corresponding image, and set - the bitmap whith the content of - the file. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - If the file corresponds to a grayscaled image, a conversion is done to deal - with \e I which is a color image. - - If EXIF information is embedded in the image file, the EXIF orientation is ignored. - - \param I : Color image to set with the \e filename content. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::readPNG(vpImage &I, const std::string &filename) -{ -#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 -#if VISP_HAVE_OPENCV_VERSION >= 0x030200 - int flags = cv::IMREAD_COLOR | cv::IMREAD_IGNORE_ORIENTATION; -#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 - int flags = cv::IMREAD_COLOR; -#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 - int flags = CV_LOAD_IMAGE_COLOR; -#endif - cv::Mat Ip = cv::imread(filename.c_str(), flags); - if (!Ip.empty()) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); -#else - IplImage *Ip = NULL; - Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_COLOR); - if (Ip != NULL) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); - cvReleaseImage(&Ip); -#endif -} -#else -void vpImageIo::readPNG(vpImage &I, const std::string &filename) -{ - int width = 0, height = 0, channels = 0; - unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_grey); - if (image == NULL) { - throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); - } - I.init(image, static_cast(height), static_cast(width), true); - stbi_image_free(image); -} -void vpImageIo::readPNG(vpImage &I, const std::string &filename) -{ - int width = 0, height = 0, channels = 0; - unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha); - if (image == NULL) { - throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); - } - I.init(reinterpret_cast(image), static_cast(height), static_cast(width), true); - stbi_image_free(image); -} -void vpImageIo::writePNG(const vpImage &I, const std::string &filename) -{ - const int stride_in_bytes = static_cast(I.getWidth()); - int res = stbi_write_png(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_grey, - reinterpret_cast(I.bitmap), stride_in_bytes); - if (res == 0) { - throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str())); - } -} -void vpImageIo::writePNG(const vpImage &I, const std::string &filename) -{ - const int stride_in_bytes = static_cast(4 * I.getWidth()); - int res = stbi_write_png(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, - reinterpret_cast(I.bitmap), stride_in_bytes); - if (res == 0) { - throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str())); - } -} -#endif diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp index 8efe2c759e..3bf19a465e 100644 --- a/modules/io/test/perfImageLoadSave.cpp +++ b/modules/io/test/perfImageLoadSave.cpp @@ -64,7 +64,7 @@ TEST_CASE("Benchmark Jpeg image loading", "[benchmark]") { vpImage I; BENCHMARK("vpImageIo::readSimdlib()") { - vpImageIo::readSimdlib(I, imagePathJpeg); + vpImageIo::readJPEG(I, imagePathJpeg, vpImageIo::IO_SIMDLIB_BACKEND); return I; }; } @@ -73,7 +73,7 @@ TEST_CASE("Benchmark Jpeg image loading", "[benchmark]") { vpImage I; BENCHMARK("vpImageIo::readStb()") { - vpImageIo::readStb(I, imagePathJpeg); + vpImageIo::readJPEG(I, imagePathJpeg, vpImageIo::IO_STB_IMAGE_BACKEND); return I; }; } @@ -93,7 +93,7 @@ TEST_CASE("Benchmark Png image loading", "[benchmark]") { vpImage I; BENCHMARK("vpImageIo::readSimdlib()") { - vpImageIo::readSimdlib(I, imagePathPng); + vpImageIo::readPNG(I, imagePathPng, vpImageIo::IO_SIMDLIB_BACKEND); return I; }; } @@ -102,7 +102,7 @@ TEST_CASE("Benchmark Png image loading", "[benchmark]") { vpImage I; BENCHMARK("vpImageIo::readStb()") { - vpImageIo::readStb(I, imagePathPng); + vpImageIo::readPNG(I, imagePathPng, vpImageIo::IO_STB_IMAGE_BACKEND); return I; }; } @@ -122,7 +122,7 @@ TEST_CASE("Benchmark big Png image loading", "[benchmark]") { vpImage I; BENCHMARK("vpImageIo::readSimdlib()") { - vpImageIo::readSimdlib(I, imagePathPngBig); + vpImageIo::readPNG(I, imagePathPngBig, vpImageIo::IO_SIMDLIB_BACKEND); return I; }; } @@ -131,7 +131,7 @@ TEST_CASE("Benchmark big Png image loading", "[benchmark]") { vpImage I; BENCHMARK("vpImageIo::readStb()") { - vpImageIo::readStb(I, imagePathPngBig); + vpImageIo::readPNG(I, imagePathPngBig, vpImageIo::IO_STB_IMAGE_BACKEND); return I; }; } @@ -153,7 +153,7 @@ TEST_CASE("Benchmark Jpeg image saving", "[benchmark]") { const std::string filename = "/tmp/Klimt_Simd.jpg"; BENCHMARK("vpImageIo::writeSimdlib()") { - vpImageIo::writeSimdlib(I, filename); + vpImageIo::writeJPEG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND); return I; }; } @@ -162,7 +162,7 @@ TEST_CASE("Benchmark Jpeg image saving", "[benchmark]") { const std::string filename = "/tmp/Klimt_stb.jpg"; BENCHMARK("vpImageIo::writeStb()") { - vpImageIo::writeStb(I, filename); + vpImageIo::writeJPEG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND); return I; }; } @@ -184,7 +184,7 @@ TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") { const std::string filename = "/tmp/Big_images_Simd.jpg"; BENCHMARK("vpImageIo::writeSimdlib()") { - vpImageIo::writeSimdlib(I, filename); + vpImageIo::writeJPEG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND); return I; }; } @@ -193,7 +193,7 @@ TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") { const std::string filename = "/tmp/Big_images_stb.jpg"; BENCHMARK("vpImageIo::writeStb()") { - vpImageIo::writeStb(I, filename); + vpImageIo::writeJPEG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND); return I; }; } @@ -215,7 +215,7 @@ TEST_CASE("Benchmark Png image saving", "[benchmark]") { const std::string filename = "/tmp/Klimt_Simd.png"; BENCHMARK("vpImageIo::writeSimdlib()") { - vpImageIo::writeSimdlib(I, filename); + vpImageIo::writePNG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND); return I; }; } @@ -224,7 +224,7 @@ TEST_CASE("Benchmark Png image saving", "[benchmark]") { const std::string filename = "/tmp/Klimt_stb.png"; BENCHMARK("vpImageIo::writeStb()") { - vpImageIo::writeStb(I, filename); + vpImageIo::writePNG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND); return I; }; } @@ -246,7 +246,7 @@ TEST_CASE("Benchmark big Png image saving", "[benchmark]") { const std::string filename = "/tmp/Big_images_Simd.png"; BENCHMARK("vpImageIo::writeSimdlib()") { - vpImageIo::writeSimdlib(I, filename); + vpImageIo::writePNG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND); return I; }; } @@ -255,155 +255,12 @@ TEST_CASE("Benchmark big Png image saving", "[benchmark]") { const std::string filename = "/tmp/Big_images_stb.png"; BENCHMARK("vpImageIo::writeStb()") { - vpImageIo::writeStb(I, filename); + vpImageIo::writePNG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND); return I; }; } } -//TEST_CASE("Benchmark bgr to grayscale (ViSP)", "[benchmark]") { -// vpImage I; -// vpImageIo::read(I, imagePathColor); - -// std::vector bgr; -// common_tools::RGBaToBGR(I, bgr); - -// vpImage I_gray(I.getHeight(), I.getWidth()); - -// BENCHMARK("Benchmark bgr to grayscale (ViSP)") { -// vpImageConvert::BGRToGrey(bgr.data(), -// I_gray.bitmap, -// I.getWidth(), I.getHeight(), -// false, nThreads); -// return I_gray; -// }; - -//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101) -// SECTION("OpenCV Mat type") -// { -// cv::Mat img; -// vpImageConvert::convert(I, img); - -// BENCHMARK("Benchmark bgr to grayscale (ViSP + OpenCV Mat type)") { -// vpImageConvert::convert(img, I_gray, false, nThreads); -// return I_gray; -// }; -// } -//#endif -//} -//#endif - -//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101) -//TEST_CASE("Benchmark bgr to grayscale (OpenCV)", "[benchmark]") { -// cv::Mat img = cv::imread(imagePathColor); -// cv::Mat img_gray(img.size(), CV_8UC1); - -// BENCHMARK("Benchmark bgr to grayscale (OpenCV)") { -// cv::cvtColor(img, img_gray, cv::COLOR_BGR2GRAY); -// return img_gray; -// }; -//} -//#endif - -//// C++11 to be able to do bgr.data() -//#if VISP_CXX_STANDARD >= VISP_CXX_STANDARD_11 -//TEST_CASE("Benchmark bgr to rgba (naive code)", "[benchmark]") { -// vpImage I; -// vpImageIo::read(I, imagePathColor); - -// std::vector bgr; -// common_tools::RGBaToBGR(I, bgr); - -// vpImage I_bench(I.getHeight(), I.getWidth()); -// BENCHMARK("Benchmark bgr to rgba (naive code)") { -// common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast(I_bench.bitmap), -// I.getWidth(), I.getHeight(), false); -// return I_bench; -// }; -//} - -//TEST_CASE("Benchmark bgr to rgba (ViSP)", "[benchmark]") { -// vpImage I; -// vpImageIo::read(I, imagePathColor); - -// std::vector bgr; -// common_tools::RGBaToBGR(I, bgr); - -// SECTION("Check BGR to RGBa conversion") -// { -// vpImage ref(I.getHeight(), I.getWidth()); -// common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast(ref.bitmap), -// I.getWidth(), I.getHeight(), false); -// vpImage rgba(I.getHeight(), I.getWidth()); -// vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast(rgba.bitmap), -// I.getWidth(), I.getHeight(), false); - -// CHECK((rgba == ref)); -// } - -// vpImage I_rgba(I.getHeight(), I.getWidth()); -// BENCHMARK("Benchmark bgr to rgba (ViSP)") { -// vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast(I_rgba.bitmap), -// I.getWidth(), I.getHeight(), false); -// return I_rgba; -// }; - -//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101) -// SECTION("OpenCV Mat type") -// { -// cv::Mat img; -// vpImageConvert::convert(I, img); - -// BENCHMARK("Benchmark bgr to rgba (ViSP + OpenCV Mat type)") { -// vpImageConvert::convert(img, I_rgba); -// return I_rgba; -// }; -// } -//#endif -//} - -//TEST_CASE("Benchmark bgra to rgba (naive code)", "[benchmark]") { -// vpImage I; -// vpImageIo::read(I, imagePathColor); - -// std::vector bgra; -// common_tools::RGBaToBGRa(I, bgra); - -// vpImage I_bench(I.getHeight(), I.getWidth()); -// BENCHMARK("Benchmark bgra to rgba (naive code)") { -// common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast(I_bench.bitmap), -// I.getWidth(), I.getHeight(), false); -// return I_bench; -// }; -//} - -//TEST_CASE("Benchmark bgra to rgba (ViSP)", "[benchmark]") { -// vpImage I; -// vpImageIo::read(I, imagePathColor); - -// std::vector bgra; -// common_tools::RGBaToBGRa(I, bgra); - -// SECTION("Check BGRa to RGBa conversion") -// { -// vpImage ref(I.getHeight(), I.getWidth()); -// common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast(ref.bitmap), -// I.getWidth(), I.getHeight(), false); -// vpImage rgba(I.getHeight(), I.getWidth()); -// vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast(rgba.bitmap), -// I.getWidth(), I.getHeight(), false); - -// CHECK((rgba == ref)); -// } -// vpImage I_rgba(I.getHeight(), I.getWidth()); -// BENCHMARK("Benchmark bgra to rgba (ViSP)") { -// vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast(I_rgba.bitmap), -// I.getWidth(), I.getHeight(), false); -// return I_rgba; -// }; -//} -//#endif - int main(int argc, char *argv[]) { Catch::Session session; // There must be exactly one instance From 122e936a257d8caf21a75e23a651d1122d93e77d Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Wed, 3 Nov 2021 01:06:57 +0100 Subject: [PATCH 07/18] Update Simd lib to 4.9.107 version. --- 3rdparty/simdlib/CMakeLists.txt | 16 +- .../Simd/{SimdSse1.h => SimdAlignment.h} | 113 +++-- 3rdparty/simdlib/Simd/SimdAllocator.hpp | 6 +- 3rdparty/simdlib/Simd/SimdArray.h | 30 +- 3rdparty/simdlib/Simd/SimdAvx1.h | 9 +- ...SimdBaseRgbaToGray.cpp => SimdAvx1Cpu.cpp} | 45 +- 3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp | 14 +- 3rdparty/simdlib/Simd/SimdAvx2.h | 22 +- 3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp | 43 +- 3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp | 61 ++- 3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp | 10 +- 3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp | 74 --- 3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp | 149 ++++++ 3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp | 56 ++- 3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp | 72 --- 3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp | 68 +++ .../simdlib/Simd/SimdAvx2Deinterleave.cpp | 59 ++- .../simdlib/Simd/SimdAvx2GaussianBlur.cpp | 3 +- 3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp | 4 +- .../simdlib/Simd/SimdAvx2ReduceGray2x2.cpp | 6 +- .../simdlib/Simd/SimdAvx2ReduceGray3x3.cpp | 4 +- .../simdlib/Simd/SimdAvx2ReduceGray4x4.cpp | 4 +- .../simdlib/Simd/SimdAvx2ReduceGray5x5.cpp | 6 +- .../simdlib/Simd/SimdAvx2ResizeBilinear.cpp | 4 +- 3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp | 23 +- 3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp | 92 ---- 3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp | 97 ---- 3rdparty/simdlib/Simd/SimdBase.h | 18 +- 3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp | 20 +- 3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp | 15 +- 3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp | 4 +- 3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp | 80 --- 3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp | 37 +- 3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp | 15 +- 3rdparty/simdlib/Simd/SimdBaseCpu.cpp | 234 +++++++++ .../simdlib/Simd/SimdBaseDeinterleave.cpp | 43 +- .../simdlib/Simd/SimdBaseGaussianBlur.cpp | 2 +- 3rdparty/simdlib/Simd/SimdBaseResizer.cpp | 243 ++++++++- 3rdparty/simdlib/Simd/SimdConfig.h | 10 +- 3rdparty/simdlib/Simd/SimdConst.h | 70 +-- 3rdparty/simdlib/Simd/SimdConversion.h | 55 +-- 3rdparty/simdlib/Simd/SimdCopyPixel.h | 17 + 3rdparty/simdlib/Simd/SimdCpu.h | 101 +++- 3rdparty/simdlib/Simd/SimdDefs.h | 80 +-- 3rdparty/simdlib/Simd/SimdEnable.h | 415 +--------------- 3rdparty/simdlib/Simd/SimdExp.h | 176 ++++++- 3rdparty/simdlib/Simd/SimdExtract.h | 22 +- 3rdparty/simdlib/Simd/SimdFrame.hpp | 98 +++- 3rdparty/simdlib/Simd/SimdInit.h | 35 +- 3rdparty/simdlib/Simd/SimdLib.cpp | 279 ++++++----- 3rdparty/simdlib/Simd/SimdLib.h | 239 +++++---- 3rdparty/simdlib/Simd/SimdLib.hpp | 465 +++++++++++++++++- 3rdparty/simdlib/Simd/SimdLoad.h | 277 +---------- 3rdparty/simdlib/Simd/SimdLoadBlock.h | 251 ++++++++++ 3rdparty/simdlib/Simd/SimdLog.h | 28 +- 3rdparty/simdlib/Simd/SimdMath.h | 47 +- 3rdparty/simdlib/Simd/SimdMemory.h | 104 ++-- 3rdparty/simdlib/Simd/SimdNeon.h | 20 +- 3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp | 45 +- 3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp | 63 ++- 3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp | 10 +- 3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp | 81 --- 3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp | 83 +++- 3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp | 41 +- 3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp | 78 --- .../simdlib/Simd/SimdNeonDeinterleave.cpp | 79 ++- .../simdlib/Simd/SimdNeonGaussianBlur.cpp | 1 + 3rdparty/simdlib/Simd/SimdNeonResizer.cpp | 8 +- 3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp | 71 --- 3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp | 71 --- 3rdparty/simdlib/Simd/SimdPixel.hpp | 200 +++++++- 3rdparty/simdlib/Simd/SimdPow.h | 2 +- 3rdparty/simdlib/Simd/SimdResizer.h | 148 ++++-- 3rdparty/simdlib/Simd/SimdResizerCommon.h | 97 ++++ 3rdparty/simdlib/Simd/SimdRuntime.h | 34 +- 3rdparty/simdlib/Simd/SimdSet.h | 8 +- 3rdparty/simdlib/Simd/SimdSse1Resizer.cpp | 129 ----- 3rdparty/simdlib/Simd/SimdSse2.h | 8 +- 3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp | 54 +- ...SimdBaseBgraToRgba.cpp => SimdSse2Cpu.cpp} | 44 +- .../simdlib/Simd/SimdSse2GaussianBlur3x3.cpp | 3 +- 3rdparty/simdlib/Simd/SimdSse2Resizer.cpp | 8 +- 3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp | 75 --- 3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp | 96 ---- 3rdparty/simdlib/Simd/SimdSse41.h | 76 +++ ...e3BgrToBgra.cpp => SimdSse41BgrToBgra.cpp} | 185 ++++--- ...e3BgrToGray.cpp => SimdSse41BgrToGray.cpp} | 241 +++++---- ...sse3BgrToRgb.cpp => SimdSse41BgrToRgb.cpp} | 163 +++--- ...e3BgraToBgr.cpp => SimdSse41BgraToBgr.cpp} | 257 ++++++---- ...SimdBaseRgbToGray.cpp => SimdSse41Cpu.cpp} | 46 +- ...terleave.cpp => SimdSse41Deinterleave.cpp} | 60 ++- .../simdlib/Simd/SimdSse41GaussianBlur.cpp | 3 +- ...ur3x3.cpp => SimdSse41GaussianBlur3x3.cpp} | 12 +- ...e3GrayToBgr.cpp => SimdSse41GrayToBgr.cpp} | 147 +++--- ...Interleave.cpp => SimdSse41Interleave.cpp} | 11 +- ...imdSsse3Reduce.cpp => SimdSse41Reduce.cpp} | 401 ++++++++------- ...Gray2x2.cpp => SimdSse41ReduceGray2x2.cpp} | 189 ++++--- ...Gray4x4.cpp => SimdSse41ReduceGray4x4.cpp} | 11 +- ...linear.cpp => SimdSse41ResizeBilinear.cpp} | 9 +- 3rdparty/simdlib/Simd/SimdSse41Resizer.cpp | 311 +++++++++++- 3rdparty/simdlib/Simd/SimdSsse3.h | 77 --- 3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp | 74 --- 3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp | 73 --- .../simdlib/Simd/SimdSsse3CustomFunctions.cpp | 69 --- 3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp | 350 ------------- 3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp | 93 ---- 3rdparty/simdlib/Simd/SimdStore.h | 45 +- 3rdparty/simdlib/Simd/SimdStream.h | 21 +- 3rdparty/simdlib/Simd/SimdUpdate.h | 17 +- 3rdparty/simdlib/Simd/SimdVersion.h | 2 +- 3rdparty/simdlib/Simd/SimdView.hpp | 6 +- modules/core/src/image/vpImageConvert.cpp | 4 +- 112 files changed, 5013 insertions(+), 4067 deletions(-) rename 3rdparty/simdlib/Simd/{SimdSse1.h => SimdAlignment.h} (53%) mode change 100644 => 100755 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAllocator.hpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdArray.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx1.h rename 3rdparty/simdlib/Simd/{SimdBaseRgbaToGray.cpp => SimdAvx1Cpu.cpp} (57%) mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp create mode 100755 3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp create mode 100644 3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBase.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseCpu.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseResizer.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdConfig.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdConst.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdConversion.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdCopyPixel.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdCpu.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdDefs.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdEnable.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdExp.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdExtract.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdFrame.hpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdInit.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLib.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLib.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLib.hpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLoad.h create mode 100755 3rdparty/simdlib/Simd/SimdLoadBlock.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLog.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdMath.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdMemory.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeon.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonResizer.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdPixel.hpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdPow.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdResizer.h create mode 100755 3rdparty/simdlib/Simd/SimdResizerCommon.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdRuntime.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSet.h delete mode 100644 3rdparty/simdlib/Simd/SimdSse1Resizer.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp rename 3rdparty/simdlib/Simd/{SimdBaseBgraToRgba.cpp => SimdSse2Cpu.cpp} (62%) mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2Resizer.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp create mode 100755 3rdparty/simdlib/Simd/SimdSse41.h rename 3rdparty/simdlib/Simd/{SimdSsse3BgrToBgra.cpp => SimdSse41BgrToBgra.cpp} (57%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdSsse3BgrToGray.cpp => SimdSse41BgrToGray.cpp} (56%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdSsse3BgrToRgb.cpp => SimdSse41BgrToRgb.cpp} (84%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdSsse3BgraToBgr.cpp => SimdSse41BgraToBgr.cpp} (53%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdBaseRgbToGray.cpp => SimdSse41Cpu.cpp} (54%) rename 3rdparty/simdlib/Simd/{SimdSsse3Deinterleave.cpp => SimdSse41Deinterleave.cpp} (74%) mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp rename 3rdparty/simdlib/Simd/{SimdSsse3GaussianBlur3x3.cpp => SimdSse41GaussianBlur3x3.cpp} (95%) rename 3rdparty/simdlib/Simd/{SimdSsse3GrayToBgr.cpp => SimdSse41GrayToBgr.cpp} (92%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdSsse3Interleave.cpp => SimdSse41Interleave.cpp} (96%) rename 3rdparty/simdlib/Simd/{SimdSsse3Reduce.cpp => SimdSse41Reduce.cpp} (96%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdSsse3ReduceGray2x2.cpp => SimdSse41ReduceGray2x2.cpp} (94%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdSsse3ReduceGray4x4.cpp => SimdSse41ReduceGray4x4.cpp} (96%) mode change 100644 => 100755 rename 3rdparty/simdlib/Simd/{SimdSsse3ResizeBilinear.cpp => SimdSse41ResizeBilinear.cpp} (98%) mode change 100644 => 100755 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse41Resizer.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3.h delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdStore.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdStream.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdUpdate.h mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdView.hpp diff --git a/3rdparty/simdlib/CMakeLists.txt b/3rdparty/simdlib/CMakeLists.txt index e6880b3800..dc6d111aae 100644 --- a/3rdparty/simdlib/CMakeLists.txt +++ b/3rdparty/simdlib/CMakeLists.txt @@ -109,23 +109,11 @@ if(X86 OR X86_64) file(GLOB_RECURSE SIMD_BASE_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdBase*.cpp) set_source_files_properties(${SIMD_BASE_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS}") - file(GLOB_RECURSE SIMD_SSE1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse1*.cpp) - set_source_files_properties(${SIMD_SSE1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG}") - file(GLOB_RECURSE SIMD_SSE2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse2*.cpp) - set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE2_FLAG}") - - file(GLOB_RECURSE SIMD_SSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse3*.cpp) - set_source_files_properties(${SIMD_SSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG}") - - file(GLOB_RECURSE SIMD_SSSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSsse3*.cpp) - set_source_files_properties(${SIMD_SSSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSSE3_FLAG}") + set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG} ${SSE2_FLAG}") file(GLOB_RECURSE SIMD_SSE41_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse41*.cpp) - set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_1_FLAG}") - - file(GLOB_RECURSE SIMD_SSE42_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse42*.cpp) - set_source_files_properties(${SIMD_SSE42_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}") + set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG} ${SSSE3_FLAG} ${SSE4_1_FLAG} ${SSE4_2_FLAG}") file(GLOB_RECURSE SIMD_AVX1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx1*.cpp) set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}") diff --git a/3rdparty/simdlib/Simd/SimdSse1.h b/3rdparty/simdlib/Simd/SimdAlignment.h old mode 100644 new mode 100755 similarity index 53% rename from 3rdparty/simdlib/Simd/SimdSse1.h rename to 3rdparty/simdlib/Simd/SimdAlignment.h index e258d50ab3..9789cbb9e7 --- a/3rdparty/simdlib/Simd/SimdSse1.h +++ b/3rdparty/simdlib/Simd/SimdAlignment.h @@ -1,40 +1,73 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdSse_h__ -#define __SimdSse_h__ - -#include "Simd/SimdDefs.h" - -namespace Simd -{ -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - void SquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum); - - void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum); - } -#endif// SIMD_SSE_ENABLE -} -#endif//__SimdSse_h__ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdAlignment_h__ +#define __SimdAlignment_h__ + +#include "Simd/SimdEnable.h" + +namespace Simd +{ + SIMD_INLINE size_t GetAlignment() + { +#ifdef SIMD_AVX2_ENABLE + if (Avx2::Enable) + return sizeof(__m256i); + else +#endif +#ifdef SIMD_AVX_ENABLE + if (Avx::Enable) + return sizeof(__m256); + else +#endif +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable) + return sizeof(__m128i); + else +#endif +#ifdef SIMD_SSE2_ENABLE + if (Sse2::Enable) + return sizeof(__m128i); + else +#endif +#ifdef SIMD_NEON_ENABLE + if (Neon::Enable) + return sizeof(uint8x16_t); + else +#endif + return sizeof(void *); + } + + extern const size_t ALIGNMENT; + + SIMD_INLINE size_t Alignment() + { +#if defined(WIN32) + return GetAlignment(); +#else + return ALIGNMENT; +#endif + } +} + +#endif//__SimdAlignment_h__ diff --git a/3rdparty/simdlib/Simd/SimdAllocator.hpp b/3rdparty/simdlib/Simd/SimdAllocator.hpp old mode 100644 new mode 100755 index cd65f196f4..8ee548e5ae --- a/3rdparty/simdlib/Simd/SimdAllocator.hpp +++ b/3rdparty/simdlib/Simd/SimdAllocator.hpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -125,8 +125,8 @@ namespace Simd */ static SIMD_INLINE size_t Alignment() { -#if defined(__SimdEnable_h__) && defined(WIN32) - return Simd::ALIGNMENT; +#if defined(__SimdAlignment_h__) && defined(WIN32) + return Simd::Alignment(); #else return SimdAlignment(); #endif diff --git a/3rdparty/simdlib/Simd/SimdArray.h b/3rdparty/simdlib/Simd/SimdArray.h old mode 100644 new mode 100755 index 30e793080f..2f7f1bbbe0 --- a/3rdparty/simdlib/Simd/SimdArray.h +++ b/3rdparty/simdlib/Simd/SimdArray.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -57,15 +57,28 @@ namespace Simd } *(size_t*)&size = size_; if (size_) - *(T**)&data = (T*)Simd::Allocate(size * sizeof(T), align); + *(T**)&data = (T*)Simd::Allocate(RawSize(), align); } if (clear) Clear(); } + SIMD_INLINE void Assign(const T * src, size_t size_) + { + Resize(size_, src == NULL); + if(src) + memcpy(data, src, RawSize()); + } + SIMD_INLINE void Clear() { - ::memset(data, 0, size * sizeof(T)); + memset(data, 0, RawSize()); + } + + SIMD_INLINE void Swap(const Array & array) + { + Simd::Swap((T*&)data, (T*&)(array.data)); + Simd::Swap((size_t&)size, (size_t&)(array.size)); } SIMD_INLINE T & operator[] (size_t i) @@ -77,12 +90,19 @@ namespace Simd { return data[i]; } + + SIMD_INLINE size_t RawSize() const + { + return size * sizeof(T); + } }; + typedef Array Array8i; typedef Array Array8u; typedef Array Array16i; typedef Array Array16u; typedef Array Array32i; + typedef Array Array32u; typedef Array Array32f; #if defined(__GNUC__) && __GNUC__ >= 6 @@ -90,8 +110,8 @@ namespace Simd #pragma GCC diagnostic ignored "-Wignored-attributes" #endif -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { typedef Array<__m128> Array128f; } diff --git a/3rdparty/simdlib/Simd/SimdAvx1.h b/3rdparty/simdlib/Simd/SimdAvx1.h old mode 100644 new mode 100755 index 25c070c459..48df913c02 --- a/3rdparty/simdlib/Simd/SimdAvx1.h +++ b/3rdparty/simdlib/Simd/SimdAvx1.h @@ -1,8 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, -* 2019-2019 Facundo Galan. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -22,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef __SimdAvx1_h__ -#define __SimdAvx1_h__ +#ifndef __SimdAvx_h__ +#define __SimdAvx_h__ #include "Simd/SimdDefs.h" @@ -36,4 +35,4 @@ namespace Simd } #endif// SIMD_AVX_ENABLE } -#endif//__SimdAvx1_h__ +#endif//__SimdAvx_h__ diff --git a/3rdparty/simdlib/Simd/SimdBaseRgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx1Cpu.cpp similarity index 57% rename from 3rdparty/simdlib/Simd/SimdBaseRgbaToGray.cpp rename to 3rdparty/simdlib/Simd/SimdAvx1Cpu.cpp index 22d37b17ee..9d9cbb29d3 100644 --- a/3rdparty/simdlib/Simd/SimdBaseRgbaToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx1Cpu.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -21,23 +21,46 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "Simd/SimdConversion.h" +#include "Simd/SimdEnable.h" +#include "Simd/SimdCpu.h" + +#if defined(_MSC_VER) +#include +#endif namespace Simd { - namespace Base +#ifdef SIMD_AVX_ENABLE + namespace Avx { - void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride) + SIMD_INLINE bool SupportedByCPU() { - for (size_t row = 0; row < height; ++row) + return + Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) && + Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::AVX); + } + + SIMD_INLINE bool SupportedByOS() + { +#if defined(_MSC_VER) + __try { - const uint8_t * pRgba = rgba + row*rgbaStride; - uint8_t * pGray = gray + row*grayStride; - for (const uint8_t *pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgba += 4) - { - *pGray = RgbToGray(pRgba[0], pRgba[1], pRgba[2]); - } + __m256d value = _mm256_set1_pd(1.0);// try to execute of AVX instructions; + return true; } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } +#else + return true; +#endif + } + + bool GetEnable() + { + return SupportedByCPU() && SupportedByOS(); } } +#endif } diff --git a/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp b/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp old mode 100644 new mode 100755 index e409c17ff1..319c609408 --- a/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -42,7 +42,7 @@ namespace Simd float * pbx[2] = { _bx[0].data, _bx[1].data }; int32_t prev = -2; size_t rsa = AlignLo(rs, Avx::F); - size_t rsh = AlignLo(rs, Sse::F); + size_t rsh = AlignLo(rs, Sse2::F); for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) { float fy1 = _ay[dy]; @@ -78,10 +78,10 @@ namespace Simd __m256 m1 = _mm256_mul_ps(fx1, _mm256_shuffle_ps(s0145, s2367, 0xDD)); _mm256_store_ps(pb + dx, _mm256_add_ps(m0, m1)); } - for (; dx < rsh; dx += Sse::F) + for (; dx < rsh; dx += Sse2::F) { - __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); - __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); + __m128 s01 = Sse2::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); + __m128 s23 = Sse2::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); __m128 fx1 = _mm_load_ps(_ax.data + dx); __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1); __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88)); @@ -128,7 +128,7 @@ namespace Simd __m256 m1 = _mm256_mul_ps(_mm256_load_ps(pbx[1] + dx), _fy1); _mm256_storeu_ps(dst + dx, _mm256_add_ps(m0, m1)); } - for (; dx < rsh; dx += Sse::F) + for (; dx < rsh; dx += Sse2::F) { __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0)); __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1)); @@ -144,7 +144,7 @@ namespace Simd void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) { ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m256)); - if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp)) + if (param.IsFloatBilinear()) return new ResizerFloatBilinear(param); else return Sse41::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); diff --git a/3rdparty/simdlib/Simd/SimdAvx2.h b/3rdparty/simdlib/Simd/SimdAvx2.h old mode 100644 new mode 100755 index 46d3b2d547..f5957b26c1 --- a/3rdparty/simdlib/Simd/SimdAvx2.h +++ b/3rdparty/simdlib/Simd/SimdAvx2.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, +* Copyright (c) 2011-2021 Yermalayeu Ihar, * 2019-2019 Facundo Galan. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -32,24 +32,22 @@ namespace Simd #ifdef SIMD_AVX2_ENABLE namespace Avx2 { + void BgraToBgr(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* bgr, size_t bgrStride); + void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride); - void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride); + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride); + + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride); void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha); - - void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride); - void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride); - void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride); - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); + void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride); void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); @@ -87,6 +85,12 @@ namespace Simd void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); + void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); + + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride); + void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); } diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp old mode 100644 new mode 100755 index b1f9ef8417..ffb4828e98 --- a/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -67,6 +67,8 @@ namespace Simd BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); } + //--------------------------------------------------------------------- + template SIMD_INLINE void Bgr48pToBgra32(uint8_t * bgra, const uint8_t * blue, const uint8_t * green, const uint8_t * red, size_t offset, __m256i alpha) { @@ -117,6 +119,45 @@ namespace Simd else Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); } + + //--------------------------------------------------------------------- + + template SIMD_INLINE void RgbToBgra(const uint8_t* rgb, uint8_t* bgra, __m256i alpha) + { + Store((__m256i*)bgra + 0, RgbToBgra(Load((__m256i*)(rgb + 0)), alpha)); + Store((__m256i*)bgra + 1, RgbToBgra(Load((__m256i*)(rgb + 24)), alpha)); + Store((__m256i*)bgra + 2, RgbToBgra(Load((__m256i*)(rgb + 48)), alpha)); + Store((__m256i*)bgra + 3, RgbToBgra(Load((__m256i*)(rgb + 64)), alpha)); + } + + template void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t alignedWidth = AlignLo(width, A); + + __m256i _alpha = _mm256_slli_si256(_mm256_set1_epi32(alpha), 3); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + RgbToBgra(rgb + 3 * col, bgra + 4 * col, _alpha); + if (width != alignedWidth) + RgbToBgra(rgb + 3 * (width - A), bgra + 4 * (width - A), _alpha); + rgb += rgbStride; + bgra += bgraStride; + } + } + + void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) + RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + else + RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + } } #else // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToBgra.cpp.o) has no symbols diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp old mode 100644 new mode 100755 index d40b0f0cc6..7b922e7025 --- a/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -45,7 +45,7 @@ namespace Simd { const __m256i lo = PackI32ToI16(BgraToGray32(bgra[0]), BgraToGray32(bgra[1])); const __m256i hi = PackI32ToI16(BgraToGray32(bgra[2]), BgraToGray32(bgra[3])); - return PackU16ToU8(lo, hi); + return PackI16ToU8(lo, hi); } template SIMD_INLINE __m256i BgrToGray(const uint8_t * bgr) @@ -84,6 +84,63 @@ namespace Simd else BgrToGray(bgr, width, height, bgrStride, gray, grayStride); } + + + //--------------------------------------------------------------------- + + const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); + + SIMD_INLINE __m256i RgbaToGray32(__m256i rgba) + { + const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF); + const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF); + const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_ROUND), _mm256_madd_epi16(r0b0, K16_RED_BLUE)); + return _mm256_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT); + } + + SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4]) + { + const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); + const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); + return PackI16ToU8(lo, hi); + } + + template SIMD_INLINE __m256i RgbToGray(const uint8_t* rgb) + { + __m256i rgba[4]; + rgba[0] = BgrToBgra(Load((__m256i*)(rgb + 0)), K32_01000000); + rgba[1] = BgrToBgra(Load((__m256i*)(rgb + 24)), K32_01000000); + rgba[2] = BgrToBgra(Load((__m256i*)(rgb + 48)), K32_01000000); + rgba[3] = BgrToBgra(Load((__m256i*)(rgb + 64)), K32_01000000); + return RgbaToGray(rgba); + } + + template void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) + { + assert(width >= A); + if (align) + assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t alignedWidth = AlignLo(width, A); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + Store((__m256i*)(gray + col), RgbToGray(rgb + 3 * col)); + if (width != alignedWidth) + Store((__m256i*)(gray + width - A), RgbToGray(rgb + 3 * (width - A))); + rgb += rgbStride; + gray += grayStride; + } + } + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) + { + if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)) + RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + else + RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + } } #else // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToGray.cpp.o) has no symbols diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp old mode 100644 new mode 100755 index 2daae1e7df..a64ed8035e --- a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -64,7 +64,7 @@ namespace Simd _mm256_shuffle_epi8(p1, K8_SHFL_2P1)), _mm256_shuffle_epi8(p2, K8_SHFL_2P2))); } - template void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) + template void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride) { assert(width >= A); if (align) @@ -85,12 +85,12 @@ namespace Simd } } - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) + void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride) { if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)) - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); + BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); else - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); + BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); } } #else diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp deleted file mode 100644 index a4f9efdb2f..0000000000 --- a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void BgrToRgba(const uint8_t * bgr, uint8_t * rgba, __m256i alpha) - { - Store((__m256i*)rgba + 0, BgrToRgba(Load((__m256i*)(bgr + 0)), alpha)); - Store((__m256i*)rgba + 1, BgrToRgba(Load((__m256i*)(bgr + 24)), alpha)); - Store((__m256i*)rgba + 2, BgrToRgba(Load((__m256i*)(bgr + 48)), alpha)); - Store((__m256i*)rgba + 3, BgrToRgba(Load((__m256i*)(bgr + 64)), alpha)); - } - - template void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha) - { - assert(width >= A); - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgba) && Aligned(rgbaStride)); - - size_t alignedWidth = AlignLo(width, A); - - __m256i _alpha = _mm256_slli_si256(_mm256_set1_epi32(alpha), 3); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BgrToRgba(bgr + 3 * col, rgba + 4 * col, _alpha); - if (width != alignedWidth) - BgrToRgba(bgr + 3 * (width - A), rgba + 4 * (width - A), _alpha); - bgr += bgrStride; - rgba += rgbaStride; - } - } - - void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha) - { - if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - else - BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToRgba.cpp.o) has no symbols - void dummy_SimdAvx2BgrToRgba(){}; -#endif//SIMD_AVX2_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp new file mode 100755 index 0000000000..aac574d71c --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp @@ -0,0 +1,149 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdStore.h" +#include "Simd/SimdMemory.h" +#include "Simd/SimdConst.h" + +namespace Simd +{ +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + template SIMD_INLINE __m256i BgraToBgr(const uint8_t* bgra) + { + __m256i _bgra = Load((__m256i*)bgra); + return _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(_bgra, K8_SHUFFLE_BGRA_TO_BGR), K32_PERMUTE_BGRA_TO_BGR); + } + + template void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) + { + assert(width >= F); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); + + size_t widthF = AlignLo(width, F); + if (width == widthF) + widthF -= F; + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < widthF; col += F) + Store((__m256i*)(bgr + 3 * col), BgraToBgr(bgra + 4 * col)); + if (width != widthF) + Store24(bgr + 3 * (width - F), BgraToBgr(bgra + 4 * (width - F))); + bgra += bgraStride; + bgr += bgrStride; + } + } + + void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) + BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); + else + BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); + } + + //--------------------------------------------------------------------- + + const __m256i K8_SHUFFLE_BGRA_TO_RGB = SIMD_MM256_SETR_EPI8( + 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1, + 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1); + + template SIMD_INLINE __m256i BgraToRgb(const uint8_t* bgra) + { + __m256i _bgra = Load((__m256i*)bgra); + return _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(_bgra, K8_SHUFFLE_BGRA_TO_RGB), K32_PERMUTE_BGRA_TO_BGR); + } + + template void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) + { + assert(width >= F); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t widthF = AlignLo(width, F); + if (width == widthF) + widthF -= F; + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < widthF; col += F) + Store((__m256i*)(rgb + 3 * col), BgraToRgb(bgra + 4 * col)); + if (width != widthF) + Store24(rgb + 3 * (width - F), BgraToRgb(bgra + 4 * (width - F))); + bgra += bgraStride; + rgb += rgbStride; + } + } + + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) + BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); + else + BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); + } + + //--------------------------------------------------------------------- + + const __m256i K8_BGRA_TO_RGBA = SIMD_MM256_SETR_EPI8( + 0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF, + 0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF); + + template SIMD_INLINE void BgraToRgba(const uint8_t* bgra, uint8_t* rgba) + { + Store((__m256i*)rgba, _mm256_shuffle_epi8(Load((__m256i*)bgra), K8_BGRA_TO_RGBA)); + } + + template void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride)); + + size_t size = width * 4; + size_t sizeA = AlignLo(size, A); + + for (size_t row = 0; row < height; ++row) + { + for (size_t i = 0; i < size; i += A) + BgraToRgba(bgra + i, rgba + i); + if (size != sizeA) + BgraToRgba(bgra + size - sizeA, rgba + size - sizeA); + bgra += bgraStride; + rgba += rgbaStride; + } + } + + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride)) + BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + else + BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + } + } +#endif// SIMD_AVX2_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp old mode 100644 new mode 100755 index f203fcae78..7082801956 --- a/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -46,7 +46,7 @@ namespace Simd { const __m256i lo = PackI32ToI16(BgraToGray32(bgra[0]), BgraToGray32(bgra[1])); const __m256i hi = PackI32ToI16(BgraToGray32(bgra[2]), BgraToGray32(bgra[3])); - return PackU16ToU8(lo, hi); + return PackI16ToU8(lo, hi); } template SIMD_INLINE void Load(const uint8_t* p, __m256i a[4]) @@ -89,6 +89,58 @@ namespace Simd else BgraToGray(bgra, width, height, bgraStride, gray, grayStride); } + + //--------------------------------------------------------------------- + + const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); + + SIMD_INLINE __m256i RgbaToGray32(__m256i rgba) + { + const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF); + const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF); + const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_0000), _mm256_madd_epi16(r0b0, K16_RED_BLUE)); + return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); + } + + SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4]) + { + const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); + const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); + return PackI16ToU8(lo, hi); + } + + template void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) + { + assert(width >= A); + if (align) + assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride)); + + size_t alignedWidth = AlignLo(width, A); + __m256i a[4]; + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + { + Load(rgba + 4 * col, a); + Store((__m256i*)(gray + col), RgbaToGray(a)); + } + if (alignedWidth != width) + { + Load(rgba + 4 * (width - A), a); + Store((__m256i*)(gray + width - A), RgbaToGray(a)); + } + rgba += rgbaStride; + gray += grayStride; + } + } + + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) + { + if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride)) + RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + else + RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + } } #else // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgraToGray.cpp.o) has no symbols diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp deleted file mode 100644 index d64f184cbf..0000000000 --- a/3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - template SIMD_INLINE void BgraToRgba(const uint8_t * bgra, uint8_t * rgba) - { - Store((__m256i*)rgba + 0, BgraToRgba(Load((__m256i*)(bgra + 0)))); - Store((__m256i*)rgba + 1, BgraToRgba(Load((__m256i*)(bgra + 32)))); - Store((__m256i*)rgba + 2, BgraToRgba(Load((__m256i*)(bgra + 64)))); - Store((__m256i*)rgba + 3, BgraToRgba(Load((__m256i*)(bgra + 96)))); - } - - template void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride)); - - size_t alignedWidth = AlignLo(width, A); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BgraToRgba(bgra + 4 * col, rgba + 4 * col); - if (width != alignedWidth) - BgraToRgba(bgra + 4 * (width - A), rgba + 4 * (width - A)); - bgra += bgraStride; - rgba += rgbaStride; - } - } - - void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride) - { - if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); - else - BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToRgba.cpp.o) has no symbols - void dummy_SimdAvx2BgraToRgba(){}; -#endif//SIMD_AVX2_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp b/3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp new file mode 100644 index 0000000000..778b11803a --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp @@ -0,0 +1,68 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2020 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdEnable.h" +#include "Simd/SimdCpu.h" + +#if defined(_MSC_VER) +#include +#endif + +namespace Simd +{ +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + SIMD_INLINE bool SupportedByCPU() + { + return + Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) && + Base::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX2) && + Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::FMA) && + Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::F16C); + } + + SIMD_INLINE bool SupportedByOS() + { +#if defined(_MSC_VER) + __try + { + __m256i value = _mm256_abs_epi8(_mm256_set1_epi8(1));// try to execute of AVX2 instructions; + return true; + } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } +#else + return true; +#endif + } + + bool GetEnable() + { + return SupportedByCPU() && SupportedByOS(); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp b/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp old mode 100644 new mode 100755 index 762d0f37ba..2bf5741a35 --- a/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -69,13 +69,15 @@ namespace Simd DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); } + //--------------------------------------------------------------------- + const __m256i K8_SHUFFLE_BGRA = SIMD_MM256_SETR_EPI8( 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF, 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF); const __m256i K32_PERMUTE_BGRA = SIMD_MM256_SETR_EPI32(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7); - template SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset) + template SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset) { __m256i _bgra[4]; _bgra[0] = _mm256_shuffle_epi8(Load((__m256i*)bgra + 0), K8_SHUFFLE_BGRA); @@ -93,39 +95,58 @@ namespace Simd __m256i rraa1 = _mm256_unpackhi_epi32(_bgra[2], _bgra[3]); Store((__m256i*)(r + offset), _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(rraa0, rraa1), K32_PERMUTE_BGRA)); - Store((__m256i*)(a + offset), _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(rraa0, rraa1), K32_PERMUTE_BGRA)); + if(alpha) + Store((__m256i*)(a + offset), _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(rraa0, rraa1), K32_PERMUTE_BGRA)); } - template void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) + template void DeinterleaveBgra(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height, + uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride) { assert(width >= A); if (align) { assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)); + assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL)); } size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) + if (a) { - for (size_t col = 0; col < alignedWidth; col += A) - DeinterleaveBgra(bgra + col * 4, b, g, r, a, col); - if (width != alignedWidth) - DeinterleaveBgra(bgra + 4 * (width - A), b, g, r, a, width - A); - bgra += bgraStride; - b += bStride; - g += gStride; - r += rStride; - a += aStride; + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + DeinterleaveBgra(bgra + col * 4, b, g, r, a, col); + if (width != alignedWidth) + DeinterleaveBgra(bgra + 4 * (width - A), b, g, r, a, width - A); + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; + a += aStride; + } + } + else + { + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + DeinterleaveBgra(bgra + col * 4, b, g, r, NULL, col); + if (width != alignedWidth) + DeinterleaveBgra(bgra + 4 * (width - A), b, g, r, NULL, width - A); + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; + } } } - void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, - uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) + void DeinterleaveBgra(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height, + uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride) { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)) + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && + Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL)) DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); else DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); diff --git a/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp old mode 100644 new mode 100755 index 243663a169..beefb55410 --- a/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2020 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "Simd/SimdMemory.h" +#include "Simd/SimdLoadBlock.h" #include "Simd/SimdStore.h" #include "Simd/SimdGaussianBlur.h" #include "Simd/SimdExtract.h" diff --git a/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp b/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp old mode 100644 new mode 100755 index ca40f5a347..5a85a27334 --- a/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp @@ -42,7 +42,7 @@ namespace Simd _mm256_and_si256(_mm256_srli_si256(s01, 1), K16_00FF), _mm256_and_si256(s11, K16_00FF), _mm256_and_si256(_mm256_srli_si256(s11, 1), K16_00FF)); - return PackU16ToU8(lo, hi); + return PackI16ToU8(lo, hi); } #else SIMD_INLINE __m256i Average16(const __m256i & s0, const __m256i & s1) @@ -52,7 +52,7 @@ namespace Simd SIMD_INLINE __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11) { - return PackU16ToU8(Average16(s00, s10), Average16(s01, s11)); + return PackI16ToU8(Average16(s00, s10), Average16(s01, s11)); } #endif diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp old mode 100644 new mode 100755 index c4ee30e989..d7caad1571 --- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2018 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -42,7 +42,7 @@ namespace Simd _mm256_and_si256(_mm256_srli_si256(s01, 1), K16_00FF), _mm256_and_si256(s11, K16_00FF), _mm256_and_si256(_mm256_srli_si256(s11, 1), K16_00FF)); - return PackU16ToU8(lo, hi); + return PackI16ToU8(lo, hi); } #else SIMD_INLINE __m256i Average16(const __m256i & s0, const __m256i & s1) @@ -52,7 +52,7 @@ namespace Simd SIMD_INLINE __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11) { - return PackU16ToU8(Average16(s00, s10), Average16(s01, s11)); + return PackI16ToU8(Average16(s00, s10), Average16(s01, s11)); } #endif diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp old mode 100644 new mode 100755 index 34b4a91ecb..71f36b978f --- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -78,7 +78,7 @@ namespace Simd template SIMD_INLINE __m256i ReduceRow(const __m256i lo[3], const __m256i hi[3]) { - return PackU16ToU8( + return PackI16ToU8( DivideBy16(BinomialSum16(lo[0], lo[1], lo[2])), DivideBy16(BinomialSum16(hi[0], hi[1], hi[2]))); } diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp old mode 100644 new mode 100755 index bf732178ed..cea41815d3 --- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -119,7 +119,7 @@ namespace Simd { __m256i lo = ReduceRow16(buffer, offset); __m256i hi = ReduceRow16(buffer, offset + HA); - return PackU16ToU8(lo, hi); + return PackI16ToU8(lo, hi); } template void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp old mode 100644 new mode 100755 index 96771d8aee..fe2ebbd3cf --- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -120,14 +120,14 @@ namespace Simd { const __m256i lo = MainRowX5x5(buffer.dst + offset); const __m256i hi = MainRowX5x5(buffer.dst + offset + HA); - return _mm256_and_si256(PackU16ToU8(lo, hi), K16_00FF); + return _mm256_and_si256(PackI16ToU8(lo, hi), K16_00FF); } template SIMD_INLINE void MainRowX5x5(Buffer & buffer, size_t offset, uint8_t * dst) { __m256i lo = MainRowX5x5(buffer, offset); __m256i hi = MainRowX5x5(buffer, offset + A); - Store((__m256i*)dst, PackU16ToU8(lo, hi)); + Store((__m256i*)dst, PackI16ToU8(lo, hi)); } template void ReduceGray5x5( diff --git a/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp b/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp old mode 100644 new mode 100755 index f00b174cb2..53c9cdc9f8 --- a/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -273,7 +273,7 @@ namespace Simd { __m256i lo = InterpolateY((__m256i*)bx0 + 0, (__m256i*)bx1 + 0, alpha); __m256i hi = InterpolateY((__m256i*)bx0 + 1, (__m256i*)bx1 + 1, alpha); - Store((__m256i*)dst, PackU16ToU8(lo, hi)); + Store((__m256i*)dst, PackI16ToU8(lo, hi)); } template void ResizeBilinear( diff --git a/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp b/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp old mode 100644 new mode 100755 index ab739b7aa9..d75c24989d --- a/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp +++ b/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -23,6 +23,7 @@ */ #include "Simd/SimdMemory.h" #include "Simd/SimdResizer.h" +#include "Simd/SimdResizerCommon.h" #include "Simd/SimdStore.h" #include "Simd/SimdSet.h" #include "Simd/SimdUpdate.h" @@ -33,7 +34,7 @@ namespace Simd namespace Avx2 { ResizerByteBilinear::ResizerByteBilinear(const ResParam & param) - : Ssse3::ResizerByteBilinear(param) + : Sse41::ResizerByteBilinear(param) { } @@ -223,7 +224,7 @@ namespace Simd { __m256i lo = ResizerByteBilinearInterpolateY((__m256i*)bx0 + 0, (__m256i*)bx1 + 0, alpha); __m256i hi = ResizerByteBilinearInterpolateY((__m256i*)bx0 + 1, (__m256i*)bx1 + 1, alpha); - Store((__m256i*)dst, PackU16ToU8(lo, hi)); + Store((__m256i*)dst, PackI16ToU8(lo, hi)); } template void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) @@ -523,7 +524,7 @@ namespace Simd float * pbx[2] = { _bx[0].data, _bx[1].data }; int32_t prev = -2; size_t rsa = AlignLo(rs, Avx::F); - size_t rsh = AlignLo(rs, Sse::F); + size_t rsh = AlignLo(rs, Sse2::F); for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) { float fy1 = _ay[dy]; @@ -560,10 +561,10 @@ namespace Simd __m256 s1 = _mm256_shuffle_ps(s0145, s2367, 0xDD); _mm256_store_ps(pb + dx, _mm256_fmadd_ps(s0, fx0, _mm256_mul_ps(s1, fx1))); } - for (; dx < rsh; dx += Sse::F) + for (; dx < rsh; dx += Sse2::F) { - __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); - __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); + __m128 s01 = Sse2::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); + __m128 s23 = Sse2::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); __m128 fx1 = _mm_load_ps(_ax.data + dx); __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1); __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88)); @@ -625,7 +626,7 @@ namespace Simd __m256 b1 = _mm256_load_ps(pbx[1] + dx); _mm256_storeu_ps(dst + dx, _mm256_fmadd_ps(b0, _fy0, _mm256_mul_ps(b1, _fy1))); } - for (; dx < rsh; dx += Sse::F) + for (; dx < rsh; dx += Sse2::F) { __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0)); __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1)); @@ -641,11 +642,11 @@ namespace Simd void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) { ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m256i)); - if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && dstX >= A) + if (param.IsByteBilinear() && dstX >= A) return new ResizerByteBilinear(param); - else if (type == SimdResizeChannelByte && method == SimdResizeMethodArea) + else if (param.IsByteArea()) return new ResizerByteArea(param); - else if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp)) + else if (param.IsFloatBilinear()) return new ResizerFloatBilinear(param); else return Avx::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); diff --git a/3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp deleted file mode 100644 index 1533d99dfb..0000000000 --- a/3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); - const __m256i K16_GREEN_ROUND = SIMD_MM256_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m256i RgbaToGray32(__m256i rgba) - { - const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF); - const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF); - const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_ROUND), _mm256_madd_epi16(r0b0, K16_RED_BLUE)); - return _mm256_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4]) - { - const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); - const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); - return PackU16ToU8(lo, hi); - } - - template SIMD_INLINE __m256i RgbToGray(const uint8_t * rgb) - { - __m256i rgba[4]; - rgba[0] = BgrToBgra(Load((__m256i*)(rgb + 0)), K32_01000000); - rgba[1] = BgrToBgra(Load((__m256i*)(rgb + 24)), K32_01000000); - rgba[2] = BgrToBgra(Load((__m256i*)(rgb + 48)), K32_01000000); - rgba[3] = BgrToBgra(Load((__m256i*)(rgb + 64)), K32_01000000); - return RgbaToGray(rgba); - } - - template void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride) - { - assert(width >= A); - if (align) - assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)); - - size_t alignedWidth = AlignLo(width, A); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - Store((__m256i*)(gray + col), RgbToGray(rgb + 3 * col)); - if (width != alignedWidth) - Store((__m256i*)(gray + width - A), RgbToGray(rgb + 3 * (width - A))); - rgb += rgbStride; - gray += grayStride; - } - } - - void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride) - { - if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)) - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2RgbToGray.cpp.o) has no symbols - void dummy_SimdAvx2RgbToGray(){}; -#endif//SIMD_AVX2_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp deleted file mode 100644 index d28cb39832..0000000000 --- a/3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_AVX2_ENABLE - namespace Avx2 - { - const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); - const __m256i K16_GREEN_0000 = SIMD_MM256_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, 0x0000); - const __m256i K32_ROUND_TERM = SIMD_MM256_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m256i RgbaToGray32(__m256i rgba) - { - const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF); - const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF); - const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_0000), _mm256_madd_epi16(r0b0, K16_RED_BLUE)); - return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4]) - { - const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); - const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); - return PackU16ToU8(lo, hi); - } - - template SIMD_INLINE void Load(const uint8_t* p, __m256i a[4]) - { - a[0] = Load((__m256i*)p + 0); - a[1] = Load((__m256i*)p + 1); - a[2] = Load((__m256i*)p + 2); - a[3] = Load((__m256i*)p + 3); - } - - template void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride) - { - assert(width >= A); - if (align) - assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, A); - __m256i a[4]; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - Load(rgba + 4 * col, a); - Store((__m256i*)(gray + col), RgbaToGray(a)); - } - if (alignedWidth != width) - { - Load(rgba + 4 * (width - A), a); - Store((__m256i*)(gray + width - A), RgbaToGray(a)); - } - rgba += rgbaStride; - gray += grayStride; - } - } - - void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride) - { - if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride)) - RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); - else - RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2RgbaToGray.cpp.o) has no symbols - void dummy_SimdAvx2RgbaToGray(){}; -#endif// SIMD_AVX2_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdBase.h b/3rdparty/simdlib/Simd/SimdBase.h old mode 100644 new mode 100755 index 57d654751e..998a7b7cbe --- a/3rdparty/simdlib/Simd/SimdBase.h +++ b/3rdparty/simdlib/Simd/SimdBase.h @@ -38,7 +38,9 @@ namespace Simd void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride); - void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride); + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride); + + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride); void BgrToBgra(const uint8_t * bgr, size_t size, uint8_t * bgra, bool fillAlpha, bool lastRow, uint8_t alpha); @@ -47,15 +49,9 @@ namespace Simd void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha); - - void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride); - void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride); - void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride); - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); + void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride); void Copy(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, uint8_t * dst, size_t dstStride); @@ -104,6 +100,12 @@ namespace Simd void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); + void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); + + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride); + void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp old mode 100644 new mode 100755 index b909ee9d20..b5b8140dbe --- a/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -106,5 +106,23 @@ namespace Simd bgra += bgraStride; } } + + void RgbToBgra(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) + { + size_t rgbGap = rgbStride - width * 3; + size_t bgraGap = bgraStride - width * 4; + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < width; ++col, rgb += 3, bgra += 4) + { + bgra[0] = rgb[2]; + bgra[1] = rgb[1]; + bgra[2] = rgb[0]; + bgra[3] = alpha; + } + rgb += rgbGap; + bgra += bgraGap; + } + } } } diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp old mode 100644 new mode 100755 index e6fa81ddb1..26f7bf171b --- a/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -39,5 +39,18 @@ namespace Simd } } } + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) + { + for (size_t row = 0; row < height; ++row) + { + const uint8_t* pRgb = rgb + row * rgbStride; + uint8_t* pGray = gray + row * grayStride; + for (const uint8_t* pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgb += 3) + { + *pGray = BgrToGray(pRgb[2], pRgb[1], pRgb[0]); + } + } + } } } diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp old mode 100644 new mode 100755 index d508115a64..ece4ffc97f --- a/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ namespace Simd { namespace Base { - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) + void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride) { size_t size = width * 3; for (size_t row = 0; row < height; ++row) diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp deleted file mode 100644 index b7003c067b..0000000000 --- a/3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdDefs.h" -#include - -namespace Simd -{ - namespace Base - { - void BgrToRgba(const uint8_t *bgr, size_t size, uint8_t *rgba, bool fillAlpha, bool lastRow, uint8_t alpha) - { - if (fillAlpha) - { -#ifdef SIMD_BIG_ENDIAN - const int32_t alphaMask = alpha; -#else - const int32_t alphaMask = alpha << 24; -#endif - for (size_t i = (lastRow ? 1 : 0); i < size; ++i, bgr += 3, rgba += 4) - { - *(int32_t*)rgba = (*(int32_t*)bgr) | alphaMask; - std::swap(rgba[0], rgba[2]); - } - if (lastRow) - { - rgba[0] = bgr[2]; - rgba[1] = bgr[1]; - rgba[2] = bgr[0]; - rgba[3] = alpha; - } - } - else - { - for (size_t i = (lastRow ? 1 : 0); i < size; ++i, bgr += 3, rgba += 4) - { - *(int32_t*)rgba = (*(int32_t*)bgr); - std::swap(rgba[0], rgba[2]); - } - if (lastRow) - { - rgba[0] = bgr[2]; - rgba[1] = bgr[1]; - rgba[2] = bgr[0]; - } - } - } - - void BgrToRgba(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *rgba, size_t bgraStride, uint8_t alpha) - { - for (size_t row = 1; row < height; ++row) - { - BgrToRgba(bgr, width, rgba, true, false, alpha); - bgr += bgrStride; - rgba += bgraStride; - } - BgrToRgba(bgr, width, rgba, true, true, alpha); - } - } -} diff --git a/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp old mode 100644 new mode 100755 index 8d3b1bbc6c..6ee5d55355 --- a/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -51,5 +51,40 @@ namespace Simd } BgraToBgr(bgra, width, bgr, true); } + + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) + { + size_t bgraGap = bgraStride - width * 4; + size_t rgbGap = rgbStride - width * 3; + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < width; ++col, bgra += 4, rgb += 3) + { + rgb[2] = bgra[0]; + rgb[1] = bgra[1]; + rgb[0] = bgra[2]; + } + bgra += bgraGap; + rgb += rgbGap; + } + } + + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) + { + size_t bgraGap = bgraStride - width * 4; + size_t rgbaGap = rgbaStride - width * 4; + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < width; ++col, bgra += 4, rgba += 4) + { + rgba[2] = bgra[0]; + rgba[1] = bgra[1]; + rgba[0] = bgra[2]; + rgba[3] = bgra[3]; + } + bgra += bgraGap; + rgba += rgbaGap; + } + } } } diff --git a/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp b/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp old mode 100644 new mode 100755 index 3d855e749e..16fba3e7ce --- a/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -39,5 +39,18 @@ namespace Simd } } } + + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) + { + for (size_t row = 0; row < height; ++row) + { + const uint8_t* pRgba = rgba + row * rgbaStride; + uint8_t* pGray = gray + row * grayStride; + for (const uint8_t* pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgba += 4) + { + *pGray = BgrToGray(pRgba[2], pRgba[1], pRgba[0]); + } + } + } } } diff --git a/3rdparty/simdlib/Simd/SimdBaseCpu.cpp b/3rdparty/simdlib/Simd/SimdBaseCpu.cpp new file mode 100644 index 0000000000..77fc5718df --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseCpu.cpp @@ -0,0 +1,234 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2020 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdCpu.h" + +#include +#include +#include +#include + +#if defined(_MSC_VER) + +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include + +#elif defined(__GNUC__) +#include +#include +#include + +#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) +#include +#endif + +#if defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE) +#include +#include +#if defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE) +#include +#endif +#endif + +#else +# error Do not know how to detect CPU info +#endif + +namespace Simd +{ + namespace Base + { +#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) + bool CheckBit(Cpuid::Level level, Cpuid::Register index, Cpuid::Bit bit) + { + unsigned int registers[4] = { 0, 0, 0, 0 }; +#if defined(_MSC_VER) + __cpuid((int*)registers, level); +#elif (defined __GNUC__) + if (__get_cpuid_max(0, NULL) < level) + return false; + __cpuid_count(level, 0, + registers[Cpuid::Eax], + registers[Cpuid::Ebx], + registers[Cpuid::Ecx], + registers[Cpuid::Edx]); +#else +#error Do not know how to detect CPU info! +#endif + return (registers[index] & bit) == bit; + } +#endif//defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) + +#if defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)) + bool CheckBit(int at, int bit) + { + bool result = false; + int file = ::open("/proc/self/auxv", O_RDONLY); + if (file < 0) + return false; + const ssize_t size = 64; + unsigned long buffer[size]; + for (ssize_t count = size; count == size;) + { + count = ::read(file, buffer, sizeof(buffer)) / sizeof(unsigned long); + for (int i = 0; i < count; i += 2) + { + if (buffer[i] == (unsigned)at) + { + result = !!(buffer[i + 1] & bit); + count = 0; + } + if (buffer[i] == AT_NULL) + count = 0; + } + } + ::close(file); + return result; + } +#endif//defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)) + + size_t CpuThreadNumber() + { + return std::thread::hardware_concurrency(); + } + +#if defined(_MSC_VER) + typedef SYSTEM_LOGICAL_PROCESSOR_INFORMATION Info; + + void GetLogicalProcessorInformation(std::vector & info) + { + DWORD size = 0; + ::GetLogicalProcessorInformation(0, &size); + info.resize(size / sizeof(Info)); + ::GetLogicalProcessorInformation(info.data(), &size); + } + + size_t CpuSocketNumber() + { + std::vector info; + GetLogicalProcessorInformation(info); + size_t number = 0; + for (size_t i = 0; i < info.size(); ++i) + if (info[i].Relationship == ::RelationNumaNode) + number++; + return number; + } + + size_t CpuCoreNumber() + { + std::vector info; + GetLogicalProcessorInformation(info); + size_t number = 0; + for (size_t i = 0; i < info.size(); ++i) + if (info[i].Relationship == ::RelationProcessorCore) + number++; + return number; + } + + size_t CpuCacheSize(size_t level) + { + std::vector info; + GetLogicalProcessorInformation(info); + for (size_t i = 0; i < info.size(); ++i) + if (info[i].Relationship == ::RelationCache && info[i].Cache.Level == level && (info[i].Cache.Type == ::CacheData || info[i].Cache.Type == CacheUnified)) + return info[i].Cache.Size; + return 0; + } +#elif defined(__GNUC__) + size_t CpuSocketNumber() + { + uint32_t number = 0; + ::FILE * p = ::popen("lscpu -b -p=Socket | grep -v '^#' | sort -u | wc -l", "r"); + if (p) + { + char buffer[PATH_MAX]; + while (::fgets(buffer, PATH_MAX, p)); + number = ::atoi(buffer); + ::pclose(p); + } + return number; + } + + size_t CpuCoreNumber() + { + uint32_t number = 0; + ::FILE * p = ::popen("lscpu -b -p=Core | grep -v '^#' | sort -u | wc -l", "r"); + if (p) + { + char buffer[PATH_MAX]; + while (::fgets(buffer, PATH_MAX, p)); + number = ::atoi(buffer); + ::pclose(p); + } + return number; + } + + SIMD_INLINE size_t CorrectIfZero(size_t value, size_t otherwise) + { + return value ? value : otherwise; + } + +#if defined(_SC_LEVEL1_DCACHE_SIZE) && defined(_SC_LEVEL2_CACHE_SIZE) && defined(_SC_LEVEL3_CACHE_SIZE) + size_t CpuCacheSize(size_t level) + { + switch (level) + { + case 1: return CorrectIfZero(::sysconf(_SC_LEVEL1_DCACHE_SIZE), 32 * 1024); + case 2: return CorrectIfZero(::sysconf(_SC_LEVEL2_CACHE_SIZE), 256 * 1024); + case 3: return CorrectIfZero(::sysconf(_SC_LEVEL3_CACHE_SIZE), 2048 * 1024); + default: + return 0; + } + } +#else + size_t CpuCacheSize(size_t level) + { + switch (level) + { + case 1: return 32 * 1024; + case 2: return 256 * 1024; + case 3: return 2048 * 1024; + default: + return 0; + } + } +#endif + +#else +#error This platform is unsupported! +#endif + } + + namespace Cpu + { + const size_t SOCKET_NUMBER = Base::CpuSocketNumber(); + const size_t CORE_NUMBER = Base::CpuCoreNumber(); + const size_t THREAD_NUMBER = Base::CpuThreadNumber(); + const size_t L1_CACHE_SIZE = Base::CpuCacheSize(1); + const size_t L2_CACHE_SIZE = Base::CpuCacheSize(2); + const size_t L3_CACHE_SIZE = Base::CpuCacheSize(3); + } +} diff --git a/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp b/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp old mode 100644 new mode 100755 index ecb22ed4b0..366ce1bc0e --- a/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -48,20 +48,39 @@ namespace Simd void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) { - for (size_t row = 0; row < height; ++row) + if (a) { - for (size_t col = 0, offset = 0; col < width; ++col, offset += 4) + for (size_t row = 0; row < height; ++row) { - b[col] = bgra[offset + 0]; - g[col] = bgra[offset + 1]; - r[col] = bgra[offset + 2]; - a[col] = bgra[offset + 3]; + for (size_t col = 0, offset = 0; col < width; ++col, offset += 4) + { + b[col] = bgra[offset + 0]; + g[col] = bgra[offset + 1]; + r[col] = bgra[offset + 2]; + a[col] = bgra[offset + 3]; + } + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; + a += aStride; + } + } + else + { + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0, offset = 0; col < width; ++col, offset += 4) + { + b[col] = bgra[offset + 0]; + g[col] = bgra[offset + 1]; + r[col] = bgra[offset + 2]; + } + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; } - bgra += bgraStride; - b += bStride; - g += gStride; - r += rStride; - a += aStride; } } } diff --git a/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp old mode 100644 new mode 100755 index 560b9d3cb9..1394d919e1 --- a/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2020 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/3rdparty/simdlib/Simd/SimdBaseResizer.cpp b/3rdparty/simdlib/Simd/SimdBaseResizer.cpp old mode 100644 new mode 100755 index 9585a4f1ac..b8c08d2b92 --- a/3rdparty/simdlib/Simd/SimdBaseResizer.cpp +++ b/3rdparty/simdlib/Simd/SimdBaseResizer.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -23,6 +23,7 @@ */ #include "Simd/SimdMemory.h" #include "Simd/SimdResizer.h" +#include "Simd/SimdCopyPixel.h" namespace Simd { @@ -132,8 +133,6 @@ namespace Simd ResizerByteArea::ResizerByteArea(const ResParam & param) : Resizer(param) { - double scale = Simd::Max(float(_param.srcW) / _param.dstW, float(_param.srcH) / _param.dstH); - _ay.Resize(_param.dstH + 1); _iy.Resize(_param.dstH + 1); EstimateParams(_param.srcH, _param.dstH, Base::AREA_RANGE, _ay.data, _iy.data); @@ -234,28 +233,173 @@ namespace Simd //--------------------------------------------------------------------- + ResizerShortBilinear::ResizerShortBilinear(const ResParam& param) + : Resizer(param) + { + _ay.Resize(_param.dstH, false, _param.align); + _iy.Resize(_param.dstH, false, _param.align); + EstimateIndexAlpha(_param.srcH, _param.dstH, 1, _iy.data, _ay.data); + size_t rs = _param.dstW * _param.channels; + _ax.Resize(rs, false, _param.align); + _ix.Resize(rs, false, _param.align); + EstimateIndexAlpha(_param.srcW, _param.dstW, _param.channels, _ix.data, _ax.data); + _bx[0].Resize(rs, false, _param.align); + _bx[1].Resize(rs, false, _param.align); + } + + void ResizerShortBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t* indices, float* alphas) + { + float scale = (float)srcSize / dstSize; + for (size_t i = 0; i < dstSize; ++i) + { + float alpha = (float)((i + 0.5f) * scale - 0.5f); + ptrdiff_t index = (ptrdiff_t)::floor(alpha); + alpha -= index; + if (index < 0) + { + index = 0; + alpha = 0; + } + if (index > (ptrdiff_t)srcSize - 2) + { + index = srcSize - 2; + alpha = 1; + } + for (size_t c = 0; c < channels; c++) + { + size_t offset = i * channels + c; + indices[offset] = (int32_t)(channels * index + c); + alphas[offset] = alpha; + } + } + } + + void ResizerShortBilinear::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) + { + Run((const uint16_t*)src, srcStride / sizeof(uint16_t), (uint16_t*)dst, dstStride / sizeof(uint16_t)); + } + + template void ResizerShortBilinear::RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride) + { + size_t rs = _param.dstW * N; + float* pbx[2] = { _bx[0].data, _bx[1].data }; + int32_t prev = -2; + for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) + { + float fy1 = _ay[dy]; + float fy0 = 1.0f - fy1; + int32_t sy = _iy[dy]; + int32_t k = 0; + if (sy == prev) + k = 2; + else if (sy == prev + 1) + { + Swap(pbx[0], pbx[1]); + k = 1; + } + prev = sy; + for (; k < 2; k++) + { + float* pb = pbx[k]; + const uint16_t* ps = src + (sy + k) * srcStride; + for (size_t dx = 0; dx < rs; dx++) + { + int32_t sx = _ix[dx]; + float fx = _ax[dx]; + pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + N] * fx; + } + } + for (size_t dx = 0; dx < rs; dx++) + dst[dx] = Round(pbx[0][dx] * fy0 + pbx[1][dx] * fy1); + } + } + + template void ResizerShortBilinear::RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride) + { + size_t rs = _param.dstW * N; + for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) + { + float fy1 = _ay[dy]; + float fy0 = 1.0f - fy1; + int32_t sy = _iy[dy]; + const uint16_t* ps0 = src + (sy + 0) * srcStride; + const uint16_t* ps1 = src + (sy + 1) * srcStride; + for (size_t dx = 0; dx < rs; dx++) + { + int32_t sx = _ix[dx]; + float fx1 = _ax[dx]; + float fx0 = 1.0f - fx1; + float r0 = ps0[sx] * fx0 + ps0[sx + N] * fx1; + float r1 = ps1[sx] * fx0 + ps1[sx + N] * fx1; + dst[dx] = Round(r0 * fy0 + r1 * fy1); + } + } + } + + void ResizerShortBilinear::Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride) + { + bool sparse = _param.dstH * 2.0 <= _param.srcH; + switch (_param.channels) + { + case 1: sparse ? RunS<1>(src, srcStride, dst, dstStride) : RunB<1>(src, srcStride, dst, dstStride); return; + case 2: sparse ? RunS<2>(src, srcStride, dst, dstStride) : RunB<2>(src, srcStride, dst, dstStride); return; + case 3: sparse ? RunS<3>(src, srcStride, dst, dstStride) : RunB<3>(src, srcStride, dst, dstStride); return; + case 4: sparse ? RunS<4>(src, srcStride, dst, dstStride) : RunB<4>(src, srcStride, dst, dstStride); return; + default: + assert(0); + } + } + + //--------------------------------------------------------------------- + ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param) : Resizer(param) { _ay.Resize(_param.dstH, false, _param.align); _iy.Resize(_param.dstH, false, _param.align); - EstimateIndexAlpha(_param.srcH, _param.dstH, 1, _param.method == SimdResizeMethodCaffeInterp, _iy.data, _ay.data); + EstimateIndexAlpha(_param.srcH, _param.dstH, 1, _iy.data, _ay.data); size_t rs = _param.dstW * _param.channels; _ax.Resize(rs, false, _param.align); _ix.Resize(rs, false, _param.align); - EstimateIndexAlpha(_param.srcW, _param.dstW, _param.channels, _param.method == SimdResizeMethodCaffeInterp, _ix.data, _ax.data); + EstimateIndexAlpha(_param.srcW, _param.dstW, _param.channels, _ix.data, _ax.data); _bx[0].Resize(rs, false, _param.align); _bx[1].Resize(rs, false, _param.align); } - void ResizerFloatBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, bool caffeInterp, int32_t * indices, float * alphas) + void ResizerFloatBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t * indices, float * alphas) { - if (caffeInterp) + if (_param.method == SimdResizeMethodBilinear) + { + float scale = (float)srcSize / dstSize; + for (size_t i = 0; i < dstSize; ++i) + { + float alpha = (float)((i + 0.5f) * scale - 0.5f); + ptrdiff_t index = (ptrdiff_t)::floor(alpha); + alpha -= index; + if (index < 0) + { + index = 0; + alpha = 0; + } + if (index > (ptrdiff_t)srcSize - 2) + { + index = srcSize - 2; + alpha = 1; + } + for (size_t c = 0; c < channels; c++) + { + size_t offset = i * channels + c; + indices[offset] = (int32_t)(channels * index + c); + alphas[offset] = alpha; + } + } + } + else if (_param.method == SimdResizeMethodCaffeInterp) { float scale = dstSize > 1 ? float(srcSize - 1) / float(dstSize - 1) : 0.0f; for (size_t i = 0; i < dstSize; ++i) { - float alpha = float(i)*scale; + float alpha = float(i) * scale; ptrdiff_t index = (ptrdiff_t)::floor(alpha); alpha -= index; if (index > (ptrdiff_t)srcSize - 2) @@ -266,17 +410,17 @@ namespace Simd for (size_t c = 0; c < channels; c++) { size_t offset = i * channels + c; - indices[offset] = (int32_t)(channels*index + c); + indices[offset] = (int32_t)(channels * index + c); alphas[offset] = alpha; } } } - else + else if (_param.method == SimdResizeMethodInferenceEngineInterp) { float scale = (float)srcSize / dstSize; for (size_t i = 0; i < dstSize; ++i) { - float alpha = (float)((i + 0.5f)*scale - 0.5f); + float alpha = float(i) * scale; ptrdiff_t index = (ptrdiff_t)::floor(alpha); alpha -= index; if (index < 0) @@ -284,7 +428,7 @@ namespace Simd index = 0; alpha = 0; } - if (index >(ptrdiff_t)srcSize - 2) + if (index > (ptrdiff_t)srcSize - 2) { index = srcSize - 2; alpha = 1; @@ -292,11 +436,13 @@ namespace Simd for (size_t c = 0; c < channels; c++) { size_t offset = i * channels + c; - indices[offset] = (int32_t)(channels*index + c); + indices[offset] = (int32_t)(channels * index + c); alphas[offset] = alpha; } } } + else + assert(0); } void ResizerFloatBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) @@ -346,15 +492,80 @@ namespace Simd //--------------------------------------------------------------------- + ResizerNearest::ResizerNearest(const ResParam& param) + : Resizer(param) + { + _pixelSize = _param.PixelSize(); + _iy.Resize(_param.dstH, false, _param.align); + EstimateIndex(_param.srcH, _param.dstH, 1, _iy.data); + _ix.Resize(_param.dstW, false, _param.align); + EstimateIndex(_param.srcW, _param.dstW, _pixelSize, _ix.data); + } + + void ResizerNearest::EstimateIndex(size_t srcSize, size_t dstSize, size_t pixelSize, int32_t* indices) + { + float scale = (float)srcSize / dstSize; + for (size_t i = 0; i < dstSize; ++i) + { + float alpha = (i + 0.5f) * scale; + int index = RestrictRange((int)::floor(alpha), 0, (int)srcSize - 1); + indices[i] = (int)(index * pixelSize); + } + } + + void ResizerNearest::Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) + { + for (size_t dy = 0; dy < _param.dstH; dy++) + { + const uint8_t* srcRow = src + _iy[dy] * srcStride; + for (size_t dx = 0, offset = 0; dx < _param.dstW; dx++, offset += _pixelSize) + memcpy(dst + offset, srcRow + _ix[dx], _pixelSize); + dst += dstStride; + } + } + + template void ResizerNearest::Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) + { + for (size_t dy = 0; dy < _param.dstH; dy++) + { + const uint8_t * srcRow = src + _iy[dy] * srcStride; + for (size_t dx = 0, offset = 0; dx < _param.dstW; dx++, offset += N) + CopyPixel(srcRow + _ix[dx], dst + offset); + dst += dstStride; + } + } + + void ResizerNearest::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) + { + switch (_pixelSize) + { + case 1: Resize<1>(src, srcStride, dst, dstStride); break; + case 2: Resize<2>(src, srcStride, dst, dstStride); break; + case 3: Resize<3>(src, srcStride, dst, dstStride); break; + case 4: Resize<4>(src, srcStride, dst, dstStride); break; + case 6: Resize<6>(src, srcStride, dst, dstStride); break; + case 8: Resize<8>(src, srcStride, dst, dstStride); break; + case 12: Resize<12>(src, srcStride, dst, dstStride); break; + default: + Resize(src, srcStride, dst, dstStride); + } + } + + //--------------------------------------------------------------------- + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) { ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(void*)); - if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear) + if (param.IsByteBilinear()) return new ResizerByteBilinear(param); - else if (type == SimdResizeChannelByte && method == SimdResizeMethodArea) + else if (param.IsByteArea()) return new ResizerByteArea(param); - else if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp)) + else if (param.IsShortBilinear()) + return new ResizerShortBilinear(param); + else if (param.IsFloatBilinear()) return new ResizerFloatBilinear(param); + else if (param.IsNearest()) + return new ResizerNearest(param); else return NULL; } diff --git a/3rdparty/simdlib/Simd/SimdConfig.h b/3rdparty/simdlib/Simd/SimdConfig.h old mode 100644 new mode 100755 index 8e328e2495..22c7fdd8e6 --- a/3rdparty/simdlib/Simd/SimdConfig.h +++ b/3rdparty/simdlib/Simd/SimdConfig.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -24,18 +24,10 @@ #ifndef __SimdConfig_h__ #define __SimdConfig_h__ -//#define SIMD_SSE_DISABLE - //#define SIMD_SSE2_DISABLE -//#define SIMD_SSE3_DISABLE - -//#define SIMD_SSSE3_DISABLE - //#define SIMD_SSE41_DISABLE -//#define SIMD_SSE42_DISABLE - //#define SIMD_AVX_DISABLE //#define SIMD_AVX2_DISABLE diff --git a/3rdparty/simdlib/Simd/SimdConst.h b/3rdparty/simdlib/Simd/SimdConst.h old mode 100644 new mode 100755 index 38e217d6ca..e18c1b90d0 --- a/3rdparty/simdlib/Simd/SimdConst.h +++ b/3rdparty/simdlib/Simd/SimdConst.h @@ -76,25 +76,13 @@ namespace Simd const int DIVISION_BY_9_FACTOR = (1 << DIVISION_BY_9_SHIFT) / 9; } -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { const size_t F = sizeof(__m128) / sizeof(float); const size_t DF = 2 * F; const size_t QF = 4 * F; const size_t HF = F / 2; - } -#endif// SIMD_SSE_ENABLE - -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - using namespace Sse; -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::F; - using Sse::DF; - using Sse::QF; -#endif const size_t A = sizeof(__m128i); const size_t DA = 2 * A; @@ -128,6 +116,7 @@ namespace Simd const __m128i K16_0020 = SIMD_MM_SET1_EPI16(0x0020); const __m128i K16_0080 = SIMD_MM_SET1_EPI16(0x0080); const __m128i K16_00FF = SIMD_MM_SET1_EPI16(0x00FF); + const __m128i K16_0101 = SIMD_MM_SET1_EPI16(0x0101); const __m128i K16_FF00 = SIMD_MM_SET1_EPI16(0xFF00); const __m128i K32_00000001 = SIMD_MM_SET1_EPI32(0x00000001); @@ -138,6 +127,7 @@ namespace Simd const __m128i K32_0000FFFF = SIMD_MM_SET1_EPI32(0x0000FFFF); const __m128i K32_00010000 = SIMD_MM_SET1_EPI32(0x00010000); const __m128i K32_01000000 = SIMD_MM_SET1_EPI32(0x01000000); + const __m128i K32_00FF0000 = SIMD_MM_SET1_EPI32(0x00FF0000); const __m128i K32_00FFFFFF = SIMD_MM_SET1_EPI32(0x00FFFFFF); const __m128i K32_FFFFFF00 = SIMD_MM_SET1_EPI32(0xFFFFFF00); @@ -162,22 +152,15 @@ namespace Simd } #endif// SIMD_SSE2_ENABLE -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { using namespace Sse2; #if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::F; - using Sse::DF; - using Sse::QF; + using Sse2::F; + using Sse2::DF; + using Sse2::QF; #endif - } -#endif// SIMD_SSE3_ENABLE - -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - using namespace Sse3; const __m128i K8_SHUFFLE_GRAY_TO_BGR0 = SIMD_MM_SETR_EPI8(0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5); const __m128i K8_SHUFFLE_GRAY_TO_BGR1 = SIMD_MM_SETR_EPI8(0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xA, 0xA); @@ -207,27 +190,8 @@ namespace Simd const __m128i K8_SHUFFLE_BGR1_TO_RED = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1); const __m128i K8_SHUFFLE_BGR2_TO_RED = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF); } -#endif// SIMD_SSSE3_ENABLE - -#ifdef SIMD_SSE41_ENABLE - namespace Sse41 - { - using namespace Ssse3; -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::F; - using Sse::DF; - using Sse::QF; -#endif - } #endif// SIMD_SSE41_ENABLE -#ifdef SIMD_SSE42_ENABLE - namespace Sse42 - { - using namespace Sse41; - } -#endif// SIMD_SSE42_ENABLE - #ifdef SIMD_AVX_ENABLE namespace Avx { @@ -282,6 +246,7 @@ namespace Simd const __m256i K16_0020 = SIMD_MM256_SET1_EPI16(0x0020); const __m256i K16_0080 = SIMD_MM256_SET1_EPI16(0x0080); const __m256i K16_00FF = SIMD_MM256_SET1_EPI16(0x00FF); + const __m256i K16_0101 = SIMD_MM256_SET1_EPI16(0x0101); const __m256i K16_FF00 = SIMD_MM256_SET1_EPI16(0xFF00); const __m256i K32_00000001 = SIMD_MM256_SET1_EPI32(0x00000001); @@ -292,6 +257,7 @@ namespace Simd const __m256i K32_0000FFFF = SIMD_MM256_SET1_EPI32(0x0000FFFF); const __m256i K32_00010000 = SIMD_MM256_SET1_EPI32(0x00010000); const __m256i K32_01000000 = SIMD_MM256_SET1_EPI32(0x01000000); + const __m256i K32_00FF0000 = SIMD_MM256_SET1_EPI32(0x00FF0000); const __m256i K32_FFFFFF00 = SIMD_MM256_SET1_EPI32(0xFFFFFF00); const __m256i K16_Y_ADJUST = SIMD_MM256_SET1_EPI16(Base::Y_ADJUST); @@ -311,6 +277,8 @@ namespace Simd const __m256i K16_DIVISION_BY_9_FACTOR = SIMD_MM256_SET1_EPI16(Base::DIVISION_BY_9_FACTOR); + const __m256i K64_00000000FFFFFFFF = SIMD_MM256_SET2_EPI32(0xFFFFFFFF, 0); + const __m256i K8_SHUFFLE_0 = SIMD_MM256_SETR_EPI8( 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0); @@ -389,11 +357,11 @@ namespace Simd -1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF); - const __m256i K8_BGRA_TO_BGR_SHUFFLE = SIMD_MM256_SETR_EPI8( + const __m256i K8_BGR_TO_BGRA_SHUFFLE = SIMD_MM256_SETR_EPI8( 0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1, 0x4, 0x5, 0x6, -1, 0x7, 0x8, 0x9, -1, 0xA, 0xB, 0xC, -1, 0xD, 0xE, 0xF, -1); - const __m256i K8_BGRA_TO_RGB_SHUFFLE = SIMD_MM256_SETR_EPI8( + const __m256i K8_RGB_TO_BGRA_SHUFFLE = SIMD_MM256_SETR_EPI8( 0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1, 0x6, 0x5, 0x4, -1, 0x9, 0x8, 0x7, -1, 0xC, 0xB, 0xA, -1, 0xF, 0xE, 0xD, -1); @@ -402,6 +370,12 @@ namespace Simd 0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF); const __m256i K32_TWO_UNPACK_PERMUTE = SIMD_MM256_SETR_EPI32(0, 2, 4, 6, 1, 3, 5, 7); + + const __m256i K8_SHUFFLE_BGRA_TO_BGR = SIMD_MM256_SETR_EPI8( + 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, + 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1); + + const __m256i K32_PERMUTE_BGRA_TO_BGR = SIMD_MM256_SETR_EPI32(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, -1, -1); } #endif// SIMD_AVX2_ENABLE @@ -459,8 +433,10 @@ namespace Simd const uint32x4_t K32_000000FF = SIMD_VEC_SET1_EPI32(0x000000FF); const uint32x4_t K32_0000FFFF = SIMD_VEC_SET1_EPI32(0x0000FFFF); const uint32x4_t K32_00010000 = SIMD_VEC_SET1_EPI32(0x00010000); + const uint32x4_t K32_00FF0000 = SIMD_VEC_SET1_EPI32(0x00FF0000); const uint32x4_t K32_01000000 = SIMD_VEC_SET1_EPI32(0x01000000); const uint32x4_t K32_08080800 = SIMD_VEC_SET1_EPI32(0x08080800); + const uint32x4_t K32_FF000000 = SIMD_VEC_SET1_EPI32(0xFF000000); const uint32x4_t K32_FFFFFF00 = SIMD_VEC_SET1_EPI32(0xFFFFFF00); const uint32x4_t K32_FFFFFFFF = SIMD_VEC_SET1_EPI32(0xFFFFFFFF); const uint32x4_t K32_0123 = SIMD_VEC_SETR_EPI32(0, 1, 2, 3); diff --git a/3rdparty/simdlib/Simd/SimdConversion.h b/3rdparty/simdlib/Simd/SimdConversion.h old mode 100644 new mode 100755 index e0601a9f61..5f8f0a0b9b --- a/3rdparty/simdlib/Simd/SimdConversion.h +++ b/3rdparty/simdlib/Simd/SimdConversion.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, +* Copyright (c) 2011-2021 Yermalayeu Ihar, * 2014-2015 Antonenka Mikhail. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -38,16 +38,10 @@ namespace Simd return (BLUE_TO_GRAY_WEIGHT*blue + GREEN_TO_GRAY_WEIGHT * green + RED_TO_GRAY_WEIGHT * red + BGR_TO_GRAY_ROUND_TERM) >> BGR_TO_GRAY_AVERAGING_SHIFT; } - - SIMD_INLINE int RgbToGray(int red, int green, int blue) - { - return (BLUE_TO_GRAY_WEIGHT*blue + GREEN_TO_GRAY_WEIGHT * green + - RED_TO_GRAY_WEIGHT * red + BGR_TO_GRAY_ROUND_TERM) >> BGR_TO_GRAY_AVERAGING_SHIFT; - } } -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { template __m128i InterleaveBgr(__m128i blue, __m128i green, __m128i red); @@ -99,7 +93,7 @@ namespace Simd _mm_shuffle_epi8(bgr[2], K8_SHUFFLE_BGR2_TO_RED))); } } -#endif//SIMD_SSSE3_ENABLE +#endif #ifdef SIMD_AVX2_ENABLE namespace Avx2 @@ -181,41 +175,24 @@ namespace Simd template<> SIMD_INLINE __m256i BgrToBgra(const __m256i & bgr, const __m256i & alpha) { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0x94), K8_BGRA_TO_BGR_SHUFFLE), alpha); + return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0x94), K8_BGR_TO_BGRA_SHUFFLE), alpha); } template<> SIMD_INLINE __m256i BgrToBgra(const __m256i & bgr, const __m256i & alpha) { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0xE9), K8_BGRA_TO_BGR_SHUFFLE), alpha); - } - - template __m256i BgrToRgba(const __m256i & bgr, const __m256i & alpha); - - template<> SIMD_INLINE __m256i BgrToRgba(const __m256i & bgr, const __m256i & alpha) - { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0x94), K8_BGRA_TO_RGB_SHUFFLE), alpha); - } - - template<> SIMD_INLINE __m256i BgrToRgba(const __m256i & bgr, const __m256i & alpha) - { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0xE9), K8_BGRA_TO_RGB_SHUFFLE), alpha); - } - - SIMD_INLINE __m256i BgraToRgba(const __m256i & bgra) - { - return _mm256_shuffle_epi8(bgra, K8_BGRA_TO_RGBA_SHUFFLE); + return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0xE9), K8_BGR_TO_BGRA_SHUFFLE), alpha); } template __m256i RgbToBgra(const __m256i & rgb, const __m256i & alpha); template<> SIMD_INLINE __m256i RgbToBgra(const __m256i & rgb, const __m256i & alpha) { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0x94), K8_BGRA_TO_RGB_SHUFFLE), alpha); + return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0x94), K8_RGB_TO_BGRA_SHUFFLE), alpha); } template<> SIMD_INLINE __m256i RgbToBgra(const __m256i & rgb, const __m256i & alpha) { - return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0xE9), K8_BGRA_TO_RGB_SHUFFLE), alpha); + return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0xE9), K8_RGB_TO_BGRA_SHUFFLE), alpha); } } #endif// SIMD_AVX2_ENABLE @@ -236,8 +213,20 @@ namespace Simd template SIMD_INLINE int32x4_t BgrToU(uint16x8_t blue, uint16x8_t green, uint16x8_t red) { - return vshrq_n_s32(vmlal_s16(vmlal_s16(vmlal_s16(K32_BGR_TO_YUV_ROUND_TERM, (int16x4_t)Half(blue), K16_BLUE_TO_U_WEIGHT), - (int16x4_t)Half(green), K16_GREEN_TO_U_WEIGHT), (int16x4_t)Half(red), K16_RED_TO_U_WEIGHT), Base::BGR_TO_YUV_AVERAGING_SHIFT); + return vshrq_n_s32(vmlal_s16(vmlal_s16(vmlal_s16(K32_BGR_TO_YUV_ROUND_TERM, vreinterpret_s16_u16(Half(blue)), K16_BLUE_TO_U_WEIGHT), + vreinterpret_s16_u16(Half(green)), K16_GREEN_TO_U_WEIGHT), vreinterpret_s16_u16(Half(red)), K16_RED_TO_U_WEIGHT), Base::BGR_TO_YUV_AVERAGING_SHIFT); + } + + SIMD_INLINE int16x8_t BgrToU(uint16x8_t blue, uint16x8_t green, uint16x8_t red) + { + return vaddq_s16(K16_UV_ADJUST, PackI32(BgrToU<0>(blue, green, red), BgrToU<1>(blue, green, red))); + } + + SIMD_INLINE uint8x16_t BgrToU(uint8x16_t blue, uint8x16_t green, uint8x16_t red) + { + return PackSaturatedI16( + BgrToU(UnpackU8<0>(blue), UnpackU8<0>(green), UnpackU8<0>(red)), + BgrToU(UnpackU8<1>(blue), UnpackU8<1>(green), UnpackU8<1>(red))); } } #endif// SIMD_NEON_ENABLE diff --git a/3rdparty/simdlib/Simd/SimdCopyPixel.h b/3rdparty/simdlib/Simd/SimdCopyPixel.h old mode 100644 new mode 100755 index 6f113e4c39..a5539eba35 --- a/3rdparty/simdlib/Simd/SimdCopyPixel.h +++ b/3rdparty/simdlib/Simd/SimdCopyPixel.h @@ -56,6 +56,23 @@ namespace Simd { ((uint32_t*)dst)[0] = ((uint32_t*)src)[0]; } + + template<> SIMD_INLINE void CopyPixel<6>(const uint8_t* src, uint8_t* dst) + { + ((uint32_t*)dst)[0] = ((uint32_t*)src)[0]; + ((uint16_t*)dst)[2] = ((uint16_t*)src)[2]; + } + + template<> SIMD_INLINE void CopyPixel<8>(const uint8_t* src, uint8_t* dst) + { + ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; + } + + template<> SIMD_INLINE void CopyPixel<12>(const uint8_t* src, uint8_t* dst) + { + ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; + ((uint32_t*)dst)[2] = ((uint32_t*)src)[2]; + } } } diff --git a/3rdparty/simdlib/Simd/SimdCpu.h b/3rdparty/simdlib/Simd/SimdCpu.h old mode 100644 new mode 100755 index adaf916462..b10d9fa98f --- a/3rdparty/simdlib/Simd/SimdCpu.h +++ b/3rdparty/simdlib/Simd/SimdCpu.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,8 +28,103 @@ namespace Simd { -#ifdef SIMD_SSE_ENABLE - namespace Sse +#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) + namespace Cpuid + { + // See http://www.sandpile.org/x86/cpuid.htm for additional information. + enum Level + { + Ordinary = 1, + Extended = 7, + }; + + enum Register + { + Eax = 0, + Ebx = 1, + Ecx = 2, + Edx = 3, + }; + + enum Bit + { + // Ordinary: + // Edx: + SSE = 1 << 25, + SSE2 = 1 << 26, + + // Ecx: + SSE3 = 1 << 0, + SSSE3 = 1 << 9, + FMA = 1 << 12, + SSE41 = 1 << 19, + SSE42 = 1 << 20, + OSXSAVE = 1 << 27, + AVX = 1 << 28, + F16C = 1 << 29, + + // Extended: + // Ebx: + AVX2 = 1 << 5, + AVX512F = 1 << 16, + AVX512DQ = 1 << 17, + AVX512CD = 1 << 28, + AVX512BW = 1 << 30, + AVX512VL = 1 << 31, + + // Ecx: + AVX512VBMI = 1 << 1, + AVX512VNNI = 1 << 11, + }; + } +#endif//defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) + + namespace Cpu + { + extern const size_t SOCKET_NUMBER; + extern const size_t CORE_NUMBER; + extern const size_t THREAD_NUMBER; + extern const size_t L1_CACHE_SIZE; + extern const size_t L2_CACHE_SIZE; + extern const size_t L3_CACHE_SIZE; + } + + namespace Base + { +#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) + bool CheckBit(Cpuid::Level level, Cpuid::Register index, Cpuid::Bit bit); +#endif + +#if defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)) + bool CheckBit(int at, int bit); +#endif + + size_t CpuSocketNumber(); + + size_t CpuCoreNumber(); + + size_t CpuThreadNumber(); + + size_t CpuCacheSize(size_t level); + + SIMD_INLINE size_t AlgCacheL1() + { + return Cpu::L1_CACHE_SIZE; + } + + SIMD_INLINE size_t AlgCacheL2() + { + return Cpu::L3_CACHE_SIZE ? Cpu::L2_CACHE_SIZE : Cpu::L2_CACHE_SIZE * Cpu::SOCKET_NUMBER / Cpu::CORE_NUMBER; + } + + SIMD_INLINE size_t AlgCacheL3() + { + return Cpu::L3_CACHE_SIZE ? Cpu::L3_CACHE_SIZE * Cpu::SOCKET_NUMBER / Cpu::CORE_NUMBER : Cpu::L2_CACHE_SIZE; + } + } + +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { const unsigned int SCR_FTZ = 1 << 15; const unsigned int SCR_DAZ = 1 << 6; diff --git a/3rdparty/simdlib/Simd/SimdDefs.h b/3rdparty/simdlib/Simd/SimdDefs.h old mode 100644 new mode 100755 index c2b9274ed4..97d8f06ad6 --- a/3rdparty/simdlib/Simd/SimdDefs.h +++ b/3rdparty/simdlib/Simd/SimdDefs.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -37,10 +37,24 @@ #include #include +#if defined(SIMD_SSE2_DISABLE) && !defined(SIMD_SSE41_DISABLE) +#define SIMD_SSE41_DISABLE +#endif + +#if defined(SIMD_SSE41_DISABLE) && !defined(SIMD_AVX_DISABLE) +#define SIMD_AVX_DISABLE +#endif + +#if defined(SIMD_AVX_DISABLE) && !defined(SIMD_AVX2_DISABLE) +#define SIMD_AVX2_DISABLE +#endif + #if defined(_MSC_VER) && defined(_MSC_FULL_VER) #define SIMD_ALIGNED(x) __declspec(align(x)) +#define SIMD_NOINLINE __declspec(noinline) + #ifdef _M_IX86 #define SIMD_X86_ENABLE #endif @@ -55,30 +69,14 @@ #if defined(SIMD_X64_ENABLE) || defined(SIMD_X86_ENABLE) -#if !defined(SIMD_SSE_DISABLE) && _MSC_VER >= 1200 -#define SIMD_SSE_ENABLE -#endif - #if !defined(SIMD_SSE2_DISABLE) && _MSC_VER >= 1300 #define SIMD_SSE2_ENABLE #endif -#if !defined(SIMD_SSE3_DISABLE) && _MSC_VER >= 1500 -#define SIMD_SSE3_ENABLE -#endif - -#if !defined(SIMD_SSSE3_DISABLE) && _MSC_VER >= 1500 -#define SIMD_SSSE3_ENABLE -#endif - #if !defined(SIMD_SSE41_DISABLE) && _MSC_VER >= 1500 #define SIMD_SSE41_ENABLE #endif -#if !defined(SIMD_SSE42_DISABLE) && _MSC_VER >= 1500 -#define SIMD_SSE42_ENABLE -#endif - #if !defined(SIMD_AVX_DISABLE) && _MSC_FULL_VER >= 160040219 #define SIMD_AVX_ENABLE #endif @@ -88,7 +86,7 @@ #endif #if defined(NDEBUG) && _MSC_VER >= 1700 && _MSC_VER < 1900 -#define SIMD_MADDUBS_ERROR // Visual Studio 2012/2013 release mode compiler bug in function _mm256_maddubs_epi16: +#define SIMD_MADDUBS_ERROR // Visual Studio 2012/2013 release mode compiler bug in function _mm256_maddubs_epi16. #endif #if defined(NDEBUG) && _MSC_VER == 1914 @@ -123,6 +121,8 @@ #define SIMD_ALIGNED(x) __attribute__ ((aligned(x))) +#define SIMD_NOINLINE __attribute__ ((noinline)) + #ifdef __i386__ #define SIMD_X86_ENABLE #endif @@ -159,36 +159,16 @@ #define SIMD_ARM64_ENABLE #endif -#if defined __mips__ -#define SIMD_MIPS_ENABLE -#endif - #if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) -#if !defined(SIMD_SSE_DISABLE) && defined(__SSE__) -#define SIMD_SSE_ENABLE -#endif - -#if !defined(SIMD_SSE2_DISABLE) && defined(__SSE2__) +#if !defined(SIMD_SSE2_DISABLE) && defined(__SSE__) && defined(__SSE2__) #define SIMD_SSE2_ENABLE #endif -#if !defined(SIMD_SSE3_DISABLE) && defined(__SSE3__) -#define SIMD_SSE3_ENABLE -#endif - -#if !defined(SIMD_SSSE3_DISABLE) && defined(__SSSE3__) -#define SIMD_SSSE3_ENABLE -#endif - -#if !defined(SIMD_SSE41_DISABLE) && defined(__SSE4_1__) +#if !defined(SIMD_SSE41_DISABLE) && defined(__SSE3__) && defined(__SSSE3__) && defined(__SSE4_1__) && defined(__SSE4_2__) #define SIMD_SSE41_ENABLE #endif -#if !defined(SIMD_SSE42_DISABLE) && defined(__SSE4_2__) -#define SIMD_SSE42_ENABLE -#endif - #if !defined(SIMD_AVX_DISABLE) && defined(__AVX__) #define SIMD_AVX_ENABLE #endif @@ -239,27 +219,11 @@ #endif -#ifdef SIMD_SSE_ENABLE -#include -#endif - #ifdef SIMD_SSE2_ENABLE #include #endif -#ifdef SIMD_SSE3_ENABLE -# include -#endif - -#ifdef SIMD_SSSE3_ENABLE -#include -#endif - #ifdef SIMD_SSE41_ENABLE -#include -#endif - -#ifdef SIMD_SSE42_ENABLE #include #endif @@ -273,10 +237,10 @@ #if defined(SIMD_AVX_ENABLE) || defined(SIMD_AVX2_ENABLE) #define SIMD_ALIGN 32 -#elif defined(SIMD_SSE_ENABLE) || defined(SIMD_SSE2_ENABLE) || defined(SIMD_SSE3_ENABLE) || defined(SIMD_SSSE3_ENABLE) || defined(SIMD_SSE41_ENABLE) || defined(SIMD_SSE42_ENABLE) \ +#elif defined(SIMD_SSE2_ENABLE) || defined(SIMD_SSE41_ENABLE) \ || defined(SIMD_NEON_ENABLE) #define SIMD_ALIGN 16 -#elif defined (SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) +#elif defined (SIMD_X64_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM64_ENABLE) #define SIMD_ALIGN 8 #else #define SIMD_ALIGN 4 diff --git a/3rdparty/simdlib/Simd/SimdEnable.h b/3rdparty/simdlib/Simd/SimdEnable.h old mode 100644 new mode 100755 index 6c79eb0d94..a501daf8ad --- a/3rdparty/simdlib/Simd/SimdEnable.h +++ b/3rdparty/simdlib/Simd/SimdEnable.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -62,455 +62,74 @@ namespace Simd { -#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) - namespace Cpuid - { - // See http://www.sandpile.org/x86/cpuid.htm for additional information. - enum Level - { - Ordinary = 1, - Extended = 7, - }; - - enum Register - { - Eax = 0, - Ebx = 1, - Ecx = 2, - Edx = 3, - }; - - enum Bit - { - // Ordinary: - // Edx: - SSE = 1 << 25, - SSE2 = 1 << 26, - - // Ecx: - SSE3 = 1 << 0, - SSSE3 = 1 << 9, - FMA = 1 << 12, - SSE41 = 1 << 19, - SSE42 = 1 << 20, - OSXSAVE = 1 << 27, - AVX = 1 << 28, - F16C = 1 << 29, - - // Extended: - // Ebx: - AVX2 = 1 << 5, - AVX512F = 1 << 16, - AVX512BW = 1 << 30, - - // Ecx: - AVX512VBMI = 1 << 1, - }; - - SIMD_INLINE bool CheckBit(Level level, Register index, Bit bit) - { - unsigned int registers[4] = { 0, 0, 0, 0 }; -#if defined(_MSC_VER) - __cpuid((int*)registers, level); -#elif (defined __GNUC__) - if (__get_cpuid_max(0, NULL) < level) - return false; - __cpuid_count(level, 0, registers[Eax], registers[Ebx], registers[Ecx], registers[Edx]); -#else -#error Do not know how to detect CPU info! -#endif - return (registers[index] & bit) == bit; - } - } -#endif//defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE) - -#if !defined(__APPLE__) // not macOS, iOS -#if defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)) - namespace CpuInfo - { - SIMD_INLINE bool CheckBit(int at, int bit) - { - bool result = false; - int file = ::open("/proc/self/auxv", O_RDONLY); - if (file < 0) - return false; - const ssize_t size = 64; - unsigned long buffer[size]; - for (ssize_t count = size; count == size;) - { - count = ::read(file, buffer, sizeof(buffer)) / sizeof(unsigned long); - for (int i = 0; i < count; i += 2) - { - if (buffer[i] == (unsigned)at) - { - result = !!(buffer[i + 1] & bit); - count = 0; - } - if (buffer[i] == AT_NULL) - count = 0; - } - } - ::close(file); - return result; - } - } -#endif//defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)) -#endif//(TARGET_OS_IOS == 0) not iOS - -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Edx, Cpuid::SSE); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m128 value = _mm_set1_ps(1.0f);// try to execute of SSE instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - #ifdef SIMD_SSE2_ENABLE namespace Sse2 { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Edx, Cpuid::SSE2); - } + bool GetEnable(); - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m128d value = _mm_set1_pd(1.0);// try to execute of SSE2 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 - { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE3); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m128 value = _mm_hadd_ps(_mm_set1_ps(1.0f), _mm_set1_ps(2.0f)); //try to execute of SSE3 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSSE3); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m128i value = _mm_abs_epi8(_mm_set1_epi8(-1)); //try to execute of SSSE3 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); + const bool Enable = GetEnable(); } #endif #ifdef SIMD_SSE41_ENABLE namespace Sse41 { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE41); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - int value = _mm_testz_si128(_mm_set1_epi8(0), _mm_set1_epi8(-1)); // try to execute of SSE41 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); - } -#endif - -#ifdef SIMD_SSE42_ENABLE - namespace Sse42 - { - SIMD_INLINE bool SupportedByCPU() - { - return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE42); - } + bool GetEnable(); - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - uint32_t value = _mm_crc32_u8(0, 1); // try to execute of SSE42 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); + const bool Enable = GetEnable(); } #endif #ifdef SIMD_AVX_ENABLE namespace Avx { - SIMD_INLINE bool SupportedByCPU() - { - return - Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) && - Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::AVX); - } - - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m256d value = _mm256_set1_pd(1.0);// try to execute of AVX instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } + bool GetEnable(); - const bool Enable = SupportedByCPU() && SupportedByOS(); + const bool Enable = GetEnable(); } #endif #ifdef SIMD_AVX2_ENABLE namespace Avx2 { - SIMD_INLINE bool SupportedByCPU() - { - return - Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) && - Cpuid::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX2) && - Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::FMA) && - Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::F16C); - } + bool GetEnable(); - SIMD_INLINE bool SupportedByOS() - { -#if defined(_MSC_VER) - __try - { - __m256i value = _mm256_abs_epi8(_mm256_set1_epi8(1));// try to execute of AVX2 instructions; - return true; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } -#else - return true; -#endif - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); + const bool Enable = GetEnable(); } #endif #ifdef SIMD_NEON_ENABLE namespace Neon { - SIMD_INLINE bool SupportedByCPU() - { -#if defined(_MSC_VER) - return true; -#elif defined(__GNUC__) -#if defined(SIMD_ARM64_ENABLE) || (TARGET_OS_IOS != 0) // iOS - return true; -#else - return CpuInfo::CheckBit(AT_HWCAP, HWCAP_NEON); -#endif -#else -#error Do not know how to detect NEON support! -#endif - } + bool GetEnable(); - SIMD_INLINE bool SupportedByOS() - { - return true; - } - - const bool Enable = SupportedByCPU() && SupportedByOS(); + const bool Enable = GetEnable(); } #endif - - SIMD_INLINE size_t Alignment() - { -#ifdef SIMD_AVX2_ENABLE - if (Avx2::Enable) - return sizeof(__m256i); - else -#endif -#ifdef SIMD_AVX_ENABLE - if (Avx::Enable) - return sizeof(__m256); - else -#endif -#ifdef SIMD_SSE41_ENABLE - if (Sse41::Enable) - return sizeof(__m128i); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable) - return sizeof(__m128i); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable) - return sizeof(__m128i); - else -#endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable) - return sizeof(__m128); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable) - return sizeof(uint8x16_t); - else -#endif - return sizeof(void *); - } - - const size_t ALIGNMENT = Alignment(); } #define SIMD_BASE_FUNC(func) Simd::Base::func -#ifdef SIMD_SSE_ENABLE -#define SIMD_SSE_FUNC(func) Simd::Sse::Enable ? Simd::Sse::func : -#else -#define SIMD_SSE_FUNC(func) -#endif - #ifdef SIMD_SSE2_ENABLE -#define SIMD_SSE2_FUNC(func) Simd::Sse2::Enable ? Simd::Sse2::func : -#else -#define SIMD_SSE2_FUNC(func) -#endif - -#ifdef SIMD_SSE3_ENABLE -#define SIMD_SSE3_FUNC(func) Simd::Sse3::Enable ? Simd::Sse3::func : +#define SIMD_SSE2_FUNC(func) Simd::Sse2::Enable ? Simd::Sse2::func : #else -#define SIMD_SSE3_FUNC(func) -#endif - -#ifdef SIMD_SSSE3_ENABLE -#define SIMD_SSSE3_FUNC(func) Simd::Ssse3::Enable ? Simd::Ssse3::func : -#else -#define SIMD_SSSE3_FUNC(func) +#define SIMD_SSE2_FUNC(func) #endif #ifdef SIMD_SSE41_ENABLE -#define SIMD_SSE41_FUNC(func) Simd::Sse41::Enable ? Simd::Sse41::func : -#else -#define SIMD_SSE41_FUNC(func) -#endif - -#ifdef SIMD_SSE42_ENABLE -#define SIMD_SSE42_FUNC(func) Simd::Sse42::Enable ? Simd::Sse42::func : +#define SIMD_SSE41_FUNC(func) Simd::Sse41::Enable ? Simd::Sse41::func : #else -#define SIMD_SSE42_FUNC(func) +#define SIMD_SSE41_FUNC(func) #endif #ifdef SIMD_AVX_ENABLE -#define SIMD_AVX_FUNC(func) Simd::Avx::Enable ? Simd::Avx::func : +#define SIMD_AVX_FUNC(func) Simd::Avx::Enable ? Simd::Avx::func : #else #define SIMD_AVX_FUNC(func) #endif #ifdef SIMD_AVX2_ENABLE -#define SIMD_AVX2_FUNC(func) Simd::Avx2::Enable ? Simd::Avx2::func : +#define SIMD_AVX2_FUNC(func) Simd::Avx2::Enable ? Simd::Avx2::func : #else #define SIMD_AVX2_FUNC(func) #endif diff --git a/3rdparty/simdlib/Simd/SimdExp.h b/3rdparty/simdlib/Simd/SimdExp.h old mode 100644 new mode 100755 index 3bfbc3f8f5..1600275b23 --- a/3rdparty/simdlib/Simd/SimdExp.h +++ b/3rdparty/simdlib/Simd/SimdExp.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -34,6 +34,11 @@ namespace Simd { return ::expf(value); } + + SIMD_INLINE float Log(float value) + { + return ::logf(value); + } } #ifdef SIMD_SSE2_ENABLE @@ -107,20 +112,20 @@ namespace Simd __m128 exp = Exp2(_mm_mul_ps(_k, value)); __m128 neg = _mm_mul_ps(alpha, _mm_sub_ps(exp, _1_0)); __m128 mask = _mm_cmpgt_ps(_mm_setzero_ps(), value); - return Sse::Combine(mask, neg, value); + return Combine(mask, neg, value); } }; namespace Detail { - SIMD_INLINE __m128 Poly5(__m128 x) + SIMD_INLINE __m128 Poly5(__m128 x, float a, float b, float c, float d, float e, float f) { - __m128 p = _mm_set1_ps(1.8775767e-3f); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(8.9893397e-3f)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(5.5826318e-2f)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(2.4015361e-1f)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(6.9315308e-1f)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(9.9999994e-1f)); + __m128 p = _mm_set1_ps(f); + p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(e)); + p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(d)); + p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(c)); + p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(b)); + p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(a)); return p; } @@ -130,9 +135,19 @@ namespace Simd __m128i ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f))); __m128 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart)); __m128 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23)); - __m128 expfpart = Poly5(fpart); + __m128 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); return _mm_mul_ps(expipart, expfpart); } + + SIMD_INLINE __m128 Log2(__m128 x) + { + __m128 _1 = _mm_set1_ps(1.0f); + __m128i i = _mm_castps_si128(x); + __m128 e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, _mm_set1_epi32(0x7F800000)), 23), _mm_set1_epi32(127))); + __m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, _mm_set1_epi32(0x007FFFFF))), _1); + __m128 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + return _mm_add_ps(_mm_mul_ps(p, _mm_sub_ps(m, _1)), e); + } } SIMD_INLINE __m128 Exponent(__m128 value) @@ -145,7 +160,36 @@ namespace Simd __m128 exp = Exponent(value); __m128 neg = _mm_mul_ps(alpha, _mm_sub_ps(exp, _mm_set1_ps(1.0f))); __m128 mask = _mm_cmpgt_ps(_mm_setzero_ps(), value); - return Sse::Combine(mask, neg, value); + return Combine(mask, neg, value); + } + + SIMD_INLINE __m128 Logarithm(__m128 value) + { + return _mm_mul_ps(_mm_set1_ps(0.693147181f), Detail::Log2(value)); + } + + SIMD_INLINE __m128 Mish(__m128 value, __m128 threshold) + { + __m128 _1 = _mm_set1_ps(1.0f); + __m128 mish = _mm_add_ps(Exponent(value), _1); + mish = _mm_add_ps(_mm_mul_ps(mish, mish), _1); + mish = _mm_mul_ps(value, _mm_sub_ps(_1, _mm_div_ps(_mm_set1_ps(2.0f), mish))); + return Combine(_mm_cmpgt_ps(threshold, value), mish, value); + } + + SIMD_INLINE __m128 Softplus(__m128 value, __m128 beta, __m128 threshold) + { + __m128 exp = Exponent(_mm_mul_ps(value, beta)); + __m128 log = Logarithm(_mm_add_ps(_mm_set1_ps(1.0f), exp)); + __m128 mask = _mm_cmpgt_ps(threshold, value); + return Combine(mask, _mm_div_ps(log, beta), value); + } + + SIMD_INLINE __m128 Tanh(__m128 value) + { + __m128 _1 = _mm_set1_ps(1.0f); + __m128 exp = Detail::Exp2(_mm_mul_ps(_mm_set1_ps(2.88539008f), value)); + return _mm_div_ps(_mm_sub_ps(exp, _1), _mm_add_ps(_1, exp)); } } #endif //SIMD_SSE2_ENABLE @@ -227,14 +271,14 @@ namespace Simd namespace Detail { - SIMD_INLINE __m256 Poly5(__m256 x) + SIMD_INLINE __m256 Poly5(__m256 x, float a, float b, float c, float d, float e, float f) { - __m256 p = _mm256_set1_ps(1.8775767e-3f); - p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(8.9893397e-3f)); - p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(5.5826318e-2f)); - p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(2.4015361e-1f)); - p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(6.9315308e-1f)); - p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(9.9999994e-1f)); + __m256 p = _mm256_set1_ps(f); + p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(e)); + p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(d)); + p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(c)); + p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(b)); + p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(a)); return p; } @@ -244,9 +288,19 @@ namespace Simd __m256i ipart = _mm256_cvtps_epi32(_mm256_sub_ps(x, _mm256_set1_ps(0.5f))); __m256 fpart = _mm256_sub_ps(x, _mm256_cvtepi32_ps(ipart)); __m256 expipart = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(ipart, _mm256_set1_epi32(127)), 23)); - __m256 expfpart = Poly5(fpart); + __m256 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); return _mm256_mul_ps(expipart, expfpart); } + + SIMD_INLINE __m256 Log2(__m256 x) + { + __m256 _1 = _mm256_set1_ps(1.0f); + __m256i i = _mm256_castps_si256(x); + __m256 e = _mm256_cvtepi32_ps(_mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(i, _mm256_set1_epi32(0x7F800000)), 23), _mm256_set1_epi32(127))); + __m256 m = _mm256_or_ps(_mm256_castsi256_ps(_mm256_and_si256(i, _mm256_set1_epi32(0x007FFFFF))), _1); + __m256 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + return _mm256_add_ps(_mm256_mul_ps(p, _mm256_sub_ps(m, _1)), e); + } } SIMD_INLINE __m256 Exponent(__m256 value) @@ -261,6 +315,35 @@ namespace Simd __m256 mask = _mm256_cmp_ps(_mm256_setzero_ps(), value, _CMP_GT_OS); return _mm256_blendv_ps(value, neg, mask); } + + SIMD_INLINE __m256 Logarithm(__m256 value) + { + return _mm256_mul_ps(_mm256_set1_ps(0.693147181f), Detail::Log2(value)); + } + + SIMD_INLINE __m256 Mish(__m256 value, __m256 threshold) + { + __m256 _1 = _mm256_set1_ps(1.0f); + __m256 mish = _mm256_add_ps(Exponent(value), _1); + mish = Fmadd(mish, mish, _1); + mish = _mm256_mul_ps(value, _mm256_sub_ps(_1, _mm256_div_ps(_mm256_set1_ps(2.0f), mish))); + return _mm256_blendv_ps(value, mish, _mm256_cmp_ps(threshold, value, _CMP_GT_OS)); + } + + SIMD_INLINE __m256 Softplus(__m256 value, __m256 beta, __m256 threshold) + { + __m256 exp = Exponent(_mm256_mul_ps(value, beta)); + __m256 log = Logarithm(_mm256_add_ps(_mm256_set1_ps(1.0f), exp)); + __m256 mask = _mm256_cmp_ps(threshold, value, _CMP_GT_OS); + return _mm256_blendv_ps(value, _mm256_div_ps(log, beta), mask); + } + + SIMD_INLINE __m256 Tanh(__m256 value) + { + __m256 _1 = _mm256_set1_ps(1.0f); + __m256 exp = Detail::Exp2(_mm256_mul_ps(_mm256_set1_ps(2.88539008f), value)); + return _mm256_div_ps(_mm256_sub_ps(exp, _1), _mm256_add_ps(_1, exp)); + } } #endif //SIMD_AVX2_ENABLE @@ -341,14 +424,14 @@ namespace Simd namespace Detail { - SIMD_INLINE float32x4_t Poly5(float32x4_t x) + SIMD_INLINE float32x4_t Poly5(float32x4_t x, float a, float b, float c, float d, float e, float f) { - float32x4_t p = vdupq_n_f32(1.8775767e-3f); - p = vmlaq_f32(vdupq_n_f32(8.9893397e-3f), x, p); - p = vmlaq_f32(vdupq_n_f32(5.5826318e-2f), x, p); - p = vmlaq_f32(vdupq_n_f32(2.4015361e-1f), x, p); - p = vmlaq_f32(vdupq_n_f32(6.9315308e-1f), x, p); - p = vmlaq_f32(vdupq_n_f32(9.9999994e-1f), x, p); + float32x4_t p = vdupq_n_f32(f); + p = vmlaq_f32(vdupq_n_f32(e), x, p); + p = vmlaq_f32(vdupq_n_f32(d), x, p); + p = vmlaq_f32(vdupq_n_f32(c), x, p); + p = vmlaq_f32(vdupq_n_f32(b), x, p); + p = vmlaq_f32(vdupq_n_f32(a), x, p); return p; } @@ -358,9 +441,19 @@ namespace Simd int32x4_t ipart = vcvtq_s32_f32(vsubq_f32(x, vdupq_n_f32(0.5f))); float32x4_t fpart = vsubq_f32(x, vcvtq_f32_s32(ipart)); float32x4_t expipart = vreinterpretq_f32_s32(vshlq_n_s32(vaddq_s32(ipart, vdupq_n_s32(127)), 23)); - float32x4_t expfpart = Poly5(fpart); + float32x4_t expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); return vmulq_f32(expipart, expfpart); } + + SIMD_INLINE float32x4_t Log2(float32x4_t x) + { + float32x4_t _1 = vdupq_n_f32(1.0f); + int32x4_t i = vreinterpretq_s32_f32(x); + float32x4_t e = vcvtq_f32_s32(vsubq_s32(vshrq_n_s32(vandq_s32(i, vdupq_n_s32(0x7F800000)), 23), vdupq_n_s32(127))); + float32x4_t m = Or(vreinterpretq_f32_s32(vandq_s32(i, vdupq_n_s32(0x007FFFFF))), _1); + float32x4_t p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + return vaddq_f32(vmulq_f32(p, vsubq_f32(m, _1)), e); + } } SIMD_INLINE float32x4_t Exponent(float32x4_t value) @@ -375,6 +468,35 @@ namespace Simd uint32x4_t mask = vcgtq_f32(vdupq_n_f32(0.0f), value); return vbslq_f32(mask, neg, value); } + + SIMD_INLINE float32x4_t Logarithm(float32x4_t value) + { + return vmulq_f32(vdupq_n_f32(0.693147181f), Detail::Log2(value)); + } + + template SIMD_INLINE float32x4_t Mish(float32x4_t value, float32x4_t threshold) + { + float32x4_t _1 = vdupq_n_f32(1.0f); + float32x4_t mish = vaddq_f32(Exponent(value), _1); + mish = Fmadd(mish, mish, _1); + mish = vmulq_f32(value, vsubq_f32(_1, Div(vdupq_n_f32(2.0f), mish))); + return vbslq_f32(vcgtq_f32(threshold, value), mish, value); + } + + template SIMD_INLINE float32x4_t Softplus(float32x4_t value, float32x4_t beta, float32x4_t threshold) + { + float32x4_t exp = Exponent(vmulq_f32(value, beta)); + float32x4_t log = Logarithm(vaddq_f32(vdupq_n_f32(1.0f), exp)); + uint32x4_t mask = vcgtq_f32(threshold, value); + return vbslq_f32(mask, Div(log, beta), value); + } + + template SIMD_INLINE float32x4_t Tanh(float32x4_t value) + { + float32x4_t _1 = vdupq_n_f32(1.0f); + float32x4_t exp = Detail::Exp2(vmulq_f32(vdupq_n_f32(2.88539008f), value)); + return Div(vsubq_f32(exp, _1), vaddq_f32(_1, exp)); + } } #endif //SIMD_NEON_ENABLE } diff --git a/3rdparty/simdlib/Simd/SimdExtract.h b/3rdparty/simdlib/Simd/SimdExtract.h old mode 100644 new mode 100755 index d0d8184d7c..e30a0c85e5 --- a/3rdparty/simdlib/Simd/SimdExtract.h +++ b/3rdparty/simdlib/Simd/SimdExtract.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,8 +28,8 @@ namespace Simd { -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { SIMD_INLINE float ExtractValue(__m128 a, int i) { @@ -44,12 +44,7 @@ namespace Simd _mm_store_ps(_a, a); return _a[0] + _a[1] + _a[2] + _a[3]; } - } -#endif//SIMD_SSE_ENABLE -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { template SIMD_INLINE int ExtractInt8(__m128i a) { return _mm_extract_epi16(_mm_srli_si128(a, index & 0x1), index >> 1) & 0xFF; @@ -90,8 +85,8 @@ namespace Simd } #endif// SIMD_SSE2_ENABLE -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { SIMD_INLINE float ExtractSum(__m128 a) { @@ -103,7 +98,7 @@ namespace Simd return _mm_hadd_ps(_mm_hadd_ps(a[0], a[1]), _mm_hadd_ps(a[2], a[3])); } } -#endif//SIMD_SSE3_ENABLE +#endif//SIMD_SSE41_ENABLE #ifdef SIMD_AVX_ENABLE namespace Avx @@ -199,6 +194,11 @@ namespace Simd return vgetq_lane_u32(a, 0) + vgetq_lane_u32(a, 1) + vgetq_lane_u32(a, 2) + vgetq_lane_u32(a, 3); } + SIMD_INLINE int32_t ExtractSum32s(const int32x4_t& a) + { + return vgetq_lane_s32(a, 0) + vgetq_lane_s32(a, 1) + vgetq_lane_s32(a, 2) + vgetq_lane_s32(a, 3); + } + SIMD_INLINE uint64_t ExtractSum64u(const uint64x2_t & a) { return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); diff --git a/3rdparty/simdlib/Simd/SimdFrame.hpp b/3rdparty/simdlib/Simd/SimdFrame.hpp old mode 100644 new mode 100755 index 53cc33879d..45b0b6022a --- a/3rdparty/simdlib/Simd/SimdFrame.hpp +++ b/3rdparty/simdlib/Simd/SimdFrame.hpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, +* Copyright (c) 2011-2021 Yermalayeu Ihar, * 2014-2019 Antonenka Mikhail, * 2019-2019 Artur Voronkov. * @@ -58,6 +58,10 @@ namespace Simd Bgr24, /*! One plane 8-bit gray pixel format. */ Gray8, + /*! One plane 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */ + Rgb24, + /*! One plane 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */ + Rgba32, }; const size_t width; /*!< \brief A width of the frame. */ @@ -373,6 +377,8 @@ namespace Simd case View::Gray8: (Format&)format = Gray8; break; case View::Bgr24: (Format&)format = Bgr24; break; case View::Bgra32: (Format&)format = Bgra32; break; + case View::Rgb24: (Format&)format = Rgb24; break; + case View::Rgba32: (Format&)format = Rgba32; break; default: assert(0); } @@ -420,6 +426,14 @@ namespace Simd case Gray8: planes[0] = View(width, height, stride0, View::Gray8, data0); break; + case Rgb24: + planes[0] = View(width, height, stride0, View::Rgb24, data0); + break; + case Rgba32: + planes[0] = View(width, height, stride0, View::Rgba32, data0); + break; + default: + assert(0); } } @@ -494,6 +508,14 @@ namespace Simd case Gray8: planes[0].Recreate(width, height, View::Gray8); break; + case Rgb24: + planes[0].Recreate(width, height, View::Rgb24); + break; + case Rgba32: + planes[0].Recreate(width, height, View::Rgba32); + break; + default: + assert(0); } } @@ -591,6 +613,8 @@ namespace Simd case Bgra32: return 1; case Bgr24: return 1; case Gray8: return 1; + case Rgb24: return 1; + case Rgba32: return 1; default: assert(0); return 0; } } @@ -648,6 +672,12 @@ namespace Simd case Frame::Gray8: BgraToGray(src.planes[0], dst.planes[0]); break; + case Frame::Rgb24: + BgraToRgb(src.planes[0], dst.planes[0]); + break; + case Frame::Rgba32: + BgraToRgba(src.planes[0], dst.planes[0]); + break; default: assert(0); } @@ -662,6 +692,12 @@ namespace Simd case Frame::Gray8: BgrToGray(src.planes[0], dst.planes[0]); break; + case Frame::Rgb24: + BgrToRgb(src.planes[0], dst.planes[0]); + break; + case Frame::Rgba32: + BgrToRgba(src.planes[0], dst.planes[0]); + break; default: assert(0); } @@ -676,11 +712,71 @@ namespace Simd case Frame::Bgr24: GrayToBgr(src.planes[0], dst.planes[0]); break; + case Frame::Rgb24: + GrayToRgb(src.planes[0], dst.planes[0]); + break; + case Frame::Rgba32: + GrayToRgba(src.planes[0], dst.planes[0]); + break; default: assert(0); } break; + case Frame::Rgb24: + switch (dst.format) + { + case Frame::Bgra32: + RgbToBgra(src.planes[0], dst.planes[0]); + break; + case Frame::Gray8: + RgbToGray(src.planes[0], dst.planes[0]); + break; + case Frame::Bgr24: + RgbToBgr(src.planes[0], dst.planes[0]); + break; + case Frame::Rgba32: + RgbToRgba(src.planes[0], dst.planes[0]); + break; + default: + assert(0); + } + + case Frame::Rgba32: + switch (dst.format) + { + case Frame::Nv12: + { + View bgr(src.Size(), View::Bgr24); + RgbaToBgr(src.planes[0], bgr); + View u(src.Size(), View::Gray8), v(src.Size(), View::Gray8); + BgrToYuv420p(bgr, dst.planes[0], u, v); + InterleaveUv(u, v, dst.planes[1]); + break; + } + case Frame::Yuv420p: + { + View bgr(src.Size(), View::Bgr24); + RgbaToBgr(src.planes[0], bgr); + BgrToYuv420p(bgr, dst.planes[0], dst.planes[1], dst.planes[2]); + break; + } + case Frame::Bgra32: + RgbaToBgra(src.planes[0], dst.planes[0]); + break; + case Frame::Gray8: + RgbaToGray(src.planes[0], dst.planes[0]); + break; + case Frame::Bgr24: + RgbaToBgr(src.planes[0], dst.planes[0]); + break; + case Frame::Rgb24: + RgbaToRgb(src.planes[0], dst.planes[0]); + break; + default: + assert(0); + } + default: assert(0); } diff --git a/3rdparty/simdlib/Simd/SimdInit.h b/3rdparty/simdlib/Simd/SimdInit.h old mode 100644 new mode 100755 index 179e61bdb4..707ea4c8bc --- a/3rdparty/simdlib/Simd/SimdInit.h +++ b/3rdparty/simdlib/Simd/SimdInit.h @@ -28,7 +28,22 @@ namespace Simd { -#if defined(_MSC_VER) && (defined(SIMD_X64_ENABLE) || defined(SIMD_X86_ENABLE)) + +#if defined(_MSC_VER) && !defined(__clang__) && (defined(SIMD_X64_ENABLE) || defined(SIMD_X86_ENABLE)) + +#define SIMD_INIT_AS_CHAR + +#elif defined(__GNUC__) || defined(__clang__) || (defined(_MSC_VER) && defined(SIMD_NEON_ENABLE)) + +#define SIMD_INIT_AS_LONGLONG + +#else + +#error This platform is unsupported! + +#endif + +#if defined(SIMD_INIT_AS_CHAR) template SIMD_INLINE char GetChar(T value, size_t index) { @@ -50,7 +65,7 @@ namespace Simd Simd::GetChar(int64_t(a), 4), Simd::GetChar(int64_t(a), 5), \ Simd::GetChar(int64_t(a), 6), Simd::GetChar(int64_t(a), 7) -#elif defined(__GNUC__) || (defined(_MSC_VER) && defined(SIMD_NEON_ENABLE)) +#elif defined(SIMD_INIT_AS_LONGLONG) #define SIMD_CHAR_AS_LONGLONG(a) (((long long)a) & 0xFF) @@ -94,11 +109,15 @@ namespace Simd #define SIMD_LL_SET2_EPI32(a, b) \ SIMD_INT_AS_LONGLONG(a) | (SIMD_INT_AS_LONGLONG(b) << 32) -#endif//defined(__GNUC__) || (defined(_MSC_VER) && defined(SIMD_NEON_ENABLE)) +#else + +#error This platform is unsupported! + +#endif #if defined(SIMD_SSE2_ENABLE) -#if defined(_MSC_VER) +#if defined(SIMD_INIT_AS_CHAR) #define SIMD_MM_SET1_EPI8(a) \ {SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ @@ -148,7 +167,7 @@ namespace Simd #define SIMD_MM_SETR_EPI64(a0, a1) \ {SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1)} -#elif defined(__GNUC__) +#elif defined(SIMD_INIT_AS_LONGLONG) #define SIMD_MM_SET1_EPI8(a) \ {SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a)} @@ -192,7 +211,7 @@ namespace Simd #if defined(SIMD_AVX2_ENABLE) -#if defined(_MSC_VER) +#if defined(SIMD_INIT_AS_CHAR) #define SIMD_MM256_SET1_EPI8(a) \ {SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \ @@ -263,7 +282,7 @@ namespace Simd #define SIMD_MM256_SETR_EPI64(a0, a1, a2, a3) \ {SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1), SIMD_AS_8CHARS(a2), SIMD_AS_8CHARS(a3)} -#elif defined(__GNUC__) +#elif defined(SIMD_INIT_AS_LONGLONG) #define SIMD_MM256_SET1_EPI8(a) \ {SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a), \ @@ -310,7 +329,7 @@ namespace Simd #define SIMD_MM256_SETR_EPI64(a0, a1, a2, a3) \ {a0, a1, a2, a3} -#endif// defined(_MSC_VER) || defined(__GNUC__) +#endif #endif// SIMD_AVX2_ENABLE diff --git a/3rdparty/simdlib/Simd/SimdLib.cpp b/3rdparty/simdlib/Simd/SimdLib.cpp old mode 100644 new mode 100755 index eb181ec376..b1cac8b1ba --- a/3rdparty/simdlib/Simd/SimdLib.cpp +++ b/3rdparty/simdlib/Simd/SimdLib.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, +* Copyright (c) 2011-2021 Yermalayeu Ihar, * 2014-2018 Antonenka Mikhail, * 2018-2018 Radchenko Andrey, * 2019-2019 Facundo Galan. @@ -55,18 +55,18 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved) #include "Simd/SimdLib.h" #include "Simd/SimdMemory.h" +#include "Simd/SimdCpu.h" #include "Simd/SimdEnable.h" +#include "Simd/SimdAlignment.h" #include "Simd/SimdConst.h" -#include "Simd/SimdCpu.h" #include "Simd/SimdLog.h" #include "Simd/SimdResizer.h" #include "Simd/SimdGaussianBlur.h" #include "Simd/SimdBase.h" -#include "Simd/SimdSse1.h" #include "Simd/SimdSse2.h" -#include "Simd/SimdSsse3.h" +#include "Simd/SimdSse41.h" #include "Simd/SimdAvx1.h" #include "Simd/SimdAvx2.h" #include "Simd/SimdNeon.h" @@ -75,6 +75,11 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved) #include "Simd/SimdVersion.h" #endif +namespace Simd +{ + const size_t ALIGNMENT = GetAlignment(); +} + SIMD_API const char * SimdVersion() { return SIMD_VERSION; @@ -118,9 +123,9 @@ SIMD_API void SimdRelease(void * context) SIMD_API SimdBool SimdGetFastMode() { -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable) - return Sse::GetFastMode(); +#ifdef SIMD_SSE2_ENABLE + if (Sse2::Enable) + return Sse2::GetFastMode(); else #endif #ifdef SIMD_NEON_ENABLE @@ -133,9 +138,9 @@ SIMD_API SimdBool SimdGetFastMode() SIMD_API void SimdSetFastMode(SimdBool value) { -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable) - Sse::SetFastMode(value); +#ifdef SIMD_SSE2_ENABLE + if (Sse2::Enable) + Sse2::SetFastMode(value); #endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable) @@ -145,9 +150,9 @@ SIMD_API void SimdSetFastMode(SimdBool value) SIMD_API void SimdBgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) { -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && width >= Sse41::A) + Sse41::BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); else #endif #ifdef SIMD_NEON_ENABLE @@ -178,84 +183,69 @@ SIMD_API void SimdBgraToGray(const uint8_t *bgra, size_t width, size_t height, s Base::BgraToGray(bgra, width, height, bgraStride, gray, grayStride); } -SIMD_API void SimdRgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride) +SIMD_API void SimdBgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) { #ifdef SIMD_AVX2_ENABLE - if(Avx2::Enable && width >= Avx2::A) - Avx2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + if (Avx2::Enable && width >= Avx2::A) + Avx2::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); else #endif -#ifdef SIMD_SSE2_ENABLE - if(Sse2::Enable && width >= Sse2::A) - Sse2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); else #endif #ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::HA) - Neon::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + if (Neon::Enable && width >= Neon::A) + Neon::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); else #endif - Base::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + Base::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); } -SIMD_API void SimdBgrToBgra(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha) +SIMD_API void SimdBgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) { -#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR) - if(Avx2::Enable && width >= Avx2::A) - Avx2::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); +#ifdef SIMD_AVX2_ENABLE + if (Avx2::Enable && width >= Avx2::A) + Avx2::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); else #endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable && width >= Neon::A) - Neon::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); + Neon::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); else #endif - Base::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); + Base::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); } -SIMD_API void SimdBgrToRgba(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *rgba, size_t rgbaStride, uint8_t alpha) +SIMD_API void SimdBgrToBgra(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha) { #if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR) if(Avx2::Enable && width >= Avx2::A) - Avx2::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - else -#endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); + Avx2::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); else #endif - Base::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); -} - -SIMD_API void SimdBgraToRgba(const uint8_t *bgra, size_t width, size_t height, size_t bgraStride, uint8_t *rgba, size_t rgbaStride) -{ -#if defined(SIMD_AVX2_ENABLE) - if(Avx2::Enable && width >= Avx2::A) - Avx2::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && width >= Sse41::A) + Sse41::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); +#ifdef SIMD_VMX_ENABLE + if(Vmx::Enable && width >= Vmx::A) + Vmx::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); else #endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable && width >= Neon::A) - Neon::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + Neon::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); else #endif - Base::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + Base::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); } SIMD_API void SimdBgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, @@ -286,9 +276,9 @@ SIMD_API void SimdBgrToGray(const uint8_t *bgr, size_t width, size_t height, siz Avx2::BgrToGray(bgr, width, height, bgrStride, gray, grayStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgrToGray(bgr, width, height, bgrStride, gray, grayStride); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && width >= Sse41::A) + Sse41::BgrToGray(bgr, width, height, bgrStride, gray, grayStride); else #endif #ifdef SIMD_SSE2_ENABLE @@ -304,49 +294,29 @@ SIMD_API void SimdBgrToGray(const uint8_t *bgr, size_t width, size_t height, siz Base::BgrToGray(bgr, width, height, bgrStride, gray, grayStride); } -SIMD_API void SimdRgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride) +SIMD_API void SimdBgrToRgb(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride) { -#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR) - if (Avx2::Enable && width >= Avx2::A) - Avx2::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else -#endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else -#endif -#ifdef SIMD_SSE2_ENABLE - if (Sse2::Enable && width >= Sse2::A) - Sse2::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); +#ifdef SIMD_AVX512BW_ENABLE + if (Avx512bw::Enable) + Avx512bw::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); else #endif -#ifdef SIMD_NEON_ENABLE - if (Neon::Enable && width >= Neon::A) - Neon::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else -#endif - Base::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); -} - -SIMD_API void SimdBgrToRgb(const uint8_t *bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) -{ #ifdef SIMD_AVX2_ENABLE if (Avx2::Enable && width >= Avx2::A) - Avx2::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); + Avx2::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); else #endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable && width >= Neon::A) - Neon::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); + Neon::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); else #endif - Base::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); + Base::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); } SIMD_API void SimdCopy(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, uint8_t * dst, size_t dstStride) @@ -368,9 +338,9 @@ SIMD_API void SimdDeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t Avx2::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); else #endif #ifdef SIMD_NEON_ENABLE @@ -389,9 +359,9 @@ SIMD_API void SimdDeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size Avx2::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); else #endif #ifdef SIMD_NEON_ENABLE @@ -410,9 +380,9 @@ SIMD_API void SimdGaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t Avx2::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && (width - 1)*channelCount >= Ssse3::A) - Ssse3::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && (width - 1)*channelCount >= Sse41::A) + Sse41::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); else #endif #ifdef SIMD_SSE2_ENABLE @@ -448,9 +418,9 @@ SIMD_API void SimdGrayToBgr(const uint8_t * gray, size_t width, size_t height, s Avx2::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && width >= Ssse3::A) - Ssse3::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && width >= Sse41::A) + Sse41::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); else #endif #ifdef SIMD_NEON_ENABLE @@ -489,9 +459,9 @@ SIMD_API void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t Avx2::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride); else #endif #ifdef SIMD_NEON_ENABLE @@ -510,9 +480,9 @@ SIMD_API void SimdInterleaveBgra(const uint8_t * b, size_t bStride, const uint8_ Avx2::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && width >= Ssse3::A) - Ssse3::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); else #endif #ifdef SIMD_NEON_ENABLE @@ -552,9 +522,9 @@ SIMD_API void SimdReduceColor2x2(const uint8_t *src, size_t srcWidth, size_t src Avx2::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); else #endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && srcWidth >= Ssse3::DA) - Ssse3::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && srcWidth >= Sse41::DA) + Sse41::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); else #endif #ifdef SIMD_SSE2_ENABLE @@ -578,9 +548,9 @@ SIMD_API void SimdReduceGray2x2(const uint8_t *src, size_t srcWidth, size_t srcH Avx2::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && srcWidth >= Ssse3::DA) - Ssse3::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && srcWidth >= Sse41::DA) + Sse41::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); else #endif #ifdef SIMD_SSE2_ENABLE @@ -625,9 +595,9 @@ SIMD_API void SimdReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcH Avx2::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && srcWidth > Ssse3::A) - Ssse3::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && srcWidth > Sse41::A) + Sse41::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); else #endif #ifdef SIMD_SSE2_ENABLE @@ -672,9 +642,9 @@ SIMD_API void SimdResizeBilinear(const uint8_t *src, size_t srcWidth, size_t src Avx2::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); else #endif -#ifdef SIMD_SSSE3_ENABLE - if(Ssse3::Enable && dstWidth >= Ssse3::A) - Ssse3::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); +#ifdef SIMD_SSE41_ENABLE + if(Sse41::Enable && dstWidth >= Sse41::A) + Sse41::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); else #endif #ifdef SIMD_SSE2_ENABLE @@ -707,21 +677,11 @@ SIMD_API void * SimdResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t ds return Sse41::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); else #endif -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable) - return Ssse3::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - else -#endif #ifdef SIMD_SSE2_ENABLE if (Sse2::Enable) return Sse2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); else #endif -#ifdef SIMD_SSE_ENABLE - if (Sse::Enable) - return Sse::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - else -#endif #ifdef SIMD_NEON_ENABLE if (Neon::Enable) return Neon::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); @@ -735,6 +695,66 @@ SIMD_API void SimdResizerRun(const void * resizer, const uint8_t * src, size_t s ((Resizer*)resizer)->Run(src, srcStride, dst, dstStride); } +SIMD_API void SimdRgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) +{ +#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR) + if (Avx2::Enable && width >= Avx2::A) + Avx2::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + else +#endif +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + else +#endif +#ifdef SIMD_NEON_ENABLE + if (Neon::Enable && width >= Neon::A) + Neon::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + else +#endif + Base::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); +} + +SIMD_API void SimdRgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) +{ +#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR) + if (Avx2::Enable && width >= Avx2::A) + Avx2::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + else +#endif +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && width >= Sse41::A) + Sse41::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + else +#endif +#ifdef SIMD_NEON_ENABLE + if (Neon::Enable && width >= Neon::A) + Neon::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + else +#endif + Base::RgbToGray(rgb, width, height, rgbStride, gray, grayStride); +} + +SIMD_API void SimdRgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) +{ +#if defined(SIMD_AVX2_ENABLE) + if (Avx2::Enable && width >= Avx2::A) + Avx2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + else +#endif +#ifdef SIMD_SSE2_ENABLE + if (Sse2::Enable && width >= Sse2::A) + Sse2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + else +#endif +#ifdef SIMD_NEON_ENABLE + if (Neon::Enable && width >= Neon::A) + Neon::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + else +#endif + Base::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); +} + SIMD_API void SimdStretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) { @@ -842,6 +862,7 @@ SIMD_API void SimdMatTranspose(const double * mat, size_t rows, size_t cols, dou SIMD_API void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff) { + //TODO: #ifdef SIMD_SSSE3_ENABLE if (Ssse3::Enable && size >= Ssse3::A) Ssse3::SimdImageDifference(img1,img2, size, imgDiff); diff --git a/3rdparty/simdlib/Simd/SimdLib.h b/3rdparty/simdlib/Simd/SimdLib.h old mode 100644 new mode 100755 index c3862f19f1..4838b82261 --- a/3rdparty/simdlib/Simd/SimdLib.h +++ b/3rdparty/simdlib/Simd/SimdLib.h @@ -1,8 +1,8 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, -* 2014-2016 Antonenka Mikhail, +* Copyright (c) 2011-2021 Yermalayeu Ihar, +* 2014-2019 Antonenka Mikhail, * 2019-2019 Facundo Galan. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -27,8 +27,6 @@ #ifndef __SimdLib_h__ #define __SimdLib_h__ -#include "Simd/SimdConfig.h" - #include #if defined(_MSC_VER) || defined(__CODEGEARC__) @@ -107,12 +105,8 @@ typedef enum SimdCpuInfoCacheL1, /*!< A size of level 1 data cache. */ SimdCpuInfoCacheL2, /*!< A size of level 2 cache. */ SimdCpuInfoCacheL3, /*!< A size of level 3 cache. */ - SimdCpuInfoSse, /*!< Availability of SSE (x86). */ SimdCpuInfoSse2, /*!< Availability of SSE2 (x86). */ - SimdCpuInfoSse3, /*!< Availability of SSE3 (x86). */ - SimdCpuInfoSsse3, /*!< Availability of SSSE3 (x86). */ SimdCpuInfoSse41, /*!< Availability of SSE4.1 (x86). */ - SimdCpuInfoSse42, /*!< Availability of SSE4.2 (x86). */ SimdCpuInfoAvx, /*!< Availability of AVX (x86). */ SimdCpuInfoAvx2, /*!< Availability of AVX2 (x86). */ SimdCpuInfoAvx512f, /*!< Availability of AVX-512F (x86). */ @@ -120,7 +114,6 @@ typedef enum SimdCpuInfoVmx, /*!< Availability of VMX or Altivec (PowerPC). */ SimdCpuInfoVsx, /*!< Availability of VSX (PowerPC). */ SimdCpuInfoNeon, /*!< Availability of NEON (ARM). */ - SimdCpuInfoMsa, /*!< Availability of MSA (MIPS). */ } SimdCpuInfoType; /*! @ingroup c_types @@ -188,6 +181,8 @@ typedef enum SimdPixelFormatHsl24, /*! A 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */ SimdPixelFormatRgb24, + /*! A 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */ + SimdPixelFormatRgba32, } SimdPixelFormatType; /*! @ingroup c_types @@ -208,12 +203,14 @@ typedef enum { /*! 8-bit integer channel type. */ SimdResizeChannelByte, + /*! 16-bit integer channel type. */ + SimdResizeChannelShort, /*! 32-bit float channel type. */ SimdResizeChannelFloat, } SimdResizeChannelType; /*! @ingroup resizing - Describes methods used in oreder to resize image. + Describes methods used in order to resize image. */ typedef enum { @@ -223,6 +220,10 @@ typedef enum SimdResizeMethodCaffeInterp, /*! Area method. */ SimdResizeMethodArea, + /*! InferenceEngine::Extension::Cpu::Interp compatible method. */ + SimdResizeMethodInferenceEngineInterp, + /*! Nearest pixel method. */ + SimdResizeMethodNearest, } SimdResizeMethodType; // ViSP custom SIMD code @@ -317,7 +318,7 @@ extern "C" \fn size_t SimdAlignment(); - \short Gets alignment required for the most productive work of the Simd Library. + \short Gets alignment required for the most productive work of Simd Library. \return a required alignment. */ @@ -359,17 +360,18 @@ extern "C" \fn void SimdBgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride); - \short Converts 32-bit BGRA image to 24-bit BGR image. + \short Converts 32-bit BGRA image to 24-bit BGR image. Also it can be used for 32-bit RGBA to 24-bit RGB conversion. All images must have the same width and height. - \note This function has a C++ wrapper Simd::BgraToBgr(const View& bgra, View& bgr). + \note This function has C++ wrappers: Simd::BgraToBgr(const View& bgra, View& bgr) + and Simd::RgbaToRgb(const View& rgba, View& rgb). - \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image. + \param [in] bgra - a pointer to pixels data of input 32-bit BGRA (or 32-bit RGBA) image. \param [in] width - an image width. \param [in] height - an image height. \param [in] bgraStride - a row size of the bgra image. - \param [out] bgr - a pointer to pixels data of output 24-bit BGR image. + \param [out] bgr - a pointer to pixels data of output 24-bit BGR (or 24-bit RGB) image. \param [in] bgrStride - a row size of the bgr image. */ SIMD_API void SimdBgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride); @@ -395,76 +397,63 @@ extern "C" /*! @ingroup bgra_conversion - \fn void SimdRgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride); + \fn void SimdBgraToRgb(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgb, size_t rgbStride); - \short Converts 32-bit RGBA image to 8-bit gray image. + \short Converts 32-bit BGRA image to 24-bit RGB image. Also it can be used for 32-bit RGBA to 24-bit BGR conversion. All images must have the same width and height. - \param [in] rgba - a pointer to pixels data of input 32-bit RGBA image. + \note This function has C++ wrappers: Simd::BgraToRgb(const View& bgra, View& rgb) + and Simd::RgbaToBgr(const View& rgba, View& bgr). + + \param [in] bgra - a pointer to pixels data of input 32-bit BGRA (or 32-bit RGBA) image. \param [in] width - an image width. \param [in] height - an image height. - \param [in] rgbaStride - a row size of the rgba image. - \param [out] gray - a pointer to pixels data of output 8-bit gray image. - \param [in] grayStride - a row size of the gray image. + \param [in] bgraStride - a row size of the bgra image. + \param [out] rgb - a pointer to pixels data of output 24-bit RGB (or 24-bit BGR) image. + \param [in] rgbStride - a row size of the rgb image. */ - SIMD_API void SimdRgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride); + SIMD_API void SimdBgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride); - /*! @ingroup bgr_conversion + /*! @ingroup bgra_conversion - \fn void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); + \fn void SimdBgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride); - \short Converts 24-bit BGR image to 32-bit BGRA image. + \short Converts 32-bit BGRA image to 32-bit RGBA image. Also it can be used for 32-bit RGBA to 32-bit BGRA conversion. All images must have the same width and height. - \note This function has a C++ wrapper Simd::BgrToBgra(const View& bgr, View& bgra, uint8_t alpha). + \note This function has C++ wrappers: Simd::BgraToRgba(const View& bgra, View& rgba) + and Simd::RgbaToBgra(const View& rgba, View& bgra). - \param [in] bgr - a pointer to pixels data of input 24-bit BGR image. + \param [in] bgra - a pointer to pixels data of input 32-bit BGRA (or 32-bit RGBA) image. \param [in] width - an image width. \param [in] height - an image height. - \param [in] bgrStride - a row size of the bgr image. - \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image. \param [in] bgraStride - a row size of the bgra image. - \param [in] alpha - a value of alpha channel. + \param [out] rgba - a pointer to pixels data of output 32-bit RGBA (or 32-bit BGRA) image. + \param [in] rgbaStride - a row size of the rgb image. */ - SIMD_API void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); + SIMD_API void SimdBgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride); /*! @ingroup bgr_conversion - \fn void SimdBgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha); + \fn void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - \short Converts 24-bit BGR image to 32-bit RGBA image. + \short Converts 24-bit BGR image to 32-bit BGRA image. All images must have the same width and height. + \note This function has a C++ wrapper Simd::BgrToBgra(const View& bgr, View& bgra, uint8_t alpha). + \param [in] bgr - a pointer to pixels data of input 24-bit BGR image. \param [in] width - an image width. \param [in] height - an image height. \param [in] bgrStride - a row size of the bgr image. - \param [out] rgba - a pointer to pixels data of output 32-bit BGRA image. - \param [in] rgbaStride - a row size of the bgra image. - \param [in] alpha - a value of alpha channel. - */ - SIMD_API void SimdBgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha); - - /*! @ingroup bgr_conversion - - \fn void SimdBgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride); - - \short Converts 32-bit BGRA image to 32-bit RGBA image. - - All images must have the same width and height. - - \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image. - \param [in] width - an image width. - \param [in] height - an image height. + \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image. \param [in] bgraStride - a row size of the bgra image. - \param [out] rgba - a pointer to pixels data of output 32-bit RGBA image. - \param [in] rgbaStride - a row size of the rgba image. \param [in] alpha - a value of alpha channel. */ - SIMD_API void SimdBgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride); + SIMD_API void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); /*! @ingroup other_conversion @@ -512,39 +501,23 @@ extern "C" /*! @ingroup bgr_conversion - \fn void SimdRgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride); + \fn void SimdBgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride); - \short Converts 24-bit RGB image to 8-bit gray image. + \short Converts 24-bit BGR image to 24-bit RGB image. Also it can be used for 24-bit RGB to 24-bit BGR conversion. All images must have the same width and height. - \param [in] rgb - a pointer to pixels data of input 24-bit BGR image. - \param [in] width - an image width. - \param [in] height - an image height. - \param [in] rgbStride - a row size of the bgr image. - \param [out] gray - a pointer to pixels data of output 8-bit gray image. - \param [in] grayStride - a row size of the gray image. - */ - SIMD_API void SimdRgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride); - - /*! @ingroup bgr_conversion - - \fn void SimdBgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); - - \short Converts 24-bit BGR image to 24-bit RGB image (also it performs backward conversion). - - All images must have the same width and height. + \note This function has C++ wrappers: Simd::BgrToRgb(const View & bgr, View & rgb) + and Simd::RgbToBgr(const View& rgb, View& bgr). - \note This function has a C++ wrapper Simd::BgrToRgb(const View & bgr, View & rgb). - - \param [in] bgr - a pointer to pixels data of input 24-bit BGR image. - \param [in] bgrStride - a row size of the bgr image. + \param [in] bgr - a pointer to pixels data of input 24-bit BGR image (or 24-bit RGB image). \param [in] width - an image width. \param [in] height - an image height. - \param [out] rgb - a pointer to pixels data of output 24-bit RGB image. + \param [in] bgrStride - a row size of the bgr image. + \param [out] rgb - a pointer to pixels data of output 24-bit RGB image (or 24-bit BGR image). \param [in] rgbStride - a row size of the rgb image. */ - SIMD_API void SimdBgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); + SIMD_API void SimdBgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride); /*! @ingroup copying @@ -591,7 +564,7 @@ extern "C" SIMD_API void SimdCopyFrame(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, size_t frameLeft, size_t frameTop, size_t frameRight, size_t frameBottom, uint8_t * dst, size_t dstStride); - /*! @ingroup other_conversion + /*! @ingroup deinterleave_conversion \fn void SimdDeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); @@ -599,7 +572,9 @@ extern "C" All images must have the same width and height. - \note This function has a C++ wrapper Simd::DeinterleaveBgr(const View& bgr, View& b, View& g, View& r). + \note This function has C++ wrappers: + Simd::DeinterleaveBgr(const View& bgr, View& b, View& g, View& r), + Simd::DeinterleaveRgb(const View& rgb, View& r, View& g, View& b). \param [in] bgr - a pointer to pixels data of input 24-bit BGR interleaved image. \param [in] bgrStride - a row size of the bgr image. @@ -615,7 +590,7 @@ extern "C" SIMD_API void SimdDeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); - /*! @ingroup other_conversion + /*! @ingroup deinterleave_conversion \fn void SimdDeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride); @@ -623,7 +598,11 @@ extern "C" All images must have the same width and height. - \note This function has a C++ wrapper Simd::DeinterleaveBgra(const View& bgra, View& b, View& g, View& r, View& a). + \note This function has C++ wrappers: + Simd::DeinterleaveBgra(const View& bgra, View& b, View& g, View& r, View& a), + Simd::DeinterleaveBgra(const View& bgra, View& b, View& g, View& r), + Simd::DeinterleaveRgba(const View& rgba, View& r, View& g, View& b, View& a), + Simd::DeinterleaveRgba(const View& rgba, View& r, View& g, View& b). \param [in] bgra - a pointer to pixels data of input 32-bit BGRA interleaved image. \param [in] bgraStride - a row size of the bgra image. @@ -635,7 +614,7 @@ extern "C" \param [in] gStride - a row size of the g image. \param [out] r - a pointer to pixels data of 8-bit Red planar image. \param [in] rStride - a row size of the r image. - \param [out] a - a pointer to pixels data of 8-bit Alpha planar image. + \param [out] a - a pointer to pixels data of 8-bit Alpha planar image. It can be NULL. \param [in] aStride - a row size of the a image. */ SIMD_API void SimdDeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, @@ -670,20 +649,27 @@ extern "C" size_t channelCount, uint8_t * dst, size_t dstStride); /*! @ingroup gaussian_filter + \fn void * SimdGaussianBlurInit(size_t width, size_t height, size_t channels, const float * sigma, const float* epsilon); + \short Creates Gaussian blur filter context. + In particular calculates Gaussian blur coefficients: \verbatim half = floor(sqrt(log(1/epsilon)) * sigma); weight[2*half + 1]; + for(x = -half; x <= half; ++x) weight[x + half] = exp(-sqr(x / sigma) / 2); + sum = 0; for (x = -half; x <= half; ++x) sum += weight[x + half]; + for (x = -half; x <= half; ++x) weight[x + half] /= sum; \endverbatim + \param [in] width - a width of input and output image. \param [in] height - a height of input and output image. \param [in] channels - a channel number of input and output image. Its value must be in range [1..4]. @@ -697,8 +683,11 @@ extern "C" SIMD_API void* SimdGaussianBlurInit(size_t width, size_t height, size_t channels, const float * sigma, const float* epsilon); /*! @ingroup gaussian_filter + \fn void SimdGaussianBlurRun(const void* filter, const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride); + \short Performs image Gaussian bluring. + Bluring algorithm for every point: \verbatim sum = 0; @@ -713,6 +702,7 @@ extern "C" } dst[dx, dy] = sum; \endverbatim + \param [in] filter - a filter context. It must be created by function ::SimdGaussianBlurInit and released by function ::SimdRelease. \param [in] src - a pointer to pixels data of the original input image. \param [in] srcStride - a row size (in bytes) of the input image. @@ -725,17 +715,18 @@ extern "C" \fn void SimdGrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride); - \short Converts 8-bit gray image to 24-bit BGR image. + \short Converts 8-bit gray image to 24-bit BGR image. Also it can be used for 8-bit gray to 24-bit RGB conversion. All images must have the same width and height. - \note This function has a C++ wrapper Simd::GrayToBgr(const View& gray, View& bgr). + \note This function has C++ wrappers: Simd::GrayToBgr(const View& gray, View& bgr) + and Simd::GrayToRgb(const View& gray, View& rgb). \param [in] gray - a pointer to pixels data of input 8-bit gray image. \param [in] width - an image width. \param [in] height - an image height. \param [in] grayStride - a row size of the gray image. - \param [out] bgr - a pointer to pixels data of output 24-bit BGR image. + \param [out] bgr - a pointer to pixels data of output 24-bit BGR (or 24-bit RGB) image. \param [in] bgrStride - a row size of the bgr image. */ SIMD_API void SimdGrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride); @@ -744,17 +735,18 @@ extern "C" \fn void SimdGrayToBgra(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - \short Converts 8-bit gray image to 32-bit BGRA image. + \short Converts 8-bit gray image to 32-bit BGRA image. Also it can be used for 8-bit gray to 32-bit RGBA conversion. All images must have the same width and height. - \note This function has a C++ wrapper Simd::GrayToBgra(const View& gray, View& bgra, uint8_t alpha). + \note This function has C++ wrappers: Simd::GrayToBgra(const View& gray, View& bgra, uint8_t alpha) + and Simd::GrayToRgba(const View& gray, View& rgba, uint8_t alpha). \param [in] gray - a pointer to pixels data of input 8-bit gray image. \param [in] width - an image width. \param [in] height - an image height. \param [in] grayStride - a row size of the gray image. - \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image. + \param [out] bgra - a pointer to pixels data of output 32-bit BGRA (or 32-bit RGBA) image. \param [in] bgraStride - a row size of the bgra image. \param [in] alpha - a value of alpha channel. */ @@ -785,7 +777,7 @@ extern "C" SIMD_API void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - /*! @ingroup other_conversion + /*! @ingroup interleave_conversion \fn void SimdInterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride); @@ -1125,6 +1117,16 @@ extern "C" \short Creates resize context. + An using example (resize of RGBA64 image): + \verbatim + void * resizer = SimdResizerInit(srcX, srcY, dstX, dstY, 4, SimdResizeChannelShort, SimdResizeMethodBilinear); + if (resizer) + { + SimdResizerRun(resizer, (uint8_t*)src, srcStride, (uint8_t*)dst, dstStride); + SimdRelease(resizer); + } + \endverbatim + \param [in] srcX - a width of the input image. \param [in] srcY - a height of the input image. \param [in] dstX - a width of the output image. @@ -1152,6 +1154,65 @@ extern "C" */ SIMD_API void SimdResizerRun(const void * resizer, const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); + /*! @ingroup rgb_conversion + + \fn void SimdRgbToBgra(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); + + \short Converts 24-bit RGB image to 32-bit BGRA image. Also it can be used for 24-bit BGR to 32-bit RGBA conversion. + + All images must have the same width and height. + + \note This function has C++ wrappers: Simd::RgbToBgra(const View& rgb, View& bgra, uint8_t alpha) + and Simd::BgrToRgba(const View& bgr, View& rgba, uint8_t alpha). + + \param [in] rgb - a pointer to pixels data of input 24-bit RGB (or 24-bit BGR) image. + \param [in] width - an image width. + \param [in] height - an image height. + \param [in] rgbStride - a row size of the rgb image. + \param [out] bgra - a pointer to pixels data of output 32-bit BGRA (or 32-bit RGBA) image. + \param [in] bgraStride - a row size of the bgra image. + \param [in] alpha - a value of alpha channel. + */ + SIMD_API void SimdRgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + + /*! @ingroup rgb_conversion + + \fn void SimdRgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride); + + \short Converts 24-bit RGB image to 8-bit gray image. + + All images must have the same width and height. + + \note This function has a C++ wrapper Simd::RgbToGray(const View& rgb, View& gray). + + \param [in] rgb - a pointer to pixels data of input 24-bit RGB image. + \param [in] width - an image width. + \param [in] height - an image height. + \param [in] rgbStride - a row size of the rgb image. + \param [out] gray - a pointer to pixels data of output 8-bit gray image. + \param [in] grayStride - a row size of the gray image. + */ + SIMD_API void SimdRgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); + + /*! @ingroup rgba_conversion + + \fn void SimdRgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride); + + \short Converts 32-bit RGBA image to 8-bit gray image. + + All images must have the same width and height. + + \note This function has a C++ wrapper Simd::RgbaToGray(const View& rgba, View& gray). + + \param [in] rgba - a pointer to pixels data of input 32-bit RGBA image. + \param [in] width - an image width. + \param [in] height - an image height. + \param [in] rgbaStride - a row size of the rgba image. + \param [out] gray - a pointer to pixels data of output 8-bit gray image. + \param [in] grayStride - a row size of the gray image. + */ + SIMD_API void SimdRgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride); + /*! @ingroup resizing \fn void SimdStretchGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride); diff --git a/3rdparty/simdlib/Simd/SimdLib.hpp b/3rdparty/simdlib/Simd/SimdLib.hpp old mode 100644 new mode 100755 index 7f7e6745d5..aaedc571e2 --- a/3rdparty/simdlib/Simd/SimdLib.hpp +++ b/3rdparty/simdlib/Simd/SimdLib.hpp @@ -1,8 +1,8 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, -* 2014-2016 Antonenka Mikhail, +* Copyright (c) 2011-2021 Yermalayeu Ihar, +* 2014-2019 Antonenka Mikhail, * 2019-2019 Facundo Galan. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -31,7 +31,9 @@ #ifndef __SimdLib_hpp__ #define __SimdLib_hpp__ -/*! \namespace Simd */ +/*! @ingroup functions + Simd API C++ wrappers. +*/ namespace Simd { /*! @ingroup bgra_conversion @@ -74,6 +76,46 @@ namespace Simd SimdBgraToGray(bgra.data, bgra.width, bgra.height, bgra.stride, gray.data, gray.stride); } + /*! @ingroup bgra_conversion + + \fn void BgraToRgb(const View& bgra, View& rgb) + + \short Converts 32-bit BGRA image to 24-bit RGB image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdBgraToRgb. + + \param [in] bgra - an input 32-bit BGRA image. + \param [out] rgb - an output 24-bit RGB image. + */ + template class A> SIMD_INLINE void BgraToRgb(const View& bgra, View& rgb) + { + assert(EqualSize(bgra, rgb) && bgra.format == View::Bgra32 && rgb.format == View::Rgb24); + + SimdBgraToRgb(bgra.data, bgra.width, bgra.height, bgra.stride, rgb.data, rgb.stride); + } + + /*! @ingroup bgra_conversion + + \fn void BgraToRgba(const View& bgra, View& rgba) + + \short Converts 32-bit BGRA image to 32-bit RGBA image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdBgraToRgba. + + \param [in] bgra - an input 32-bit BGRA image. + \param [out] rgba - an output 32-bit RGBA image. + */ + template class A> SIMD_INLINE void BgraToRgba(const View& bgra, View& rgba) + { + assert(EqualSize(bgra, rgba) && bgra.format == View::Bgra32 && rgba.format == View::Rgba32); + + SimdBgraToRgba(bgra.data, bgra.width, bgra.height, bgra.stride, rgba.data, rgba.stride); + } + /*! @ingroup bgr_conversion \fn void BgrToBgra(const View& bgr, View& bgra, uint8_t alpha = 0xFF) @@ -142,7 +184,7 @@ namespace Simd \fn void BgrToRgb(const View & bgr, View & rgb) - \short Converts 24-bit BGR image to 24-bit RGB image (also it performs backward conversion). + \short Converts 24-bit BGR image to 24-bit RGB image. All images must have the same width and height. @@ -153,9 +195,30 @@ namespace Simd */ template class A> SIMD_INLINE void BgrToRgb(const View & bgr, View & rgb) { - assert(EqualSize(bgr, rgb) && bgr.PixelSize() == 3 && rgb.PixelSize() == 3); + assert(EqualSize(bgr, rgb) && bgr.format == View::Bgr24 && rgb.format == View::Rgb24); - SimdBgrToRgb(bgr.data, bgr.stride, bgr.width, bgr.height, rgb.data, rgb.stride); + SimdBgrToRgb(bgr.data, bgr.width, bgr.height, bgr.stride, rgb.data, rgb.stride); + } + + /*! @ingroup bgr_conversion + + \fn void BgrToRgba(const View& bgr, View& rgba, uint8_t alpha = 0xFF) + + \short Converts 24-bit BGR image to 32-bit RGBA image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdRgbToBgra. + + \param [in] bgr - an input 24-bit BGR image. + \param [out] rgba - an output 32-bit RGBA image. + \param [in] alpha - a value of alpha channel. It is equal to 256 by default. + */ + template class A> SIMD_INLINE void BgrToRgba(const View& bgr, View& rgba, uint8_t alpha = 0xFF) + { + assert(EqualSize(bgr, rgba) && rgba.format == View::Rgba32 && bgr.format == View::Bgr24); + + SimdRgbToBgra(bgr.data, bgr.width, bgr.height, bgr.stride, rgba.data, rgba.stride, alpha); } /*! @ingroup copying @@ -204,7 +267,7 @@ namespace Simd frame.left, frame.top, frame.right, frame.bottom, dst.data, dst.stride); } - /*! @ingroup other_conversion + /*! @ingroup deinterleave_conversion \fn void DeinterleaveBgr(const View& bgr, View& b, View& g, View& r) @@ -226,7 +289,7 @@ namespace Simd SimdDeinterleaveBgr(bgr.data, bgr.stride, bgr.width, bgr.height, b.data, b.stride, g.data, g.stride, r.data, r.stride); } - /*! @ingroup other_conversion + /*! @ingroup deinterleave_conversion \fn void DeinterleaveBgra(const View& bgra, View& b, View& g, View& r, View& a) @@ -249,6 +312,95 @@ namespace Simd SimdDeinterleaveBgra(bgra.data, bgra.stride, bgra.width, bgra.height, b.data, b.stride, g.data, g.stride, r.data, r.stride, a.data, a.stride); } + /*! @ingroup deinterleave_conversion + + \fn void DeinterleaveBgra(const View& bgra, View& b, View& g, View& r) + + \short Deinterleaves 32-bit BGRA interleaved image into separated 8-bit Blue, Green and Red planar images (Alpha channel is ignored). + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdDeinterleaveBgra. + + \param [in] bgra - an input 32-bit BGRA interleaved image. + \param [out] b - an output 8-bit Blue planar image. + \param [out] g - an output 8-bit Green planar image. + \param [out] r - an output 8-bit Red planar image. + */ + template class A> SIMD_INLINE void DeinterleaveBgra(const View& bgra, View& b, View& g, View& r) + { + assert(EqualSize(bgra, b) && Compatible(b, g, r) && bgra.format == View::Bgra32 && b.format == View::Gray8); + + SimdDeinterleaveBgra(bgra.data, bgra.stride, bgra.width, bgra.height, b.data, b.stride, g.data, g.stride, r.data, r.stride, NULL, 0); + } + + /*! @ingroup deinterleave_conversion + + \fn void DeinterleaveRgb(const View& rgb, View& r, View& g, View& b) + + \short Deinterleaves 24-bit RGB interleaved image into separated 8-bit Red, Green and Blue planar images. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdDeinterleaveBgr. + + \param [in] rgb - an input 24-bit RGB interleaved image. + \param [out] r - an output 8-bit Red planar image. + \param [out] g - an output 8-bit Green planar image. + \param [out] b - an output 8-bit Blue planar image. + */ + template class A> SIMD_INLINE void DeinterleaveRgb(const View& rgb, View& r, View& g, View& b) + { + assert(EqualSize(rgb, b) && Compatible(b, g, r) && rgb.format == View::Rgb24 && b.format == View::Gray8); + + SimdDeinterleaveBgr(rgb.data, rgb.stride, rgb.width, rgb.height, r.data, r.stride, g.data, g.stride, b.data, b.stride); + } + + /*! @ingroup deinterleave_conversion + + \fn void DeinterleaveRgba(const View& rgba, View& r, View& g, View& b, View& a) + + \short Deinterleaves 32-bit RGBA interleaved image into separated 8-bit Red, Green, Blue and Alpha planar images. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdDeinterleaveBgra. + + \param [in] rgba - an input 32-bit RGBA interleaved image. + \param [out] r - an output 8-bit Red planar image. + \param [out] g - an output 8-bit Green planar image. + \param [out] b - an output 8-bit Blue planar image. + \param [out] a - an output 8-bit Alpha planar image. + */ + template class A> SIMD_INLINE void DeinterleaveRgba(const View& rgba, View& r, View& g, View& b, View& a) + { + assert(EqualSize(rgba, b) && Compatible(b, g, r, a) && rgba.format == View::Rgba32 && b.format == View::Gray8); + + SimdDeinterleaveBgra(rgba.data, rgba.stride, rgba.width, rgba.height, r.data, r.stride, g.data, g.stride, b.data, b.stride, a.data, a.stride); + } + + /*! @ingroup deinterleave_conversion + + \fn void DeinterleaveRgba(const View& rgba, View& r, View& g, View& b) + + \short Deinterleaves 32-bit RGBA interleaved image into separated 8-bit Red, Green and Blue planar images (Alpha channel is ignored). + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdDeinterleaveBgra. + + \param [in] rgba - an input 32-bit RGBA interleaved image. + \param [out] r - an output 8-bit Red planar image. + \param [out] g - an output 8-bit Green planar image. + \param [out] b - an output 8-bit Blue planar image. + */ + template class A> SIMD_INLINE void DeinterleaveRgba(const View& rgba, View& r, View& g, View& b) + { + assert(EqualSize(rgba, b) && Compatible(b, g, r) && rgba.format == View::Rgba32 && b.format == View::Gray8); + + SimdDeinterleaveBgra(rgba.data, rgba.stride, rgba.width, rgba.height, r.data, r.stride, g.data, g.stride, b.data, b.stride, NULL, 0); + } + /*! @ingroup other_filter \fn void GaussianBlur3x3(const View& src, View& dst) @@ -295,6 +447,26 @@ namespace Simd SimdGrayToBgr(gray.data, gray.width, gray.height, gray.stride, bgr.data, bgr.stride); } + /*! @ingroup gray_conversion + + \fn void GrayToRgb(const View& gray, View& rgb) + + \short Converts 8-bit gray image to 24-bit RGB image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdGrayToBgr. + + \param [in] gray - an input 8-bit gray image. + \param [out] rgb - an output 24-bit RGB image. + */ + template class A> SIMD_INLINE void GrayToRgb(const View& gray, View& rgb) + { + assert(EqualSize(gray, rgb) && rgb.format == View::Rgb24 && gray.format == View::Gray8); + + SimdGrayToBgr(gray.data, gray.width, gray.height, gray.stride, rgb.data, rgb.stride); + } + /*! @ingroup gray_conversion \fn void GrayToBgra(const View& gray, View& bgra, uint8_t alpha = 0xFF) @@ -316,6 +488,27 @@ namespace Simd SimdGrayToBgra(gray.data, gray.width, gray.height, gray.stride, bgra.data, bgra.stride, alpha); } + /*! @ingroup gray_conversion + + \fn void GrayToRgba(const View& gray, View& rgba, uint8_t alpha = 0xFF) + + \short Converts 8-bit gray image to 32-bit RGBA image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdGrayToBgra. + + \param [in] gray - an input 8-bit gray image. + \param [out] rgba - an output 32-bit RGBA image. + \param [in] alpha - a value of alpha channel. It is equal to 255 by default. + */ + template class A> SIMD_INLINE void GrayToRgba(const View& gray, View& rgba, uint8_t alpha = 0xFF) + { + assert(EqualSize(gray, rgba) && rgba.format == View::Rgba32 && gray.format == View::Gray8); + + SimdGrayToBgra(gray.data, gray.width, gray.height, gray.stride, rgba.data, rgba.stride, alpha); + } + /*! @ingroup other_conversion \fn void InterleaveBgr(const View & b, const View & g, const View & r, View & bgr) @@ -338,7 +531,7 @@ namespace Simd SimdInterleaveBgr(b.data, b.stride, g.data, g.stride, r.data, r.stride, bgr.width, bgr.height, bgr.data, bgr.stride); } - /*! @ingroup other_conversion + /*! @ingroup interleave_conversion \fn void InterleaveBgra(const View& b, const View& g, const View& r, const View& a, View& bgra) @@ -798,6 +991,200 @@ namespace Simd } } + /*! @ingroup resizing + + \fn void Resize(const View & src, View & dst, const Point & size, ::SimdResizeMethodType method = ::SimdResizeMethodBilinear) + + \short Performs resizing of image. + + \param [in] src - an original input image. + \param [out] dst - a resized output image. The input image can be the output. + \param [in] size - a size of output image. + \param [in] method - a resizing method. By default it is equal to ::SimdResizeMethodBilinear. + */ + template class A> SIMD_INLINE void Resize(const View& src, View& dst, const Point & size, ::SimdResizeMethodType method = ::SimdResizeMethodBilinear) + { + assert(src.format == View::Float || src.ChannelSize() == 1); + + if (&src == &dst) + { + if (src.Size() != size) + { + View tmp(size, src.format); + Resize(src, tmp, method); + dst.Swap(tmp); + } + } + else + { + if (dst.Size() != size) + dst.Recreate(size, src.format); + Resize(src, dst, method); + } + } + + /*! @ingroup rgb_conversion + + \fn void RgbToBgr(const View & rgb, View & bgr) + + \short Converts 24-bit RGB image to 24-bit BGR image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdBgrToRgb. + + \param [in] rgb - an input 24-bit RGB image. + \param [out] bgr - an output 24-bit BGR image. + */ + template class A> SIMD_INLINE void RgbToBgr(const View& rgb, View& bgr) + { + assert(EqualSize(bgr, rgb) && rgb.format == View::Rgb24 || bgr.format == View::Bgr24); + + SimdBgrToRgb(rgb.data, rgb.width, rgb.height, rgb.stride, bgr.data, bgr.stride); + } + + /*! @ingroup rgb_conversion + + \fn void RgbToBgra(const View& rgb, View& bgra, uint8_t alpha = 0xFF) + + \short Converts 24-bit RGB image to 32-bit BGRA image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdRgbToBgra. + + \param [in] rgb - an input 24-bit RGB image. + \param [out] bgra - an output 32-bit BGRA image. + \param [in] alpha - a value of alpha channel. It is equal to 256 by default. + */ + template class A> SIMD_INLINE void RgbToBgra(const View& rgb, View& bgra, uint8_t alpha = 0xFF) + { + assert(EqualSize(rgb, bgra) && bgra.format == View::Bgra32 && rgb.format == View::Rgb24); + + SimdRgbToBgra(rgb.data, rgb.width, rgb.height, rgb.stride, bgra.data, bgra.stride, alpha); + } + + /*! @ingroup rgb_conversion + + \fn void RgbToGray(const View& rgb, View& gray) + + \short Converts 24-bit RGB image to 8-bit gray image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdRgbToGray. + + \param [in] rgb - an input 24-bit RGB image. + \param [out] gray - an output 8-bit gray image. + */ + template class A> SIMD_INLINE void RgbToGray(const View& rgb, View& gray) + { + assert(EqualSize(rgb, gray) && rgb.format == View::Rgb24 && gray.format == View::Gray8); + + SimdRgbToGray(rgb.data, rgb.width, rgb.height, rgb.stride, gray.data, gray.stride); + } + + /*! @ingroup rgb_conversion + + \fn void RgbToRgba(const View& rgb, View& rgba, uint8_t alpha = 0xFF) + + \short Converts 24-bit RGB image to 32-bit RGBA image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdBgrToBgra. + + \param [in] rgb - an input 24-bit RGB image. + \param [out] rgba - an output 32-bit RGBA image. + \param [in] alpha - a value of alpha channel. It is equal to 256 by default. + */ + template class A> SIMD_INLINE void RgbToRgba(const View& rgb, View& rgba, uint8_t alpha = 0xFF) + { + assert(EqualSize(rgb, rgba) && rgba.format == View::Rgba32 && rgb.format == View::Rgb24); + + SimdBgrToBgra(rgb.data, rgb.width, rgb.height, rgb.stride, rgba.data, rgba.stride, alpha); + } + + /*! @ingroup rgba_conversion + + \fn void RgbaToBgr(const View& rgba, View& bgr) + + \short Converts 32-bit RGBA image to 24-bit BGR image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdBgraToRgb. + + \param [in] rgba - an input 32-bit RGBA image. + \param [out] bgr - an output 24-bit RGB image. + */ + template class A> SIMD_INLINE void RgbaToBgr(const View& rgba, View& bgr) + { + assert(EqualSize(rgba, bgr) && rgba.format == View::Rgba32 && bgr.format == View::Bgr24); + + SimdBgraToRgb(rgba.data, rgba.width, rgba.height, rgba.stride, bgr.data, bgr.stride); + } + + /*! @ingroup rgba_conversion + + \fn void RgbaToBgra(const View& rgba, View& bgra) + + \short Converts 32-bit RGBA image to 32-bit BGRA image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdBgraToRgba. + + \param [in] rgba - an input 32-bit RGBA image. + \param [out] bgra - an output 32-bit BGRA image. + */ + template class A> SIMD_INLINE void RgbaToBgra(const View& rgba, View& bgra) + { + assert(EqualSize(bgra, rgba) && bgra.format == View::Bgra32 && rgba.format == View::Rgba32); + + SimdBgraToRgba(rgba.data, rgba.width, rgba.height, rgba.stride, bgra.data, bgra.stride); + } + + /*! @ingroup rgba_conversion + + \fn void RgbaToGray(const View& rgba, View& gray) + + \short Converts 32-bit RGBA image to 8-bit gray image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdRgbaToGray. + + \param [in] rgba - an input 32-bit RGBA image. + \param [out] gray - an output 8-bit gray image. + */ + template class A> SIMD_INLINE void RgbaToGray(const View& rgba, View& gray) + { + assert(EqualSize(rgba, gray) && rgba.format == View::Rgba32 && gray.format == View::Gray8); + + SimdRgbaToGray(rgba.data, rgba.width, rgba.height, rgba.stride, gray.data, gray.stride); + } + + /*! @ingroup rgba_conversion + + \fn void RgbaToRgb(const View& rgba, View& rgb) + + \short Converts 32-bit RGBA image to 24-bit RGB image. + + All images must have the same width and height. + + \note This function is a C++ wrapper for function ::SimdBgraToBgr. + + \param [in] rgba - an input 32-bit RGBA image. + \param [out] rgb - an output 24-bit RGB image. + */ + template class A> SIMD_INLINE void RgbaToRgb(const View& rgba, View& rgb) + { + assert(EqualSize(rgba, rgb) && rgba.format == View::Rgba32 && rgb.format == View::Rgb24); + + SimdBgraToBgr(rgba.data, rgba.width, rgba.height, rgba.stride, rgb.data, rgb.stride); + } + /*! @ingroup resizing \fn void StretchGray2x2(const View& src, View& dst) @@ -825,7 +1212,7 @@ namespace Simd The input and output images must have the same width and height. - \note This function supports conversion between Gray8, Bgr24 and Bgra32 image formats. + \note This function supports conversion between View::Gray8, View::Bgr24, View::Bgra32, View::Rgb24 and View::Rgba32 image formats. \param [in] src - an input image. \param [out] dst - an output image. @@ -848,9 +1235,15 @@ namespace Simd case View::Bgra32: GrayToBgra(src, dst); break; + case View::Rgba32: + GrayToRgba(src, dst); + break; case View::Bgr24: GrayToBgr(src, dst); break; + case View::Rgb24: + GrayToRgb(src, dst); + break; default: assert(0); } @@ -865,6 +1258,32 @@ namespace Simd case View::Gray8: BgrToGray(src, dst); break; + case View::Rgb24: + BgrToRgb(src, dst); + break; + case View::Rgba32: + BgrToRgba(src, dst); + break; + default: + assert(0); + } + break; + + case View::Rgb24: + switch (dst.format) + { + case View::Bgra32: + RgbToBgra(src, dst); + break; + case View::Bgr24: + RgbToBgr(src, dst); + break; + case View::Gray8: + RgbToGray(src, dst); + break; + case View::Rgba32: + RgbToRgba(src, dst); + break; default: assert(0); } @@ -879,6 +1298,32 @@ namespace Simd case View::Gray8: BgraToGray(src, dst); break; + case View::Rgb24: + BgraToRgb(src, dst); + break; + case View::Rgba32: + BgraToRgba(src, dst); + break; + default: + assert(0); + } + break; + + case View::Rgba32: + switch (dst.format) + { + case View::Bgra32: + RgbaToBgra(src, dst); + break; + case View::Bgr24: + RgbaToBgr(src, dst); + break; + case View::Gray8: + RgbaToGray(src, dst); + break; + case View::Rgb24: + RgbaToRgb(src, dst); + break; default: assert(0); } diff --git a/3rdparty/simdlib/Simd/SimdLoad.h b/3rdparty/simdlib/Simd/SimdLoad.h old mode 100644 new mode 100755 index 97d7af7098..243858ca1b --- a/3rdparty/simdlib/Simd/SimdLoad.h +++ b/3rdparty/simdlib/Simd/SimdLoad.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,16 +28,8 @@ namespace Simd { - enum PadType - { - PadNose1, - PadNone, - PadTail1, - PadTail2, - }; - -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { template SIMD_INLINE __m128 Load(const float * p); @@ -56,7 +48,7 @@ namespace Simd return _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)p0), (__m64*)p1); } - SIMD_INLINE __m128 LoadPadZeroNose1(const float * p) + SIMD_INLINE __m128 LoadPadZeroNose1(const float* p) { SIMD_ALIGNED(16) const int32_t m[F] = { 0, -1, -1, -1 }; __m128 a = _mm_loadu_ps(p + 1); @@ -64,7 +56,7 @@ namespace Simd return _mm_and_ps(b, _mm_load_ps((float*)m)); } - SIMD_INLINE __m128 LoadPadZeroTail1(const float * p) + SIMD_INLINE __m128 LoadPadZeroTail1(const float* p) { SIMD_ALIGNED(16) const int32_t m[F] = { -1, -1, -1, 0 }; __m128 a = _mm_loadu_ps(p - 1); @@ -72,20 +64,15 @@ namespace Simd return _mm_and_ps(b, _mm_load_ps((float*)m)); } - SIMD_INLINE __m128 LoadPadZeroTail2(const float * p) + SIMD_INLINE __m128 LoadPadZeroTail2(const float* p) { SIMD_ALIGNED(16) const int32_t m[F] = { -1, -1, 0, 0 }; __m128 a = _mm_loadu_ps(p - 2); __m128 b = _mm_shuffle_ps(a, a, 0xFE); return _mm_and_ps(b, _mm_load_ps((float*)m)); } - } -#endif//SIMD_SSE_ENABLE -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - using namespace Sse; + //--------------------------------------------------------------------- template SIMD_INLINE __m128i Load(const __m128i * p); @@ -99,6 +86,11 @@ namespace Simd return _mm_load_si128(p); } + SIMD_INLINE __m128i Load(const __m128i* p0, const __m128i* p1) + { + return _mm_castps_si128(_mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)p0), (__m64*)p1)); + } + template SIMD_INLINE __m128i LoadMaskI8(const __m128i * p, __m128i index) { return _mm_cmpeq_epi8(Load(p), index); @@ -113,90 +105,13 @@ namespace Simd { return _mm_or_si128(_mm_srli_si128(last, count), _mm_and_si128(last, _mm_slli_si128(K_INV_ZERO, A - count))); } - - template SIMD_INLINE void LoadNose3(const uint8_t * p, __m128i a[3]) - { - a[1] = Load((__m128i*)p); - a[0] = LoadBeforeFirst(a[1]); - a[2] = _mm_loadu_si128((__m128i*)(p + step)); - } - - template SIMD_INLINE void LoadBody3(const uint8_t * p, __m128i a[3]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - step)); - a[1] = Load((__m128i*)p); - a[2] = _mm_loadu_si128((__m128i*)(p + step)); - } - - template SIMD_INLINE void LoadTail3(const uint8_t * p, __m128i a[3]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - step)); - a[1] = Load((__m128i*)p); - a[2] = LoadAfterLast(a[1]); - } - - template SIMD_INLINE void LoadNose5(const uint8_t * p, __m128i a[5]) - { - a[2] = Load((__m128i*)p); - a[1] = LoadBeforeFirst(a[2]); - a[0] = LoadBeforeFirst(a[1]); - a[3] = _mm_loadu_si128((__m128i*)(p + step)); - a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step)); - } - - template SIMD_INLINE void LoadBody5(const uint8_t * p, __m128i a[5]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step)); - a[1] = _mm_loadu_si128((__m128i*)(p - step)); - a[2] = Load((__m128i*)p); - a[3] = _mm_loadu_si128((__m128i*)(p + step)); - a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step)); - } - - template SIMD_INLINE void LoadTail5(const uint8_t * p, __m128i a[5]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step)); - a[1] = _mm_loadu_si128((__m128i*)(p - step)); - a[2] = Load((__m128i*)p); - a[3] = LoadAfterLast(a[2]); - a[4] = LoadAfterLast(a[3]); - } - - SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m128i a[3]) - { - a[0] = LoadBeforeFirst<1>(_mm_loadu_si128((__m128i*)p)); - a[2] = _mm_loadu_si128((__m128i*)(p + 1)); - } - - SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m128i a[3]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - 1)); - a[2] = _mm_loadu_si128((__m128i*)(p + 1)); - } - - SIMD_INLINE void LoadTailDx(const uint8_t * p, __m128i a[3]) - { - a[0] = _mm_loadu_si128((__m128i*)(p - 1)); - a[2] = LoadAfterLast<1>(_mm_loadu_si128((__m128i*)p)); - } } #endif//SIMD_SSE2_ENABLE -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 - { -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::Load; - using Sse2::Load; -#endif - } -#endif - #ifdef SIMD_SSE41_ENABLE namespace Sse41 { #if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::Load; using Sse2::Load; #endif } @@ -219,12 +134,17 @@ namespace Simd template SIMD_INLINE __m256 Load(const float * p0, const float * p1) { - return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse::Load(p0)), Sse::Load(p1), 1); + return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse2::Load(p0)), Sse2::Load(p1), 1); } SIMD_INLINE __m256 Load(const float * p0, const float * p1, const float * p2, const float * p3) { - return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse::Load(p0, p1)), Sse::Load(p2, p3), 1); + return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse2::Load(p0, p1)), Sse2::Load(p2, p3), 1); + } + + SIMD_INLINE __m256 Load(const float * ptr, __m256i mask) + { + return _mm256_maskload_ps(ptr, mask); } } #endif//SIMD_AVX_ENABLE @@ -333,86 +253,6 @@ namespace Simd __m128i secondHi = LoadHalfAfterLast(firstHi); second = _mm256_inserti128_si256(_mm256_castsi128_si256(secondLo), secondHi, 0x1); } - - template SIMD_INLINE void LoadNose3(const uint8_t * p, __m256i a[3]) - { - a[0] = LoadBeforeFirst(p); - a[1] = Load((__m256i*)p); - a[2] = _mm256_loadu_si256((__m256i*)(p + step)); - } - - template SIMD_INLINE void LoadBody3(const uint8_t * p, __m256i a[3]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - step)); - a[1] = Load((__m256i*)p); - a[2] = _mm256_loadu_si256((__m256i*)(p + step)); - } - - template SIMD_INLINE void LoadTail3(const uint8_t * p, __m256i a[3]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - step)); - a[1] = Load((__m256i*)p); - a[2] = LoadAfterLast(p); - } - - template SIMD_INLINE void LoadNose5(const uint8_t * p, __m256i a[5]) - { - LoadBeforeFirst(p, a[1], a[0]); - a[2] = Load((__m256i*)p); - a[3] = _mm256_loadu_si256((__m256i*)(p + step)); - a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step)); - } - - template SIMD_INLINE void LoadBody5(const uint8_t * p, __m256i a[5]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step)); - a[1] = _mm256_loadu_si256((__m256i*)(p - step)); - a[2] = Load((__m256i*)p); - a[3] = _mm256_loadu_si256((__m256i*)(p + step)); - a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step)); - } - - template SIMD_INLINE void LoadTail5(const uint8_t * p, __m256i a[5]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step)); - a[1] = _mm256_loadu_si256((__m256i*)(p - step)); - a[2] = Load((__m256i*)p); - LoadAfterLast(p, a[3], a[4]); - } - - SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m256i a[3]) - { - a[0] = LoadBeforeFirst(p); - a[2] = _mm256_loadu_si256((__m256i*)(p + 1)); - } - - SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m256i a[3]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - 1)); - a[2] = _mm256_loadu_si256((__m256i*)(p + 1)); - } - - SIMD_INLINE void LoadTailDx(const uint8_t * p, __m256i a[3]) - { - a[0] = _mm256_loadu_si256((__m256i*)(p - 1)); - a[2] = LoadAfterLast(p); - } - - template SIMD_INLINE __m256 Load(const float * p); - - template <> SIMD_INLINE __m256 Load(const float * p) - { - return _mm256_loadu_ps(p); - } - - template <> SIMD_INLINE __m256 Load(const float * p) - { -#ifdef _MSC_VER - return _mm256_castsi256_ps(_mm256_load_si256((__m256i*)p)); -#else - return _mm256_load_ps(p); -#endif - } } #endif//SIMD_AVX2_ENABLE @@ -456,12 +296,12 @@ namespace Simd template SIMD_INLINE int32x4_t Load(const int32_t * p) { - return (int32x4_t)Load((const uint8_t*)p); + return vreinterpretq_s32_u8(Load((const uint8_t*)p)); } template SIMD_INLINE uint32x4_t Load(const uint32_t * p) { - return (uint32x4_t)Load((const uint8_t*)p); + return vreinterpretq_u32_u8(Load((const uint8_t*)p)); } template SIMD_INLINE float32x4_t Load(const float * p); @@ -829,81 +669,6 @@ namespace Simd return vextq_u8(last, vextq_u8(last, last, 16 - count), count); } - template SIMD_INLINE void LoadNose3(const uint8_t * p, uint8x16_t a[3]) - { - a[1] = Load(p); - a[0] = LoadBeforeFirst(a[1]); - a[2] = vld1q_u8(p + step); - } - - template SIMD_INLINE void LoadBody3(const uint8_t * p, uint8x16_t a[3]) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - a[0] = vld1q_u8(p - step); - a[1] = Load(p); - a[2] = vld1q_u8(p + step); - } - - template SIMD_INLINE void LoadTail3(const uint8_t * p, uint8x16_t a[3]) - { - a[0] = vld1q_u8(p - step); - a[1] = Load(p); - a[2] = LoadAfterLast(a[1]); - } - - template SIMD_INLINE void LoadNose5(const uint8_t * p, uint8x16_t a[5]) - { - a[2] = Load(p); - a[1] = LoadBeforeFirst(a[2]); - a[0] = LoadBeforeFirst(a[1]); - a[3] = vld1q_u8(p + step); - a[4] = vld1q_u8(p + 2 * step); - } - - template SIMD_INLINE void LoadBody5(const uint8_t * p, uint8x16_t a[5]) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - a[0] = vld1q_u8(p - 2 * step); - a[1] = vld1q_u8(p - step); - a[2] = Load(p); - a[3] = vld1q_u8(p + step); - a[4] = vld1q_u8(p + 2 * step); - } - - template SIMD_INLINE void LoadTail5(const uint8_t * p, uint8x16_t a[5]) - { - a[0] = vld1q_u8(p - 2 * step); - a[1] = vld1q_u8(p - step); - a[2] = Load(p); - a[3] = LoadAfterLast(a[2]); - a[4] = LoadAfterLast(a[3]); - } - - SIMD_INLINE void LoadNoseDx(const uint8_t * p, uint8x16_t a[3]) - { - a[0] = LoadBeforeFirst<1>(vld1q_u8(p)); - a[2] = vld1q_u8(p + 1); - } - - SIMD_INLINE void LoadBodyDx(const uint8_t * p, uint8x16_t a[3]) - { -#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE - __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); -#endif - a[0] = vld1q_u8(p - 1); - a[2] = vld1q_u8(p + 1); - } - - SIMD_INLINE void LoadTailDx(const uint8_t * p, uint8x16_t a[3]) - { - a[0] = vld1q_u8(p - 1); - a[2] = LoadAfterLast<1>(vld1q_u8(p)); - } - template SIMD_INLINE uint8x8_t LoadBeforeFirst(uint8x8_t first) { return vext_u8(vext_u8(first, first, count), first, 8 - count); diff --git a/3rdparty/simdlib/Simd/SimdLoadBlock.h b/3rdparty/simdlib/Simd/SimdLoadBlock.h new file mode 100755 index 0000000000..8a46e07687 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdLoadBlock.h @@ -0,0 +1,251 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdLoadBlock_h__ +#define __SimdLoadBlock_h__ + +#include "Simd/SimdLoad.h" + +namespace Simd +{ +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 + { + template SIMD_INLINE void LoadNose3(const uint8_t * p, __m128i a[3]) + { + a[1] = Load((__m128i*)p); + a[0] = LoadBeforeFirst(a[1]); + a[2] = _mm_loadu_si128((__m128i*)(p + step)); + } + + template SIMD_INLINE void LoadBody3(const uint8_t * p, __m128i a[3]) + { + a[0] = _mm_loadu_si128((__m128i*)(p - step)); + a[1] = Load((__m128i*)p); + a[2] = _mm_loadu_si128((__m128i*)(p + step)); + } + + template SIMD_INLINE void LoadTail3(const uint8_t * p, __m128i a[3]) + { + a[0] = _mm_loadu_si128((__m128i*)(p - step)); + a[1] = Load((__m128i*)p); + a[2] = LoadAfterLast(a[1]); + } + + template SIMD_INLINE void LoadNose5(const uint8_t * p, __m128i a[5]) + { + a[2] = Load((__m128i*)p); + a[1] = LoadBeforeFirst(a[2]); + a[0] = LoadBeforeFirst(a[1]); + a[3] = _mm_loadu_si128((__m128i*)(p + step)); + a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step)); + } + + template SIMD_INLINE void LoadBody5(const uint8_t * p, __m128i a[5]) + { + a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step)); + a[1] = _mm_loadu_si128((__m128i*)(p - step)); + a[2] = Load((__m128i*)p); + a[3] = _mm_loadu_si128((__m128i*)(p + step)); + a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step)); + } + + template SIMD_INLINE void LoadTail5(const uint8_t * p, __m128i a[5]) + { + a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step)); + a[1] = _mm_loadu_si128((__m128i*)(p - step)); + a[2] = Load((__m128i*)p); + a[3] = LoadAfterLast(a[2]); + a[4] = LoadAfterLast(a[3]); + } + + SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m128i a[3]) + { + a[0] = LoadBeforeFirst<1>(_mm_loadu_si128((__m128i*)p)); + a[2] = _mm_loadu_si128((__m128i*)(p + 1)); + } + + SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m128i a[3]) + { + a[0] = _mm_loadu_si128((__m128i*)(p - 1)); + a[2] = _mm_loadu_si128((__m128i*)(p + 1)); + } + + SIMD_INLINE void LoadTailDx(const uint8_t * p, __m128i a[3]) + { + a[0] = _mm_loadu_si128((__m128i*)(p - 1)); + a[2] = LoadAfterLast<1>(_mm_loadu_si128((__m128i*)p)); + } + } +#endif//SIMD_SSE2_ENABLE + +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + template SIMD_INLINE void LoadNose3(const uint8_t * p, __m256i a[3]) + { + a[0] = LoadBeforeFirst(p); + a[1] = Load((__m256i*)p); + a[2] = _mm256_loadu_si256((__m256i*)(p + step)); + } + + template SIMD_INLINE void LoadBody3(const uint8_t * p, __m256i a[3]) + { + a[0] = _mm256_loadu_si256((__m256i*)(p - step)); + a[1] = Load((__m256i*)p); + a[2] = _mm256_loadu_si256((__m256i*)(p + step)); + } + + template SIMD_INLINE void LoadTail3(const uint8_t * p, __m256i a[3]) + { + a[0] = _mm256_loadu_si256((__m256i*)(p - step)); + a[1] = Load((__m256i*)p); + a[2] = LoadAfterLast(p); + } + + template SIMD_INLINE void LoadNose5(const uint8_t * p, __m256i a[5]) + { + LoadBeforeFirst(p, a[1], a[0]); + a[2] = Load((__m256i*)p); + a[3] = _mm256_loadu_si256((__m256i*)(p + step)); + a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step)); + } + + template SIMD_INLINE void LoadBody5(const uint8_t * p, __m256i a[5]) + { + a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step)); + a[1] = _mm256_loadu_si256((__m256i*)(p - step)); + a[2] = Load((__m256i*)p); + a[3] = _mm256_loadu_si256((__m256i*)(p + step)); + a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step)); + } + + template SIMD_INLINE void LoadTail5(const uint8_t * p, __m256i a[5]) + { + a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step)); + a[1] = _mm256_loadu_si256((__m256i*)(p - step)); + a[2] = Load((__m256i*)p); + LoadAfterLast(p, a[3], a[4]); + } + + SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m256i a[3]) + { + a[0] = LoadBeforeFirst(p); + a[2] = _mm256_loadu_si256((__m256i*)(p + 1)); + } + + SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m256i a[3]) + { + a[0] = _mm256_loadu_si256((__m256i*)(p - 1)); + a[2] = _mm256_loadu_si256((__m256i*)(p + 1)); + } + + SIMD_INLINE void LoadTailDx(const uint8_t * p, __m256i a[3]) + { + a[0] = _mm256_loadu_si256((__m256i*)(p - 1)); + a[2] = LoadAfterLast(p); + } + } +#endif//SIMD_AVX2_ENABLE + +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + template SIMD_INLINE void LoadNose3(const uint8_t * p, uint8x16_t a[3]) + { + a[1] = Load(p); + a[0] = LoadBeforeFirst(a[1]); + a[2] = vld1q_u8(p + step); + } + + template SIMD_INLINE void LoadBody3(const uint8_t * p, uint8x16_t a[3]) + { +#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE + __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); +#endif + a[0] = vld1q_u8(p - step); + a[1] = Load(p); + a[2] = vld1q_u8(p + step); + } + + template SIMD_INLINE void LoadTail3(const uint8_t * p, uint8x16_t a[3]) + { + a[0] = vld1q_u8(p - step); + a[1] = Load(p); + a[2] = LoadAfterLast(a[1]); + } + + template SIMD_INLINE void LoadNose5(const uint8_t * p, uint8x16_t a[5]) + { + a[2] = Load(p); + a[1] = LoadBeforeFirst(a[2]); + a[0] = LoadBeforeFirst(a[1]); + a[3] = vld1q_u8(p + step); + a[4] = vld1q_u8(p + 2 * step); + } + + template SIMD_INLINE void LoadBody5(const uint8_t * p, uint8x16_t a[5]) + { +#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE + __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); +#endif + a[0] = vld1q_u8(p - 2 * step); + a[1] = vld1q_u8(p - step); + a[2] = Load(p); + a[3] = vld1q_u8(p + step); + a[4] = vld1q_u8(p + 2 * step); + } + + template SIMD_INLINE void LoadTail5(const uint8_t * p, uint8x16_t a[5]) + { + a[0] = vld1q_u8(p - 2 * step); + a[1] = vld1q_u8(p - step); + a[2] = Load(p); + a[3] = LoadAfterLast(a[2]); + a[4] = LoadAfterLast(a[3]); + } + + SIMD_INLINE void LoadNoseDx(const uint8_t * p, uint8x16_t a[3]) + { + a[0] = LoadBeforeFirst<1>(vld1q_u8(p)); + a[2] = vld1q_u8(p + 1); + } + + SIMD_INLINE void LoadBodyDx(const uint8_t * p, uint8x16_t a[3]) + { +#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE + __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE); +#endif + a[0] = vld1q_u8(p - 1); + a[2] = vld1q_u8(p + 1); + } + + SIMD_INLINE void LoadTailDx(const uint8_t * p, uint8x16_t a[3]) + { + a[0] = vld1q_u8(p - 1); + a[2] = LoadAfterLast<1>(vld1q_u8(p)); + } + } +#endif//SIMD_NEON_ENABLE +} +#endif//__SimdLoadBlock_h__ diff --git a/3rdparty/simdlib/Simd/SimdLog.h b/3rdparty/simdlib/Simd/SimdLog.h old mode 100644 new mode 100755 index 45ba3f3be5..923a16dc70 --- a/3rdparty/simdlib/Simd/SimdLog.h +++ b/3rdparty/simdlib/Simd/SimdLog.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -58,8 +58,8 @@ namespace Simd Log(array.data, array.size, name); } -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { SIMD_INLINE void Log(const __m128 & value, const std::string & name) { @@ -67,12 +67,7 @@ namespace Simd _mm_storeu_ps(buffer, value); Simd::Log(buffer, F, name); } - } -#endif //SIMD_SSE_ENABLE -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { template SIMD_INLINE void Log(const __m128i & value, const std::string & name) { const size_t n = sizeof(__m128i) / sizeof(T); @@ -86,7 +81,7 @@ namespace Simd #ifdef SIMD_SSE41_ENABLE namespace Sse41 { - using namespace Sse; + using namespace Sse2; } #endif //SIMD_SSE41_ENABLE @@ -173,14 +168,15 @@ namespace Simd #define SIMD_LOG2(value) Log(value, #value) #define SIMD_LOG4(value) Log(value, #value) -#define SIMD_LOG_SS(message) \ +#define SIMD_LOG_ERROR(message) \ { \ - std::cout << __FUNCTION__ << " : " << message << std::endl; \ - std::cout.flush(); \ + std::stringstream ss; \ + ss << std::endl << " In function " << SIMD_FUNCTION << ":" << std::endl; \ + ss << " In file " << __FILE__ << ":" << __LINE__ << ":" << std::endl; \ + ss << " Error: " << message << std::endl << std::endl; \ + std::cerr << ss.str() << std::flush; \ } -#define SIMD_LOG_LINE() std::cout << __FUNCTION__ << " : " << __LINE__ << std::endl << std::flush; - #else//SIMD_LOG_ENABLE #define SIMD_LOG(value) @@ -188,9 +184,7 @@ namespace Simd #define SIMD_LOG2(value) #define SIMD_LOG4(value) -#define SIMD_LOG_SS(message) - -#define SIMD_LOG_LINE() +#define SIMD_LOG_ERROR(message) #endif//SIMD_LOG_ENABLE diff --git a/3rdparty/simdlib/Simd/SimdMath.h b/3rdparty/simdlib/Simd/SimdMath.h old mode 100644 new mode 100755 index 4b674ea512..0f7425f76e --- a/3rdparty/simdlib/Simd/SimdMath.h +++ b/3rdparty/simdlib/Simd/SimdMath.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, +* Copyright (c) 2011-2021 Yermalayeu Ihar, * 2018-2019 Radchenko Andrey. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -66,11 +66,21 @@ namespace Simd #define SIMD_ROUND SIMD_INLINE int Round(double value) { -#if defined(SIMD_SSE2_ENABLE) && ((defined(_MSC_VER) && defined(_M_X64)) || (defined(__GNUC__) && defined(__x86_64__))) - __m128d t = _mm_set_sd(value); - return _mm_cvtsd_si32(t); +#if defined(SIMD_X64_ENABLE) && !defined(SIMD_SSE2_DISABLE) + __m128d _value = _mm_set_sd(value); + return _mm_cvtsd_si32(_value); #else - return (int)(value + (value >= 0 ? 0.5 : -0.5)); + return (int)(value + (value >= 0.0 ? 0.5 : -0.5)); +#endif + } + + SIMD_INLINE int Round(float value) + { +#if defined(SIMD_X64_ENABLE) && !defined(SIMD_SSE2_DISABLE) + __m128 _value = _mm_set_ss(value); + return _mm_cvtss_si32(_value); +#else + return (int)(value + (value >= 0.0f ? 0.5f : -0.5f)); #endif } #endif @@ -263,8 +273,8 @@ namespace Simd } } -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { SIMD_INLINE __m128 Square(__m128 value) { @@ -330,12 +340,7 @@ namespace Simd __m128 m = _mm_max_ps(s0, s1); return _mm_store_ss(dst, _mm_max_ss(m, _mm_shuffle_ps(m, m, 1))); } - } -#endif//SIMD_SSE_ENABLE -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { SIMD_INLINE __m128i SaturateI16ToU8(__m128i value) { return _mm_min_epi16(K16_00FF, _mm_max_epi16(value, K_ZERO)); @@ -508,17 +513,8 @@ namespace Simd } #endif// SIMD_SSE2_ENABLE -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 - { -#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug - using Sse::RightNotZero; -#endif - } -#endif//SIMD_SSE3_ENABLE - -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { using namespace Sse2; @@ -538,12 +534,7 @@ namespace Simd { return _mm_maddubs_epi16(UnpackU8(a, b), K8_01_FF); } - } -#endif// SIMD_SSSE3_ENABLE -#ifdef SIMD_SSE41_ENABLE - namespace Sse41 - { #if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug using Sse::RightNotZero; #endif diff --git a/3rdparty/simdlib/Simd/SimdMemory.h b/3rdparty/simdlib/Simd/SimdMemory.h old mode 100644 new mode 100755 index de45abb291..d7772ffa3c --- a/3rdparty/simdlib/Simd/SimdMemory.h +++ b/3rdparty/simdlib/Simd/SimdMemory.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2018 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * 2016-2016 Sintegrial Technologies. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -28,9 +28,10 @@ #include "Simd/SimdDefs.h" #include "Simd/SimdMath.h" -#if defined(__GNUC__) && defined(SIMD_ALLOCATE_ERROR_MESSAGE) +#if defined(SIMD_ALLOCATE_ERROR_MESSAGE) #include #endif +#include namespace Simd { @@ -88,17 +89,18 @@ namespace Simd align = AlignHi(align, sizeof(void *)); size = AlignHi(size, align); int result = ::posix_memalign(&ptr, align, size); -#ifdef SIMD_ALLOCATE_ERROR_MESSAGE if (result != 0) + ptr = NULL; +#else + ptr = malloc(size); +#endif +#ifdef SIMD_ALLOCATE_ERROR_MESSAGE + if (ptr == NULL) std::cout << "The function posix_memalign can't allocate " << size << " bytes with align " << align << " !" << std::endl << std::flush; #endif #ifdef SIMD_ALLOCATE_ASSERT - assert(result == 0); -#endif -#else - ptr = malloc(size); + assert(ptr); #endif - #ifdef SIMD_NO_MANS_LAND if (ptr) ptr = (char*)ptr + SIMD_NO_MANS_LAND; @@ -121,60 +123,86 @@ namespace Simd #endif } + //--------------------------------------------------------------------------------------------- + struct Deletable { virtual ~Deletable() {} }; -#ifdef SIMD_SSE_ENABLE - namespace Sse + //--------------------------------------------------------------------------------------------- + +#if defined(SIMD_CPP_2011_ENABLE) + template using Holder = std::unique_ptr; +#else + template class Holder { - SIMD_INLINE bool Aligned(size_t size, size_t align = sizeof(__m128)) + T* _ptr; + + public: + Holder(T* ptr) + : _ptr(ptr) { - return Simd::Aligned(size, align); } - SIMD_INLINE bool Aligned(const void * ptr, size_t align = sizeof(__m128)) + ~Holder() { - return Simd::Aligned(ptr, align); + if (_ptr) + delete _ptr; + } + + T& operator * () + { + return *_ptr; + } + + const T& operator * () const + { + return *_ptr; + } + + T* operator -> () + { + return _ptr; } - } -#endif// SIMD_SSE_ENABLE + + const T* operator -> () const + { + return _ptr; + } + + operator bool() const + { + return _ptr != NULL; + } + }; +#endif + + //--------------------------------------------------------------------------------------------- + #ifdef SIMD_SSE2_ENABLE namespace Sse2 { - using Sse::Aligned; - } -#endif// SIMD_SSE2_ENABLE - -#ifdef SIMD_SSE3_ENABLE - namespace Sse3 - { - using Sse::Aligned; - } -#endif// SIMD_SSE3_ENABLE + SIMD_INLINE bool Aligned(size_t size, size_t align = sizeof(__m128)) + { + return Simd::Aligned(size, align); + } -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - using Sse::Aligned; + SIMD_INLINE bool Aligned(const void * ptr, size_t align = sizeof(__m128)) + { + return Simd::Aligned(ptr, align); + } } -#endif// SIMD_SSSE3_ENABLE +#endif// SIMD_SSE2_ENABLE #ifdef SIMD_SSE41_ENABLE namespace Sse41 { - using Sse::Aligned; + using Sse2::Aligned; } #endif// SIMD_SSE41_ENABLE -#ifdef SIMD_SSE42_ENABLE - namespace Sse42 - { - } -#endif// SIMD_SSE42_ENABLE - #ifdef SIMD_AVX_ENABLE namespace Avx { diff --git a/3rdparty/simdlib/Simd/SimdNeon.h b/3rdparty/simdlib/Simd/SimdNeon.h old mode 100644 new mode 100755 index 54373b506e..bf2b98be69 --- a/3rdparty/simdlib/Simd/SimdNeon.h +++ b/3rdparty/simdlib/Simd/SimdNeon.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, +* Copyright (c) 2011-2021 Yermalayeu Ihar, * 2018-2018 Radchenko Andrey. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -36,22 +36,18 @@ namespace Simd void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride); - void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride); + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride); - void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha); + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride); - void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride); + void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride); - void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride); - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); + void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride); void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); @@ -93,6 +89,12 @@ namespace Simd void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); + void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); + + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride); + void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); } diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp old mode 100644 new mode 100755 index bb25c0c6e8..98a360b0e6 --- a/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -74,6 +74,8 @@ namespace Simd BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); } + //--------------------------------------------------------------------- + template SIMD_INLINE void Bgr48pToBgra32(uint8_t * bgra, const uint8_t * blue, const uint8_t * green, const uint8_t * red, size_t offset, const uint8x16_t & alpha) { @@ -128,6 +130,47 @@ namespace Simd else Bgr48pToBgra32(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha); } + + //--------------------------------------------------------------------- + + template SIMD_INLINE void RgbToBgra(const uint8_t* rgb, uint8_t* bgra, uint8x16_t alpha) + { + uint8x16x3_t _rgb = Load3(rgb); + uint8x16x4_t _bgra; + _bgra.val[0] = _rgb.val[2]; + _bgra.val[1] = _rgb.val[1]; + _bgra.val[2] = _rgb.val[0]; + _bgra.val[3] = alpha; + Store4(bgra, _bgra); + } + + template void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t alignedWidth = AlignLo(width, A); + uint8x16_t _alpha = vdupq_n_u8(alpha); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0, colBgra = 0, colRgb = 0; col < alignedWidth; col += A, colBgra += A4, colRgb += A3) + RgbToBgra(rgb + colRgb, bgra + colBgra, _alpha); + if (width != alignedWidth) + RgbToBgra(rgb + 3 * (width - A), bgra + 4 * (width - A), _alpha); + rgb += rgbStride; + bgra += bgraStride; + } + } + + void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) + RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + else + RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + } } #endif// SIMD_NEON_ENABLE } diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp old mode 100644 new mode 100755 index 0b9fdeaedf..57cf19f18d --- a/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -30,29 +30,31 @@ namespace Simd #ifdef SIMD_NEON_ENABLE namespace Neon { - SIMD_INLINE uint8x8_t BgrToGray(uint8x8x3_t bgr) + SIMD_INLINE uint8x16_t BgrToGray(uint8x16x3_t bgr) { - return vmovn_u16(BgrToGray(vmovl_u8(bgr.val[0]), vmovl_u8(bgr.val[1]), vmovl_u8(bgr.val[2]))); + uint8x8_t lo = vmovn_u16(BgrToGray(UnpackU8<0>(bgr.val[0]), UnpackU8<0>(bgr.val[1]), UnpackU8<0>(bgr.val[2]))); + uint8x8_t hi = vmovn_u16(BgrToGray(UnpackU8<1>(bgr.val[0]), UnpackU8<1>(bgr.val[1]), UnpackU8<1>(bgr.val[2]))); + return vcombine_u8(lo, hi); } - template void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) + template void BgrToGray(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* gray, size_t grayStride) { - assert(width >= HA); + assert(width >= A); if (align) assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride)); - size_t alignedWidth = AlignLo(width, HA); + size_t alignedWidth = AlignLo(width, A); for (size_t row = 0; row < height; ++row) { - for (size_t col = 0; col < alignedWidth; col += HA) + for (size_t col = 0; col < alignedWidth; col += A) { - uint8x8x3_t _bgr = LoadHalf3(bgr + 3 * col); + uint8x16x3_t _bgr = Load3(bgr + 3 * col); Store(gray + col, BgrToGray(_bgr)); } if (alignedWidth != width) { - uint8x8x3_t _bgr = LoadHalf3(bgr + 3 * (width - HA)); - Store(gray + width - HA, BgrToGray(_bgr)); + uint8x16x3_t _bgr = Load3(bgr + 3 * (width - A)); + Store(gray + width - A, BgrToGray(_bgr)); } bgr += bgrStride; gray += grayStride; @@ -66,6 +68,47 @@ namespace Simd else BgrToGray(bgr, width, height, bgrStride, gray, grayStride); } + + //--------------------------------------------------------------------- + + SIMD_INLINE uint8x16_t RgbToGray(uint8x16x3_t rgb) + { + uint8x8_t lo = vmovn_u16(BgrToGray(UnpackU8<0>(rgb.val[2]), UnpackU8<0>(rgb.val[1]), UnpackU8<0>(rgb.val[0]))); + uint8x8_t hi = vmovn_u16(BgrToGray(UnpackU8<1>(rgb.val[2]), UnpackU8<1>(rgb.val[1]), UnpackU8<1>(rgb.val[0]))); + return vcombine_u8(lo, hi); + } + + template void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) + { + assert(width >= A); + if (align) + assert(Aligned(rgb) && Aligned(rgbStride) && Aligned(gray) && Aligned(grayStride)); + + size_t alignedWidth = AlignLo(width, A); + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + { + uint8x16x3_t _rgb = Load3(rgb + 3 * col); + Store(gray + col, RgbToGray(_rgb)); + } + if (alignedWidth != width) + { + uint8x16x3_t _rgb = Load3(rgb + 3 * (width - A)); + Store(gray + width - A, RgbToGray(_rgb)); + } + rgb += rgbStride; + gray += grayStride; + } + } + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) + { + if (Aligned(rgb) && Aligned(gray) && Aligned(rgbStride) && Aligned(grayStride)) + RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + else + RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + } } #endif// SIMD_NEON_ENABLE } diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp old mode 100644 new mode 100755 index fb69a04b5f..b1e69cc3aa --- a/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -38,7 +38,7 @@ namespace Simd Store3(rgb, _bgr); } - template void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) + template void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride) { assert(width >= A); if (align) @@ -59,12 +59,12 @@ namespace Simd } } - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) + void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride) { if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)) - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); + BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); else - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); + BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); } } #endif//SIMD_NEON_ENABLE diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp deleted file mode 100644 index b2950c7da1..0000000000 --- a/3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - const size_t A3 = A * 3; - const size_t A4 = A * 4; - - union Bgra - { - uint8x16x4_t bgra; - uint8x16x3_t bgr; - }; - - template SIMD_INLINE void BgrToRgba(const uint8_t * bgr, uint8_t * rgba, Bgra & _bgra) - { - _bgra.bgr = Load3(bgr); - uint8x16_t tmp = _bgra.bgr.val[0]; - _bgra.bgr.val[0] = _bgra.bgr.val[2]; - _bgra.bgr.val[2] = tmp; - Store4(rgba, _bgra.bgra); - } - - template void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha) - { - assert(width >= A); - if (align) - assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - - Bgra _bgra; - _bgra.bgra.val[3] = vdupq_n_u8(alpha); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colRgba = 0, colBgr = 0; col < alignedWidth; col += A, colRgba += A4, colBgr += A3) - BgrToRgba(bgr + colBgr, rgba + colRgba, _bgra); - if (width != alignedWidth) - BgrToRgba(bgr + 3 * (width - A), rgba + 4 * (width - A), _bgra); - bgr += bgrStride; - rgba += rgbaStride; - } - } - - void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha) - { - if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - else - BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp old mode 100644 new mode 100755 index f95e1a9118..944fe5b45e --- a/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -66,6 +66,87 @@ namespace Simd else BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); } + + //--------------------------------------------------------------------- + + template SIMD_INLINE void BgraToRgb(const uint8_t* bgra, uint8_t* rgb) + { + uint8x16x4_t _bgra = Load4(bgra); + uint8x16x3_t _rgb; + _rgb.val[0] = _bgra.val[2]; + _rgb.val[1] = _bgra.val[1]; + _rgb.val[2] = _bgra.val[0]; + Store3(rgb, _rgb); + } + + template void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t alignedWidth = AlignLo(width, A); + if (width == alignedWidth) + alignedWidth -= A; + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0, colBgra = 0, colRgb = 0; col < alignedWidth; col += A, colBgra += A4, colRgb += A3) + BgraToRgb(bgra + colBgra, rgb + colRgb); + if (width != alignedWidth) + BgraToRgb(bgra + 4 * (width - A), rgb + 3 * (width - A)); + bgra += bgraStride; + rgb += rgbStride; + } + } + + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) + BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); + else + BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); + } + + //--------------------------------------------------------------------- + + template SIMD_INLINE void BgraToRgba(const uint8_t* bgra, uint8_t* rgba) + { + uint8x16x4_t _bgra = Load4(bgra); + uint8x16_t tmp = _bgra.val[0]; + _bgra.val[0] = _bgra.val[2]; + _bgra.val[2] = tmp; + Store4(rgba, _bgra); + } + + template void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride)); + + size_t alignedWidth = AlignLo(width, A); + if (width == alignedWidth) + alignedWidth -= A; + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0, colBgra = 0, colRgba = 0; col < alignedWidth; col += A, colBgra += A4, colRgba += A4) + BgraToRgba(bgra + colBgra, rgba + colRgba); + if (width != alignedWidth) + BgraToRgba(bgra + 4 * (width - A), rgba + 4 * (width - A)); + bgra += bgraStride; + rgba += rgbaStride; + } + } + + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride)) + BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + else + BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + } } #endif// SIMD_NEON_ENABLE } diff --git a/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp old mode 100644 new mode 100755 index 24fc228560..6b2eb4de48 --- a/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -66,6 +66,45 @@ namespace Simd else BgraToGray(bgra, width, height, bgraStride, gray, grayStride); } + + //--------------------------------------------------------------------- + + SIMD_INLINE uint8x8_t RgbaToGray(uint8x8x4_t rgba) + { + return vmovn_u16(BgrToGray(vmovl_u8(rgba.val[2]), vmovl_u8(rgba.val[1]), vmovl_u8(rgba.val[0]))); + } + + template void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) + { + assert(width >= HA); + if (align) + assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride)); + + size_t alignedWidth = AlignLo(width, HA); + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += HA) + { + uint8x8x4_t _rgba = LoadHalf4(rgba + 4 * col); + Store(gray + col, RgbaToGray(_rgba)); + } + if (alignedWidth != width) + { + uint8x8x4_t _rgba = LoadHalf4(rgba + 4 * (width - HA)); + Store(gray + width - HA, RgbaToGray(_rgba)); + } + rgba += rgbaStride; + gray += grayStride; + } + } + + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) + { + if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride)) + RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + else + RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + } } #endif// SIMD_NEON_ENABLE } diff --git a/3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp b/3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp deleted file mode 100644 index d1873eddcb..0000000000 --- a/3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - const size_t A4 = A * 4; - - union Bgra - { - uint8x16x4_t bgra; - }; - - template SIMD_INLINE void BgraToRgba(const uint8_t * bgra, uint8_t * rgba, Bgra & _bgra) - { - _bgra.bgra = Load4(bgra); - uint8x16_t tmp = _bgra.bgra.val[0]; - _bgra.bgra.val[0] = _bgra.bgra.val[2]; - _bgra.bgra.val[2] = tmp; - Store4(rgba, _bgra.bgra); - } - - template void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride) - { - assert(width >= A); - if (align) - assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride)); - - size_t alignedWidth = AlignLo(width, A); - - Bgra _bgra; - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0, colRgba = 0; col < alignedWidth; col += A, colRgba += A4) - BgraToRgba(bgra + colRgba, rgba + colRgba, _bgra); - if (width != alignedWidth) - BgraToRgba(bgra + 4 * (width - A), rgba + 4 * (width - A), _bgra); - bgra += bgraStride; - rgba += rgbaStride; - } - } - - void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride) - { - if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); - else - BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp b/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp old mode 100644 new mode 100755 index 53530a788d..36a623efb5 --- a/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -71,6 +71,8 @@ namespace Simd DeinterleaveUv(uv, uvStride, width, height, u, uStride, v, vStride); } + //--------------------------------------------------------------------- + template void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride) { @@ -118,6 +120,8 @@ namespace Simd DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); } + //--------------------------------------------------------------------- + template void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) { @@ -125,36 +129,65 @@ namespace Simd if (align) { assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)); + assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL)); } size_t bodyWidth = AlignLo(width, A); size_t tail = width - bodyWidth; - for (size_t row = 0; row < height; ++row) + if (a) { - for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += QA) + for (size_t row = 0; row < height; ++row) { - uint8x16x4_t _bgra = Load4(bgra + offset); - Store(b + col, _bgra.val[0]); - Store(g + col, _bgra.val[1]); - Store(r + col, _bgra.val[2]); - Store(a + col, _bgra.val[3]); + for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += QA) + { + uint8x16x4_t _bgra = Load4(bgra + offset); + Store(b + col, _bgra.val[0]); + Store(g + col, _bgra.val[1]); + Store(r + col, _bgra.val[2]); + Store(a + col, _bgra.val[3]); + } + if (tail) + { + size_t col = width - A; + size_t offset = 4 * col; + uint8x16x4_t _bgra = Load4(bgra + offset); + Store(b + col, _bgra.val[0]); + Store(g + col, _bgra.val[1]); + Store(r + col, _bgra.val[2]); + Store(a + col, _bgra.val[3]); + } + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; + a += aStride; } - if (tail) + } + else + { + for (size_t row = 0; row < height; ++row) { - size_t col = width - A; - size_t offset = 4 * col; - uint8x16x4_t _bgra = Load4(bgra + offset); - Store(b + col, _bgra.val[0]); - Store(g + col, _bgra.val[1]); - Store(r + col, _bgra.val[2]); - Store(a + col, _bgra.val[3]); + for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += QA) + { + uint8x16x4_t _bgra = Load4(bgra + offset); + Store(b + col, _bgra.val[0]); + Store(g + col, _bgra.val[1]); + Store(r + col, _bgra.val[2]); + } + if (tail) + { + size_t col = width - A; + size_t offset = 4 * col; + uint8x16x4_t _bgra = Load4(bgra + offset); + Store(b + col, _bgra.val[0]); + Store(g + col, _bgra.val[1]); + Store(r + col, _bgra.val[2]); + } + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; } - bgra += bgraStride; - b += bStride; - g += gStride; - r += rStride; - a += aStride; } } @@ -162,7 +195,7 @@ namespace Simd uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) { if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && - Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)) + Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL)) DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); else DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); diff --git a/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp old mode 100644 new mode 100755 index 752778be2a..1d63a6510b --- a/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "Simd/SimdMemory.h" +#include "Simd/SimdLoadBlock.h" #include "Simd/SimdStore.h" #include "Simd/SimdGaussianBlur.h" #include "Simd/SimdLog.h" diff --git a/3rdparty/simdlib/Simd/SimdNeonResizer.cpp b/3rdparty/simdlib/Simd/SimdNeonResizer.cpp old mode 100644 new mode 100755 index b2e965200e..d11a0e29a8 --- a/3rdparty/simdlib/Simd/SimdNeonResizer.cpp +++ b/3rdparty/simdlib/Simd/SimdNeonResizer.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -578,11 +578,11 @@ namespace Simd void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) { ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(float32x4_t)); - if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && dstX >= A) + if (param.IsByteBilinear() && dstX >= A) return new ResizerByteBilinear(param); - else if (type == SimdResizeChannelByte && method == SimdResizeMethodArea) + else if (param.IsByteArea()) return new ResizerByteArea(param); - else if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp)) + else if (param.IsFloatBilinear()) return new ResizerFloatBilinear(param); else return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); diff --git a/3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp deleted file mode 100644 index 37b288b277..0000000000 --- a/3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE uint8x8_t RgbToGray(uint8x8x3_t rgb) - { - return vmovn_u16(BgrToGray(vmovl_u8(rgb.val[2]), vmovl_u8(rgb.val[1]), vmovl_u8(rgb.val[0]))); - } - - template void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride) - { - assert(width >= HA); - if (align) - assert(Aligned(rgb) && Aligned(rgbStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, HA); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += HA) - { - uint8x8x3_t _rgb = LoadHalf3(rgb + 3 * col); - Store(gray + col, RgbToGray(_rgb)); - } - if (alignedWidth != width) - { - uint8x8x3_t _rgb = LoadHalf3(rgb + 3 * (width - HA)); - Store(gray + width - HA, RgbToGray(_rgb)); - } - rgb += rgbStride; - gray += grayStride; - } - } - - void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride) - { - if (Aligned(rgb) && Aligned(gray) && Aligned(rgbStride) && Aligned(grayStride)) - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp deleted file mode 100644 index 377d6fcb42..0000000000 --- a/3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" -#include "Simd/SimdConversion.h" - -namespace Simd -{ -#ifdef SIMD_NEON_ENABLE - namespace Neon - { - SIMD_INLINE uint8x8_t RgbaToGray(uint8x8x4_t rgba) - { - return vmovn_u16(BgrToGray(vmovl_u8(rgba.val[2]), vmovl_u8(rgba.val[1]), vmovl_u8(rgba.val[0]))); - } - - template void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride) - { - assert(width >= HA); - if (align) - assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, HA); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += HA) - { - uint8x8x4_t _rgba = LoadHalf4(rgba + 4 * col); - Store(gray + col, RgbaToGray(_rgba)); - } - if (alignedWidth != width) - { - uint8x8x4_t _rgba = LoadHalf4(rgba + 4 * (width - HA)); - Store(gray + width - HA, RgbaToGray(_rgba)); - } - rgba += rgbaStride; - gray += grayStride; - } - } - - void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride) - { - if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride)) - RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); - else - RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); - } - } -#endif// SIMD_NEON_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdPixel.hpp b/3rdparty/simdlib/Simd/SimdPixel.hpp old mode 100644 new mode 100755 index 109c18ec1d..f95ce46ee6 --- a/3rdparty/simdlib/Simd/SimdPixel.hpp +++ b/3rdparty/simdlib/Simd/SimdPixel.hpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -41,6 +41,7 @@ namespace Simd struct Hsv24; struct Hsl24; struct Rgb24; + struct Rgba32; //------------------------------------------------------------------------- @@ -86,6 +87,13 @@ namespace Simd */ Bgr24(const Rgb24 & p); + /*! + Creates a new 24-bit BGR pixel structure on the base of 32-bit RGBA pixel. + + \param [in] p - 32-bit RGBA pixel. + */ + Bgr24(const Rgba32& p); + /*! Creates a copy of 24-bit BGR pixel structure. @@ -165,6 +173,13 @@ namespace Simd */ Bgra32(const Rgb24 & p, const uint8_t & a = uint8_t(255)); + /*! + Creates a new 32-bit BGRA pixel structure on the base of 32-bit RGBA pixel. + + \param [in] p - 32-bit RGBA pixel. + */ + Bgra32(const Rgba32& p); + /*! Creates a copy of 32-bit BGRA pixel structure. @@ -360,6 +375,13 @@ namespace Simd */ Rgb24(const Bgr24 & p); + /*! + Creates a new 24-bit RGB pixel structure on the base of 32-bit RGBA pixel. + + \param [in] p - 32-bit RGBA pixel. + */ + Rgb24(const Rgba32& p); + /*! Creates a copy of 24-bit RGB pixel structure. @@ -392,6 +414,92 @@ namespace Simd template class A> static Rgb24 & At(View & view, ptrdiff_t col, ptrdiff_t row); }; + /*! @ingroup cpp_pixels + + \short 32-bit RGBA pixel. + + Provides manipulation of 32-bit RGBA (Red, Blue, Green, Alpha) pixels of the View struct. + */ + struct Rgba32 + { + uint8_t red; /*!< \brief 8-bit red channel 32-bit BGRA pixel. */ + uint8_t green; /*!< \brief 8-bit green channel 32-bit BGRA pixel. */ + uint8_t blue; /*!< \brief 8-bit blue channel 32-bit BGRA pixel. */ + uint8_t alpha; /*!< \brief 8-bit alpha channel 32-bit RGBA pixel. */ + + /*! + Creates a new 32-bit RGBA pixel structure with specified channel values. + + \param [in] gray - initial value for all channels. It is equal to 0 by default. + \param [in] a - initial value for alpha channel. It is equal to 255 by default. + */ + Rgba32(const uint8_t& gray = uint8_t(0), const uint8_t& a = uint8_t(255)); + + /*! + Creates a new 32-bit RGBA pixel structure with specified channel values. + + \param [in] r - initial value for red channel. + \param [in] g - initial value for green channel. + \param [in] b - initial value for blue channel. + \param [in] a - initial value for alpha channel. It is equal to 255 by default. + */ + Rgba32(const uint8_t& r, const uint8_t& g, const uint8_t& b, const uint8_t& a = uint8_t(255)); + + /*! + Creates a new 32-bit RGBA pixel structure on the base of 32-bit BGRA pixel. + + \param [in] p - 32-bit BGRA pixel. + */ + Rgba32(const Bgra32& p); + + /*! + Creates a new 32-bit RGBA pixel structure on the base of 24-bit BGR pixel. + + \param [in] p - 24-bit BGR pixel. + \param [in] a - initial value for alpha channel. It is equal to 255 by default. + */ + Rgba32(const Bgr24& p, const uint8_t& a = uint8_t(255)); + + /*! + Creates a new of 32-bit RGBA pixel structure on the base of 24-bit RGB pixel. + + \param [in] p - 24-bit RGB pixel. + \param [in] a - initial value for alpha channel. It is equal to 255 by default. + */ + Rgba32(const Rgb24& p, const uint8_t& a = uint8_t(255)); + + /*! + Creates a copy of 32-bit RGBA pixel structure. + + \param [in] p - 32-bit RGBA pixel. + */ + Rgba32(const Rgba32& p); + + /*! + \fn template class A> static const Rgba32 & At(const View & view, ptrdiff_t col, ptrdiff_t row); + + Gets constant reference to the pixel with specific coordinates at the image view. + + \param [in] view - an image view of 32-bit RGBA pixel format. + \param [in] col - x-coordinate of the pixel. + \param [in] row - y-coordinate of the pixel. + \return a constant reference to the pixel. + */ + template class A> static const Rgba32& At(const View& view, ptrdiff_t col, ptrdiff_t row); + + /*! + \fn template class A> static Rgba32 & At(View & view, ptrdiff_t col, ptrdiff_t row); + + Gets reference to the pixel with specific coordinates at the image view. + + \param [in] view - an image view of 32-bit RGBA pixel format. + \param [in] col - x-coordinate of the pixel. + \param [in] row - y-coordinate of the pixel. + \return a reference to the pixel. + */ + template class A> static Rgba32& At(View& view, ptrdiff_t col, ptrdiff_t row); + }; + //------------------------------------------------------------------------- // struct Bgr24 implementation: @@ -417,14 +525,21 @@ namespace Simd { } - SIMD_INLINE Bgr24::Bgr24(const Bgr24 & p) + SIMD_INLINE Bgr24::Bgr24(const Rgb24 & p) : blue(p.blue) , green(p.green) , red(p.red) { } - SIMD_INLINE Bgr24::Bgr24(const Rgb24 & p) + SIMD_INLINE Bgr24::Bgr24(const Rgba32& p) + : blue(p.blue) + , green(p.green) + , red(p.red) + { + } + + SIMD_INLINE Bgr24::Bgr24(const Bgr24 & p) : blue(p.blue) , green(p.green) , red(p.red) @@ -479,6 +594,14 @@ namespace Simd { } + SIMD_INLINE Bgra32::Bgra32(const Rgba32& p) + : blue(p.blue) + , green(p.green) + , red(p.red) + , alpha(p.alpha) + { + } + SIMD_INLINE Bgra32::Bgra32(const Bgra32 & p) : blue(p.blue) , green(p.green) @@ -605,6 +728,13 @@ namespace Simd { } + SIMD_INLINE Rgb24::Rgb24(const Rgba32& p) + : red(p.red) + , green(p.green) + , blue(p.blue) + { + } + SIMD_INLINE Rgb24::Rgb24(const Rgb24 & p) : red(p.red) , green(p.green) @@ -625,6 +755,70 @@ namespace Simd return Simd::At(view, col, row); } + + // struct Rgba32 implementation: + + SIMD_INLINE Rgba32::Rgba32(const uint8_t& gray, const uint8_t& a) + : red(gray) + , green(gray) + , blue(gray) + , alpha(a) + { + } + + SIMD_INLINE Rgba32::Rgba32(const uint8_t& r, const uint8_t& g, const uint8_t& b, const uint8_t& a) + : red(r) + , green(g) + , blue(b) + , alpha(a) + { + } + + SIMD_INLINE Rgba32::Rgba32(const Bgra32& p) + : red(p.red) + , green(p.green) + , blue(p.blue) + , alpha(p.alpha) + { + } + + SIMD_INLINE Rgba32::Rgba32(const Bgr24& p, const uint8_t& a) + : red(p.red) + , green(p.green) + , blue(p.blue) + , alpha(a) + { + } + + SIMD_INLINE Rgba32::Rgba32(const Rgb24& p, const uint8_t& a) + : red(p.red) + , green(p.green) + , blue(p.blue) + , alpha(a) + { + } + + SIMD_INLINE Rgba32::Rgba32(const Rgba32& p) + : red(p.red) + , green(p.green) + , blue(p.blue) + , alpha(p.alpha) + { + } + + template class A> SIMD_INLINE const Rgba32& Rgba32::At(const View& view, ptrdiff_t col, ptrdiff_t row) + { + assert(view.format == View::Rgba32); + + return Simd::At(view, col, row); + } + + template class A> SIMD_INLINE Rgba32& Rgba32::At(View& view, ptrdiff_t col, ptrdiff_t row) + { + assert(view.format == View::Rgba32); + + return Simd::At(view, col, row); + } } } diff --git a/3rdparty/simdlib/Simd/SimdPow.h b/3rdparty/simdlib/Simd/SimdPow.h old mode 100644 new mode 100755 index 309e3104f0..ca0db18eb5 --- a/3rdparty/simdlib/Simd/SimdPow.h +++ b/3rdparty/simdlib/Simd/SimdPow.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/3rdparty/simdlib/Simd/SimdResizer.h b/3rdparty/simdlib/Simd/SimdResizer.h old mode 100644 new mode 100755 index 0a70ee0ad6..15dacfcd0c --- a/3rdparty/simdlib/Simd/SimdResizer.h +++ b/3rdparty/simdlib/Simd/SimdResizer.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -46,6 +46,43 @@ namespace Simd this->channels = channels; this->align = align; } + + bool IsByteBilinear() const + { + return type == SimdResizeChannelByte && method == SimdResizeMethodBilinear; + } + + bool IsByteArea() const + { + return type == SimdResizeChannelByte && method == SimdResizeMethodArea; + } + + bool IsShortBilinear() const + { + return type == SimdResizeChannelShort && method == SimdResizeMethodBilinear; + } + + bool IsFloatBilinear() const + { + return type == SimdResizeChannelFloat && + (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp || method == SimdResizeMethodInferenceEngineInterp); + } + + bool IsNearest() const + { + return method == SimdResizeMethodNearest; + } + + size_t ChannelSize() const + { + static const size_t sizes[3] = { 1, 2, 4 }; + return sizes[(int)type]; + } + + size_t PixelSize() const + { + return ChannelSize() * channels; + } }; class Resizer : Deletable @@ -94,13 +131,32 @@ namespace Simd virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); }; + class ResizerShortBilinear : public Resizer + { + protected: + Array32i _ix, _iy; + Array32f _ax, _ay, _bx[2]; + + void EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t* indices, float* alphas); + + template void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + template void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + + virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + + public: + ResizerShortBilinear(const ResParam& param); + + virtual void Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride); + }; + class ResizerFloatBilinear : public Resizer { protected: Array32i _ix, _iy; Array32f _ax, _ay, _bx[2]; - void EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, bool caffeInterp, int32_t * indices, float * alphas); + void EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t * indices, float * alphas); virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride); @@ -110,22 +166,23 @@ namespace Simd virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); }; - void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); - } - -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - class ResizerFloatBilinear : public Base::ResizerFloatBilinear + class ResizerNearest : public Resizer { - virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride); + void Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride); + template void Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride); + protected: + size_t _pixelSize; + Array32i _ix, _iy; + + void EstimateIndex(size_t srcSize, size_t dstSize, size_t pixelSize, int32_t* indices); public: - ResizerFloatBilinear(const ResParam & param); - }; + ResizerNearest(const ResParam& param); + virtual void Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride); + }; + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); } -#endif //SIMD_SSE_ENABLE #ifdef SIMD_SSE2_ENABLE namespace Sse2 @@ -156,12 +213,19 @@ namespace Simd virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); }; + class ResizerFloatBilinear : public Base::ResizerFloatBilinear + { + virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride); + public: + ResizerFloatBilinear(const ResParam & param); + }; + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); } #endif //SIMD_SSE2_ENABLE -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { class ResizerByteBilinear : public Sse2::ResizerByteBilinear { @@ -183,15 +247,8 @@ namespace Simd ResizerByteBilinear(const ResParam & param); virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); - }; - - void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); - } -#endif //SIMD_SSSE3_ENABLE - -#ifdef SIMD_SSE41_ENABLE - namespace Sse41 - { + }; + class ResizerByteArea : public Sse2::ResizerByteArea { protected: @@ -202,6 +259,17 @@ namespace Simd virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); }; + class ResizerShortBilinear : public Base::ResizerShortBilinear + { + protected: + template void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + template void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + + virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + public: + ResizerShortBilinear(const ResParam& param); + }; + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); } #endif //SIMD_SSE41_ENABLE @@ -223,15 +291,7 @@ namespace Simd #ifdef SIMD_AVX2_ENABLE namespace Avx2 { - template SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t * src, const Idx & index, const uint8_t * alpha, uint8_t * dst) - { - __m256i _src = _mm256_loadu_si256((__m256i*)(src + index.src)); - __m256i _shuffle = _mm256_loadu_si256((__m256i*)&index.shuffle); - __m256i _alpha = _mm256_loadu_si256((__m256i*)(alpha + index.dst)); - _mm256_storeu_si256((__m256i*)(dst + index.dst), _mm256_maddubs_epi16(Avx2::Shuffle(_src, _shuffle), _alpha)); - } - - class ResizerByteBilinear : public Ssse3::ResizerByteBilinear + class ResizerByteBilinear : public Sse41::ResizerByteBilinear { protected: struct Idx @@ -260,6 +320,17 @@ namespace Simd virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); }; + class ResizerShortBilinear : public Sse41::ResizerShortBilinear + { + protected: + template void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + template void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + + virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + public: + ResizerShortBilinear(const ResParam& param); + }; + class ResizerFloatBilinear : public Base::ResizerFloatBilinear { virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride); @@ -308,6 +379,17 @@ namespace Simd virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); }; + class ResizerShortBilinear : public Base::ResizerShortBilinear + { + protected: + template void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + template void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + + virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride); + public: + ResizerShortBilinear(const ResParam& param); + }; + class ResizerFloatBilinear : public Base::ResizerFloatBilinear { virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride); diff --git a/3rdparty/simdlib/Simd/SimdResizerCommon.h b/3rdparty/simdlib/Simd/SimdResizerCommon.h new file mode 100755 index 0000000000..3e6ab00ffa --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdResizerCommon.h @@ -0,0 +1,97 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdResizerCommon_h__ +#define __SimdResizerCommon_h__ + +#include "Simd/SimdLoad.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + const __m128i RSB_1_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x4, 0x5, -1, -1, 0x8, 0x9, -1, -1, 0xC, 0xD, -1, -1); + const __m128i RSB_1_1 = SIMD_MM_SETR_EPI8(0x2, 0x3, -1, -1, 0x6, 0x7, -1, -1, 0xA, 0xB, -1, -1, 0xE, 0xF, -1, -1); + + SIMD_INLINE __m128 BilColS1(const uint16_t* src, const int32_t* idx, __m128 fx0, __m128 fx1) + { + __m128i s = _mm_setr_epi32( + *(uint32_t*)(src + idx[0]), *(uint32_t*)(src + idx[1]), + *(uint32_t*)(src + idx[2]), *(uint32_t*)(src + idx[3])); + __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_1_0))); + __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_1_1))); + return _mm_add_ps(m0, m1); + } + + const __m128i RSB_2_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x2, 0x3, -1, -1, 0x8, 0x9, -1, -1, 0xA, 0xB, -1, -1); + const __m128i RSB_2_1 = SIMD_MM_SETR_EPI8(0x4, 0x5, -1, -1, 0x6, 0x7, -1, -1, 0xC, 0xD, -1, -1, 0xE, 0xF, -1, -1); + + SIMD_INLINE __m128 BilColS2(const uint16_t* src, const int32_t* idx, __m128 fx0, __m128 fx1) + { + __m128i s = Sse2::Load((__m128i*)(src + idx[0]), (__m128i*)(src + idx[2])); + __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_2_0))); + __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_2_1))); + return _mm_add_ps(m0, m1); + } + + const __m128i RSB_3_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x2, 0x3, -1, -1, 0x4, 0x5, -1, -1, -1, -1, -1, -1); + const __m128i RSB_3_1 = SIMD_MM_SETR_EPI8(0x6, 0x7, -1, -1, 0x8, 0x9, -1, -1, 0xA, 0xB, -1, -1, -1, -1, -1, -1); + + SIMD_INLINE __m128 BilColS3(const uint16_t* src, __m128 fx0, __m128 fx1) + { + __m128i s = _mm_loadu_si128((__m128i*)src); + __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_3_0))); + __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_3_1))); + return _mm_add_ps(m0, m1); + } + + const __m128i RSB_4_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x2, 0x3, -1, -1, 0x4, 0x5, -1, -1, 0x6, 0x7, -1, -1); + const __m128i RSB_4_1 = SIMD_MM_SETR_EPI8(0x8, 0x9, -1, -1, 0xA, 0xB, -1, -1, 0xC, 0xD, -1, -1, 0xE, 0xF, -1, -1); + + SIMD_INLINE __m128 BilColS4(const uint16_t* src, __m128 fx0, __m128 fx1) + { + __m128i s = _mm_loadu_si128((__m128i*)src); + __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_4_0))); + __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_4_1))); + return _mm_add_ps(m0, m1); + } + + const __m128i RSB_3_P = SIMD_MM_SETR_EPI8(0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, -1, -1, -1, -1); + } +#endif //SIMD_SSE41_ENABLE + +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + template SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t * src, const Idx & index, const uint8_t * alpha, uint8_t * dst) + { + __m256i _src = _mm256_loadu_si256((__m256i*)(src + index.src)); + __m256i _shuffle = _mm256_loadu_si256((__m256i*)&index.shuffle); + __m256i _alpha = _mm256_loadu_si256((__m256i*)(alpha + index.dst)); + _mm256_storeu_si256((__m256i*)(dst + index.dst), _mm256_maddubs_epi16(Avx2::Shuffle(_src, _shuffle), _alpha)); + } + } +#endif //SIMD_AVX2_ENABLE +} +#endif//__SimdResizerCommon_h__ diff --git a/3rdparty/simdlib/Simd/SimdRuntime.h b/3rdparty/simdlib/Simd/SimdRuntime.h old mode 100644 new mode 100755 index 5fb82ebd00..de098cdb94 --- a/3rdparty/simdlib/Simd/SimdRuntime.h +++ b/3rdparty/simdlib/Simd/SimdRuntime.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -54,10 +54,13 @@ namespace Simd if (!_info.empty()) { std::sort(_candidates.begin(), _candidates.end(), [](const Candidate & a, const Candidate & b) { return a.Mean() < b.Mean(); }); - std::cout << std::setprecision(3) << std::fixed; std::cout << "Simd::Runtime " << _info << " : "; + int64_t f = TimeFrequency(); for (size_t i = 0; i < _candidates.size(); ++i) - std::cout << _candidates[i].func.Name() << ": " << _candidates[i].Mean()*1000.0 << " "; + { + int64_t t = _candidates[i].Mean(); + std::cout << _candidates[i].func.Name() << ": " << t * 1000 / f << "." << (t * 1000000 / f) % 1000 << " "; + } std::cout << std::endl; } #endif @@ -104,18 +107,18 @@ namespace Simd { Func func; size_t count; - double sum, min, max; + int64_t sum, min, max; SIMD_INLINE Candidate(const Func & f) : func(f) , count(0) , sum(0) - , min(std::numeric_limits::max()) - , max(std::numeric_limits::min()) + , min(std::numeric_limits::max()) + , max(0) { } - SIMD_INLINE void Update(const double & value) + SIMD_INLINE void Update(int64_t value) { count += 1; sum += value; @@ -123,9 +126,14 @@ namespace Simd max = std::max(max, value); } - SIMD_INLINE double Mean() const + SIMD_INLINE int64_t Mean() const { - return (sum - min - max) / (count - 2); + if( count > 2) + return (sum - min - max) / (count - 2); + else if (count > 0) + return sum / count; + else + return sum; } }; typedef std::vector Candidates; @@ -144,9 +152,9 @@ namespace Simd if (_info.empty()) _info = current->func.Info(args); #endif - double start = Simd::Time(); + int64_t start = Simd::TimeCounter(); current->func.Run(args); - current->Update(Simd::Time() - start); + current->Update(Simd::TimeCounter() - start); } else { @@ -173,10 +181,10 @@ namespace Simd SIMD_INLINE Candidate * Best() { Candidate * best = &_candidates[0]; - double min = best->Mean(); + int64_t min = best->Mean(); for (size_t i = 1; i < _candidates.size(); ++i) { - double mean = _candidates[i].Mean(); + int64_t mean = _candidates[i].Mean(); if (mean < min) { min = mean; diff --git a/3rdparty/simdlib/Simd/SimdSet.h b/3rdparty/simdlib/Simd/SimdSet.h old mode 100644 new mode 100755 index ae1bb6066a..22b5622e73 --- a/3rdparty/simdlib/Simd/SimdSet.h +++ b/3rdparty/simdlib/Simd/SimdSet.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2018 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -109,6 +109,12 @@ namespace Simd const float a[4] = { a0, a1, a2, a3 }; return vld1q_f32(a); } + + SIMD_INLINE int32x4_t SetI32(int32_t a0, int32_t a1, int32_t a2, int32_t a3) + { + const int32_t a[4] = { a0, a1, a2, a3 }; + return vld1q_s32(a); + } } #endif// SIMD_NEON_ENABLE } diff --git a/3rdparty/simdlib/Simd/SimdSse1Resizer.cpp b/3rdparty/simdlib/Simd/SimdSse1Resizer.cpp deleted file mode 100644 index 405ee03f4f..0000000000 --- a/3rdparty/simdlib/Simd/SimdSse1Resizer.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdResizer.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_SSE_ENABLE - namespace Sse - { - ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param) - : Base::ResizerFloatBilinear(param) - { - } - - void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) - { - size_t cn = _param.channels; - size_t rs = _param.dstW * cn; - float * pbx[2] = { _bx[0].data, _bx[1].data }; - int32_t prev = -2; - size_t rsa = AlignLo(rs, Sse::F); - for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) - { - float fy1 = _ay[dy]; - float fy0 = 1.0f - fy1; - int32_t sy = _iy[dy]; - int32_t k = 0; - - if (sy == prev) - k = 2; - else if (sy == prev + 1) - { - Swap(pbx[0], pbx[1]); - k = 1; - } - - prev = sy; - - for (; k < 2; k++) - { - float * pb = pbx[k]; - const float * ps = src + (sy + k)*srcStride; - size_t dx = 0; - if (cn == 1) - { - __m128 _1 = _mm_set1_ps(1.0f); - for (; dx < rsa; dx += Sse::F) - { - __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); - __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); - __m128 fx1 = _mm_load_ps(_ax.data + dx); - __m128 fx0 = _mm_sub_ps(_1, fx1); - __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88)); - __m128 m1 = _mm_mul_ps(fx1, _mm_shuffle_ps(s01, s23, 0xDD)); - _mm_store_ps(pb + dx, _mm_add_ps(m0, m1)); - } - } - if (cn == 3 && rs > 3) - { - __m128 _1 = _mm_set1_ps(1.0f); - size_t rs3 = rs - 3; - for (; dx < rs3; dx += 3) - { - __m128 s0 = _mm_loadu_ps(ps + _ix[dx] + 0); - __m128 s1 = _mm_loadu_ps(ps + _ix[dx] + 3); - __m128 fx1 = _mm_set1_ps(_ax.data[dx]); - __m128 fx0 = _mm_sub_ps(_1, fx1); - _mm_storeu_ps(pb + dx, _mm_add_ps(_mm_mul_ps(fx0, s0), _mm_mul_ps(fx1, s1))); - } - } - for (; dx < rs; dx++) - { - int32_t sx = _ix[dx]; - float fx = _ax[dx]; - pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + cn] * fx; - } - } - - size_t dx = 0; - __m128 _fy0 = _mm_set1_ps(fy0); - __m128 _fy1 = _mm_set1_ps(fy1); - for (; dx < rsa; dx += Sse::F) - { - __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _fy0); - __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _fy1); - _mm_storeu_ps(dst + dx, _mm_add_ps(m0, m1)); - } - for (; dx < rs; dx++) - dst[dx] = pbx[0][dx] * fy0 + pbx[1][dx] * fy1; - } - } - - //--------------------------------------------------------------------- - - void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) - { - ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128)); - if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp)) - return new ResizerFloatBilinear(param); - else - return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - } - } -#endif //SIMD_SSE_ENABLE -} - diff --git a/3rdparty/simdlib/Simd/SimdSse2.h b/3rdparty/simdlib/Simd/SimdSse2.h old mode 100644 new mode 100755 index ce304774f5..66a0d22700 --- a/3rdparty/simdlib/Simd/SimdSse2.h +++ b/3rdparty/simdlib/Simd/SimdSse2.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,15 +33,11 @@ namespace Simd { void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride); - void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride); - void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); void BgrToGray(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *gray, size_t grayStride); - void RgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride); - void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); @@ -68,6 +64,8 @@ namespace Simd void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride); + void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); diff --git a/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp b/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp old mode 100644 new mode 100755 index c150220b82..b818225858 --- a/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -88,6 +88,58 @@ namespace Simd else BgraToGray(bgra, width, height, bgraStride, gray, grayStride); } + + //--------------------------------------------------------------------- + + const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); + + SIMD_INLINE __m128i RgbaToGray32(__m128i rgba) + { + const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF); + const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF); + const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_0000), _mm_madd_epi16(r0b0, K16_RED_BLUE)); + return _mm_srli_epi32(_mm_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); + } + + SIMD_INLINE __m128i RgbaToGray(__m128i rgba[4]) + { + const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); + const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); + return _mm_packus_epi16(lo, hi); + } + + template void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) + { + assert(width >= A); + if (align) + assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride)); + + size_t alignedWidth = AlignLo(width, A); + __m128i a[4]; + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + { + Load(rgba + 4 * col, a); + Store((__m128i*)(gray + col), RgbaToGray(a)); + } + if (alignedWidth != width) + { + Load(rgba + 4 * (width - A), a); + Store((__m128i*)(gray + width - A), RgbaToGray(a)); + } + rgba += rgbaStride; + gray += grayStride; + } + } + + void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride) + { + if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride)) + RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + else + RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); + } } #else // Work arround to avoid warning: libvisp_simdlib.a(SimdSse2BgraToGray.cpp.o) has no symbols diff --git a/3rdparty/simdlib/Simd/SimdBaseBgraToRgba.cpp b/3rdparty/simdlib/Simd/SimdSse2Cpu.cpp similarity index 62% rename from 3rdparty/simdlib/Simd/SimdBaseBgraToRgba.cpp rename to 3rdparty/simdlib/Simd/SimdSse2Cpu.cpp index 8ada2f6a2c..3d1dfe00fb 100644 --- a/3rdparty/simdlib/Simd/SimdBaseBgraToRgba.cpp +++ b/3rdparty/simdlib/Simd/SimdSse2Cpu.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2020 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -21,30 +21,44 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "Simd/SimdDefs.h" -#include +#include "Simd/SimdEnable.h" +#include "Simd/SimdCpu.h" + +#if defined(_MSC_VER) +#include +#endif namespace Simd { - namespace Base +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { - void BgraToRgba(const uint8_t *bgra, size_t size, uint8_t *rgba) + SIMD_INLINE bool SupportedByCPU() { - for (size_t i = 0; i < size; ++i, bgra += 4, rgba += 4) - { - *(int32_t*)rgba = (*(int32_t*)bgra); - std::swap(rgba[0], rgba[2]); - } + return Base::CheckBit(Cpuid::Ordinary, Cpuid::Edx, Cpuid::SSE2); } - void BgraToRgba(const uint8_t *bgra, size_t width, size_t height, size_t bgraStride, uint8_t *rgba, size_t rgbaStride) + SIMD_INLINE bool SupportedByOS() { - for (size_t row = 0; row < height; ++row) +#if defined(_MSC_VER) + __try + { + __m128d value = _mm_set1_pd(1.0);// try to execute of SSE2 instructions; + return true; + } + __except (EXCEPTION_EXECUTE_HANDLER) { - BgraToRgba(bgra, width, rgba); - bgra += bgraStride; - rgba += rgbaStride; + return false; } +#else + return true; +#endif + } + + bool GetEnable() + { + return SupportedByCPU() && SupportedByOS(); } } +#endif } diff --git a/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp b/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp old mode 100644 new mode 100755 index 394488a804..70e4f139ea --- a/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp +++ b/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "Simd/SimdMemory.h" +#include "Simd/SimdLoadBlock.h" #include "Simd/SimdStore.h" namespace Simd diff --git a/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp b/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp old mode 100644 new mode 100755 index f29d96eeb1..c289ab7f75 --- a/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp +++ b/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -297,12 +297,12 @@ namespace Simd void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) { ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128i)); - if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && (channels == 1 || channels == 2) && dstX >= A) + if (param.IsByteBilinear() && (channels == 1 || channels == 2) && dstX >= A) return new ResizerByteBilinear(param); - else if (type == SimdResizeChannelByte && method == SimdResizeMethodArea) + else if (param.IsByteArea()) return new ResizerByteArea(param); else - return Sse::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); + return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); } } #else diff --git a/3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp b/3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp deleted file mode 100644 index 927dde0dae..0000000000 --- a/3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdSse2.h" - -namespace Simd -{ -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - namespace - { - struct Buffer - { - Buffer(size_t width) - { - _p = Allocate(sizeof(uint8_t) * 4 * width); - rgba = (uint8_t*)_p; - } - - ~Buffer() - { - Free(_p); - } - - uint8_t * rgba; - private: - void *_p; - }; - } - - void RgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride) - { - assert(width >= A); - - Buffer buffer(width); - - for (size_t row = 1; row < height; ++row) - { - Base::BgrToBgra(rgb, width, buffer.rgba, false, false, 0xFF); - Sse2::RgbaToGray(buffer.rgba, width, 1, 4 * width, gray, width); - rgb += rgbStride; - gray += grayStride; - } - Base::BgrToBgra(rgb, width, buffer.rgba, false, true, 0xFF); - Sse2::RgbaToGray(buffer.rgba, width, 1, 4 * width, gray, width); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSse2RgbToGray.cpp.o) has no symbols - void dummy_SimdSse2RgbToGray(){}; -#endif//SIMD_SSE2_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp deleted file mode 100644 index 884f09924b..0000000000 --- a/3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp +++ /dev/null @@ -1,96 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); - const __m128i K16_GREEN_0000 = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, 0x0000); - const __m128i K32_ROUND_TERM = SIMD_MM_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m128i RgbaToGray32(__m128i rgba) - { - const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF); - const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF); - const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_0000), _mm_madd_epi16(r0b0, K16_RED_BLUE)); - return _mm_srli_epi32(_mm_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - SIMD_INLINE __m128i RgbaToGray(__m128i rgba[4]) - { - const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); - const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); - return _mm_packus_epi16(lo, hi); - } - - template SIMD_INLINE void Load(const uint8_t* p, __m128i a[4]) - { - a[0] = Load((__m128i*)p + 0); - a[1] = Load((__m128i*)p + 1); - a[2] = Load((__m128i*)p + 2); - a[3] = Load((__m128i*)p + 3); - } - - template void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride) - { - assert(width >= A); - if (align) - assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, A); - __m128i a[4]; - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - Load(rgba + 4 * col, a); - Store((__m128i*)(gray + col), RgbaToGray(a)); - } - if (alignedWidth != width) - { - Load(rgba + 4 * (width - A), a); - Store((__m128i*)(gray + width - A), RgbaToGray(a)); - } - rgba += rgbaStride; - gray += grayStride; - } - } - - void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride) - { - if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride)) - RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); - else - RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSse2RgbaToGray.cpp.o) has no symbols - void dummy_SimdSse2RgbaToGray(){}; -#endif// SIMD_SSE2_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdSse41.h b/3rdparty/simdlib/Simd/SimdSse41.h new file mode 100755 index 0000000000..958fc11bc5 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdSse41.h @@ -0,0 +1,76 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdSse41_h__ +#define __SimdSse41_h__ + +#include "Simd/SimdDefs.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + void BgraToBgr(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* bgr, size_t bgrStride); + + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride); + + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride); + + void BgrToBgra(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + + void BgrToGray(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* gray, size_t grayStride); + + void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride); + + void DeinterleaveBgr(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride); + + void DeinterleaveBgra(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height, uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride); + + void GaussianBlur3x3(const uint8_t* src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t* dst, size_t dstStride); + + void GrayToBgr(const uint8_t* gray, size_t width, size_t height, size_t grayStride, uint8_t* bgr, size_t bgrStride); + + void InterleaveBgr(const uint8_t* b, size_t bStride, const uint8_t* g, size_t gStride, const uint8_t* r, size_t rStride, size_t width, size_t height, uint8_t* bgr, size_t bgrStride); + + void InterleaveBgra(const uint8_t* b, size_t bStride, const uint8_t* g, size_t gStride, const uint8_t* r, size_t rStride, const uint8_t* a, size_t aStride, size_t width, size_t height, uint8_t* bgra, size_t bgraStride); + + void ReduceColor2x2(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); + + void ReduceGray2x2(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride); + + void ReduceGray4x4(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride); + + void ResizeBilinear(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); + + void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); + } +#endif// SIMD_SSE41_ENABLE +} +#endif//__SimdSse41_h__ diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdSse41BgrToBgra.cpp old mode 100644 new mode 100755 similarity index 57% rename from 3rdparty/simdlib/Simd/SimdSsse3BgrToBgra.cpp rename to 3rdparty/simdlib/Simd/SimdSse41BgrToBgra.cpp index 2c7f277758..65787e1a45 --- a/3rdparty/simdlib/Simd/SimdSsse3BgrToBgra.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41BgrToBgra.cpp @@ -1,74 +1,111 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - template SIMD_INLINE void BgrToBgra(const uint8_t * bgr, uint8_t * bgra, __m128i alpha, __m128i shuffle) - { - Store((__m128i*)bgra + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 0)), shuffle))); - Store((__m128i*)bgra + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 12)), shuffle))); - Store((__m128i*)bgra + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 24)), shuffle))); - Store((__m128i*)bgra + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(bgr + 32)), 4), shuffle))); - } - - template void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - - __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3); - __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BgrToBgra(bgr + 3 * col, bgra + 4 * col, _alpha, _shuffle); - if (width != alignedWidth) - BgrToBgra(bgr + 3 * (width - A), bgra + 4 * (width - A), _alpha, _shuffle); - bgr += bgrStride; - bgra += bgraStride; - } - } - - void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); - else - BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToBgra.cpp.o) has no symbols - void dummy_SimdSsse3BgrToBgra(){}; -#endif// SIMD_SSSE3_ENABLE -} +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdStore.h" +#include "Simd/SimdMemory.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + template SIMD_INLINE void BgrToBgra(const uint8_t * bgr, uint8_t * bgra, __m128i alpha, __m128i shuffle) + { + Store((__m128i*)bgra + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 0)), shuffle))); + Store((__m128i*)bgra + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 12)), shuffle))); + Store((__m128i*)bgra + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 24)), shuffle))); + Store((__m128i*)bgra + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(bgr + 32)), 4), shuffle))); + } + + template void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); + + size_t alignedWidth = AlignLo(width, A); + + __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3); + __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + BgrToBgra(bgr + 3 * col, bgra + 4 * col, _alpha, _shuffle); + if (width != alignedWidth) + BgrToBgra(bgr + 3 * (width - A), bgra + 4 * (width - A), _alpha, _shuffle); + bgr += bgrStride; + bgra += bgraStride; + } + } + + void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) + BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); + else + BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha); + } + + //--------------------------------------------------------------------- + + template SIMD_INLINE void RgbToBgra(const uint8_t* rgb, uint8_t* bgra, __m128i alpha, __m128i shuffle) + { + Store((__m128i*)bgra + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(rgb + 0)), shuffle))); + Store((__m128i*)bgra + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(rgb + 12)), shuffle))); + Store((__m128i*)bgra + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(rgb + 24)), shuffle))); + Store((__m128i*)bgra + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(rgb + 32)), 4), shuffle))); + } + + template void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t alignedWidth = AlignLo(width, A); + + __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3); + __m128i _shuffle = _mm_setr_epi8(0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + RgbToBgra(rgb + 3 * col, bgra + 4 * col, _alpha, _shuffle); + if (width != alignedWidth) + RgbToBgra(rgb + 3 * (width - A), bgra + 4 * (width - A), _alpha, _shuffle); + rgb += rgbStride; + bgra += bgraStride; + } + } + + void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) + RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + else + RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToGray.cpp b/3rdparty/simdlib/Simd/SimdSse41BgrToGray.cpp old mode 100644 new mode 100755 similarity index 56% rename from 3rdparty/simdlib/Simd/SimdSsse3BgrToGray.cpp rename to 3rdparty/simdlib/Simd/SimdSse41BgrToGray.cpp index 224a87bbce..b089e35631 --- a/3rdparty/simdlib/Simd/SimdSsse3BgrToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41BgrToGray.cpp @@ -1,93 +1,148 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - const __m128i K16_BLUE_RED = SIMD_MM_SET2_EPI16(Base::BLUE_TO_GRAY_WEIGHT, Base::RED_TO_GRAY_WEIGHT); - const __m128i K16_GREEN_ROUND = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m128i BgraToGray32(__m128i bgra) - { - const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(bgra, 1), K16_00FF); - const __m128i b0r0 = _mm_and_si128(bgra, K16_00FF); - const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(b0r0, K16_BLUE_RED)); - return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - SIMD_INLINE __m128i BgraToGray(__m128i bgra[4]) - { - const __m128i lo = _mm_packs_epi32(BgraToGray32(bgra[0]), BgraToGray32(bgra[1])); - const __m128i hi = _mm_packs_epi32(BgraToGray32(bgra[2]), BgraToGray32(bgra[3])); - return _mm_packus_epi16(lo, hi); - } - - template SIMD_INLINE __m128i BgrToGray(const uint8_t * bgr, __m128i shuffle) - { - __m128i bgra[4]; - bgra[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(bgr + 0)), shuffle)); - bgra[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(bgr + 12)), shuffle)); - bgra[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(bgr + 24)), shuffle)); - bgra[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(bgr + 32)), 4), shuffle)); - return BgraToGray(bgra); - } - - template void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) - { - assert(width >= A); - if (align) - assert(Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - - __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - Store((__m128i*)(gray + col), BgrToGray(bgr + 3 * col, _shuffle)); - if (width != alignedWidth) - Store((__m128i*)(gray + width - A), BgrToGray(bgr + 3 * (width - A), _shuffle)); - bgr += bgrStride; - gray += grayStride; - } - } - - void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) - { - if (Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - else - BgrToGray(bgr, width, height, bgrStride, gray, grayStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToGray.cpp.o) has no symbols - void dummy_SimdSsse3BgrToGray(){}; -#endif// SIMD_SSSE3_ENABLE -} +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdStore.h" +#include "Simd/SimdMemory.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + const __m128i K16_BLUE_RED = SIMD_MM_SET2_EPI16(Base::BLUE_TO_GRAY_WEIGHT, Base::RED_TO_GRAY_WEIGHT); + const __m128i K16_GREEN_ROUND = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM); + + SIMD_INLINE __m128i BgraToGray32(__m128i bgra) + { + const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(bgra, 1), K16_00FF); + const __m128i b0r0 = _mm_and_si128(bgra, K16_00FF); + const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(b0r0, K16_BLUE_RED)); + return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT); + } + + SIMD_INLINE __m128i BgraToGray(__m128i bgra[4]) + { + const __m128i lo = _mm_packs_epi32(BgraToGray32(bgra[0]), BgraToGray32(bgra[1])); + const __m128i hi = _mm_packs_epi32(BgraToGray32(bgra[2]), BgraToGray32(bgra[3])); + return _mm_packus_epi16(lo, hi); + } + + template SIMD_INLINE __m128i BgrToGray(const uint8_t * bgr, __m128i shuffle) + { + __m128i bgra[4]; + bgra[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(bgr + 0)), shuffle)); + bgra[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(bgr + 12)), shuffle)); + bgra[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(bgr + 24)), shuffle)); + bgra[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(bgr + 32)), 4), shuffle)); + return BgraToGray(bgra); + } + + template void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) + { + assert(width >= A); + if (align) + assert(Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride)); + + size_t alignedWidth = AlignLo(width, A); + + __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + Store((__m128i*)(gray + col), BgrToGray(bgr + 3 * col, _shuffle)); + if (width != alignedWidth) + Store((__m128i*)(gray + width - A), BgrToGray(bgr + 3 * (width - A), _shuffle)); + bgr += bgrStride; + gray += grayStride; + } + } + + void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) + { + if (Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride)) + BgrToGray(bgr, width, height, bgrStride, gray, grayStride); + else + BgrToGray(bgr, width, height, bgrStride, gray, grayStride); + } + + //--------------------------------------------------------------------- + + const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); + + SIMD_INLINE __m128i RgbaToGray32(__m128i rgba) + { + const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF); + const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF); + const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(r0b0, K16_RED_BLUE)); + return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT); + } + + SIMD_INLINE __m128i RgbaToGray(__m128i rgba[4]) + { + const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); + const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); + return _mm_packus_epi16(lo, hi); + } + + template SIMD_INLINE __m128i RgbToGray(const uint8_t* rgb, __m128i shuffle) + { + __m128i rgba[4]; + rgba[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(rgb + 0)), shuffle)); + rgba[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(rgb + 12)), shuffle)); + rgba[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(rgb + 24)), shuffle)); + rgba[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(rgb + 32)), 4), shuffle)); + return RgbaToGray(rgba); + } + + template void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) + { + assert(width >= A); + if (align) + assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t alignedWidth = AlignLo(width, A); + + __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + Store((__m128i*)(gray + col), RgbToGray(rgb + 3 * col, _shuffle)); + if (width != alignedWidth) + Store((__m128i*)(gray + width - A), RgbToGray(rgb + 3 * (width - A), _shuffle)); + rgb += rgbStride; + gray += grayStride; + } + } + + void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride) + { + if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)) + RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + else + RgbToGray(rgb, width, height, rgbStride, gray, grayStride); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdSse41BgrToRgb.cpp old mode 100644 new mode 100755 similarity index 84% rename from 3rdparty/simdlib/Simd/SimdSsse3BgrToRgb.cpp rename to 3rdparty/simdlib/Simd/SimdSse41BgrToRgb.cpp index 0f74b41b91..14a351a5c9 --- a/3rdparty/simdlib/Simd/SimdSsse3BgrToRgb.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41BgrToRgb.cpp @@ -1,83 +1,80 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - const __m128i K8_CVT_00 = SIMD_MM_SETR_EPI8(0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1); - const __m128i K8_CVT_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); - const __m128i K8_CVT_10 = SIMD_MM_SETR_EPI8(-1, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_CVT_11 = SIMD_MM_SETR_EPI8(0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF); - const __m128i K8_CVT_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, -1); - const __m128i K8_CVT_21 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_CVT_22 = SIMD_MM_SETR_EPI8(-1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD); - - template SIMD_INLINE void BgrToRgb(const uint8_t * src, uint8_t * dst) - { - __m128i s0 = Load((__m128i*)src + 0); - __m128i s1 = Load((__m128i*)src + 1); - __m128i s2 = Load((__m128i*)src + 2); - Store((__m128i*)dst + 0, _mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_00), _mm_shuffle_epi8(s1, K8_CVT_01))); - Store((__m128i*)dst + 1, _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_10), _mm_shuffle_epi8(s1, K8_CVT_11)), _mm_shuffle_epi8(s2, K8_CVT_12))); - Store((__m128i*)dst + 2, _mm_or_si128(_mm_shuffle_epi8(s1, K8_CVT_21), _mm_shuffle_epi8(s2, K8_CVT_22))); - } - - template void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)); - - const size_t A3 = A * 3; - size_t size = width * 3; - size_t aligned = AlignLo(width, A) * 3; - - for (size_t row = 0; row < height; ++row) - { - for (size_t i = 0; i < aligned; i += A3) - BgrToRgb(bgr + i, rgb + i); - if (aligned < size) - BgrToRgb(bgr + size - A3, rgb + size - A3); - bgr += bgrStride; - rgb += rgbStride; - } - } - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride) - { - if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)) - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); - else - BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToRgb.cpp.o) has no symbols - void dummy_SimdSsse3BgrToRgb(){}; -#endif//SIMD_SSSE3_ENABLE -} +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdStore.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + const __m128i K8_CVT_00 = SIMD_MM_SETR_EPI8(0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1); + const __m128i K8_CVT_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); + const __m128i K8_CVT_10 = SIMD_MM_SETR_EPI8(-1, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + const __m128i K8_CVT_11 = SIMD_MM_SETR_EPI8(0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF); + const __m128i K8_CVT_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, -1); + const __m128i K8_CVT_21 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + const __m128i K8_CVT_22 = SIMD_MM_SETR_EPI8(-1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD); + + template SIMD_INLINE void BgrToRgb(const uint8_t * src, uint8_t * dst) + { + __m128i s0 = Load((__m128i*)src + 0); + __m128i s1 = Load((__m128i*)src + 1); + __m128i s2 = Load((__m128i*)src + 2); + Store((__m128i*)dst + 0, _mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_00), _mm_shuffle_epi8(s1, K8_CVT_01))); + Store((__m128i*)dst + 1, _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_10), _mm_shuffle_epi8(s1, K8_CVT_11)), _mm_shuffle_epi8(s2, K8_CVT_12))); + Store((__m128i*)dst + 2, _mm_or_si128(_mm_shuffle_epi8(s1, K8_CVT_21), _mm_shuffle_epi8(s2, K8_CVT_22))); + } + + template void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)); + + const size_t A3 = A * 3; + size_t size = width * 3; + size_t aligned = AlignLo(width, A) * 3; + + for (size_t row = 0; row < height; ++row) + { + for (size_t i = 0; i < aligned; i += A3) + BgrToRgb(bgr + i, rgb + i); + if (aligned < size) + BgrToRgb(bgr + size - A3, rgb + size - A3); + bgr += bgrStride; + rgb += rgbStride; + } + } + + void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride) + { + if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)) + BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); + else + BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdSse41BgraToBgr.cpp old mode 100644 new mode 100755 similarity index 53% rename from 3rdparty/simdlib/Simd/SimdSsse3BgraToBgr.cpp rename to 3rdparty/simdlib/Simd/SimdSse41BgraToBgr.cpp index ccf4c51c97..a3000972e6 --- a/3rdparty/simdlib/Simd/SimdSsse3BgraToBgr.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41BgraToBgr.cpp @@ -1,92 +1,165 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - template SIMD_INLINE void BgraToBgrBody(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2]) - { - Store((__m128i*)(bgr + 0), _mm_shuffle_epi8(Load((__m128i*)bgra + 0), k[0][0])); - Store((__m128i*)(bgr + 12), _mm_shuffle_epi8(Load((__m128i*)bgra + 1), k[0][0])); - Store((__m128i*)(bgr + 24), _mm_shuffle_epi8(Load((__m128i*)bgra + 2), k[0][0])); - Store((__m128i*)(bgr + 36), _mm_shuffle_epi8(Load((__m128i*)bgra + 3), k[0][0])); - } - - template SIMD_INLINE void BgraToBgr(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2]) - { - __m128i bgra0 = Load((__m128i*)bgra + 0); - __m128i bgra1 = Load((__m128i*)bgra + 1); - __m128i bgra2 = Load((__m128i*)bgra + 2); - __m128i bgra3 = Load((__m128i*)bgra + 3); - Store((__m128i*)bgr + 0, _mm_or_si128(_mm_shuffle_epi8(bgra0, k[0][0]), _mm_shuffle_epi8(bgra1, k[0][1]))); - Store((__m128i*)bgr + 1, _mm_or_si128(_mm_shuffle_epi8(bgra1, k[1][0]), _mm_shuffle_epi8(bgra2, k[1][1]))); - Store((__m128i*)bgr + 2, _mm_or_si128(_mm_shuffle_epi8(bgra2, k[2][0]), _mm_shuffle_epi8(bgra3, k[2][1]))); - } - - template void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - if (width == alignedWidth) - alignedWidth -= A; - - __m128i k[3][2]; - k[0][0] = _mm_setr_epi8(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1); - k[0][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4); - k[1][0] = _mm_setr_epi8(0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1); - k[1][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9); - k[2][0] = _mm_setr_epi8(0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - k[2][1] = _mm_setr_epi8(-1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BgraToBgrBody(bgra + 4 * col, bgr + 3 * col, k); - if (width != alignedWidth) - BgraToBgr(bgra + 4 * (width - A), bgr + 3 * (width - A), k); - bgra += bgraStride; - bgr += bgrStride; - } - } - - void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) - { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) - BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); - else - BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgraToBgr.cpp.o) has no symbols - void dummy_SimdSsse3BgraToBgr(){}; -#endif// SIMD_SSSE3_ENABLE -} +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdStore.h" +#include "Simd/SimdMemory.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + template SIMD_INLINE void BgraToBgrBody(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2]) + { + Store((__m128i*)(bgr + 0), _mm_shuffle_epi8(Load((__m128i*)bgra + 0), k[0][0])); + Store((__m128i*)(bgr + 12), _mm_shuffle_epi8(Load((__m128i*)bgra + 1), k[0][0])); + Store((__m128i*)(bgr + 24), _mm_shuffle_epi8(Load((__m128i*)bgra + 2), k[0][0])); + Store((__m128i*)(bgr + 36), _mm_shuffle_epi8(Load((__m128i*)bgra + 3), k[0][0])); + } + + template SIMD_INLINE void BgraToBgr(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2]) + { + __m128i bgra0 = Load((__m128i*)bgra + 0); + __m128i bgra1 = Load((__m128i*)bgra + 1); + __m128i bgra2 = Load((__m128i*)bgra + 2); + __m128i bgra3 = Load((__m128i*)bgra + 3); + Store((__m128i*)bgr + 0, _mm_or_si128(_mm_shuffle_epi8(bgra0, k[0][0]), _mm_shuffle_epi8(bgra1, k[0][1]))); + Store((__m128i*)bgr + 1, _mm_or_si128(_mm_shuffle_epi8(bgra1, k[1][0]), _mm_shuffle_epi8(bgra2, k[1][1]))); + Store((__m128i*)bgr + 2, _mm_or_si128(_mm_shuffle_epi8(bgra2, k[2][0]), _mm_shuffle_epi8(bgra3, k[2][1]))); + } + + template void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); + + size_t alignedWidth = AlignLo(width, A); + if (width == alignedWidth) + alignedWidth -= A; + + __m128i k[3][2]; + k[0][0] = _mm_setr_epi8(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1); + k[0][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4); + k[1][0] = _mm_setr_epi8(0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1); + k[1][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9); + k[2][0] = _mm_setr_epi8(0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + k[2][1] = _mm_setr_epi8(-1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + BgraToBgrBody(bgra + 4 * col, bgr + 3 * col, k); + if (width != alignedWidth) + BgraToBgr(bgra + 4 * (width - A), bgr + 3 * (width - A), k); + bgra += bgraStride; + bgr += bgrStride; + } + } + + void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)) + BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); + else + BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride); + } + + //--------------------------------------------------------------------- + + template void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)); + + size_t alignedWidth = AlignLo(width, A); + if (width == alignedWidth) + alignedWidth -= A; + + __m128i k[3][2]; + k[0][0] = _mm_setr_epi8(0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1); + k[0][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x1, 0x0, 0x6); + k[1][0] = _mm_setr_epi8(0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1, -1, -1, -1, -1); + k[1][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9); + k[2][0] = _mm_setr_epi8(0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + k[2][1] = _mm_setr_epi8(-1, -1, -1, -1, 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC); + + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + BgraToBgrBody(bgra + 4 * col, rgb + 3 * col, k); + if (width != alignedWidth) + BgraToBgr(bgra + 4 * (width - A), rgb + 3 * (width - A), k); + bgra += bgraStride; + rgb += rgbStride; + } + } + + void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride)) + BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); + else + BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride); + } + + //--------------------------------------------------------------------- + + const __m128i K8_BGRA_TO_RGBA = SIMD_MM_SETR_EPI8(0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF); + + template SIMD_INLINE void BgraToRgba(const uint8_t* bgra, uint8_t* rgba) + { + Store((__m128i*)rgba, _mm_shuffle_epi8(Load((__m128i*)bgra), K8_BGRA_TO_RGBA)); + } + + template void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride)); + + size_t size = width * 4; + size_t sizeA = AlignLo(size, A); + + for (size_t row = 0; row < height; ++row) + { + for (size_t i = 0; i < size; i += A) + BgraToRgba(bgra + i, rgba + i); + if (size != sizeA) + BgraToRgba(bgra + size - sizeA, rgba + size - sizeA); + bgra += bgraStride; + rgba += rgbaStride; + } + } + + void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride) + { + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride)) + BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + else + BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdBaseRgbToGray.cpp b/3rdparty/simdlib/Simd/SimdSse41Cpu.cpp similarity index 54% rename from 3rdparty/simdlib/Simd/SimdBaseRgbToGray.cpp rename to 3rdparty/simdlib/Simd/SimdSse41Cpu.cpp index 6ac7f88791..9b5719ce97 100644 --- a/3rdparty/simdlib/Simd/SimdBaseRgbToGray.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41Cpu.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -21,23 +21,47 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "Simd/SimdConversion.h" +#include "Simd/SimdEnable.h" +#include "Simd/SimdCpu.h" + +#if defined(_MSC_VER) +#include +#endif namespace Simd { - namespace Base +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { - void RgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride) + SIMD_INLINE bool SupportedByCPU() { - for (size_t row = 0; row < height; ++row) + return + Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE41) && + Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE42); + } + + SIMD_INLINE bool SupportedByOS() + { +#if defined(_MSC_VER) + __try { - const uint8_t * pRgb = rgb + row*rgbStride; - uint8_t * pGray = gray + row*grayStride; - for (const uint8_t *pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgb += 3) - { - *pGray = RgbToGray(pRgb[0], pRgb[1], pRgb[2]); - } + int value = _mm_testz_si128(_mm_set1_epi8(0), _mm_set1_epi8(-1)); // try to execute of SSE41 instructions; + uint32_t crc = _mm_crc32_u8(0, 1); // try to execute of SSE42 instructions; + return true; } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } +#else + return true; +#endif + } + + bool GetEnable() + { + return SupportedByCPU() && SupportedByOS(); } } +#endif } diff --git a/3rdparty/simdlib/Simd/SimdSsse3Deinterleave.cpp b/3rdparty/simdlib/Simd/SimdSse41Deinterleave.cpp similarity index 74% rename from 3rdparty/simdlib/Simd/SimdSsse3Deinterleave.cpp rename to 3rdparty/simdlib/Simd/SimdSse41Deinterleave.cpp index 45ff364d03..68ae14efc5 100644 --- a/3rdparty/simdlib/Simd/SimdSsse3Deinterleave.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41Deinterleave.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,8 +27,8 @@ namespace Simd { -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { template SIMD_INLINE void DeinterleaveBgr(const uint8_t * bgr, uint8_t * b, uint8_t * g, uint8_t * r, size_t offset) { @@ -69,9 +69,11 @@ namespace Simd DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride); } + //--------------------------------------------------------------------- + const __m128i K8_SHUFFLE_BGRA = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF); - template SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset) + template SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset) { __m128i _bgra[4]; _bgra[0] = _mm_shuffle_epi8(Load((__m128i*)bgra + 0), K8_SHUFFLE_BGRA); @@ -89,7 +91,8 @@ namespace Simd __m128i rraa1 = _mm_unpackhi_epi32(_bgra[2], _bgra[3]); Store((__m128i*)(r + offset), _mm_unpacklo_epi64(rraa0, rraa1)); - Store((__m128i*)(a + offset), _mm_unpackhi_epi64(rraa0, rraa1)); + if(alpha) + Store((__m128i*)(a + offset), _mm_unpackhi_epi64(rraa0, rraa1)); } template void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, @@ -99,36 +102,51 @@ namespace Simd if (align) { assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride)); - assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)); + assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL)); } size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) + if (a) { - for (size_t col = 0; col < alignedWidth; col += A) - DeinterleaveBgra(bgra + col * 4, b, g, r, a, col); - if (width != alignedWidth) - DeinterleaveBgra(bgra + 4 * (width - A), b, g, r, a, width - A); - bgra += bgraStride; - b += bStride; - g += gStride; - r += rStride; - a += aStride; + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + DeinterleaveBgra(bgra + col * 4, b, g, r, a, col); + if (width != alignedWidth) + DeinterleaveBgra(bgra + 4 * (width - A), b, g, r, a, width - A); + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; + a += aStride; + } + } + else + { + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + DeinterleaveBgra(bgra + col * 4, b, g, r, NULL, col); + if (width != alignedWidth) + DeinterleaveBgra(bgra + 4 * (width - A), b, g, r, NULL, width - A); + bgra += bgraStride; + b += bStride; + g += gStride; + r += rStride; + } } } void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride) { - if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride)) + if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && + Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL)) DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); else DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride); } } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Deinterleave.cpp.o) has no symbols - void dummy_SimdSsse3Deinterleave(){}; -#endif// SIMD_SSSE3_ENABLE +#endif } diff --git a/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp old mode 100644 new mode 100755 index bacd2f7d91..73334c635d --- a/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2020 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "Simd/SimdMemory.h" +#include "Simd/SimdLoadBlock.h" #include "Simd/SimdStore.h" #include "Simd/SimdGaussianBlur.h" diff --git a/3rdparty/simdlib/Simd/SimdSsse3GaussianBlur3x3.cpp b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur3x3.cpp similarity index 95% rename from 3rdparty/simdlib/Simd/SimdSsse3GaussianBlur3x3.cpp rename to 3rdparty/simdlib/Simd/SimdSse41GaussianBlur3x3.cpp index 74ff76aa8a..11573a696b 100644 --- a/3rdparty/simdlib/Simd/SimdSsse3GaussianBlur3x3.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur3x3.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -22,12 +22,13 @@ * SOFTWARE. */ #include "Simd/SimdMemory.h" +#include "Simd/SimdLoadBlock.h" #include "Simd/SimdStore.h" namespace Simd { -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { namespace { @@ -154,8 +155,5 @@ namespace Simd GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); } } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3GaussianBlur3x3.cpp.o) has no symbols - void dummy_SimdSsse3GaussianBlur3x3(){}; -#endif// SIMD_SSSE3_ENABLE +#endif } diff --git a/3rdparty/simdlib/Simd/SimdSsse3GrayToBgr.cpp b/3rdparty/simdlib/Simd/SimdSse41GrayToBgr.cpp old mode 100644 new mode 100755 similarity index 92% rename from 3rdparty/simdlib/Simd/SimdSsse3GrayToBgr.cpp rename to 3rdparty/simdlib/Simd/SimdSse41GrayToBgr.cpp index 8106f6451a..db79b3e4f0 --- a/3rdparty/simdlib/Simd/SimdSsse3GrayToBgr.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41GrayToBgr.cpp @@ -1,75 +1,72 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - template SIMD_INLINE void GrayToBgr(uint8_t * bgr, __m128i gray) - { - Store((__m128i*)bgr + 0, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR0)); - Store((__m128i*)bgr + 1, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR1)); - Store((__m128i*)bgr + 2, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR2)); - } - - template void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride) - { - assert(width >= A); - if (align) - assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride)); - - size_t alignedWidth = AlignLo(width, A); - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - { - __m128i _gray = Load((__m128i*)(gray + col)); - GrayToBgr(bgr + 3 * col, _gray); - } - if (alignedWidth != width) - { - __m128i _gray = Load((__m128i*)(gray + width - A)); - GrayToBgr(bgr + 3 * (width - A), _gray); - } - gray += grayStride; - bgr += bgrStride; - } - } - - void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride) - { - if (Aligned(bgr) && Aligned(gray) && Aligned(bgrStride) && Aligned(grayStride)) - GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); - else - GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3GrayToBgr.cpp.o) has no symbols - void dummy_SimdSsse3GrayToBgr(){}; -#endif// SIMD_SSSE3_ENABLE -} +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdStore.h" +#include "Simd/SimdMemory.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + template SIMD_INLINE void GrayToBgr(uint8_t * bgr, __m128i gray) + { + Store((__m128i*)bgr + 0, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR0)); + Store((__m128i*)bgr + 1, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR1)); + Store((__m128i*)bgr + 2, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR2)); + } + + template void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride) + { + assert(width >= A); + if (align) + assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride)); + + size_t alignedWidth = AlignLo(width, A); + for (size_t row = 0; row < height; ++row) + { + for (size_t col = 0; col < alignedWidth; col += A) + { + __m128i _gray = Load((__m128i*)(gray + col)); + GrayToBgr(bgr + 3 * col, _gray); + } + if (alignedWidth != width) + { + __m128i _gray = Load((__m128i*)(gray + width - A)); + GrayToBgr(bgr + 3 * (width - A), _gray); + } + gray += grayStride; + bgr += bgrStride; + } + } + + void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride) + { + if (Aligned(bgr) && Aligned(gray) && Aligned(bgrStride) && Aligned(grayStride)) + GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); + else + GrayToBgr(gray, width, height, grayStride, bgr, bgrStride); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdSsse3Interleave.cpp b/3rdparty/simdlib/Simd/SimdSse41Interleave.cpp similarity index 96% rename from 3rdparty/simdlib/Simd/SimdSsse3Interleave.cpp rename to 3rdparty/simdlib/Simd/SimdSse41Interleave.cpp index c7213577fd..bb6354405e 100644 --- a/3rdparty/simdlib/Simd/SimdSsse3Interleave.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41Interleave.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,8 +27,8 @@ namespace Simd { -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { template SIMD_INLINE void InterleaveBgr(const uint8_t * b, const uint8_t * g, const uint8_t * r, size_t offset, uint8_t * bgr) { @@ -124,8 +124,5 @@ namespace Simd InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride); } } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Interleave.cpp.o) has no symbols - void dummy_SimdSsse3Interleave(){}; -#endif// SIMD_SSSE3_ENABLE +#endif } diff --git a/3rdparty/simdlib/Simd/SimdSsse3Reduce.cpp b/3rdparty/simdlib/Simd/SimdSse41Reduce.cpp old mode 100644 new mode 100755 similarity index 96% rename from 3rdparty/simdlib/Simd/SimdSsse3Reduce.cpp rename to 3rdparty/simdlib/Simd/SimdSse41Reduce.cpp index faded50ec7..9905a6f171 --- a/3rdparty/simdlib/Simd/SimdSsse3Reduce.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41Reduce.cpp @@ -1,202 +1,199 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2018 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1) - { - return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2); - } - - SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) - { - return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11)); - } - - template __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11); - - template<> SIMD_INLINE __m128i Average8<1>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) - { - return Average8(s00, s01, s10, s11); - } - - const __m128i K8_RC2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF); - - template<> SIMD_INLINE __m128i Average8<2>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) - { - return Average8(_mm_shuffle_epi8(s00, K8_RC2), _mm_shuffle_epi8(s01, K8_RC2), _mm_shuffle_epi8(s10, K8_RC2), _mm_shuffle_epi8(s11, K8_RC2)); - } - - const __m128i K8_RC4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF); - - template<> SIMD_INLINE __m128i Average8<4>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) - { - return Average8(_mm_shuffle_epi8(s00, K8_RC4), _mm_shuffle_epi8(s01, K8_RC4), _mm_shuffle_epi8(s10, K8_RC4), _mm_shuffle_epi8(s11, K8_RC4)); - } - - template SIMD_INLINE void ReduceColor2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - __m128i s00 = Load((__m128i*)src0 + 0); - __m128i s01 = Load((__m128i*)src0 + 1); - __m128i s10 = Load((__m128i*)src1 + 0); - __m128i s11 = Load((__m128i*)src1 + 1); - Store((__m128i*)dst, Average8(s00, s01, s10, s11)); - } - - template void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t evenWidth = AlignLo(srcWidth, 2); - size_t evenSize = evenWidth * channelCount; - size_t alignedSize = AlignLo(evenSize, DA); - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t *src0 = src; - const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - size_t srcOffset = 0, dstOffset = 0; - for (; srcOffset < alignedSize; srcOffset += DA, dstOffset += A) - ReduceColor2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); - if (alignedSize != evenSize) - { - srcOffset = evenSize - DA; - dstOffset = srcOffset / 2; - ReduceColor2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); - } - if (evenWidth != srcWidth) - { - for (size_t c = 0; c < channelCount; ++c) - dst[evenSize/2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]); - } - src += 2 * srcStride; - dst += dstStride; - } - } - - const __m128i K8_BGR0 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1); - const __m128i K8_BGR1 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0); - const __m128i K8_BGR2 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_BGR3 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1); - const __m128i K8_BGR4 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); - const __m128i K8_BGR5 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_BGR6 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF); - - template SIMD_INLINE void ReduceBgr2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) - { - __m128i s00 = Load((__m128i*)src0 + 0); - __m128i s01 = Load((__m128i*)src0 + 1); - __m128i s02 = Load((__m128i*)src0 + 2); - __m128i s10 = Load((__m128i*)src1 + 0); - __m128i s11 = Load((__m128i*)src1 + 1); - __m128i s12 = Load((__m128i*)src1 + 2); - __m128i m00 = _mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR0), _mm_shuffle_epi8(s01, K8_BGR1)); - __m128i m01 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR2), _mm_shuffle_epi8(s01, K8_BGR3)), _mm_shuffle_epi8(s02, K8_BGR4)); - __m128i m10 = _mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR0), _mm_shuffle_epi8(s11, K8_BGR1)); - __m128i m11 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR2), _mm_shuffle_epi8(s11, K8_BGR3)), _mm_shuffle_epi8(s12, K8_BGR4)); - Store((__m128i*)dst + 0, Average8(m00, m01, m10, m11)); - __m128i s03 = Load((__m128i*)src0 + 3); - __m128i s04 = Load((__m128i*)src0 + 4); - __m128i s13 = Load((__m128i*)src1 + 3); - __m128i s14 = Load((__m128i*)src1 + 4); - __m128i m02 = _mm_or_si128(_mm_shuffle_epi8(s01, K8_BGR5), _mm_shuffle_epi8(s02, K8_BGR6)); - __m128i m03 = _mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR0), _mm_shuffle_epi8(s04, K8_BGR1)); - __m128i m12 = _mm_or_si128(_mm_shuffle_epi8(s11, K8_BGR5), _mm_shuffle_epi8(s12, K8_BGR6)); - __m128i m13 = _mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR0), _mm_shuffle_epi8(s14, K8_BGR1)); - Store((__m128i*)dst + 1, Average8(m02, m03, m12, m13)); - __m128i s05 = Load((__m128i*)src0 + 5); - __m128i s15 = Load((__m128i*)src1 + 5); - __m128i m04 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR2), _mm_shuffle_epi8(s04, K8_BGR3)), _mm_shuffle_epi8(s05, K8_BGR4)); - __m128i m05 = _mm_or_si128(_mm_shuffle_epi8(s04, K8_BGR5), _mm_shuffle_epi8(s05, K8_BGR6)); - __m128i m14 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR2), _mm_shuffle_epi8(s14, K8_BGR3)), _mm_shuffle_epi8(s15, K8_BGR4)); - __m128i m15 = _mm_or_si128(_mm_shuffle_epi8(s14, K8_BGR5), _mm_shuffle_epi8(s15, K8_BGR6)); - Store((__m128i*)dst + 2, Average8(m04, m05, m14, m15)); - } - - template void ReduceBgr2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t evenWidth = AlignLo(srcWidth, 2); - size_t alignedWidth = AlignLo(srcWidth, DA); - size_t evenSize = evenWidth * 3; - size_t alignedSize = alignedWidth*3; - size_t srcStep = DA * 3, dstStep = A*3; - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t *src0 = src; - const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - size_t srcOffset = 0, dstOffset = 0; - for (; srcOffset < alignedSize; srcOffset += srcStep, dstOffset += dstStep) - ReduceBgr2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); - if (alignedSize != evenSize) - { - srcOffset = evenSize - srcStep; - dstOffset = srcOffset / 2; - ReduceBgr2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); - } - if (evenWidth != srcWidth) - { - for (size_t c = 0; c < 3; ++c) - dst[evenSize / 2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]); - } - src += 2 * srcStride; - dst += dstStride; - } - } - - template void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(dst) && Aligned(dstStride)); - } - - switch (channelCount) - { - case 1: ReduceColor2x2<1, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 2: ReduceColor2x2<2, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 3: ReduceBgr2x2(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - case 4: ReduceColor2x2<4, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; - default: assert(0); - } - } - - void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) - { - if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) - ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - else - ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Reduce.cpp.o) has no symbols - void dummy_SimdSsse3Reduce(){}; -#endif// SIMD_SSSE3_ENABLE -} +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdStore.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1) + { + return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2); + } + + SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) + { + return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11)); + } + + template __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11); + + template<> SIMD_INLINE __m128i Average8<1>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) + { + return Average8(s00, s01, s10, s11); + } + + const __m128i K8_RC2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF); + + template<> SIMD_INLINE __m128i Average8<2>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) + { + return Average8(_mm_shuffle_epi8(s00, K8_RC2), _mm_shuffle_epi8(s01, K8_RC2), _mm_shuffle_epi8(s10, K8_RC2), _mm_shuffle_epi8(s11, K8_RC2)); + } + + const __m128i K8_RC4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF); + + template<> SIMD_INLINE __m128i Average8<4>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) + { + return Average8(_mm_shuffle_epi8(s00, K8_RC4), _mm_shuffle_epi8(s01, K8_RC4), _mm_shuffle_epi8(s10, K8_RC4), _mm_shuffle_epi8(s11, K8_RC4)); + } + + template SIMD_INLINE void ReduceColor2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) + { + __m128i s00 = Load((__m128i*)src0 + 0); + __m128i s01 = Load((__m128i*)src0 + 1); + __m128i s10 = Load((__m128i*)src1 + 0); + __m128i s11 = Load((__m128i*)src1 + 1); + Store((__m128i*)dst, Average8(s00, s01, s10, s11)); + } + + template void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride) + { + size_t evenWidth = AlignLo(srcWidth, 2); + size_t evenSize = evenWidth * channelCount; + size_t alignedSize = AlignLo(evenSize, DA); + for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) + { + const uint8_t *src0 = src; + const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); + size_t srcOffset = 0, dstOffset = 0; + for (; srcOffset < alignedSize; srcOffset += DA, dstOffset += A) + ReduceColor2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); + if (alignedSize != evenSize) + { + srcOffset = evenSize - DA; + dstOffset = srcOffset / 2; + ReduceColor2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); + } + if (evenWidth != srcWidth) + { + for (size_t c = 0; c < channelCount; ++c) + dst[evenSize/2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]); + } + src += 2 * srcStride; + dst += dstStride; + } + } + + const __m128i K8_BGR0 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1); + const __m128i K8_BGR1 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0); + const __m128i K8_BGR2 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + const __m128i K8_BGR3 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1); + const __m128i K8_BGR4 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); + const __m128i K8_BGR5 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + const __m128i K8_BGR6 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF); + + template SIMD_INLINE void ReduceBgr2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst) + { + __m128i s00 = Load((__m128i*)src0 + 0); + __m128i s01 = Load((__m128i*)src0 + 1); + __m128i s02 = Load((__m128i*)src0 + 2); + __m128i s10 = Load((__m128i*)src1 + 0); + __m128i s11 = Load((__m128i*)src1 + 1); + __m128i s12 = Load((__m128i*)src1 + 2); + __m128i m00 = _mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR0), _mm_shuffle_epi8(s01, K8_BGR1)); + __m128i m01 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR2), _mm_shuffle_epi8(s01, K8_BGR3)), _mm_shuffle_epi8(s02, K8_BGR4)); + __m128i m10 = _mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR0), _mm_shuffle_epi8(s11, K8_BGR1)); + __m128i m11 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR2), _mm_shuffle_epi8(s11, K8_BGR3)), _mm_shuffle_epi8(s12, K8_BGR4)); + Store((__m128i*)dst + 0, Average8(m00, m01, m10, m11)); + __m128i s03 = Load((__m128i*)src0 + 3); + __m128i s04 = Load((__m128i*)src0 + 4); + __m128i s13 = Load((__m128i*)src1 + 3); + __m128i s14 = Load((__m128i*)src1 + 4); + __m128i m02 = _mm_or_si128(_mm_shuffle_epi8(s01, K8_BGR5), _mm_shuffle_epi8(s02, K8_BGR6)); + __m128i m03 = _mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR0), _mm_shuffle_epi8(s04, K8_BGR1)); + __m128i m12 = _mm_or_si128(_mm_shuffle_epi8(s11, K8_BGR5), _mm_shuffle_epi8(s12, K8_BGR6)); + __m128i m13 = _mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR0), _mm_shuffle_epi8(s14, K8_BGR1)); + Store((__m128i*)dst + 1, Average8(m02, m03, m12, m13)); + __m128i s05 = Load((__m128i*)src0 + 5); + __m128i s15 = Load((__m128i*)src1 + 5); + __m128i m04 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR2), _mm_shuffle_epi8(s04, K8_BGR3)), _mm_shuffle_epi8(s05, K8_BGR4)); + __m128i m05 = _mm_or_si128(_mm_shuffle_epi8(s04, K8_BGR5), _mm_shuffle_epi8(s05, K8_BGR6)); + __m128i m14 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR2), _mm_shuffle_epi8(s14, K8_BGR3)), _mm_shuffle_epi8(s15, K8_BGR4)); + __m128i m15 = _mm_or_si128(_mm_shuffle_epi8(s14, K8_BGR5), _mm_shuffle_epi8(s15, K8_BGR6)); + Store((__m128i*)dst + 2, Average8(m04, m05, m14, m15)); + } + + template void ReduceBgr2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride) + { + size_t evenWidth = AlignLo(srcWidth, 2); + size_t alignedWidth = AlignLo(srcWidth, DA); + size_t evenSize = evenWidth * 3; + size_t alignedSize = alignedWidth*3; + size_t srcStep = DA * 3, dstStep = A*3; + for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) + { + const uint8_t *src0 = src; + const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); + size_t srcOffset = 0, dstOffset = 0; + for (; srcOffset < alignedSize; srcOffset += srcStep, dstOffset += dstStep) + ReduceBgr2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); + if (alignedSize != evenSize) + { + srcOffset = evenSize - srcStep; + dstOffset = srcOffset / 2; + ReduceBgr2x2(src0 + srcOffset, src1 + srcOffset, dst + dstOffset); + } + if (evenWidth != srcWidth) + { + for (size_t c = 0; c < 3; ++c) + dst[evenSize / 2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]); + } + src += 2 * srcStride; + dst += dstStride; + } + } + + template void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) + { + assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA); + if (align) + { + assert(Aligned(src) && Aligned(srcStride)); + assert(Aligned(dst) && Aligned(dstStride)); + } + + switch (channelCount) + { + case 1: ReduceColor2x2<1, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; + case 2: ReduceColor2x2<2, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; + case 3: ReduceBgr2x2(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; + case 4: ReduceColor2x2<4, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break; + default: assert(0); + } + } + + void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) + { + if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)) + ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); + else + ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray2x2.cpp b/3rdparty/simdlib/Simd/SimdSse41ReduceGray2x2.cpp old mode 100644 new mode 100755 similarity index 94% rename from 3rdparty/simdlib/Simd/SimdSsse3ReduceGray2x2.cpp rename to 3rdparty/simdlib/Simd/SimdSse41ReduceGray2x2.cpp index 24d071182d..dd8bd5b0e3 --- a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray2x2.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41ReduceGray2x2.cpp @@ -1,96 +1,93 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1) - { - return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2); - } - - SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) - { - return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11)); - } - - template void ReduceGray2x2( - const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA); - if (align) - { - assert(Aligned(src) && Aligned(srcStride)); - assert(Aligned(dst) && Aligned(dstStride) && Aligned(dstWidth)); - } - - size_t alignedWidth = AlignLo(srcWidth, DA); - size_t evenWidth = AlignLo(srcWidth, 2); - for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) - { - const uint8_t *src0 = src; - const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); - size_t srcOffset = 0, dstOffset = 0; - for (; srcOffset < alignedWidth; srcOffset += DA, dstOffset += A) - { - Store((__m128i*)(dst + dstOffset), Average8( - Load((__m128i*)(src0 + srcOffset)), Load((__m128i*)(src0 + srcOffset + A)), - Load((__m128i*)(src1 + srcOffset)), Load((__m128i*)(src1 + srcOffset + A)))); - } - if (alignedWidth != srcWidth) - { - dstOffset = dstWidth - A - (evenWidth != srcWidth ? 1 : 0); - srcOffset = evenWidth - DA; - Store((__m128i*)(dst + dstOffset), Average8( - Load((__m128i*)(src0 + srcOffset)), Load((__m128i*)(src0 + srcOffset + A)), - Load((__m128i*)(src1 + srcOffset)), Load((__m128i*)(src1 + srcOffset + A)))); - if (evenWidth != srcWidth) - { - dst[dstWidth - 1] = Base::Average(src0[evenWidth], src1[evenWidth]); - } - } - src += 2 * srcStride; - dst += dstStride; - } - } - - void ReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) - { - if (Aligned(src) && Aligned(srcWidth) && Aligned(srcStride) && Aligned(dst) && Aligned(dstWidth) && Aligned(dstStride)) - ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - else - ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Reduce2x2.cpp.o) has no symbols - void dummy_SimdSsse3Reduce2x2(){}; -#endif// SIMD_SSSE3_ENABLE -} +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdStore.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1) + { + return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2); + } + + SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) + { + return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11)); + } + + template void ReduceGray2x2( + const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) + { + assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA); + if (align) + { + assert(Aligned(src) && Aligned(srcStride)); + assert(Aligned(dst) && Aligned(dstStride) && Aligned(dstWidth)); + } + + size_t alignedWidth = AlignLo(srcWidth, DA); + size_t evenWidth = AlignLo(srcWidth, 2); + for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2) + { + const uint8_t *src0 = src; + const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride); + size_t srcOffset = 0, dstOffset = 0; + for (; srcOffset < alignedWidth; srcOffset += DA, dstOffset += A) + { + Store((__m128i*)(dst + dstOffset), Average8( + Load((__m128i*)(src0 + srcOffset)), Load((__m128i*)(src0 + srcOffset + A)), + Load((__m128i*)(src1 + srcOffset)), Load((__m128i*)(src1 + srcOffset + A)))); + } + if (alignedWidth != srcWidth) + { + dstOffset = dstWidth - A - (evenWidth != srcWidth ? 1 : 0); + srcOffset = evenWidth - DA; + Store((__m128i*)(dst + dstOffset), Average8( + Load((__m128i*)(src0 + srcOffset)), Load((__m128i*)(src0 + srcOffset + A)), + Load((__m128i*)(src1 + srcOffset)), Load((__m128i*)(src1 + srcOffset + A)))); + if (evenWidth != srcWidth) + { + dst[dstWidth - 1] = Base::Average(src0[evenWidth], src1[evenWidth]); + } + } + src += 2 * srcStride; + dst += dstStride; + } + } + + void ReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride) + { + if (Aligned(src) && Aligned(srcWidth) && Aligned(srcStride) && Aligned(dst) && Aligned(dstWidth) && Aligned(dstStride)) + ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); + else + ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray4x4.cpp b/3rdparty/simdlib/Simd/SimdSse41ReduceGray4x4.cpp old mode 100644 new mode 100755 similarity index 96% rename from 3rdparty/simdlib/Simd/SimdSsse3ReduceGray4x4.cpp rename to 3rdparty/simdlib/Simd/SimdSse41ReduceGray4x4.cpp index 261e84c918..7754b290ba --- a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray4x4.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41ReduceGray4x4.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -26,8 +26,8 @@ namespace Simd { -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { namespace { @@ -170,8 +170,5 @@ namespace Simd ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); } } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Reduce4x4.cpp.o) has no symbols - void dummy_SimdSsse3Reduce4x4(){}; -#endif// SIMD_SSSE3_ENABLE +#endif } diff --git a/3rdparty/simdlib/Simd/SimdSsse3ResizeBilinear.cpp b/3rdparty/simdlib/Simd/SimdSse41ResizeBilinear.cpp old mode 100644 new mode 100755 similarity index 98% rename from 3rdparty/simdlib/Simd/SimdSsse3ResizeBilinear.cpp rename to 3rdparty/simdlib/Simd/SimdSse41ResizeBilinear.cpp index b39f619005..50a708aa20 --- a/3rdparty/simdlib/Simd/SimdSsse3ResizeBilinear.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41ResizeBilinear.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,8 +27,8 @@ namespace Simd { -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 { namespace { @@ -401,9 +401,6 @@ namespace Simd } } } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3ResizeBilinear.cpp.o) has no symbols - void dummy_SimdSsse3ResizeBilinear(){}; #endif } diff --git a/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp b/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp old mode 100644 new mode 100755 index b766a8a209..e3e8e7b360 --- a/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp +++ b/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,6 +32,309 @@ namespace Simd #ifdef SIMD_SSE41_ENABLE namespace Sse41 { + ResizerByteBilinear::ResizerByteBilinear(const ResParam& param) + : Sse2::ResizerByteBilinear(param) + , _blocks(0) + { + } + + size_t ResizerByteBilinear::BlockCountMax(size_t align) + { + return (size_t)Simd::Max(::ceil(float(_param.srcW) / (align - 1)), ::ceil(float(_param.dstW) * 2.0f / align)); + } + + void ResizerByteBilinear::EstimateParams() + { + if (_ax.data) + return; + if (_param.channels == 1 && _param.srcW < 4 * _param.dstW) + _blocks = BlockCountMax(A); + float scale = (float)_param.srcW / _param.dstW; + _ax.Resize(AlignHi(_param.dstW, A) * _param.channels * 2, false, _param.align); + uint8_t* alphas = _ax.data; + if (_blocks) + { + _ixg.Resize(_blocks); + int block = 0; + _ixg[0].src = 0; + _ixg[0].dst = 0; + for (int dstIndex = 0; dstIndex < (int)_param.dstW; ++dstIndex) + { + float alpha = (float)((dstIndex + 0.5) * scale - 0.5); + int srcIndex = (int)::floor(alpha); + alpha -= srcIndex; + + if (srcIndex < 0) + { + srcIndex = 0; + alpha = 0; + } + + if (srcIndex > (int)_param.srcW - 2) + { + srcIndex = (int)_param.srcW - 2; + alpha = 1; + } + + int dst = 2 * dstIndex - _ixg[block].dst; + int src = srcIndex - _ixg[block].src; + if (src >= A - 1 || dst >= A) + { + block++; + _ixg[block].src = Simd::Min(srcIndex, int(_param.srcW - A)); + _ixg[block].dst = 2 * dstIndex; + dst = 0; + src = srcIndex - _ixg[block].src; + } + _ixg[block].shuffle[dst] = src; + _ixg[block].shuffle[dst + 1] = src + 1; + + alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); + alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); + alphas += 2; + } + _blocks = block + 1; + } + else + { + _ix.Resize(_param.dstW); + for (size_t i = 0; i < _param.dstW; ++i) + { + float alpha = (float)((i + 0.5) * scale - 0.5); + ptrdiff_t index = (ptrdiff_t)::floor(alpha); + alpha -= index; + + if (index < 0) + { + index = 0; + alpha = 0; + } + + if (index > (ptrdiff_t)_param.srcW - 2) + { + index = _param.srcW - 2; + alpha = 1; + } + + _ix[i] = (int)index; + alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); + alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); + for (size_t channel = 1; channel < _param.channels; channel++) + ((uint16_t*)alphas)[channel] = *(uint16_t*)alphas; + alphas += 2 * _param.channels; + } + } + size_t size = AlignHi(_param.dstW, _param.align) * _param.channels * 2; + _bx[0].Resize(size, false, _param.align); + _bx[1].Resize(size, false, _param.align); + } + + template void ResizerByteBilinearInterpolateX(const __m128i* alpha, __m128i* buffer); + + template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<1>(const __m128i* alpha, __m128i* buffer) + { + _mm_store_si128(buffer, _mm_maddubs_epi16(_mm_load_si128(buffer), _mm_load_si128(alpha))); + } + + const __m128i K8_SHUFFLE_X2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF); + + SIMD_INLINE void ResizerByteBilinearInterpolateX2(const __m128i* alpha, __m128i* buffer) + { + __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X2); + _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha))); + } + + template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<2>(const __m128i* alpha, __m128i* buffer) + { + ResizerByteBilinearInterpolateX2(alpha + 0, buffer + 0); + ResizerByteBilinearInterpolateX2(alpha + 1, buffer + 1); + } + + const __m128i K8_SHUFFLE_X3_00 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1); + const __m128i K8_SHUFFLE_X3_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0); + const __m128i K8_SHUFFLE_X3_10 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + const __m128i K8_SHUFFLE_X3_11 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1); + const __m128i K8_SHUFFLE_X3_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); + const __m128i K8_SHUFFLE_X3_21 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + const __m128i K8_SHUFFLE_X3_22 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF); + + template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<3>(const __m128i* alpha, __m128i* buffer) + { + __m128i src[3], shuffled[3]; + src[0] = _mm_load_si128(buffer + 0); + src[1] = _mm_load_si128(buffer + 1); + src[2] = _mm_load_si128(buffer + 2); + shuffled[0] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_00); + shuffled[0] = _mm_or_si128(shuffled[0], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_01)); + _mm_store_si128(buffer + 0, _mm_maddubs_epi16(shuffled[0], _mm_load_si128(alpha + 0))); + shuffled[1] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_10); + shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_11)); + shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_12)); + _mm_store_si128(buffer + 1, _mm_maddubs_epi16(shuffled[1], _mm_load_si128(alpha + 1))); + shuffled[2] = _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_21); + shuffled[2] = _mm_or_si128(shuffled[2], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_22)); + _mm_store_si128(buffer + 2, _mm_maddubs_epi16(shuffled[2], _mm_load_si128(alpha + 2))); + } + + const __m128i K8_SHUFFLE_X4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF); + + SIMD_INLINE void ResizerByteBilinearInterpolateX4(const __m128i* alpha, __m128i* buffer) + { + __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X4); + _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha))); + } + + template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<4>(const __m128i* alpha, __m128i* buffer) + { + ResizerByteBilinearInterpolateX4(alpha + 0, buffer + 0); + ResizerByteBilinearInterpolateX4(alpha + 1, buffer + 1); + ResizerByteBilinearInterpolateX4(alpha + 2, buffer + 2); + ResizerByteBilinearInterpolateX4(alpha + 3, buffer + 3); + } + + const __m128i K16_FRACTION_ROUND_TERM = SIMD_MM_SET1_EPI16(Base::BILINEAR_ROUND_TERM); + + template SIMD_INLINE __m128i ResizerByteBilinearInterpolateY(const __m128i* pbx0, const __m128i* pbx1, __m128i alpha[2]) + { + __m128i sum = _mm_add_epi16(_mm_mullo_epi16(Load(pbx0), alpha[0]), _mm_mullo_epi16(Load(pbx1), alpha[1])); + return _mm_srli_epi16(_mm_add_epi16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT); + } + + template SIMD_INLINE void ResizerByteBilinearInterpolateY(const uint8_t* bx0, const uint8_t* bx1, __m128i alpha[2], uint8_t* dst) + { + __m128i lo = ResizerByteBilinearInterpolateY((__m128i*)bx0 + 0, (__m128i*)bx1 + 0, alpha); + __m128i hi = ResizerByteBilinearInterpolateY((__m128i*)bx0 + 1, (__m128i*)bx1 + 1, alpha); + Store((__m128i*)dst, _mm_packus_epi16(lo, hi)); + } + + template void ResizerByteBilinear::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) + { + struct One { uint8_t val[N * 1]; }; + struct Two { uint8_t val[N * 2]; }; + + size_t size = 2 * _param.dstW * N; + size_t aligned = AlignHi(size, DA) - DA; + const size_t step = A * N; + ptrdiff_t previous = -2; + __m128i a[2]; + uint8_t* bx[2] = { _bx[0].data, _bx[1].data }; + const uint8_t* ax = _ax.data; + const int32_t* ix = _ix.data; + size_t dstW = _param.dstW; + + for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride) + { + a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst])); + a[1] = _mm_set1_epi16(int16_t(_ay[yDst])); + + ptrdiff_t sy = _iy[yDst]; + int k = 0; + + if (sy == previous) + k = 2; + else if (sy == previous + 1) + { + Swap(bx[0], bx[1]); + k = 1; + } + + previous = sy; + + for (; k < 2; k++) + { + Two* pb = (Two*)bx[k]; + const One* psrc = (const One*)(src + (sy + k) * srcStride); + for (size_t x = 0; x < dstW; x++) + pb[x] = *(Two*)(psrc + ix[x]); + + uint8_t* pbx = bx[k]; + for (size_t i = 0; i < size; i += step) + ResizerByteBilinearInterpolateX((__m128i*)(ax + i), (__m128i*)(pbx + i)); + } + + for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A) + ResizerByteBilinearInterpolateY(bx[0] + ib, bx[1] + ib, a, dst + id); + size_t i = size - DA; + ResizerByteBilinearInterpolateY(bx[0] + i, bx[1] + i, a, dst + i / 2); + } + } + + template SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t* src, const Idx& index, const uint8_t* alpha, uint8_t* dst) + { + __m128i _src = _mm_loadu_si128((__m128i*)(src + index.src)); + __m128i _shuffle = _mm_loadu_si128((__m128i*) & index.shuffle); + __m128i _alpha = _mm_loadu_si128((__m128i*)(alpha + index.dst)); + _mm_storeu_si128((__m128i*)(dst + index.dst), _mm_maddubs_epi16(_mm_shuffle_epi8(_src, _shuffle), _alpha)); + } + + void ResizerByteBilinear::RunG(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) + { + size_t bufW = AlignHi(_param.dstW, A) * 2; + size_t size = 2 * _param.dstW; + size_t aligned = AlignHi(size, DA) - DA; + size_t blocks = _blocks; + ptrdiff_t previous = -2; + __m128i a[2]; + uint8_t* bx[2] = { _bx[0].data, _bx[1].data }; + const uint8_t* ax = _ax.data; + const Idx* ixg = _ixg.data; + + for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride) + { + a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst])); + a[1] = _mm_set1_epi16(int16_t(_ay[yDst])); + + ptrdiff_t sy = _iy[yDst]; + int k = 0; + + if (sy == previous) + k = 2; + else if (sy == previous + 1) + { + Swap(bx[0], bx[1]); + k = 1; + } + + previous = sy; + + for (; k < 2; k++) + { + const uint8_t* psrc = src + (sy + k) * srcStride; + uint8_t* pdst = bx[k]; + for (size_t i = 0; i < blocks; ++i) + ResizerByteBilinearLoadGrayInterpolated(psrc, ixg[i], ax, pdst); + } + + for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A) + ResizerByteBilinearInterpolateY(bx[0] + ib, bx[1] + ib, a, dst + id); + size_t i = size - DA; + ResizerByteBilinearInterpolateY(bx[0] + i, bx[1] + i, a, dst + i / 2); + } + } + + void ResizerByteBilinear::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) + { + assert(_param.dstW >= A); + + EstimateParams(); + switch (_param.channels) + { + case 1: + if (_blocks) + RunG(src, srcStride, dst, dstStride); + else + Run<1>(src, srcStride, dst, dstStride); + break; + case 2: Run<2>(src, srcStride, dst, dstStride); break; + case 3: Run<3>(src, srcStride, dst, dstStride); break; + case 4: Run<4>(src, srcStride, dst, dstStride); break; + default: + assert(0); + } + } + + //--------------------------------------------------------------------- + ResizerByteArea::ResizerByteArea(const ResParam & param) : Sse2::ResizerByteArea(param) { @@ -200,10 +503,12 @@ namespace Simd void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) { ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128i)); - if (type == SimdResizeChannelByte && method == SimdResizeMethodArea) + if (param.IsByteBilinear() && dstX >= A) + return new ResizerByteBilinear(param); + else if (param.IsByteArea()) return new ResizerByteArea(param); else - return Ssse3::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); + return Sse2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); } } #else diff --git a/3rdparty/simdlib/Simd/SimdSsse3.h b/3rdparty/simdlib/Simd/SimdSsse3.h deleted file mode 100644 index ed7849f39d..0000000000 --- a/3rdparty/simdlib/Simd/SimdSsse3.h +++ /dev/null @@ -1,77 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#ifndef __SimdSsse3_h__ -#define __SimdSsse3_h__ - -#include "Simd/SimdDefs.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride); - - void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha); - - void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha); - - void BgraToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride); - - void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride); - - void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride); - - void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride); - - void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride); - - void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride); - - void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); - - void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride); - - void InterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride); - - void InterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride); - - void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - void ReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void ReduceGray4x4(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride); - - void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, - uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); - - // ViSP custom SIMD code - void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff); - } -#endif// SIMD_SSSE3_ENABLE -} -#endif//__SimdSsse3_h__ diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp b/3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp deleted file mode 100644 index bb01107812..0000000000 --- a/3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - template SIMD_INLINE void BgrToRgba(const uint8_t * bgr, uint8_t * rgba, __m128i alpha, __m128i shuffle) - { - Store((__m128i*)rgba + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 0)), shuffle))); - Store((__m128i*)rgba + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 12)), shuffle))); - Store((__m128i*)rgba + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load((__m128i*)(bgr + 24)), shuffle))); - Store((__m128i*)rgba + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(bgr + 32)), 4), shuffle))); - } - - template void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha) - { - assert(width >= A); - if (align) - assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride)); - - size_t alignedWidth = AlignLo(width, A); - - __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3); - __m128i _shuffle = _mm_setr_epi8(0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BgrToRgba(bgr + 3 * col, rgba + 4 * col, _alpha, _shuffle); - if (width != alignedWidth) - BgrToRgba(bgr + 3 * (width - A), rgba + 4 * (width - A), _alpha, _shuffle); - bgr += bgrStride; - rgba += rgbaStride; - } - } - - void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha) - { - if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride)) - BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - else - BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToRGBa.cpp.o) has no symbols - void dummy_SimdSsse3BgrToRGBa(){}; -#endif// SIMD_SSSE3_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp b/3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp deleted file mode 100644 index d455781ed3..0000000000 --- a/3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - template SIMD_INLINE void BgraToRgba(const uint8_t * bgra, uint8_t * rgba, __m128i shuffle) - { - Store((__m128i*)rgba + 0, _mm_shuffle_epi8(Load((__m128i*)(bgra + 0)), shuffle)); - Store((__m128i*)rgba + 1, _mm_shuffle_epi8(Load((__m128i*)(bgra + 16)), shuffle)); - Store((__m128i*)rgba + 2, _mm_shuffle_epi8(Load((__m128i*)(bgra + 32)), shuffle)); - Store((__m128i*)rgba + 3, _mm_shuffle_epi8(Load((__m128i*)(bgra + 48)), shuffle)); - } - - template void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride) - { - assert(width >= A); - if (align) - assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride)); - - size_t alignedWidth = AlignLo(width, A); - - __m128i _shuffle = _mm_setr_epi8(0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - BgraToRgba(bgra + 4 * col, rgba + 4 * col, _shuffle); - if (width != alignedWidth) - BgraToRgba(bgra + 4 * (width - A), rgba + 4 * (width - A), _shuffle); - bgra += bgraStride; - rgba += rgbaStride; - } - } - - void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride) - { - if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride)) - BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); - else - BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToRGBa.cpp.o) has no symbols - void dummy_SimdSsse3BgraToRGBa(){}; -#endif// SIMD_SSSE3_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp b/3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp deleted file mode 100644 index 985a772d47..0000000000 --- a/3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp +++ /dev/null @@ -1,69 +0,0 @@ -/* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdBase.h" -#include "Simd/SimdStore.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff) - { - const __m128i mask1 = _mm_set_epi8(-1, 14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0); - const __m128i mask2 = _mm_set_epi8(-1, 15, -1, 13, -1, 11, -1, 9, -1, 7, -1, 5, -1, 3, -1, 1); - const __m128i mask_out2 = _mm_set_epi8(14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0, -1); - - size_t i = 0; - for (; i <= size-16; i+= 16) { - const __m128i vdata1 = _mm_loadu_si128(reinterpret_cast(img1 + i)); - const __m128i vdata2 = _mm_loadu_si128(reinterpret_cast(img2 + i)); - - __m128i vdata1_reorg = _mm_shuffle_epi8(vdata1, mask1); - __m128i vdata2_reorg = _mm_shuffle_epi8(vdata2, mask1); - - const __m128i vshift = _mm_set1_epi16(128); - __m128i vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift); - - const __m128i v255 = _mm_set1_epi16(255); - const __m128i vzero = _mm_setzero_si128(); - const __m128i vdata_diff_min_max1 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero); - - vdata1_reorg = _mm_shuffle_epi8(vdata1, mask2); - vdata2_reorg = _mm_shuffle_epi8(vdata2, mask2); - - vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift); - const __m128i vdata_diff_min_max2 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero); - - _mm_storeu_si128(reinterpret_cast<__m128i *>(imgDiff + i), _mm_or_si128(_mm_shuffle_epi8(vdata_diff_min_max1, mask1), - _mm_shuffle_epi8(vdata_diff_min_max2, mask_out2))); - } - - if (i < size) { - Base::SimdImageDifference(img1 + i, img2 + i, size - i, imgDiff + i); - } - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3CustomFunctions.cpp.o) has no symbols - void dummy_SimdSsse3CustomFunctions(){}; -#endif// SIMD_SSSE3_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp b/3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp deleted file mode 100644 index 37f2eca6c1..0000000000 --- a/3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp +++ /dev/null @@ -1,350 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2019 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdMemory.h" -#include "Simd/SimdStore.h" -#include "Simd/SimdResizer.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - ResizerByteBilinear::ResizerByteBilinear(const ResParam & param) - : Sse2::ResizerByteBilinear(param) - , _blocks(0) - { - } - - size_t ResizerByteBilinear::BlockCountMax(size_t align) - { - return (size_t)Simd::Max(::ceil(float(_param.srcW) / (align - 1)), ::ceil(float(_param.dstW) * 2.0f / align )); - } - - void ResizerByteBilinear::EstimateParams() - { - if (_ax.data) - return; - if (_param.channels == 1 && _param.srcW < 4 * _param.dstW) - _blocks = BlockCountMax(A); - float scale = (float)_param.srcW / _param.dstW; - _ax.Resize(AlignHi(_param.dstW, A) * _param.channels * 2, false, _param.align); - uint8_t * alphas = _ax.data; - if (_blocks) - { - _ixg.Resize(_blocks); - int block = 0; - _ixg[0].src = 0; - _ixg[0].dst = 0; - for (int dstIndex = 0; dstIndex < (int)_param.dstW; ++dstIndex) - { - float alpha = (float)((dstIndex + 0.5)*scale - 0.5); - int srcIndex = (int)::floor(alpha); - alpha -= srcIndex; - - if (srcIndex < 0) - { - srcIndex = 0; - alpha = 0; - } - - if (srcIndex > (int)_param.srcW - 2) - { - srcIndex = (int)_param.srcW - 2; - alpha = 1; - } - - int dst = 2 * dstIndex - _ixg[block].dst; - int src = srcIndex - _ixg[block].src; - if (src >= A - 1 || dst >= A) - { - block++; - _ixg[block].src = Simd::Min(srcIndex, int(_param.srcW - A)); - _ixg[block].dst = 2 * dstIndex; - dst = 0; - src = srcIndex - _ixg[block].src; - } - _ixg[block].shuffle[dst] = src; - _ixg[block].shuffle[dst + 1] = src + 1; - - alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); - alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); - alphas += 2; - } - _blocks = block + 1; - } - else - { - _ix.Resize(_param.dstW); - for (size_t i = 0; i < _param.dstW; ++i) - { - float alpha = (float)((i + 0.5)*scale - 0.5); - ptrdiff_t index = (ptrdiff_t)::floor(alpha); - alpha -= index; - - if (index < 0) - { - index = 0; - alpha = 0; - } - - if (index >(ptrdiff_t)_param.srcW - 2) - { - index = _param.srcW - 2; - alpha = 1; - } - - _ix[i] = (int)index; - alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); - alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); - for (size_t channel = 1; channel < _param.channels; channel++) - ((uint16_t*)alphas)[channel] = *(uint16_t*)alphas; - alphas += 2 * _param.channels; - } - } - size_t size = AlignHi(_param.dstW, _param.align)*_param.channels * 2; - _bx[0].Resize(size, false, _param.align); - _bx[1].Resize(size, false, _param.align); - } - - template void ResizerByteBilinearInterpolateX(const __m128i * alpha, __m128i * buffer); - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<1>(const __m128i * alpha, __m128i * buffer) - { - _mm_store_si128(buffer, _mm_maddubs_epi16(_mm_load_si128(buffer), _mm_load_si128(alpha))); - } - - const __m128i K8_SHUFFLE_X2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF); - - SIMD_INLINE void ResizerByteBilinearInterpolateX2(const __m128i * alpha, __m128i * buffer) - { - __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X2); - _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha))); - } - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<2>(const __m128i * alpha, __m128i * buffer) - { - ResizerByteBilinearInterpolateX2(alpha + 0, buffer + 0); - ResizerByteBilinearInterpolateX2(alpha + 1, buffer + 1); - } - - const __m128i K8_SHUFFLE_X3_00 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1); - const __m128i K8_SHUFFLE_X3_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0); - const __m128i K8_SHUFFLE_X3_10 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_X3_11 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1); - const __m128i K8_SHUFFLE_X3_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); - const __m128i K8_SHUFFLE_X3_21 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - const __m128i K8_SHUFFLE_X3_22 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF); - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<3>(const __m128i * alpha, __m128i * buffer) - { - __m128i src[3], shuffled[3]; - src[0] = _mm_load_si128(buffer + 0); - src[1] = _mm_load_si128(buffer + 1); - src[2] = _mm_load_si128(buffer + 2); - shuffled[0] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_00); - shuffled[0] = _mm_or_si128(shuffled[0], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_01)); - _mm_store_si128(buffer + 0, _mm_maddubs_epi16(shuffled[0], _mm_load_si128(alpha + 0))); - shuffled[1] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_10); - shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_11)); - shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_12)); - _mm_store_si128(buffer + 1, _mm_maddubs_epi16(shuffled[1], _mm_load_si128(alpha + 1))); - shuffled[2] = _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_21); - shuffled[2] = _mm_or_si128(shuffled[2], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_22)); - _mm_store_si128(buffer + 2, _mm_maddubs_epi16(shuffled[2], _mm_load_si128(alpha + 2))); - } - - const __m128i K8_SHUFFLE_X4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF); - - SIMD_INLINE void ResizerByteBilinearInterpolateX4(const __m128i * alpha, __m128i * buffer) - { - __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X4); - _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha))); - } - - template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<4>(const __m128i * alpha, __m128i * buffer) - { - ResizerByteBilinearInterpolateX4(alpha + 0, buffer + 0); - ResizerByteBilinearInterpolateX4(alpha + 1, buffer + 1); - ResizerByteBilinearInterpolateX4(alpha + 2, buffer + 2); - ResizerByteBilinearInterpolateX4(alpha + 3, buffer + 3); - } - - const __m128i K16_FRACTION_ROUND_TERM = SIMD_MM_SET1_EPI16(Base::BILINEAR_ROUND_TERM); - - template SIMD_INLINE __m128i ResizerByteBilinearInterpolateY(const __m128i * pbx0, const __m128i * pbx1, __m128i alpha[2]) - { - __m128i sum = _mm_add_epi16(_mm_mullo_epi16(Load(pbx0), alpha[0]), _mm_mullo_epi16(Load(pbx1), alpha[1])); - return _mm_srli_epi16(_mm_add_epi16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT); - } - - template SIMD_INLINE void ResizerByteBilinearInterpolateY(const uint8_t * bx0, const uint8_t * bx1, __m128i alpha[2], uint8_t * dst) - { - __m128i lo = ResizerByteBilinearInterpolateY((__m128i*)bx0 + 0, (__m128i*)bx1 + 0, alpha); - __m128i hi = ResizerByteBilinearInterpolateY((__m128i*)bx0 + 1, (__m128i*)bx1 + 1, alpha); - Store((__m128i*)dst, _mm_packus_epi16(lo, hi)); - } - - template void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - struct One { uint8_t val[N * 1]; }; - struct Two { uint8_t val[N * 2]; }; - - size_t size = 2 * _param.dstW*N; - size_t aligned = AlignHi(size, DA) - DA; - const size_t step = A * N; - ptrdiff_t previous = -2; - __m128i a[2]; - uint8_t * bx[2] = { _bx[0].data, _bx[1].data }; - const uint8_t * ax = _ax.data; - const int32_t * ix = _ix.data; - size_t dstW = _param.dstW; - - for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride) - { - a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst])); - a[1] = _mm_set1_epi16(int16_t(_ay[yDst])); - - ptrdiff_t sy = _iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(bx[0], bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - Two * pb = (Two *)bx[k]; - const One * psrc = (const One *)(src + (sy + k)*srcStride); - for (size_t x = 0; x < dstW; x++) - pb[x] = *(Two *)(psrc + ix[x]); - - uint8_t * pbx = bx[k]; - for (size_t i = 0; i < size; i += step) - ResizerByteBilinearInterpolateX((__m128i*)(ax + i), (__m128i*)(pbx + i)); - } - - for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A) - ResizerByteBilinearInterpolateY(bx[0] + ib, bx[1] + ib, a, dst + id); - size_t i = size - DA; - ResizerByteBilinearInterpolateY(bx[0] + i, bx[1] + i, a, dst + i / 2); - } - } - - template SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t * src, const Idx & index, const uint8_t * alpha, uint8_t * dst) - { - __m128i _src = _mm_loadu_si128((__m128i*)(src + index.src)); - __m128i _shuffle = _mm_loadu_si128((__m128i*)&index.shuffle); - __m128i _alpha = _mm_loadu_si128((__m128i*)(alpha + index.dst)); - _mm_storeu_si128((__m128i*)(dst + index.dst), _mm_maddubs_epi16(_mm_shuffle_epi8(_src, _shuffle), _alpha)); - } - - void ResizerByteBilinear::RunG(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - size_t bufW = AlignHi(_param.dstW, A) * 2; - size_t size = 2 * _param.dstW; - size_t aligned = AlignHi(size, DA) - DA; - size_t blocks = _blocks; - ptrdiff_t previous = -2; - __m128i a[2]; - uint8_t * bx[2] = { _bx[0].data, _bx[1].data }; - const uint8_t * ax = _ax.data; - const Idx * ixg = _ixg.data; - - for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride) - { - a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst])); - a[1] = _mm_set1_epi16(int16_t(_ay[yDst])); - - ptrdiff_t sy = _iy[yDst]; - int k = 0; - - if (sy == previous) - k = 2; - else if (sy == previous + 1) - { - Swap(bx[0], bx[1]); - k = 1; - } - - previous = sy; - - for (; k < 2; k++) - { - const uint8_t * psrc = src + (sy + k)*srcStride; - uint8_t * pdst = bx[k]; - for (size_t i = 0; i < blocks; ++i) - ResizerByteBilinearLoadGrayInterpolated(psrc, ixg[i], ax, pdst); - } - - for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A) - ResizerByteBilinearInterpolateY(bx[0] + ib, bx[1] + ib, a, dst + id); - size_t i = size - DA; - ResizerByteBilinearInterpolateY(bx[0] + i, bx[1] + i, a, dst + i / 2); - } - } - - void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) - { - assert(_param.dstW >= A); - - EstimateParams(); - switch (_param.channels) - { - case 1: - if(_blocks) - RunG(src, srcStride, dst, dstStride); - else - Run<1>(src, srcStride, dst, dstStride); - break; - case 2: Run<2>(src, srcStride, dst, dstStride); break; - case 3: Run<3>(src, srcStride, dst, dstStride); break; - case 4: Run<4>(src, srcStride, dst, dstStride); break; - default: - assert(0); - } - } - - //--------------------------------------------------------------------- - - void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) - { - ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128i)); - if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && dstX >= A) - return new ResizerByteBilinear(param); - else - return Sse2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Resizer.cpp.o) has no symbols - void dummy_SimdSsse3Resizer(){}; -#endif//SIMD_SSSE3_ENABLE -} - diff --git a/3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp b/3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp deleted file mode 100644 index cf79dd55bd..0000000000 --- a/3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp +++ /dev/null @@ -1,93 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2017 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdStore.h" -#include "Simd/SimdMemory.h" - -namespace Simd -{ -#ifdef SIMD_SSSE3_ENABLE - namespace Ssse3 - { - const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT); - const __m128i K16_GREEN_ROUND = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM); - - SIMD_INLINE __m128i RgbaToGray32(__m128i rgba) - { - const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF); - const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF); - const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(r0b0, K16_RED_BLUE)); - return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT); - } - - SIMD_INLINE __m128i RgbToGray(__m128i rgba[4]) - { - const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1])); - const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3])); - return _mm_packus_epi16(lo, hi); - } - - template SIMD_INLINE __m128i RgbToGray(const uint8_t * rgb, __m128i shuffle) - { - __m128i rgba[4]; - rgba[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(rgb + 0)), shuffle)); - rgba[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(rgb + 12)), shuffle)); - rgba[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(rgb + 24)), shuffle)); - rgba[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(rgb + 32)), 4), shuffle)); - return RgbToGray(rgba); - } - - template void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride) - { - assert(width >= A); - if (align) - assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)); - - size_t alignedWidth = AlignLo(width, A); - - __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); - - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < alignedWidth; col += A) - Store((__m128i*)(gray + col), RgbToGray(rgb + 3 * col, _shuffle)); - if (width != alignedWidth) - Store((__m128i*)(gray + width - A), RgbToGray(rgb + 3 * (width - A), _shuffle)); - rgb += rgbStride; - gray += grayStride; - } - } - - void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride) - { - if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride)) - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - else - RgbToGray(rgb, width, height, rgbStride, gray, grayStride); - } - } -#else - // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3RgbToGray.cpp.o) has no symbols - void dummy_SimdSsse3RgbToGray(){}; -#endif// SIMD_SSSE3_ENABLE -} diff --git a/3rdparty/simdlib/Simd/SimdStore.h b/3rdparty/simdlib/Simd/SimdStore.h old mode 100644 new mode 100755 index 11ae3f7815..2b22a9616d --- a/3rdparty/simdlib/Simd/SimdStore.h +++ b/3rdparty/simdlib/Simd/SimdStore.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -31,8 +31,8 @@ namespace Simd { -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { template SIMD_INLINE void Store(float * p, __m128 a); @@ -63,13 +63,6 @@ namespace Simd __m128 old = Load(p); Store(p, Combine(mask, value, old)); } - } -#endif//SIMD_SSE_ENABLE - -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { - using namespace Sse; template SIMD_INLINE void Store(__m128i * p, __m128i a); @@ -83,6 +76,11 @@ namespace Simd _mm_store_si128(p, a); } + template SIMD_INLINE void StoreHalf(__m128i* p, __m128i a) + { + StoreHalf((float*)p, _mm_castsi128_ps(a)); + } + template SIMD_INLINE void StoreMasked(__m128i * p, __m128i value, __m128i mask) { __m128i old = Load(p); @@ -95,7 +93,6 @@ namespace Simd namespace Sse41 { #if defined(_MSC_VER) && _MSC_VER >= 1800 && _MSC_VER < 1900 // Visual Studio 2013 compiler bug - using Sse::Store; using Sse2::Store; #endif } @@ -118,8 +115,8 @@ namespace Simd template SIMD_INLINE void Store(float * p0, float * p1, __m256 a) { - Sse::Store(p0, _mm256_extractf128_ps(a, 0)); - Sse::Store(p1, _mm256_extractf128_ps(a, 1)); + Sse2::Store(p0, _mm256_extractf128_ps(a, 0)); + Sse2::Store(p1, _mm256_extractf128_ps(a, 1)); } template SIMD_INLINE void StoreMasked(float * p, __m256 value, __m256 mask) @@ -163,11 +160,6 @@ namespace Simd return _mm256_permute4x64_epi64(_mm256_packus_epi16(lo, hi), 0xD8); } - SIMD_INLINE __m256i PackU16ToU8(__m256i lo, __m256i hi) - { - return _mm256_permute4x64_epi64(_mm256_packus_epi16(lo, hi), 0xD8); - } - SIMD_INLINE __m256i PackI32ToI16(__m256i lo, __m256i hi) { return _mm256_permute4x64_epi64(_mm256_packs_epi32(lo, hi), 0xD8); @@ -184,6 +176,12 @@ namespace Simd lo = _mm256_permute2x128_si256(lo, hi, 0x20); hi = _mm256_permute2x128_si256(_lo, hi, 0x31); } + + template SIMD_INLINE void Store24(uint8_t * p, __m256i a) + { + Sse2::Store((__m128i*)p, _mm256_extractf128_si256(a, 0)); + Sse2::StoreHalf<0>((__m128i*)p + 1, _mm256_extractf128_si256(a, 1)); + } } #endif//SIMD_SAVX2_ENABLE @@ -230,27 +228,27 @@ namespace Simd template SIMD_INLINE void Store(uint16_t * p, uint16x8_t a) { - Store((uint8_t*)p, (uint8x16_t)a); + Store((uint8_t*)p, vreinterpretq_u8_u16(a)); } template SIMD_INLINE void Store(uint16_t * p, uint16x4_t a) { - Store((uint8_t*)p, (uint8x8_t)a); + Store((uint8_t*)p, vreinterpret_u8_u16(a)); } template SIMD_INLINE void Store(int16_t * p, int16x8_t a) { - Store((uint8_t*)p, (uint8x16_t)a); + Store((uint8_t*)p, vreinterpretq_u8_s16(a)); } template SIMD_INLINE void Store(uint32_t * p, uint32x4_t a) { - Store((uint8_t*)p, (uint8x16_t)a); + Store((uint8_t*)p, vreinterpretq_u8_u32(a)); } template SIMD_INLINE void Store(int32_t * p, int32x4_t a) { - Store((uint8_t*)p, (uint8x16_t)a); + Store((uint8_t*)p, vreinterpretq_u8_s32(a)); } template SIMD_INLINE void Store2(uint8_t * p, uint8x16x2_t a); @@ -310,7 +308,6 @@ namespace Simd #endif } - template SIMD_INLINE void Store3(uint8_t * p, uint8x16x3_t a); template <> SIMD_INLINE void Store3(uint8_t * p, uint8x16x3_t a) diff --git a/3rdparty/simdlib/Simd/SimdStream.h b/3rdparty/simdlib/Simd/SimdStream.h old mode 100644 new mode 100755 index b6399bd1f1..6abf65cf68 --- a/3rdparty/simdlib/Simd/SimdStream.h +++ b/3rdparty/simdlib/Simd/SimdStream.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -30,36 +30,31 @@ namespace Simd { const size_t STREAM_SIZE_MIN = 0x00100000; -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { - template SIMD_INLINE void Stream(float * p, __m128 a); + template SIMD_INLINE void Stream(float* p, __m128 a); - template <> SIMD_INLINE void Stream(float * p, __m128 a) + template <> SIMD_INLINE void Stream(float* p, __m128 a) { _mm_storeu_ps(p, a); } - template <> SIMD_INLINE void Stream(float * p, __m128 a) + template <> SIMD_INLINE void Stream(float* p, __m128 a) { _mm_storeu_ps(p, a); } - template <> SIMD_INLINE void Stream(float * p, __m128 a) + template <> SIMD_INLINE void Stream(float* p, __m128 a) { _mm_store_ps(p, a); } - template <> SIMD_INLINE void Stream(float * p, __m128 a) + template <> SIMD_INLINE void Stream(float* p, __m128 a) { _mm_stream_ps(p, a); } - } -#endif//SIMD_SSE_ENABLE -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { template SIMD_INLINE void Stream(__m128i * p, __m128i a); template <> SIMD_INLINE void Stream(__m128i * p, __m128i a) diff --git a/3rdparty/simdlib/Simd/SimdUpdate.h b/3rdparty/simdlib/Simd/SimdUpdate.h old mode 100644 new mode 100755 index 47e9b22dc2..4c4d64b1c0 --- a/3rdparty/simdlib/Simd/SimdUpdate.h +++ b/3rdparty/simdlib/Simd/SimdUpdate.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar. +* Copyright (c) 2011-2021 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -47,8 +47,8 @@ namespace Simd } } -#ifdef SIMD_SSE_ENABLE - namespace Sse +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 { template SIMD_INLINE void Update(float * p, __m128 a) { @@ -63,13 +63,10 @@ namespace Simd template <> SIMD_INLINE void Update(float * p, __m128 a) { Store(p, _mm_add_ps(Load(p), a)); - } - } -#endif//SIMD_SSE_ENABLE + } -#ifdef SIMD_SSE2_ENABLE - namespace Sse2 - { + //----------------------------------------------------------------------------------------- + template SIMD_INLINE void Update(int32_t * p, __m128i a) { Store((__m128i*)p, a); @@ -160,6 +157,6 @@ namespace Simd Store(p, vaddq_f32(Load(p), a)); } } -#endif//SIMD_SSE_ENABLE +#endif//SIMD_NEON_ENABLE } #endif//__SimdUpdate_h__ diff --git a/3rdparty/simdlib/Simd/SimdVersion.h b/3rdparty/simdlib/Simd/SimdVersion.h index 72ae751ade..09efd5de91 100644 --- a/3rdparty/simdlib/Simd/SimdVersion.h +++ b/3rdparty/simdlib/Simd/SimdVersion.h @@ -34,7 +34,7 @@ #ifndef __SimdVersion_h__ #define __SimdVersion_h__ -#define SIMD_VERSION "4.4.82" +#define SIMD_VERSION "4.9.107" #endif//__SimdVersion_h__ diff --git a/3rdparty/simdlib/Simd/SimdView.hpp b/3rdparty/simdlib/Simd/SimdView.hpp old mode 100644 new mode 100755 index c9a51c5f61..0c61a0e6e8 --- a/3rdparty/simdlib/Simd/SimdView.hpp +++ b/3rdparty/simdlib/Simd/SimdView.hpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2019 Yermalayeu Ihar, +* Copyright (c) 2011-2021 Yermalayeu Ihar, * 2014-2019 Antonenka Mikhail, * 2018-2019 Dmitry Fedorov, * 2019-2019 Artur Voronkov. @@ -95,7 +95,9 @@ namespace Simd /*! A single channel 64-bit float point pixel format. */ Double, /*! A 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */ - Rgb24 + Rgb24, + /*! A 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */ + Rgba32, }; /*! diff --git a/modules/core/src/image/vpImageConvert.cpp b/modules/core/src/image/vpImageConvert.cpp index cc9ecd4853..62c2b751cf 100644 --- a/modules/core/src/image/vpImageConvert.cpp +++ b/modules/core/src/image/vpImageConvert.cpp @@ -744,7 +744,7 @@ vpImageConvert::convert( const cv::Mat &src, vpImage< vpRGBa > &dest, bool flip { if ( src.isContinuous() && !flip ) { - SimdBgrToRgba( src.data, src.cols, src.rows, src.step[0], reinterpret_cast< uint8_t * >( dest.bitmap ), + SimdRgbToBgra( src.data, src.cols, src.rows, src.step[0], reinterpret_cast< uint8_t * >( dest.bitmap ), dest.getWidth() * sizeof( vpRGBa ), vpRGBa::alpha_default ); } else @@ -3864,7 +3864,7 @@ vpImageConvert::BGRToRGBa( unsigned char *bgr, unsigned char *rgba, unsigned int { if ( !flip ) { - SimdBgrToRgba( bgr, width, height, width * 3, rgba, width * sizeof( vpRGBa ), vpRGBa::alpha_default ); + SimdRgbToBgra( bgr, width, height, width * 3, rgba, width * sizeof( vpRGBa ), vpRGBa::alpha_default ); } else { From 94fee711854e17aee14fef33324b7e9e349c3473 Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Wed, 3 Nov 2021 10:17:07 +0100 Subject: [PATCH 08/18] Add missing file. --- 3rdparty/simdlib/Simd/SimdNeonCpu.cpp | 59 +++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 3rdparty/simdlib/Simd/SimdNeonCpu.cpp diff --git a/3rdparty/simdlib/Simd/SimdNeonCpu.cpp b/3rdparty/simdlib/Simd/SimdNeonCpu.cpp new file mode 100644 index 0000000000..8b644c04f6 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdNeonCpu.cpp @@ -0,0 +1,59 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2020 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdEnable.h" +#include "Simd/SimdCpu.h" + +#if defined(__GNUC__) && (defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)) +#include +#include +#include +#endif + +namespace Simd +{ +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + SIMD_INLINE bool SupportedByCPU() + { +#if defined(_MSC_VER) + return true; +#elif defined(__GNUC__) +#if defined(SIMD_ARM64_ENABLE) + return true; +#else + return Base::CheckBit(AT_HWCAP, HWCAP_NEON); +#endif +#else +#error Do not know how to detect NEON support! +#endif + } + + bool GetEnable() + { + return SupportedByCPU(); + } + } +#endif +} From bd6dd785ad64d8405b45408bdb4da0ae339ad8b3 Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Wed, 3 Nov 2021 11:18:25 +0100 Subject: [PATCH 09/18] Remove not used SSE flags. Add missing SSE 4.1 implementation. --- 3rdparty/simdlib/CMakeLists.txt | 64 ++--------------- 3rdparty/simdlib/Simd/SimdLib.cpp | 7 +- 3rdparty/simdlib/Simd/SimdSse41.h | 3 + .../simdlib/Simd/SimdSse41CustomFunctions.cpp | 69 +++++++++++++++++++ modules/io/src/image/vpImageIo.cpp | 2 +- 5 files changed, 83 insertions(+), 62 deletions(-) create mode 100644 3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp diff --git a/3rdparty/simdlib/CMakeLists.txt b/3rdparty/simdlib/CMakeLists.txt index dc6d111aae..1acb1341be 100644 --- a/3rdparty/simdlib/CMakeLists.txt +++ b/3rdparty/simdlib/CMakeLists.txt @@ -20,46 +20,31 @@ file(GLOB_RECURSE SIMD_BASE_HDR ${CMAKE_CURRENT_SOURCE_DIR}/Simd/*.h ${CMAKE_CUR if(X86 OR X86_64) # Flags check - set(SSE_FLAG "") set(SSE2_FLAG "") - set(SSE3_FLAG "") - set(SSSE3_FLAG "") - set(SSE4_1_FLAG "") set(SSE4_2_FLAG "") set(AVX_FLAG "") set(AVX2_FLAG "") if(MSVC) if(NOT MSVC64) - vp_check_compiler_flag(CXX "/arch:SSE" HAVE_SSE_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse.cpp") vp_check_compiler_flag(CXX "/arch:SSE2" HAVE_SSE2_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp") endif() vp_check_compiler_flag(CXX "/arch:AVX" HAVE_AVX_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx.cpp") vp_check_compiler_flag(CXX "/arch:AVX2" HAVE_AVX2_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp") - if(HAVE_SSE_FLAG) - set(SSE_FLAG "/arch:SSE") - endif() if(HAVE_SSE2_FLAG) set(SSE2_FLAG "/arch:SSE2") endif() if(HAVE_AVX_FLAG) set(AVX_FLAG "/arch:AVX") set(SSE4_2_FLAG "/arch:AVX") - set(SSE4_1_FLAG "/arch:AVX") - set(SSSE3_FLAG "/arch:AVX") - set(SSE3_FLAG "/arch:AVX") endif() if(HAVE_AVX2_FLAG) set(AVX2_FLAG "/arch:AVX2") endif() else() - vp_check_compiler_flag(CXX "-msse" HAVE_SSE_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse.cpp") vp_check_compiler_flag(CXX "-msse2" HAVE_SSE2_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp") - vp_check_compiler_flag(CXX "-msse3" HAVE_SSE3_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse3.cpp") - vp_check_compiler_flag(CXX "-mssse3" HAVE_SSSE3_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_ssse3.cpp") - vp_check_compiler_flag(CXX "-msse4.1" HAVE_SSE4_1_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse41.cpp") vp_check_compiler_flag(CXX "-msse4.2" HAVE_SSE4_2_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse42.cpp") vp_check_compiler_flag(CXX "-mavx" HAVE_AVX_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx.cpp") vp_check_compiler_flag(CXX "-mavx2" HAVE_AVX2_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp") @@ -68,23 +53,11 @@ if(X86 OR X86_64) vp_check_compiler_flag(CXX "-Wno-sign-compare" HAVE_NO_SIGN_COMPARE_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_warning.cpp") vp_check_compiler_flag(CXX "-Wno-ignored-qualifiers" HAVE_NO_IGNORED_QUALIFIERS "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_warning.cpp") - if(HAVE_SSE_FLAG) - set(SSE_FLAG "-msse") - endif() if(HAVE_SSE2_FLAG) - set(SSE2_FLAG "-msse2") - endif() - if(HAVE_SSE3_FLAG) - set(SSE3_FLAG "-msse3") - endif() - if(HAVE_SSSE3_FLAG) - set(SSSE3_FLAG "-mssse3") - endif() - if(HAVE_SSE4_1_FLAG) - set(SSE4_1_FLAG "-msse4.1") + set(SSE2_FLAG "-msse -msse2") endif() if(HAVE_SSE4_2_FLAG) - set(SSE4_2_FLAG "-msse4.2") + set(SSE4_2_FLAG "-msse3 -mssse3 -msse4.1 -msse4.2") endif() if(HAVE_AVX_FLAG) set(AVX_FLAG "-mavx") @@ -110,10 +83,10 @@ if(X86 OR X86_64) set_source_files_properties(${SIMD_BASE_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS}") file(GLOB_RECURSE SIMD_SSE2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse2*.cpp) - set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG} ${SSE2_FLAG}") + set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE2_FLAG}") file(GLOB_RECURSE SIMD_SSE41_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse41*.cpp) - set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG} ${SSSE3_FLAG} ${SSE4_1_FLAG} ${SSE4_2_FLAG}") + set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}") file(GLOB_RECURSE SIMD_AVX1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx1*.cpp) set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}") @@ -126,7 +99,7 @@ if(X86 OR X86_64) endif() set(SIMD_LIB_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}") - set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE1_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE3_SRC} ${SIMD_SSSE3_SRC} ${SIMD_SSE41_SRC} ${SIMD_SSE42_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC}) + set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE41_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC}) file(GLOB_RECURSE SIMD_LIB_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdLib.cpp) set_source_files_properties(${SIMD_LIB_SRC} PROPERTIES COMPILE_FLAGS "${SIMD_LIB_FLAGS}") @@ -171,32 +144,21 @@ elseif(WINRT) add_library(${SIMD_LIBRARY} STATIC ${SIMD_LIB_SRC} ${SIMD_BASE_SRC} ${SIMD_NEON_SRC} ${SIMD_BASE_HDR}) else() # Flags check - set(SSE_FLAG "") set(SSE2_FLAG "") - set(SSE3_FLAG "") - set(SSSE3_FLAG "") - set(SSE4_1_FLAG "") set(SSE4_2_FLAG "") set(AVX_FLAG "") set(AVX2_FLAG "") - vp_check_compiler_flag(CXX "/arch:SSE" HAVE_SSE_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse.cpp") vp_check_compiler_flag(CXX "/arch:SSE2" HAVE_SSE2_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp") vp_check_compiler_flag(CXX "/arch:AVX" HAVE_AVX_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx.cpp") vp_check_compiler_flag(CXX "/arch:AVX2" HAVE_AVX2_FLAG "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp") - if(HAVE_SSE_FLAG) - set(SSE_FLAG "/arch:SSE") - endif() if(HAVE_SSE2_FLAG) set(SSE2_FLAG "/arch:SSE2") endif() if(HAVE_AVX_FLAG) set(AVX_FLAG "/arch:AVX") set(SSE4_2_FLAG "/arch:AVX") - set(SSE4_1_FLAG "/arch:AVX") - set(SSSE3_FLAG "/arch:AVX") - set(SSE3_FLAG "/arch:AVX") endif() if(HAVE_AVX2_FLAG) set(AVX2_FLAG "/arch:AVX2") @@ -205,23 +167,11 @@ elseif(WINRT) file(GLOB_RECURSE SIMD_BASE_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdBase*.cpp) set_source_files_properties(${SIMD_BASE_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS}") - file(GLOB_RECURSE SIMD_SSE1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse1*.cpp) - set_source_files_properties(${SIMD_SSE1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG}") - file(GLOB_RECURSE SIMD_SSE2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse2*.cpp) set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE2_FLAG}") - file(GLOB_RECURSE SIMD_SSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse3*.cpp) - set_source_files_properties(${SIMD_SSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG}") - - file(GLOB_RECURSE SIMD_SSSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSsse3*.cpp) - set_source_files_properties(${SIMD_SSSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSSE3_FLAG}") - file(GLOB_RECURSE SIMD_SSE41_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse41*.cpp) - set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_1_FLAG}") - - file(GLOB_RECURSE SIMD_SSE42_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse42*.cpp) - set_source_files_properties(${SIMD_SSE42_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}") + set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}") file(GLOB_RECURSE SIMD_AVX1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx1*.cpp) set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}") @@ -230,7 +180,7 @@ elseif(WINRT) set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}") set(SIMD_LIB_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}") - set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE1_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE3_SRC} ${SIMD_SSSE3_SRC} ${SIMD_SSE41_SRC} ${SIMD_SSE42_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC}) + set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE41_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC}) file(GLOB_RECURSE SIMD_LIB_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdLib.cpp) set_source_files_properties(${SIMD_LIB_SRC} PROPERTIES COMPILE_FLAGS "${SIMD_LIB_FLAGS}") diff --git a/3rdparty/simdlib/Simd/SimdLib.cpp b/3rdparty/simdlib/Simd/SimdLib.cpp index b1cac8b1ba..89718bb80e 100755 --- a/3rdparty/simdlib/Simd/SimdLib.cpp +++ b/3rdparty/simdlib/Simd/SimdLib.cpp @@ -862,10 +862,9 @@ SIMD_API void SimdMatTranspose(const double * mat, size_t rows, size_t cols, dou SIMD_API void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff) { - //TODO: -#ifdef SIMD_SSSE3_ENABLE - if (Ssse3::Enable && size >= Ssse3::A) - Ssse3::SimdImageDifference(img1,img2, size, imgDiff); +#ifdef SIMD_SSE41_ENABLE + if (Sse41::Enable && size >= Sse41::A) + Sse41::SimdImageDifference(img1,img2, size, imgDiff); else #endif Base::SimdImageDifference(img1, img2, size, imgDiff); diff --git a/3rdparty/simdlib/Simd/SimdSse41.h b/3rdparty/simdlib/Simd/SimdSse41.h index 958fc11bc5..7a4bb04ad8 100755 --- a/3rdparty/simdlib/Simd/SimdSse41.h +++ b/3rdparty/simdlib/Simd/SimdSse41.h @@ -70,6 +70,9 @@ namespace Simd void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride); + + // ViSP custom SIMD code + void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff); } #endif// SIMD_SSE41_ENABLE } diff --git a/3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp b/3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp new file mode 100644 index 0000000000..f34a29329d --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp @@ -0,0 +1,69 @@ +/* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdBase.h" +#include "Simd/SimdStore.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff) + { + const __m128i mask1 = _mm_set_epi8(-1, 14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0); + const __m128i mask2 = _mm_set_epi8(-1, 15, -1, 13, -1, 11, -1, 9, -1, 7, -1, 5, -1, 3, -1, 1); + const __m128i mask_out2 = _mm_set_epi8(14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0, -1); + + size_t i = 0; + for (; i <= size-16; i+= 16) { + const __m128i vdata1 = _mm_loadu_si128(reinterpret_cast(img1 + i)); + const __m128i vdata2 = _mm_loadu_si128(reinterpret_cast(img2 + i)); + + __m128i vdata1_reorg = _mm_shuffle_epi8(vdata1, mask1); + __m128i vdata2_reorg = _mm_shuffle_epi8(vdata2, mask1); + + const __m128i vshift = _mm_set1_epi16(128); + __m128i vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift); + + const __m128i v255 = _mm_set1_epi16(255); + const __m128i vzero = _mm_setzero_si128(); + const __m128i vdata_diff_min_max1 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero); + + vdata1_reorg = _mm_shuffle_epi8(vdata1, mask2); + vdata2_reorg = _mm_shuffle_epi8(vdata2, mask2); + + vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift); + const __m128i vdata_diff_min_max2 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero); + + _mm_storeu_si128(reinterpret_cast<__m128i *>(imgDiff + i), _mm_or_si128(_mm_shuffle_epi8(vdata_diff_min_max1, mask1), + _mm_shuffle_epi8(vdata_diff_min_max2, mask_out2))); + } + + if (i < size) { + Base::SimdImageDifference(img1 + i, img2 + i, size - i, imgDiff + i); + } + } + } +#else + // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3CustomFunctions.cpp.o) has no symbols + void dummy_SimdSse41CustomFunctions(){}; +#endif// SIMD_SSE41_ENABLE +} diff --git a/modules/io/src/image/vpImageIo.cpp b/modules/io/src/image/vpImageIo.cpp index 633503389c..ab290fa5f7 100644 --- a/modules/io/src/image/vpImageIo.cpp +++ b/modules/io/src/image/vpImageIo.cpp @@ -102,7 +102,7 @@ void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const st while (cpt_elt != nb_elt) { // Skip empty lines or lines starting with # (comment) while (std::getline(fd, line) && (line.compare(0, 1, "#") == 0 || line.size() == 0)) { - }; + } if (fd.eof()) { fd.close(); From b75d20064519c27e8e70336ebf53ce4fbad026fd Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Wed, 3 Nov 2021 18:07:41 +0100 Subject: [PATCH 10/18] WIP code to add and test image loading/saving using Simd and for JPEG and PNG image format. --- 3rdparty/simdlib/CMakeLists.txt | 4 +- 3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp | 158 ++ 3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp | 138 + .../simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp | 351 +++ .../simdlib/Simd/SimdAvx2ImageSavePng.cpp | 369 +++ 3rdparty/simdlib/Simd/SimdBase.h | 4 + 3rdparty/simdlib/Simd/SimdBaseCrc32.cpp | 978 +++++++ 3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp | 371 +++ .../simdlib/Simd/SimdBaseImageLoadJpeg.cpp | 2456 +++++++++++++++++ .../simdlib/Simd/SimdBaseImageLoadPng.cpp | 1317 +++++++++ 3rdparty/simdlib/Simd/SimdBaseImageSave.cpp | 340 +++ .../simdlib/Simd/SimdBaseImageSaveJpeg.cpp | 451 +++ .../simdlib/Simd/SimdBaseImageSavePng.cpp | 379 +++ 3rdparty/simdlib/Simd/SimdImageLoad.h | 396 +++ 3rdparty/simdlib/Simd/SimdImageSave.h | 386 +++ 3rdparty/simdlib/Simd/SimdImageSaveJpeg.h | 649 +++++ 3rdparty/simdlib/Simd/SimdImageSavePng.h | 235 ++ 3rdparty/simdlib/Simd/SimdLib.cpp | 32 +- 3rdparty/simdlib/Simd/SimdLib.h | 109 +- 3rdparty/simdlib/Simd/SimdMath.h | 5 + 3rdparty/simdlib/Simd/SimdMemory.h | 19 + 3rdparty/simdlib/Simd/SimdMemoryStream.h | 510 ++++ 3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp | 154 ++ 3rdparty/simdlib/Simd/SimdNeonImageSave.cpp | 134 + 3rdparty/simdlib/Simd/SimdPerformance.h | 197 ++ 3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp | 159 ++ .../simdlib/Simd/SimdSse41ImageLoadPng.cpp | 1805 ++++++++++++ 3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp | 139 + .../simdlib/Simd/SimdSse41ImageSaveJpeg.cpp | 431 +++ .../simdlib/Simd/SimdSse41ImageSavePng.cpp | 370 +++ 3rdparty/simdlib/Simd/SimdView.hpp | 209 +- CMakeLists.txt | 2 + modules/io/CMakeLists.txt | 14 +- modules/io/include/visp3/io/vpImageIo.h | 8 + modules/io/src/image/vpImageIo.cpp | 63 + modules/io/test/perfImageLoadSave.cpp | 461 ++++ 36 files changed, 13646 insertions(+), 157 deletions(-) create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseCrc32.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageSave.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp create mode 100644 3rdparty/simdlib/Simd/SimdImageLoad.h create mode 100644 3rdparty/simdlib/Simd/SimdImageSave.h create mode 100644 3rdparty/simdlib/Simd/SimdImageSaveJpeg.h create mode 100644 3rdparty/simdlib/Simd/SimdImageSavePng.h create mode 100644 3rdparty/simdlib/Simd/SimdMemoryStream.h create mode 100644 3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp create mode 100644 3rdparty/simdlib/Simd/SimdNeonImageSave.cpp create mode 100644 3rdparty/simdlib/Simd/SimdPerformance.h create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp create mode 100644 modules/io/test/perfImageLoadSave.cpp diff --git a/3rdparty/simdlib/CMakeLists.txt b/3rdparty/simdlib/CMakeLists.txt index 1acb1341be..95b3358ad2 100644 --- a/3rdparty/simdlib/CMakeLists.txt +++ b/3rdparty/simdlib/CMakeLists.txt @@ -93,9 +93,9 @@ if(X86 OR X86_64) file(GLOB_RECURSE SIMD_AVX2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx2*.cpp) if(MSVC) - set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}") + set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store") else() - set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mfma") + set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mfma -mbmi -mbmi2 -mlzcnt -fabi-version=4 -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store") endif() set(SIMD_LIB_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}") diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp new file mode 100644 index 0000000000..aad4785761 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp @@ -0,0 +1,158 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdAvx2.h" + +#include + +namespace Simd +{ +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param) + : Sse41::ImagePgmTxtLoader(param) + { + } + + void ImagePgmTxtLoader::SetConverters() + { + Sse41::ImagePgmTxtLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Avx2::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Avx2::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Avx2::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Avx2::GrayToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param) + : Sse41::ImagePgmBinLoader(param) + { + } + + void ImagePgmBinLoader::SetConverters() + { + Sse41::ImagePgmBinLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Avx2::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Avx2::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Avx2::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Avx2::GrayToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param) + : Sse41::ImagePpmTxtLoader(param) + { + } + + void ImagePpmTxtLoader::SetConverters() + { + Sse41::ImagePpmTxtLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Avx2::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Avx2::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Avx2::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Avx2::BgrToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param) + : Sse41::ImagePpmBinLoader(param) + { + } + + void ImagePpmBinLoader::SetConverters() + { + Sse41::ImagePpmBinLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Avx2::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Avx2::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Avx2::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Avx2::BgrToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImageLoader* CreateImageLoader(const ImageLoaderParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param); + case SimdImageFilePgmBin: return new ImagePgmBinLoader(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param); + case SimdImageFilePpmBin: return new ImagePpmBinLoader(param); + case SimdImageFilePng: return new Sse41::ImagePngLoader(param); + case SimdImageFileJpeg: return new Base::ImageJpegLoader(param); + default: + return NULL; + } + } + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) + { + ImageLoaderParam param(data, size, *format); + if (param.Validate()) + { + Holder loader(CreateImageLoader(param)); + if (loader) + { + if (loader->FromStream()) + return loader->Release(stride, width, height, format); + } + } + return NULL; + } + } +#endif// SIMD_AVX2_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp new file mode 100644 index 0000000000..bd7e057092 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp @@ -0,0 +1,138 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdAvx2.h" + +#include + +namespace Simd +{ +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param) + : Sse41::ImagePgmTxtSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Avx2::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Avx2::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Avx2::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Avx2::RgbaToGray; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param) + : Sse41::ImagePgmBinSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Avx2::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Avx2::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Avx2::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Avx2::RgbaToGray; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param) + : Sse41::ImagePpmTxtSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Avx2::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Avx2::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Avx2::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Avx2::BgraToBgr; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param) + : Sse41::ImagePpmBinSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Avx2::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Avx2::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Avx2::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Avx2::BgraToBgr; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImageSaver* CreateImageSaver(const ImageSaverParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param); + case SimdImageFilePgmBin: return new ImagePgmBinSaver(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param); + case SimdImageFilePpmBin: return new ImagePpmBinSaver(param); + case SimdImageFilePng: return new ImagePngSaver(param); + case SimdImageFileJpeg: return new ImageJpegSaver(param); + default: + return NULL; + } + } + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size) + { + ImageSaverParam param(width, height, format, file, quality); + if (param.Validate()) + { + Holder saver(CreateImageSaver(param)); + if (saver) + { + if (saver->ToStream(src, stride)) + return saver->Release(size); + } + } + return NULL; + } + } +#endif// SIMD_AVX2_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp new file mode 100644 index 0000000000..2ff51e4dc1 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp @@ -0,0 +1,351 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdImageSaveJpeg.h" +#include "Simd/SimdLoad.h" +#include "Simd/SimdAvx2.h" + +namespace Simd +{ +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + const uint32_t JpegZigZagTi32[64] = { + 0, 8, 1, 2, 9, 16, 24, 17, + 10, 3, 4, 11, 18, 25, 32, 40, + 33, 26, 19, 12, 5, 6, 13, 20, + 27, 34, 41, 48, 56, 49, 42, 35, + 28, 21, 14, 7, 15, 22, 29, 36, + 43, 50, 57, 58, 51, 44, 37, 30, + 23, 31, 38, 45, 52, 59, 60, 53, + 46, 39, 47, 54, 61, 62, 55, 63 }; + + //--------------------------------------------------------------------- + + static int JpegProcessDu(Base::BitBuf& bitBuf, float* CDU, int stride, const float* fdtbl, int DC, const uint16_t HTDC[256][2], const uint16_t HTAC[256][2]) + { + SIMD_ALIGNED(32) int DUO[64], DU[64]; + JpegDct(CDU, stride, fdtbl, DUO); + union + { + uint64_t u64[1]; + uint32_t u32[2]; + uint8_t u8[8]; + } dum; + for (int i = 0, j = 0; i < 64; i += 8, j++) + { + __m256i du = _mm256_i32gather_epi32(DUO, _mm256_loadu_si256((__m256i*)(JpegZigZagTi32 + i)), 4); + dum.u8[j] = ~_mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpeq_epi32(du, Avx2::K_ZERO))); + _mm256_storeu_si256((__m256i*)(DU + i), du); + } + int diff = DU[0] - DC; + if (diff == 0) + bitBuf.Push(HTDC[0]); + else + { + uint16_t bits[2]; + Base::JpegCalcBits(diff, bits); + bitBuf.Push(HTDC[bits[1]]); + bitBuf.Push(bits); + } +#if defined(SIMD_X64_ENABLE) + if (dum.u64[0] == 0) + { + bitBuf.Push(HTAC[0x00]); + return DU[0]; + } + dum.u64[0] >>= 1; + int i = 1; + for (; dum.u64[0]; ++i, dum.u64[0] >>= 1) + { + int nrzeroes = (int)_tzcnt_u64(dum.u64[0]); + i += nrzeroes; + dum.u64[0] >>= nrzeroes; + if (nrzeroes >= 16) + { + for (int nrmarker = 16; nrmarker <= nrzeroes; nrmarker += 16) + bitBuf.Push(HTAC[0xF0]); + nrzeroes &= 15; + } + uint16_t bits[2]; + Base::JpegCalcBits(DU[i], bits); + bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]); + bitBuf.Push(bits); + } + if (i < 64) + bitBuf.Push(HTAC[0x00]); +#else + int end0pos = 64; + do + { + end0pos -= 8; + int mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_loadu_si256((__m256i*)(DU + end0pos)), Avx2::K_ZERO)); + if (mask) + { + end0pos += 7 - _lzcnt_u32(mask) / 4; + break; + } + } + while (end0pos > 0); + if (end0pos == 0) + { + bitBuf.Push(HTAC[0x00]); + return DU[0]; + } + for (int i = 1; i <= end0pos; ++i) + { + int startpos = i; + for (; DU[i] == 0 && i <= end0pos; ++i); + int nrzeroes = i - startpos; + if (nrzeroes >= 16) + { + int lng = nrzeroes >> 4; + int nrmarker; + for (nrmarker = 1; nrmarker <= lng; ++nrmarker) + bitBuf.Push(HTAC[0xF0]); + nrzeroes &= 15; + } + uint16_t bits[2]; + Base::JpegCalcBits(DU[i], bits); + bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]); + bitBuf.Push(bits); + } + if (end0pos != 63) + bitBuf.Push(HTAC[0x00]); +#endif + return DU[0]; + } + + SIMD_INLINE void RgbToYuvInit(__m256 k[10]) + { + k[0] = _mm256_set1_ps(+0.29900f); + k[1] = _mm256_set1_ps(+0.58700f); + k[2] = _mm256_set1_ps(+0.11400f); + k[3] = _mm256_set1_ps(-128.000f); + k[4] = _mm256_set1_ps(-0.16874f); + k[5] = _mm256_set1_ps(-0.33126f); + k[6] = _mm256_set1_ps(+0.50000f); + k[7] = _mm256_set1_ps(+0.50000f); + k[8] = _mm256_set1_ps(-0.41869f); + k[9] = _mm256_set1_ps(-0.08131f); + } + + SIMD_INLINE void RgbToYuv(const uint8_t* r, const uint8_t* g, const uint8_t* b, int stride, int height, + const __m256 k[10], float* y, float* u, float* v, int size) + { + for (int row = 0; row < size;) + { + for (int col = 0; col < size; col += 8) + { + __m256 _r = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(r + col)))); + __m256 _g = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(g + col)))); + __m256 _b = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(b + col)))); + _mm256_storeu_ps(y + col, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, k[0]), _mm256_mul_ps(_g, k[1])), _mm256_mul_ps(_b, k[2])), k[3])); + //_mm256_storeu_ps(y + col, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, _yr), _mm256_mul_ps(_g, _yg)), _mm256_add_ps(_mm256_mul_ps(_b, _yb), _yt))); + _mm256_storeu_ps(u + col, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, k[4]), _mm256_mul_ps(_g, k[5])), _mm256_mul_ps(_b, k[6]))); + _mm256_storeu_ps(v + col, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, k[7]), _mm256_mul_ps(_g, k[8])), _mm256_mul_ps(_b, k[9]))); + } + if(++row < height) + r += stride, g += stride, b += stride; + y += size, u += size, v += size; + } + } + + SIMD_INLINE void GrayToY(const uint8_t* g, int stride, int height, const __m256 k[10], float* y, int size) + { + for (int row = 0; row < size;) + { + for (int col = 0; col < size; col += 8) + { + __m256 _g = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(g + col)))); + _mm256_storeu_ps(y + col, _mm256_add_ps(_g, k[3])); + } + if (++row < height) + g += stride; + y += size; + } + } + + SIMD_INLINE void SubUv(const float * src, float * dst) + { + __m256 _0_25 = _mm256_set1_ps(0.25f), s0, s1; + for (int yy = 0; yy < 8; yy += 1) + { + s0 = _mm256_add_ps(_mm256_loadu_ps(src + 0), _mm256_loadu_ps(src + 16)); + s1 = _mm256_add_ps(_mm256_loadu_ps(src + 8), _mm256_loadu_ps(src + 24)); + _mm256_storeu_ps(dst + 0, _mm256_mul_ps(PermutedHorizontalAdd(s0, s1), _0_25)); + src += 32; + dst += 8; + } + } + + void JpegWriteBlockSubs(OutputMemoryStream& stream, int width, int height, const uint8_t* red, + const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3]) + { + __m256 k[10]; + RgbToYuvInit(k); + int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2]; + int width16 = width & (~15); + bool gray = red == green && red == blue; + Base::BitBuf bitBuf; + for (int y = 0; y < height; y += 16) + { + int x = 0; + SIMD_ALIGNED(16) float Y[256], U[256], V[256]; + SIMD_ALIGNED(16) float subU[64], subV[64]; + for (; x < width16; x += 16) + { + if (gray) + GrayToY(red + x, stride, height - y, k, Y, 16); + else + RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 16); + DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + SubUv(U, subU); + SubUv(V, subV); + DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + if (bitBuf.Full()) + { + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + } + for (; x < width; x += 16) + { + if (gray) + Base::GrayToY(red + x, stride, height - y, width - x, Y, 16); + else + Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 16); + DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + SubUv(U, subU); + SubUv(V, subV); + DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + } + } + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + + void JpegWriteBlockFull(OutputMemoryStream& stream, int width, int height, const uint8_t* red, + const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3]) + { + __m256 k[10]; + RgbToYuvInit(k); + int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2]; + int width8 = width & (~7); + bool gray = red == green && red == blue; + Base::BitBuf bitBuf; + for (int y = 0; y < height; y += 8) + { + int x = 0; + SIMD_ALIGNED(16) float Y[64], U[64], V[64]; + for (; x < width8; x += 8) + { + if (gray) + GrayToY(red + x, stride, height - y, k, Y, 8); + else + RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 8); + DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + if (bitBuf.Full()) + { + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + } + for (; x < width; x += 8) + { + if (gray) + Base::GrayToY(red + x, stride, height - y, width - x, Y, 8); + else + Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 8); + DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + } + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + } + + //--------------------------------------------------------------------- + + ImageJpegSaver::ImageJpegSaver(const ImageSaverParam& param) + : Sse41::ImageJpegSaver(param) + { + } + + void ImageJpegSaver::Init() + { + Sse41::ImageJpegSaver::Init(); + if (_param.width >= 32) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: + case SimdPixelFormatRgb24: + _deintBgr = Avx2::DeinterleaveBgr; + break; + case SimdPixelFormatBgra32: + case SimdPixelFormatRgba32: + _deintBgra = Avx2::DeinterleaveBgra; + break; + default: + break; + } + } + _writeBlock = _subSample ? JpegWriteBlockSubs : JpegWriteBlockFull; + } + } +#endif// SIMD_AVX2_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp new file mode 100644 index 0000000000..3cfa79fc62 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp @@ -0,0 +1,369 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdImageSavePng.h" +#include "Simd/SimdAvx2.h" +#include "Simd/SimdExtract.h" + +namespace Simd +{ +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + static uint32_t ZlibAdler32(uint8_t* data, int size) + { + __m256i _i0 = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7), _8 = _mm256_set1_epi32(8); + uint32_t lo = 1, hi = 0; + for (int b = 0, n = (int)(size % 5552); b < size;) + { + int n8 = n & (~7), i = 0; + __m256i _i = _mm256_add_epi32(_i0, _mm256_set1_epi32(n)); + __m256i _l = _mm256_setzero_si256(), _h = _mm256_setzero_si256(); + for (; i < n8; i += 8) + { + __m256i d = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(data + b + i))); + _l = _mm256_add_epi32(_l, d); + _h = _mm256_add_epi32(_h, _mm256_mullo_epi32(d, _i)); + _i = _mm256_sub_epi32(_i, _8); + } + int l = Avx2::ExtractSum(_l), h = Avx2::ExtractSum(_h); + for (; i < n; ++i) + { + l += data[b + i]; + h += data[b + i]*(n - i); + } + hi = (hi + h + lo*n) % 65521; + lo = (lo + l) % 65521; + b += n; + n = 5552; + } + return (hi << 16) | lo; + } + + void ZlibCompress(uint8_t* data, int size, int quality, OutputMemoryStream& stream) + { + const int ZHASH = 16384; + if (quality < 5) + quality = 5; + const int basket = quality * 2; + Array32i hashTable(ZHASH * basket); + memset(hashTable.data, -1, hashTable.RawSize()); + + stream.Write(uint8_t(0x78)); + stream.Write(uint8_t(0x5e)); + stream.WriteBits(1, 1); + stream.WriteBits(1, 2); + + int i = 0, j; + while (i < size - 3) + { + int h = Base::ZlibHash(data + i) & (ZHASH - 1), best = 3; + uint8_t* bestLoc = 0; + int* hList = hashTable.data + h * basket; + for (j = 0; hList[j] != -1 && j < basket; ++j) + { + if (hList[j] > i - 32768) + { + int d = Avx2::ZlibCount(data + hList[j], data + i, size - i); + if (d >= best) + { + best = d; + bestLoc = data + hList[j]; + } + } + } + if (j == basket) + { + memcpy(hList, hList + quality, quality * sizeof(int)); + memset(hList + quality, -1, quality * sizeof(int)); + j = quality; + } + hList[j] = i; + + if (bestLoc) + { + h = Base::ZlibHash(data + i + 1) & (ZHASH - 1); + int* hList = hashTable.data + h * basket; + for (j = 0; hList[j] != -1 && j < basket; ++j) + { + if (hList[j] > i - 32767) + { + int e = Avx2::ZlibCount(data + hList[j], data + i + 1, size - i - 1); + if (e > best) + { + bestLoc = NULL; + break; + } + } + } + } + + if (bestLoc) + { + int d = (int)(data + i - bestLoc); + assert(d <= 32767 && best <= 258); + for (j = 0; best > Base::ZlibLenC[j + 1] - 1; ++j); + Base::ZlibHuff(j + 257, stream); + if (Base::ZlibLenEb[j]) + stream.WriteBits(best - Base::ZlibLenC[j], Base::ZlibLenEb[j]); + for (j = 0; d > Base::ZlibDistC[j + 1] - 1; ++j); + stream.WriteBits(Base::ZlibBitRev(j, 5), 5); + if (Base::ZlibDistEb[j]) + stream.WriteBits(d - Base::ZlibDistC[j], Base::ZlibDistEb[j]); + i += best; + } + else + { + Base::ZlibHuffB(data[i], stream); + ++i; + } + } + for (; i < size; ++i) + Base::ZlibHuffB(data[i], stream); + Base::ZlibHuff(256, stream); + stream.FlushBits(); + stream.WriteBe32u(ZlibAdler32(data, size)); + } + + uint32_t EncodeLine0(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size, A); + __m256i _sum = _mm256_setzero_si256(); + for (; i < sizeA; i += A) + { + __m256i _src = _mm256_loadu_si256((__m256i*)(src + i)); + _mm256_storeu_si256((__m256i*)(dst + i), _src); + _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_src))); + } + uint32_t sum = Avx2::ExtractSum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine1(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + __m256i _sum = _mm256_setzero_si256(); + for (; i < sizeA; i += A) + { + __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i)); + __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n)); + __m256i _dst = _mm256_sub_epi8(_src0, _src1); + _mm256_storeu_si256((__m256i*)(dst + i), _dst); + _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst))); + } + sum += Avx2::ExtractSum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - src[i - n]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine2(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i] - src[i - stride]; + sum += ::abs(dst[i]); + } + __m256i _sum = _mm256_setzero_si256(); + for (; i < sizeA; i += A) + { + __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i)); + __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - stride)); + __m256i _dst = _mm256_sub_epi8(_src0, _src1); + _mm256_storeu_si256((__m256i*)(dst + i), _dst); + _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst))); + } + sum += Avx2::ExtractSum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - src[i - stride]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine3(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i] - (src[i - stride] >> 1); + sum += ::abs(dst[i]); + } + __m256i _sum = _mm256_setzero_si256(); + for (; i < sizeA; i += A) + { + __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i)); + __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n)); + __m256i _src2 = _mm256_loadu_si256((__m256i*)(src + i - stride)); + __m256i lo = _mm256_srli_epi16(_mm256_add_epi16(UnpackU8<0>(_src1), UnpackU8<0>(_src2)), 1); + __m256i hi = _mm256_srli_epi16(_mm256_add_epi16(UnpackU8<1>(_src1), UnpackU8<1>(_src2)), 1); + __m256i _dst = _mm256_sub_epi8(_src0, _mm256_packus_epi16(lo, hi)); + _mm256_storeu_si256((__m256i*)(dst + i), _dst); + _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst))); + } + sum += Avx2::ExtractSum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - ((src[i - n] + src[i - stride]) >> 1); + sum += ::abs(dst[i]); + } + return sum; + } + + SIMD_INLINE __m256i Paeth(__m256i a, __m256i b, __m256i c) + { + __m256i p = _mm256_sub_epi16(_mm256_add_epi16(a, b), c); + __m256i pa = _mm256_abs_epi16(_mm256_sub_epi16(p, a)); + __m256i pb = _mm256_abs_epi16(_mm256_sub_epi16(p, b)); + __m256i pc = _mm256_abs_epi16(_mm256_sub_epi16(p, c)); + __m256i mbc = _mm256_or_si256(_mm256_cmpgt_epi16(pa, pb), _mm256_cmpgt_epi16(pa, pc)); + __m256i mc = _mm256_cmpgt_epi16(pb, pc); + return _mm256_blendv_epi8(a, _mm256_blendv_epi8(b, c, mc), mbc); + } + + uint32_t EncodeLine4(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = (int8_t)(src[i] - src[i - stride]); + sum += ::abs(dst[i]); + } + __m256i _sum = _mm256_setzero_si256(); + for (; i < sizeA; i += A) + { + __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i)); + __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n)); + __m256i _src2 = _mm256_loadu_si256((__m256i*)(src + i - stride)); + __m256i _src3 = _mm256_loadu_si256((__m256i*)(src + i - stride - n)); + __m256i lo = Paeth(UnpackU8<0>(_src1), UnpackU8<0>(_src2), UnpackU8<0>(_src3)); + __m256i hi = Paeth(UnpackU8<1>(_src1), UnpackU8<1>(_src2), UnpackU8<1>(_src3)); + __m256i _dst = _mm256_sub_epi8(_src0, _mm256_packus_epi16(lo, hi)); + _mm256_storeu_si256((__m256i*)(dst + i), _dst); + _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst))); + } + sum += Avx2::ExtractSum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - Base::Paeth(src[i - n], src[i - stride], src[i - stride - n]); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine5(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + __m256i _sum = _mm256_setzero_si256(); + for (; i < sizeA; i += A) + { + __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i)); + __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n)); + __m256i lo = _mm256_srli_epi16(UnpackU8<0>(_src1), 1); + __m256i hi = _mm256_srli_epi16(UnpackU8<1>(_src1), 1); + __m256i _dst = _mm256_sub_epi8(_src0, _mm256_packus_epi16(lo, hi)); + _mm256_storeu_si256((__m256i*)(dst + i), _dst); + _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst))); + } + sum += Avx2::ExtractSum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - (src[i - n] >> 1); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine6(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + __m256i _sum = _mm256_setzero_si256(); + for (; i < sizeA; i += A) + { + __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i)); + __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n)); + __m256i _dst = _mm256_sub_epi8(_src0, _src1); + _mm256_storeu_si256((__m256i*)(dst + i), _dst); + _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst))); + } + sum += Avx2::ExtractSum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - src[i - n]; + sum += ::abs(dst[i]); + } + return sum; + } + + ImagePngSaver::ImagePngSaver(const ImageSaverParam& param) + : Sse41::ImagePngSaver(param) + { + if (_param.format == SimdPixelFormatBgr24) + _convert = Avx2::BgrToRgb; + else if (_param.format == SimdPixelFormatBgra32) + _convert = Avx2::BgraToRgba; + _encode[0] = Avx2::EncodeLine0; + _encode[1] = Avx2::EncodeLine1; + _encode[2] = Avx2::EncodeLine2; + _encode[3] = Avx2::EncodeLine3; + _encode[4] = Avx2::EncodeLine4; + _encode[5] = Avx2::EncodeLine5; + _encode[6] = Avx2::EncodeLine6; + _compress = Avx2::ZlibCompress; + } + } +#endif// SIMD_AVX2_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdBase.h b/3rdparty/simdlib/Simd/SimdBase.h index 998a7b7cbe..3ad6e60d96 100755 --- a/3rdparty/simdlib/Simd/SimdBase.h +++ b/3rdparty/simdlib/Simd/SimdBase.h @@ -32,6 +32,10 @@ namespace Simd { namespace Base { + uint32_t Crc32(const void* src, size_t size); + + uint32_t Crc32c(const void * src, size_t size); + void BgraToBgr(const uint8_t * bgra, size_t size, uint8_t * bgr, bool lastRow); void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride); diff --git a/3rdparty/simdlib/Simd/SimdBaseCrc32.cpp b/3rdparty/simdlib/Simd/SimdBaseCrc32.cpp new file mode 100644 index 0000000000..4008b0f0d8 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseCrc32.cpp @@ -0,0 +1,978 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdDefs.h" + +namespace Simd +{ + namespace Base + { + static SIMD_INLINE uint32_t Reorder32(uint32_t x) + { +#if defined(__GNUC__) || defined(__clang__) + return __builtin_bswap32(x); +#else + return (x >> 24) | + ((x >> 8) & 0x0000FF00) | + ((x << 8) & 0x00FF0000) | + (x << 24); +#endif + } + + // Precalculated CRC32c lookup table for polynomial 0xEDB88320. + static const uint32_t Crc32Table[16][256] = + { + { + 0x00000000,0x77073096,0xEE0E612C,0x990951BA,0x076DC419,0x706AF48F,0xE963A535,0x9E6495A3, + 0x0EDB8832,0x79DCB8A4,0xE0D5E91E,0x97D2D988,0x09B64C2B,0x7EB17CBD,0xE7B82D07,0x90BF1D91, + 0x1DB71064,0x6AB020F2,0xF3B97148,0x84BE41DE,0x1ADAD47D,0x6DDDE4EB,0xF4D4B551,0x83D385C7, + 0x136C9856,0x646BA8C0,0xFD62F97A,0x8A65C9EC,0x14015C4F,0x63066CD9,0xFA0F3D63,0x8D080DF5, + 0x3B6E20C8,0x4C69105E,0xD56041E4,0xA2677172,0x3C03E4D1,0x4B04D447,0xD20D85FD,0xA50AB56B, + 0x35B5A8FA,0x42B2986C,0xDBBBC9D6,0xACBCF940,0x32D86CE3,0x45DF5C75,0xDCD60DCF,0xABD13D59, + 0x26D930AC,0x51DE003A,0xC8D75180,0xBFD06116,0x21B4F4B5,0x56B3C423,0xCFBA9599,0xB8BDA50F, + 0x2802B89E,0x5F058808,0xC60CD9B2,0xB10BE924,0x2F6F7C87,0x58684C11,0xC1611DAB,0xB6662D3D, + 0x76DC4190,0x01DB7106,0x98D220BC,0xEFD5102A,0x71B18589,0x06B6B51F,0x9FBFE4A5,0xE8B8D433, + 0x7807C9A2,0x0F00F934,0x9609A88E,0xE10E9818,0x7F6A0DBB,0x086D3D2D,0x91646C97,0xE6635C01, + 0x6B6B51F4,0x1C6C6162,0x856530D8,0xF262004E,0x6C0695ED,0x1B01A57B,0x8208F4C1,0xF50FC457, + 0x65B0D9C6,0x12B7E950,0x8BBEB8EA,0xFCB9887C,0x62DD1DDF,0x15DA2D49,0x8CD37CF3,0xFBD44C65, + 0x4DB26158,0x3AB551CE,0xA3BC0074,0xD4BB30E2,0x4ADFA541,0x3DD895D7,0xA4D1C46D,0xD3D6F4FB, + 0x4369E96A,0x346ED9FC,0xAD678846,0xDA60B8D0,0x44042D73,0x33031DE5,0xAA0A4C5F,0xDD0D7CC9, + 0x5005713C,0x270241AA,0xBE0B1010,0xC90C2086,0x5768B525,0x206F85B3,0xB966D409,0xCE61E49F, + 0x5EDEF90E,0x29D9C998,0xB0D09822,0xC7D7A8B4,0x59B33D17,0x2EB40D81,0xB7BD5C3B,0xC0BA6CAD, + 0xEDB88320,0x9ABFB3B6,0x03B6E20C,0x74B1D29A,0xEAD54739,0x9DD277AF,0x04DB2615,0x73DC1683, + 0xE3630B12,0x94643B84,0x0D6D6A3E,0x7A6A5AA8,0xE40ECF0B,0x9309FF9D,0x0A00AE27,0x7D079EB1, + 0xF00F9344,0x8708A3D2,0x1E01F268,0x6906C2FE,0xF762575D,0x806567CB,0x196C3671,0x6E6B06E7, + 0xFED41B76,0x89D32BE0,0x10DA7A5A,0x67DD4ACC,0xF9B9DF6F,0x8EBEEFF9,0x17B7BE43,0x60B08ED5, + 0xD6D6A3E8,0xA1D1937E,0x38D8C2C4,0x4FDFF252,0xD1BB67F1,0xA6BC5767,0x3FB506DD,0x48B2364B, + 0xD80D2BDA,0xAF0A1B4C,0x36034AF6,0x41047A60,0xDF60EFC3,0xA867DF55,0x316E8EEF,0x4669BE79, + 0xCB61B38C,0xBC66831A,0x256FD2A0,0x5268E236,0xCC0C7795,0xBB0B4703,0x220216B9,0x5505262F, + 0xC5BA3BBE,0xB2BD0B28,0x2BB45A92,0x5CB36A04,0xC2D7FFA7,0xB5D0CF31,0x2CD99E8B,0x5BDEAE1D, + 0x9B64C2B0,0xEC63F226,0x756AA39C,0x026D930A,0x9C0906A9,0xEB0E363F,0x72076785,0x05005713, + 0x95BF4A82,0xE2B87A14,0x7BB12BAE,0x0CB61B38,0x92D28E9B,0xE5D5BE0D,0x7CDCEFB7,0x0BDBDF21, + 0x86D3D2D4,0xF1D4E242,0x68DDB3F8,0x1FDA836E,0x81BE16CD,0xF6B9265B,0x6FB077E1,0x18B74777, + 0x88085AE6,0xFF0F6A70,0x66063BCA,0x11010B5C,0x8F659EFF,0xF862AE69,0x616BFFD3,0x166CCF45, + 0xA00AE278,0xD70DD2EE,0x4E048354,0x3903B3C2,0xA7672661,0xD06016F7,0x4969474D,0x3E6E77DB, + 0xAED16A4A,0xD9D65ADC,0x40DF0B66,0x37D83BF0,0xA9BCAE53,0xDEBB9EC5,0x47B2CF7F,0x30B5FFE9, + 0xBDBDF21C,0xCABAC28A,0x53B39330,0x24B4A3A6,0xBAD03605,0xCDD70693,0x54DE5729,0x23D967BF, + 0xB3667A2E,0xC4614AB8,0x5D681B02,0x2A6F2B94,0xB40BBE37,0xC30C8EA1,0x5A05DF1B,0x2D02EF8D, + }, + { + 0x00000000,0x191B3141,0x32366282,0x2B2D53C3,0x646CC504,0x7D77F445,0x565AA786,0x4F4196C7, + 0xC8D98A08,0xD1C2BB49,0xFAEFE88A,0xE3F4D9CB,0xACB54F0C,0xB5AE7E4D,0x9E832D8E,0x87981CCF, + 0x4AC21251,0x53D92310,0x78F470D3,0x61EF4192,0x2EAED755,0x37B5E614,0x1C98B5D7,0x05838496, + 0x821B9859,0x9B00A918,0xB02DFADB,0xA936CB9A,0xE6775D5D,0xFF6C6C1C,0xD4413FDF,0xCD5A0E9E, + 0x958424A2,0x8C9F15E3,0xA7B24620,0xBEA97761,0xF1E8E1A6,0xE8F3D0E7,0xC3DE8324,0xDAC5B265, + 0x5D5DAEAA,0x44469FEB,0x6F6BCC28,0x7670FD69,0x39316BAE,0x202A5AEF,0x0B07092C,0x121C386D, + 0xDF4636F3,0xC65D07B2,0xED705471,0xF46B6530,0xBB2AF3F7,0xA231C2B6,0x891C9175,0x9007A034, + 0x179FBCFB,0x0E848DBA,0x25A9DE79,0x3CB2EF38,0x73F379FF,0x6AE848BE,0x41C51B7D,0x58DE2A3C, + 0xF0794F05,0xE9627E44,0xC24F2D87,0xDB541CC6,0x94158A01,0x8D0EBB40,0xA623E883,0xBF38D9C2, + 0x38A0C50D,0x21BBF44C,0x0A96A78F,0x138D96CE,0x5CCC0009,0x45D73148,0x6EFA628B,0x77E153CA, + 0xBABB5D54,0xA3A06C15,0x888D3FD6,0x91960E97,0xDED79850,0xC7CCA911,0xECE1FAD2,0xF5FACB93, + 0x7262D75C,0x6B79E61D,0x4054B5DE,0x594F849F,0x160E1258,0x0F152319,0x243870DA,0x3D23419B, + 0x65FD6BA7,0x7CE65AE6,0x57CB0925,0x4ED03864,0x0191AEA3,0x188A9FE2,0x33A7CC21,0x2ABCFD60, + 0xAD24E1AF,0xB43FD0EE,0x9F12832D,0x8609B26C,0xC94824AB,0xD05315EA,0xFB7E4629,0xE2657768, + 0x2F3F79F6,0x362448B7,0x1D091B74,0x04122A35,0x4B53BCF2,0x52488DB3,0x7965DE70,0x607EEF31, + 0xE7E6F3FE,0xFEFDC2BF,0xD5D0917C,0xCCCBA03D,0x838A36FA,0x9A9107BB,0xB1BC5478,0xA8A76539, + 0x3B83984B,0x2298A90A,0x09B5FAC9,0x10AECB88,0x5FEF5D4F,0x46F46C0E,0x6DD93FCD,0x74C20E8C, + 0xF35A1243,0xEA412302,0xC16C70C1,0xD8774180,0x9736D747,0x8E2DE606,0xA500B5C5,0xBC1B8484, + 0x71418A1A,0x685ABB5B,0x4377E898,0x5A6CD9D9,0x152D4F1E,0x0C367E5F,0x271B2D9C,0x3E001CDD, + 0xB9980012,0xA0833153,0x8BAE6290,0x92B553D1,0xDDF4C516,0xC4EFF457,0xEFC2A794,0xF6D996D5, + 0xAE07BCE9,0xB71C8DA8,0x9C31DE6B,0x852AEF2A,0xCA6B79ED,0xD37048AC,0xF85D1B6F,0xE1462A2E, + 0x66DE36E1,0x7FC507A0,0x54E85463,0x4DF36522,0x02B2F3E5,0x1BA9C2A4,0x30849167,0x299FA026, + 0xE4C5AEB8,0xFDDE9FF9,0xD6F3CC3A,0xCFE8FD7B,0x80A96BBC,0x99B25AFD,0xB29F093E,0xAB84387F, + 0x2C1C24B0,0x350715F1,0x1E2A4632,0x07317773,0x4870E1B4,0x516BD0F5,0x7A468336,0x635DB277, + 0xCBFAD74E,0xD2E1E60F,0xF9CCB5CC,0xE0D7848D,0xAF96124A,0xB68D230B,0x9DA070C8,0x84BB4189, + 0x03235D46,0x1A386C07,0x31153FC4,0x280E0E85,0x674F9842,0x7E54A903,0x5579FAC0,0x4C62CB81, + 0x8138C51F,0x9823F45E,0xB30EA79D,0xAA1596DC,0xE554001B,0xFC4F315A,0xD7626299,0xCE7953D8, + 0x49E14F17,0x50FA7E56,0x7BD72D95,0x62CC1CD4,0x2D8D8A13,0x3496BB52,0x1FBBE891,0x06A0D9D0, + 0x5E7EF3EC,0x4765C2AD,0x6C48916E,0x7553A02F,0x3A1236E8,0x230907A9,0x0824546A,0x113F652B, + 0x96A779E4,0x8FBC48A5,0xA4911B66,0xBD8A2A27,0xF2CBBCE0,0xEBD08DA1,0xC0FDDE62,0xD9E6EF23, + 0x14BCE1BD,0x0DA7D0FC,0x268A833F,0x3F91B27E,0x70D024B9,0x69CB15F8,0x42E6463B,0x5BFD777A, + 0xDC656BB5,0xC57E5AF4,0xEE530937,0xF7483876,0xB809AEB1,0xA1129FF0,0x8A3FCC33,0x9324FD72, + }, + { + 0x00000000,0x01C26A37,0x0384D46E,0x0246BE59,0x0709A8DC,0x06CBC2EB,0x048D7CB2,0x054F1685, + 0x0E1351B8,0x0FD13B8F,0x0D9785D6,0x0C55EFE1,0x091AF964,0x08D89353,0x0A9E2D0A,0x0B5C473D, + 0x1C26A370,0x1DE4C947,0x1FA2771E,0x1E601D29,0x1B2F0BAC,0x1AED619B,0x18ABDFC2,0x1969B5F5, + 0x1235F2C8,0x13F798FF,0x11B126A6,0x10734C91,0x153C5A14,0x14FE3023,0x16B88E7A,0x177AE44D, + 0x384D46E0,0x398F2CD7,0x3BC9928E,0x3A0BF8B9,0x3F44EE3C,0x3E86840B,0x3CC03A52,0x3D025065, + 0x365E1758,0x379C7D6F,0x35DAC336,0x3418A901,0x3157BF84,0x3095D5B3,0x32D36BEA,0x331101DD, + 0x246BE590,0x25A98FA7,0x27EF31FE,0x262D5BC9,0x23624D4C,0x22A0277B,0x20E69922,0x2124F315, + 0x2A78B428,0x2BBADE1F,0x29FC6046,0x283E0A71,0x2D711CF4,0x2CB376C3,0x2EF5C89A,0x2F37A2AD, + 0x709A8DC0,0x7158E7F7,0x731E59AE,0x72DC3399,0x7793251C,0x76514F2B,0x7417F172,0x75D59B45, + 0x7E89DC78,0x7F4BB64F,0x7D0D0816,0x7CCF6221,0x798074A4,0x78421E93,0x7A04A0CA,0x7BC6CAFD, + 0x6CBC2EB0,0x6D7E4487,0x6F38FADE,0x6EFA90E9,0x6BB5866C,0x6A77EC5B,0x68315202,0x69F33835, + 0x62AF7F08,0x636D153F,0x612BAB66,0x60E9C151,0x65A6D7D4,0x6464BDE3,0x662203BA,0x67E0698D, + 0x48D7CB20,0x4915A117,0x4B531F4E,0x4A917579,0x4FDE63FC,0x4E1C09CB,0x4C5AB792,0x4D98DDA5, + 0x46C49A98,0x4706F0AF,0x45404EF6,0x448224C1,0x41CD3244,0x400F5873,0x4249E62A,0x438B8C1D, + 0x54F16850,0x55330267,0x5775BC3E,0x56B7D609,0x53F8C08C,0x523AAABB,0x507C14E2,0x51BE7ED5, + 0x5AE239E8,0x5B2053DF,0x5966ED86,0x58A487B1,0x5DEB9134,0x5C29FB03,0x5E6F455A,0x5FAD2F6D, + 0xE1351B80,0xE0F771B7,0xE2B1CFEE,0xE373A5D9,0xE63CB35C,0xE7FED96B,0xE5B86732,0xE47A0D05, + 0xEF264A38,0xEEE4200F,0xECA29E56,0xED60F461,0xE82FE2E4,0xE9ED88D3,0xEBAB368A,0xEA695CBD, + 0xFD13B8F0,0xFCD1D2C7,0xFE976C9E,0xFF5506A9,0xFA1A102C,0xFBD87A1B,0xF99EC442,0xF85CAE75, + 0xF300E948,0xF2C2837F,0xF0843D26,0xF1465711,0xF4094194,0xF5CB2BA3,0xF78D95FA,0xF64FFFCD, + 0xD9785D60,0xD8BA3757,0xDAFC890E,0xDB3EE339,0xDE71F5BC,0xDFB39F8B,0xDDF521D2,0xDC374BE5, + 0xD76B0CD8,0xD6A966EF,0xD4EFD8B6,0xD52DB281,0xD062A404,0xD1A0CE33,0xD3E6706A,0xD2241A5D, + 0xC55EFE10,0xC49C9427,0xC6DA2A7E,0xC7184049,0xC25756CC,0xC3953CFB,0xC1D382A2,0xC011E895, + 0xCB4DAFA8,0xCA8FC59F,0xC8C97BC6,0xC90B11F1,0xCC440774,0xCD866D43,0xCFC0D31A,0xCE02B92D, + 0x91AF9640,0x906DFC77,0x922B422E,0x93E92819,0x96A63E9C,0x976454AB,0x9522EAF2,0x94E080C5, + 0x9FBCC7F8,0x9E7EADCF,0x9C381396,0x9DFA79A1,0x98B56F24,0x99770513,0x9B31BB4A,0x9AF3D17D, + 0x8D893530,0x8C4B5F07,0x8E0DE15E,0x8FCF8B69,0x8A809DEC,0x8B42F7DB,0x89044982,0x88C623B5, + 0x839A6488,0x82580EBF,0x801EB0E6,0x81DCDAD1,0x8493CC54,0x8551A663,0x8717183A,0x86D5720D, + 0xA9E2D0A0,0xA820BA97,0xAA6604CE,0xABA46EF9,0xAEEB787C,0xAF29124B,0xAD6FAC12,0xACADC625, + 0xA7F18118,0xA633EB2F,0xA4755576,0xA5B73F41,0xA0F829C4,0xA13A43F3,0xA37CFDAA,0xA2BE979D, + 0xB5C473D0,0xB40619E7,0xB640A7BE,0xB782CD89,0xB2CDDB0C,0xB30FB13B,0xB1490F62,0xB08B6555, + 0xBBD72268,0xBA15485F,0xB853F606,0xB9919C31,0xBCDE8AB4,0xBD1CE083,0xBF5A5EDA,0xBE9834ED, + }, + { + 0x00000000,0xB8BC6765,0xAA09C88B,0x12B5AFEE,0x8F629757,0x37DEF032,0x256B5FDC,0x9DD738B9, + 0xC5B428EF,0x7D084F8A,0x6FBDE064,0xD7018701,0x4AD6BFB8,0xF26AD8DD,0xE0DF7733,0x58631056, + 0x5019579F,0xE8A530FA,0xFA109F14,0x42ACF871,0xDF7BC0C8,0x67C7A7AD,0x75720843,0xCDCE6F26, + 0x95AD7F70,0x2D111815,0x3FA4B7FB,0x8718D09E,0x1ACFE827,0xA2738F42,0xB0C620AC,0x087A47C9, + 0xA032AF3E,0x188EC85B,0x0A3B67B5,0xB28700D0,0x2F503869,0x97EC5F0C,0x8559F0E2,0x3DE59787, + 0x658687D1,0xDD3AE0B4,0xCF8F4F5A,0x7733283F,0xEAE41086,0x525877E3,0x40EDD80D,0xF851BF68, + 0xF02BF8A1,0x48979FC4,0x5A22302A,0xE29E574F,0x7F496FF6,0xC7F50893,0xD540A77D,0x6DFCC018, + 0x359FD04E,0x8D23B72B,0x9F9618C5,0x272A7FA0,0xBAFD4719,0x0241207C,0x10F48F92,0xA848E8F7, + 0x9B14583D,0x23A83F58,0x311D90B6,0x89A1F7D3,0x1476CF6A,0xACCAA80F,0xBE7F07E1,0x06C36084, + 0x5EA070D2,0xE61C17B7,0xF4A9B859,0x4C15DF3C,0xD1C2E785,0x697E80E0,0x7BCB2F0E,0xC377486B, + 0xCB0D0FA2,0x73B168C7,0x6104C729,0xD9B8A04C,0x446F98F5,0xFCD3FF90,0xEE66507E,0x56DA371B, + 0x0EB9274D,0xB6054028,0xA4B0EFC6,0x1C0C88A3,0x81DBB01A,0x3967D77F,0x2BD27891,0x936E1FF4, + 0x3B26F703,0x839A9066,0x912F3F88,0x299358ED,0xB4446054,0x0CF80731,0x1E4DA8DF,0xA6F1CFBA, + 0xFE92DFEC,0x462EB889,0x549B1767,0xEC277002,0x71F048BB,0xC94C2FDE,0xDBF98030,0x6345E755, + 0x6B3FA09C,0xD383C7F9,0xC1366817,0x798A0F72,0xE45D37CB,0x5CE150AE,0x4E54FF40,0xF6E89825, + 0xAE8B8873,0x1637EF16,0x048240F8,0xBC3E279D,0x21E91F24,0x99557841,0x8BE0D7AF,0x335CB0CA, + 0xED59B63B,0x55E5D15E,0x47507EB0,0xFFEC19D5,0x623B216C,0xDA874609,0xC832E9E7,0x708E8E82, + 0x28ED9ED4,0x9051F9B1,0x82E4565F,0x3A58313A,0xA78F0983,0x1F336EE6,0x0D86C108,0xB53AA66D, + 0xBD40E1A4,0x05FC86C1,0x1749292F,0xAFF54E4A,0x322276F3,0x8A9E1196,0x982BBE78,0x2097D91D, + 0x78F4C94B,0xC048AE2E,0xD2FD01C0,0x6A4166A5,0xF7965E1C,0x4F2A3979,0x5D9F9697,0xE523F1F2, + 0x4D6B1905,0xF5D77E60,0xE762D18E,0x5FDEB6EB,0xC2098E52,0x7AB5E937,0x680046D9,0xD0BC21BC, + 0x88DF31EA,0x3063568F,0x22D6F961,0x9A6A9E04,0x07BDA6BD,0xBF01C1D8,0xADB46E36,0x15080953, + 0x1D724E9A,0xA5CE29FF,0xB77B8611,0x0FC7E174,0x9210D9CD,0x2AACBEA8,0x38191146,0x80A57623, + 0xD8C66675,0x607A0110,0x72CFAEFE,0xCA73C99B,0x57A4F122,0xEF189647,0xFDAD39A9,0x45115ECC, + 0x764DEE06,0xCEF18963,0xDC44268D,0x64F841E8,0xF92F7951,0x41931E34,0x5326B1DA,0xEB9AD6BF, + 0xB3F9C6E9,0x0B45A18C,0x19F00E62,0xA14C6907,0x3C9B51BE,0x842736DB,0x96929935,0x2E2EFE50, + 0x2654B999,0x9EE8DEFC,0x8C5D7112,0x34E11677,0xA9362ECE,0x118A49AB,0x033FE645,0xBB838120, + 0xE3E09176,0x5B5CF613,0x49E959FD,0xF1553E98,0x6C820621,0xD43E6144,0xC68BCEAA,0x7E37A9CF, + 0xD67F4138,0x6EC3265D,0x7C7689B3,0xC4CAEED6,0x591DD66F,0xE1A1B10A,0xF3141EE4,0x4BA87981, + 0x13CB69D7,0xAB770EB2,0xB9C2A15C,0x017EC639,0x9CA9FE80,0x241599E5,0x36A0360B,0x8E1C516E, + 0x866616A7,0x3EDA71C2,0x2C6FDE2C,0x94D3B949,0x090481F0,0xB1B8E695,0xA30D497B,0x1BB12E1E, + 0x43D23E48,0xFB6E592D,0xE9DBF6C3,0x516791A6,0xCCB0A91F,0x740CCE7A,0x66B96194,0xDE0506F1, + }, + { + 0x00000000,0x3D6029B0,0x7AC05360,0x47A07AD0,0xF580A6C0,0xC8E08F70,0x8F40F5A0,0xB220DC10, + 0x30704BC1,0x0D106271,0x4AB018A1,0x77D03111,0xC5F0ED01,0xF890C4B1,0xBF30BE61,0x825097D1, + 0x60E09782,0x5D80BE32,0x1A20C4E2,0x2740ED52,0x95603142,0xA80018F2,0xEFA06222,0xD2C04B92, + 0x5090DC43,0x6DF0F5F3,0x2A508F23,0x1730A693,0xA5107A83,0x98705333,0xDFD029E3,0xE2B00053, + 0xC1C12F04,0xFCA106B4,0xBB017C64,0x866155D4,0x344189C4,0x0921A074,0x4E81DAA4,0x73E1F314, + 0xF1B164C5,0xCCD14D75,0x8B7137A5,0xB6111E15,0x0431C205,0x3951EBB5,0x7EF19165,0x4391B8D5, + 0xA121B886,0x9C419136,0xDBE1EBE6,0xE681C256,0x54A11E46,0x69C137F6,0x2E614D26,0x13016496, + 0x9151F347,0xAC31DAF7,0xEB91A027,0xD6F18997,0x64D15587,0x59B17C37,0x1E1106E7,0x23712F57, + 0x58F35849,0x659371F9,0x22330B29,0x1F532299,0xAD73FE89,0x9013D739,0xD7B3ADE9,0xEAD38459, + 0x68831388,0x55E33A38,0x124340E8,0x2F236958,0x9D03B548,0xA0639CF8,0xE7C3E628,0xDAA3CF98, + 0x3813CFCB,0x0573E67B,0x42D39CAB,0x7FB3B51B,0xCD93690B,0xF0F340BB,0xB7533A6B,0x8A3313DB, + 0x0863840A,0x3503ADBA,0x72A3D76A,0x4FC3FEDA,0xFDE322CA,0xC0830B7A,0x872371AA,0xBA43581A, + 0x9932774D,0xA4525EFD,0xE3F2242D,0xDE920D9D,0x6CB2D18D,0x51D2F83D,0x167282ED,0x2B12AB5D, + 0xA9423C8C,0x9422153C,0xD3826FEC,0xEEE2465C,0x5CC29A4C,0x61A2B3FC,0x2602C92C,0x1B62E09C, + 0xF9D2E0CF,0xC4B2C97F,0x8312B3AF,0xBE729A1F,0x0C52460F,0x31326FBF,0x7692156F,0x4BF23CDF, + 0xC9A2AB0E,0xF4C282BE,0xB362F86E,0x8E02D1DE,0x3C220DCE,0x0142247E,0x46E25EAE,0x7B82771E, + 0xB1E6B092,0x8C869922,0xCB26E3F2,0xF646CA42,0x44661652,0x79063FE2,0x3EA64532,0x03C66C82, + 0x8196FB53,0xBCF6D2E3,0xFB56A833,0xC6368183,0x74165D93,0x49767423,0x0ED60EF3,0x33B62743, + 0xD1062710,0xEC660EA0,0xABC67470,0x96A65DC0,0x248681D0,0x19E6A860,0x5E46D2B0,0x6326FB00, + 0xE1766CD1,0xDC164561,0x9BB63FB1,0xA6D61601,0x14F6CA11,0x2996E3A1,0x6E369971,0x5356B0C1, + 0x70279F96,0x4D47B626,0x0AE7CCF6,0x3787E546,0x85A73956,0xB8C710E6,0xFF676A36,0xC2074386, + 0x4057D457,0x7D37FDE7,0x3A978737,0x07F7AE87,0xB5D77297,0x88B75B27,0xCF1721F7,0xF2770847, + 0x10C70814,0x2DA721A4,0x6A075B74,0x576772C4,0xE547AED4,0xD8278764,0x9F87FDB4,0xA2E7D404, + 0x20B743D5,0x1DD76A65,0x5A7710B5,0x67173905,0xD537E515,0xE857CCA5,0xAFF7B675,0x92979FC5, + 0xE915E8DB,0xD475C16B,0x93D5BBBB,0xAEB5920B,0x1C954E1B,0x21F567AB,0x66551D7B,0x5B3534CB, + 0xD965A31A,0xE4058AAA,0xA3A5F07A,0x9EC5D9CA,0x2CE505DA,0x11852C6A,0x562556BA,0x6B457F0A, + 0x89F57F59,0xB49556E9,0xF3352C39,0xCE550589,0x7C75D999,0x4115F029,0x06B58AF9,0x3BD5A349, + 0xB9853498,0x84E51D28,0xC34567F8,0xFE254E48,0x4C059258,0x7165BBE8,0x36C5C138,0x0BA5E888, + 0x28D4C7DF,0x15B4EE6F,0x521494BF,0x6F74BD0F,0xDD54611F,0xE03448AF,0xA794327F,0x9AF41BCF, + 0x18A48C1E,0x25C4A5AE,0x6264DF7E,0x5F04F6CE,0xED242ADE,0xD044036E,0x97E479BE,0xAA84500E, + 0x4834505D,0x755479ED,0x32F4033D,0x0F942A8D,0xBDB4F69D,0x80D4DF2D,0xC774A5FD,0xFA148C4D, + 0x78441B9C,0x4524322C,0x028448FC,0x3FE4614C,0x8DC4BD5C,0xB0A494EC,0xF704EE3C,0xCA64C78C, + }, + { + 0x00000000,0xCB5CD3A5,0x4DC8A10B,0x869472AE,0x9B914216,0x50CD91B3,0xD659E31D,0x1D0530B8, + 0xEC53826D,0x270F51C8,0xA19B2366,0x6AC7F0C3,0x77C2C07B,0xBC9E13DE,0x3A0A6170,0xF156B2D5, + 0x03D6029B,0xC88AD13E,0x4E1EA390,0x85427035,0x9847408D,0x531B9328,0xD58FE186,0x1ED33223, + 0xEF8580F6,0x24D95353,0xA24D21FD,0x6911F258,0x7414C2E0,0xBF481145,0x39DC63EB,0xF280B04E, + 0x07AC0536,0xCCF0D693,0x4A64A43D,0x81387798,0x9C3D4720,0x57619485,0xD1F5E62B,0x1AA9358E, + 0xEBFF875B,0x20A354FE,0xA6372650,0x6D6BF5F5,0x706EC54D,0xBB3216E8,0x3DA66446,0xF6FAB7E3, + 0x047A07AD,0xCF26D408,0x49B2A6A6,0x82EE7503,0x9FEB45BB,0x54B7961E,0xD223E4B0,0x197F3715, + 0xE82985C0,0x23755665,0xA5E124CB,0x6EBDF76E,0x73B8C7D6,0xB8E41473,0x3E7066DD,0xF52CB578, + 0x0F580A6C,0xC404D9C9,0x4290AB67,0x89CC78C2,0x94C9487A,0x5F959BDF,0xD901E971,0x125D3AD4, + 0xE30B8801,0x28575BA4,0xAEC3290A,0x659FFAAF,0x789ACA17,0xB3C619B2,0x35526B1C,0xFE0EB8B9, + 0x0C8E08F7,0xC7D2DB52,0x4146A9FC,0x8A1A7A59,0x971F4AE1,0x5C439944,0xDAD7EBEA,0x118B384F, + 0xE0DD8A9A,0x2B81593F,0xAD152B91,0x6649F834,0x7B4CC88C,0xB0101B29,0x36846987,0xFDD8BA22, + 0x08F40F5A,0xC3A8DCFF,0x453CAE51,0x8E607DF4,0x93654D4C,0x58399EE9,0xDEADEC47,0x15F13FE2, + 0xE4A78D37,0x2FFB5E92,0xA96F2C3C,0x6233FF99,0x7F36CF21,0xB46A1C84,0x32FE6E2A,0xF9A2BD8F, + 0x0B220DC1,0xC07EDE64,0x46EAACCA,0x8DB67F6F,0x90B34FD7,0x5BEF9C72,0xDD7BEEDC,0x16273D79, + 0xE7718FAC,0x2C2D5C09,0xAAB92EA7,0x61E5FD02,0x7CE0CDBA,0xB7BC1E1F,0x31286CB1,0xFA74BF14, + 0x1EB014D8,0xD5ECC77D,0x5378B5D3,0x98246676,0x852156CE,0x4E7D856B,0xC8E9F7C5,0x03B52460, + 0xF2E396B5,0x39BF4510,0xBF2B37BE,0x7477E41B,0x6972D4A3,0xA22E0706,0x24BA75A8,0xEFE6A60D, + 0x1D661643,0xD63AC5E6,0x50AEB748,0x9BF264ED,0x86F75455,0x4DAB87F0,0xCB3FF55E,0x006326FB, + 0xF135942E,0x3A69478B,0xBCFD3525,0x77A1E680,0x6AA4D638,0xA1F8059D,0x276C7733,0xEC30A496, + 0x191C11EE,0xD240C24B,0x54D4B0E5,0x9F886340,0x828D53F8,0x49D1805D,0xCF45F2F3,0x04192156, + 0xF54F9383,0x3E134026,0xB8873288,0x73DBE12D,0x6EDED195,0xA5820230,0x2316709E,0xE84AA33B, + 0x1ACA1375,0xD196C0D0,0x5702B27E,0x9C5E61DB,0x815B5163,0x4A0782C6,0xCC93F068,0x07CF23CD, + 0xF6999118,0x3DC542BD,0xBB513013,0x700DE3B6,0x6D08D30E,0xA65400AB,0x20C07205,0xEB9CA1A0, + 0x11E81EB4,0xDAB4CD11,0x5C20BFBF,0x977C6C1A,0x8A795CA2,0x41258F07,0xC7B1FDA9,0x0CED2E0C, + 0xFDBB9CD9,0x36E74F7C,0xB0733DD2,0x7B2FEE77,0x662ADECF,0xAD760D6A,0x2BE27FC4,0xE0BEAC61, + 0x123E1C2F,0xD962CF8A,0x5FF6BD24,0x94AA6E81,0x89AF5E39,0x42F38D9C,0xC467FF32,0x0F3B2C97, + 0xFE6D9E42,0x35314DE7,0xB3A53F49,0x78F9ECEC,0x65FCDC54,0xAEA00FF1,0x28347D5F,0xE368AEFA, + 0x16441B82,0xDD18C827,0x5B8CBA89,0x90D0692C,0x8DD55994,0x46898A31,0xC01DF89F,0x0B412B3A, + 0xFA1799EF,0x314B4A4A,0xB7DF38E4,0x7C83EB41,0x6186DBF9,0xAADA085C,0x2C4E7AF2,0xE712A957, + 0x15921919,0xDECECABC,0x585AB812,0x93066BB7,0x8E035B0F,0x455F88AA,0xC3CBFA04,0x089729A1, + 0xF9C19B74,0x329D48D1,0xB4093A7F,0x7F55E9DA,0x6250D962,0xA90C0AC7,0x2F987869,0xE4C4ABCC, + }, + { + 0x00000000,0xA6770BB4,0x979F1129,0x31E81A9D,0xF44F2413,0x52382FA7,0x63D0353A,0xC5A73E8E, + 0x33EF4E67,0x959845D3,0xA4705F4E,0x020754FA,0xC7A06A74,0x61D761C0,0x503F7B5D,0xF64870E9, + 0x67DE9CCE,0xC1A9977A,0xF0418DE7,0x56368653,0x9391B8DD,0x35E6B369,0x040EA9F4,0xA279A240, + 0x5431D2A9,0xF246D91D,0xC3AEC380,0x65D9C834,0xA07EF6BA,0x0609FD0E,0x37E1E793,0x9196EC27, + 0xCFBD399C,0x69CA3228,0x582228B5,0xFE552301,0x3BF21D8F,0x9D85163B,0xAC6D0CA6,0x0A1A0712, + 0xFC5277FB,0x5A257C4F,0x6BCD66D2,0xCDBA6D66,0x081D53E8,0xAE6A585C,0x9F8242C1,0x39F54975, + 0xA863A552,0x0E14AEE6,0x3FFCB47B,0x998BBFCF,0x5C2C8141,0xFA5B8AF5,0xCBB39068,0x6DC49BDC, + 0x9B8CEB35,0x3DFBE081,0x0C13FA1C,0xAA64F1A8,0x6FC3CF26,0xC9B4C492,0xF85CDE0F,0x5E2BD5BB, + 0x440B7579,0xE27C7ECD,0xD3946450,0x75E36FE4,0xB044516A,0x16335ADE,0x27DB4043,0x81AC4BF7, + 0x77E43B1E,0xD19330AA,0xE07B2A37,0x460C2183,0x83AB1F0D,0x25DC14B9,0x14340E24,0xB2430590, + 0x23D5E9B7,0x85A2E203,0xB44AF89E,0x123DF32A,0xD79ACDA4,0x71EDC610,0x4005DC8D,0xE672D739, + 0x103AA7D0,0xB64DAC64,0x87A5B6F9,0x21D2BD4D,0xE47583C3,0x42028877,0x73EA92EA,0xD59D995E, + 0x8BB64CE5,0x2DC14751,0x1C295DCC,0xBA5E5678,0x7FF968F6,0xD98E6342,0xE86679DF,0x4E11726B, + 0xB8590282,0x1E2E0936,0x2FC613AB,0x89B1181F,0x4C162691,0xEA612D25,0xDB8937B8,0x7DFE3C0C, + 0xEC68D02B,0x4A1FDB9F,0x7BF7C102,0xDD80CAB6,0x1827F438,0xBE50FF8C,0x8FB8E511,0x29CFEEA5, + 0xDF879E4C,0x79F095F8,0x48188F65,0xEE6F84D1,0x2BC8BA5F,0x8DBFB1EB,0xBC57AB76,0x1A20A0C2, + 0x8816EAF2,0x2E61E146,0x1F89FBDB,0xB9FEF06F,0x7C59CEE1,0xDA2EC555,0xEBC6DFC8,0x4DB1D47C, + 0xBBF9A495,0x1D8EAF21,0x2C66B5BC,0x8A11BE08,0x4FB68086,0xE9C18B32,0xD82991AF,0x7E5E9A1B, + 0xEFC8763C,0x49BF7D88,0x78576715,0xDE206CA1,0x1B87522F,0xBDF0599B,0x8C184306,0x2A6F48B2, + 0xDC27385B,0x7A5033EF,0x4BB82972,0xEDCF22C6,0x28681C48,0x8E1F17FC,0xBFF70D61,0x198006D5, + 0x47ABD36E,0xE1DCD8DA,0xD034C247,0x7643C9F3,0xB3E4F77D,0x1593FCC9,0x247BE654,0x820CEDE0, + 0x74449D09,0xD23396BD,0xE3DB8C20,0x45AC8794,0x800BB91A,0x267CB2AE,0x1794A833,0xB1E3A387, + 0x20754FA0,0x86024414,0xB7EA5E89,0x119D553D,0xD43A6BB3,0x724D6007,0x43A57A9A,0xE5D2712E, + 0x139A01C7,0xB5ED0A73,0x840510EE,0x22721B5A,0xE7D525D4,0x41A22E60,0x704A34FD,0xD63D3F49, + 0xCC1D9F8B,0x6A6A943F,0x5B828EA2,0xFDF58516,0x3852BB98,0x9E25B02C,0xAFCDAAB1,0x09BAA105, + 0xFFF2D1EC,0x5985DA58,0x686DC0C5,0xCE1ACB71,0x0BBDF5FF,0xADCAFE4B,0x9C22E4D6,0x3A55EF62, + 0xABC30345,0x0DB408F1,0x3C5C126C,0x9A2B19D8,0x5F8C2756,0xF9FB2CE2,0xC813367F,0x6E643DCB, + 0x982C4D22,0x3E5B4696,0x0FB35C0B,0xA9C457BF,0x6C636931,0xCA146285,0xFBFC7818,0x5D8B73AC, + 0x03A0A617,0xA5D7ADA3,0x943FB73E,0x3248BC8A,0xF7EF8204,0x519889B0,0x6070932D,0xC6079899, + 0x304FE870,0x9638E3C4,0xA7D0F959,0x01A7F2ED,0xC400CC63,0x6277C7D7,0x539FDD4A,0xF5E8D6FE, + 0x647E3AD9,0xC209316D,0xF3E12BF0,0x55962044,0x90311ECA,0x3646157E,0x07AE0FE3,0xA1D90457, + 0x579174BE,0xF1E67F0A,0xC00E6597,0x66796E23,0xA3DE50AD,0x05A95B19,0x34414184,0x92364A30, + }, + { + 0x00000000,0xCCAA009E,0x4225077D,0x8E8F07E3,0x844A0EFA,0x48E00E64,0xC66F0987,0x0AC50919, + 0xD3E51BB5,0x1F4F1B2B,0x91C01CC8,0x5D6A1C56,0x57AF154F,0x9B0515D1,0x158A1232,0xD92012AC, + 0x7CBB312B,0xB01131B5,0x3E9E3656,0xF23436C8,0xF8F13FD1,0x345B3F4F,0xBAD438AC,0x767E3832, + 0xAF5E2A9E,0x63F42A00,0xED7B2DE3,0x21D12D7D,0x2B142464,0xE7BE24FA,0x69312319,0xA59B2387, + 0xF9766256,0x35DC62C8,0xBB53652B,0x77F965B5,0x7D3C6CAC,0xB1966C32,0x3F196BD1,0xF3B36B4F, + 0x2A9379E3,0xE639797D,0x68B67E9E,0xA41C7E00,0xAED97719,0x62737787,0xECFC7064,0x205670FA, + 0x85CD537D,0x496753E3,0xC7E85400,0x0B42549E,0x01875D87,0xCD2D5D19,0x43A25AFA,0x8F085A64, + 0x562848C8,0x9A824856,0x140D4FB5,0xD8A74F2B,0xD2624632,0x1EC846AC,0x9047414F,0x5CED41D1, + 0x299DC2ED,0xE537C273,0x6BB8C590,0xA712C50E,0xADD7CC17,0x617DCC89,0xEFF2CB6A,0x2358CBF4, + 0xFA78D958,0x36D2D9C6,0xB85DDE25,0x74F7DEBB,0x7E32D7A2,0xB298D73C,0x3C17D0DF,0xF0BDD041, + 0x5526F3C6,0x998CF358,0x1703F4BB,0xDBA9F425,0xD16CFD3C,0x1DC6FDA2,0x9349FA41,0x5FE3FADF, + 0x86C3E873,0x4A69E8ED,0xC4E6EF0E,0x084CEF90,0x0289E689,0xCE23E617,0x40ACE1F4,0x8C06E16A, + 0xD0EBA0BB,0x1C41A025,0x92CEA7C6,0x5E64A758,0x54A1AE41,0x980BAEDF,0x1684A93C,0xDA2EA9A2, + 0x030EBB0E,0xCFA4BB90,0x412BBC73,0x8D81BCED,0x8744B5F4,0x4BEEB56A,0xC561B289,0x09CBB217, + 0xAC509190,0x60FA910E,0xEE7596ED,0x22DF9673,0x281A9F6A,0xE4B09FF4,0x6A3F9817,0xA6959889, + 0x7FB58A25,0xB31F8ABB,0x3D908D58,0xF13A8DC6,0xFBFF84DF,0x37558441,0xB9DA83A2,0x7570833C, + 0x533B85DA,0x9F918544,0x111E82A7,0xDDB48239,0xD7718B20,0x1BDB8BBE,0x95548C5D,0x59FE8CC3, + 0x80DE9E6F,0x4C749EF1,0xC2FB9912,0x0E51998C,0x04949095,0xC83E900B,0x46B197E8,0x8A1B9776, + 0x2F80B4F1,0xE32AB46F,0x6DA5B38C,0xA10FB312,0xABCABA0B,0x6760BA95,0xE9EFBD76,0x2545BDE8, + 0xFC65AF44,0x30CFAFDA,0xBE40A839,0x72EAA8A7,0x782FA1BE,0xB485A120,0x3A0AA6C3,0xF6A0A65D, + 0xAA4DE78C,0x66E7E712,0xE868E0F1,0x24C2E06F,0x2E07E976,0xE2ADE9E8,0x6C22EE0B,0xA088EE95, + 0x79A8FC39,0xB502FCA7,0x3B8DFB44,0xF727FBDA,0xFDE2F2C3,0x3148F25D,0xBFC7F5BE,0x736DF520, + 0xD6F6D6A7,0x1A5CD639,0x94D3D1DA,0x5879D144,0x52BCD85D,0x9E16D8C3,0x1099DF20,0xDC33DFBE, + 0x0513CD12,0xC9B9CD8C,0x4736CA6F,0x8B9CCAF1,0x8159C3E8,0x4DF3C376,0xC37CC495,0x0FD6C40B, + 0x7AA64737,0xB60C47A9,0x3883404A,0xF42940D4,0xFEEC49CD,0x32464953,0xBCC94EB0,0x70634E2E, + 0xA9435C82,0x65E95C1C,0xEB665BFF,0x27CC5B61,0x2D095278,0xE1A352E6,0x6F2C5505,0xA386559B, + 0x061D761C,0xCAB77682,0x44387161,0x889271FF,0x825778E6,0x4EFD7878,0xC0727F9B,0x0CD87F05, + 0xD5F86DA9,0x19526D37,0x97DD6AD4,0x5B776A4A,0x51B26353,0x9D1863CD,0x1397642E,0xDF3D64B0, + 0x83D02561,0x4F7A25FF,0xC1F5221C,0x0D5F2282,0x079A2B9B,0xCB302B05,0x45BF2CE6,0x89152C78, + 0x50353ED4,0x9C9F3E4A,0x121039A9,0xDEBA3937,0xD47F302E,0x18D530B0,0x965A3753,0x5AF037CD, + 0xFF6B144A,0x33C114D4,0xBD4E1337,0x71E413A9,0x7B211AB0,0xB78B1A2E,0x39041DCD,0xF5AE1D53, + 0x2C8E0FFF,0xE0240F61,0x6EAB0882,0xA201081C,0xA8C40105,0x646E019B,0xEAE10678,0x264B06E6, + }, + { + 0x00000000,0x177B1443,0x2EF62886,0x398D3CC5,0x5DEC510C,0x4A97454F,0x731A798A,0x64616DC9, + 0xBBD8A218,0xACA3B65B,0x952E8A9E,0x82559EDD,0xE634F314,0xF14FE757,0xC8C2DB92,0xDFB9CFD1, + 0xACC04271,0xBBBB5632,0x82366AF7,0x954D7EB4,0xF12C137D,0xE657073E,0xDFDA3BFB,0xC8A12FB8, + 0x1718E069,0x0063F42A,0x39EEC8EF,0x2E95DCAC,0x4AF4B165,0x5D8FA526,0x640299E3,0x73798DA0, + 0x82F182A3,0x958A96E0,0xAC07AA25,0xBB7CBE66,0xDF1DD3AF,0xC866C7EC,0xF1EBFB29,0xE690EF6A, + 0x392920BB,0x2E5234F8,0x17DF083D,0x00A41C7E,0x64C571B7,0x73BE65F4,0x4A335931,0x5D484D72, + 0x2E31C0D2,0x394AD491,0x00C7E854,0x17BCFC17,0x73DD91DE,0x64A6859D,0x5D2BB958,0x4A50AD1B, + 0x95E962CA,0x82927689,0xBB1F4A4C,0xAC645E0F,0xC80533C6,0xDF7E2785,0xE6F31B40,0xF1880F03, + 0xDE920307,0xC9E91744,0xF0642B81,0xE71F3FC2,0x837E520B,0x94054648,0xAD887A8D,0xBAF36ECE, + 0x654AA11F,0x7231B55C,0x4BBC8999,0x5CC79DDA,0x38A6F013,0x2FDDE450,0x1650D895,0x012BCCD6, + 0x72524176,0x65295535,0x5CA469F0,0x4BDF7DB3,0x2FBE107A,0x38C50439,0x014838FC,0x16332CBF, + 0xC98AE36E,0xDEF1F72D,0xE77CCBE8,0xF007DFAB,0x9466B262,0x831DA621,0xBA909AE4,0xADEB8EA7, + 0x5C6381A4,0x4B1895E7,0x7295A922,0x65EEBD61,0x018FD0A8,0x16F4C4EB,0x2F79F82E,0x3802EC6D, + 0xE7BB23BC,0xF0C037FF,0xC94D0B3A,0xDE361F79,0xBA5772B0,0xAD2C66F3,0x94A15A36,0x83DA4E75, + 0xF0A3C3D5,0xE7D8D796,0xDE55EB53,0xC92EFF10,0xAD4F92D9,0xBA34869A,0x83B9BA5F,0x94C2AE1C, + 0x4B7B61CD,0x5C00758E,0x658D494B,0x72F65D08,0x169730C1,0x01EC2482,0x38611847,0x2F1A0C04, + 0x6655004F,0x712E140C,0x48A328C9,0x5FD83C8A,0x3BB95143,0x2CC24500,0x154F79C5,0x02346D86, + 0xDD8DA257,0xCAF6B614,0xF37B8AD1,0xE4009E92,0x8061F35B,0x971AE718,0xAE97DBDD,0xB9ECCF9E, + 0xCA95423E,0xDDEE567D,0xE4636AB8,0xF3187EFB,0x97791332,0x80020771,0xB98F3BB4,0xAEF42FF7, + 0x714DE026,0x6636F465,0x5FBBC8A0,0x48C0DCE3,0x2CA1B12A,0x3BDAA569,0x025799AC,0x152C8DEF, + 0xE4A482EC,0xF3DF96AF,0xCA52AA6A,0xDD29BE29,0xB948D3E0,0xAE33C7A3,0x97BEFB66,0x80C5EF25, + 0x5F7C20F4,0x480734B7,0x718A0872,0x66F11C31,0x029071F8,0x15EB65BB,0x2C66597E,0x3B1D4D3D, + 0x4864C09D,0x5F1FD4DE,0x6692E81B,0x71E9FC58,0x15889191,0x02F385D2,0x3B7EB917,0x2C05AD54, + 0xF3BC6285,0xE4C776C6,0xDD4A4A03,0xCA315E40,0xAE503389,0xB92B27CA,0x80A61B0F,0x97DD0F4C, + 0xB8C70348,0xAFBC170B,0x96312BCE,0x814A3F8D,0xE52B5244,0xF2504607,0xCBDD7AC2,0xDCA66E81, + 0x031FA150,0x1464B513,0x2DE989D6,0x3A929D95,0x5EF3F05C,0x4988E41F,0x7005D8DA,0x677ECC99, + 0x14074139,0x037C557A,0x3AF169BF,0x2D8A7DFC,0x49EB1035,0x5E900476,0x671D38B3,0x70662CF0, + 0xAFDFE321,0xB8A4F762,0x8129CBA7,0x9652DFE4,0xF233B22D,0xE548A66E,0xDCC59AAB,0xCBBE8EE8, + 0x3A3681EB,0x2D4D95A8,0x14C0A96D,0x03BBBD2E,0x67DAD0E7,0x70A1C4A4,0x492CF861,0x5E57EC22, + 0x81EE23F3,0x969537B0,0xAF180B75,0xB8631F36,0xDC0272FF,0xCB7966BC,0xF2F45A79,0xE58F4E3A, + 0x96F6C39A,0x818DD7D9,0xB800EB1C,0xAF7BFF5F,0xCB1A9296,0xDC6186D5,0xE5ECBA10,0xF297AE53, + 0x2D2E6182,0x3A5575C1,0x03D84904,0x14A35D47,0x70C2308E,0x67B924CD,0x5E341808,0x494F0C4B, + }, + { + 0x00000000,0xEFC26B3E,0x04F5D03D,0xEB37BB03,0x09EBA07A,0xE629CB44,0x0D1E7047,0xE2DC1B79, + 0x13D740F4,0xFC152BCA,0x172290C9,0xF8E0FBF7,0x1A3CE08E,0xF5FE8BB0,0x1EC930B3,0xF10B5B8D, + 0x27AE81E8,0xC86CEAD6,0x235B51D5,0xCC993AEB,0x2E452192,0xC1874AAC,0x2AB0F1AF,0xC5729A91, + 0x3479C11C,0xDBBBAA22,0x308C1121,0xDF4E7A1F,0x3D926166,0xD2500A58,0x3967B15B,0xD6A5DA65, + 0x4F5D03D0,0xA09F68EE,0x4BA8D3ED,0xA46AB8D3,0x46B6A3AA,0xA974C894,0x42437397,0xAD8118A9, + 0x5C8A4324,0xB348281A,0x587F9319,0xB7BDF827,0x5561E35E,0xBAA38860,0x51943363,0xBE56585D, + 0x68F38238,0x8731E906,0x6C065205,0x83C4393B,0x61182242,0x8EDA497C,0x65EDF27F,0x8A2F9941, + 0x7B24C2CC,0x94E6A9F2,0x7FD112F1,0x901379CF,0x72CF62B6,0x9D0D0988,0x763AB28B,0x99F8D9B5, + 0x9EBA07A0,0x71786C9E,0x9A4FD79D,0x758DBCA3,0x9751A7DA,0x7893CCE4,0x93A477E7,0x7C661CD9, + 0x8D6D4754,0x62AF2C6A,0x89989769,0x665AFC57,0x8486E72E,0x6B448C10,0x80733713,0x6FB15C2D, + 0xB9148648,0x56D6ED76,0xBDE15675,0x52233D4B,0xB0FF2632,0x5F3D4D0C,0xB40AF60F,0x5BC89D31, + 0xAAC3C6BC,0x4501AD82,0xAE361681,0x41F47DBF,0xA32866C6,0x4CEA0DF8,0xA7DDB6FB,0x481FDDC5, + 0xD1E70470,0x3E256F4E,0xD512D44D,0x3AD0BF73,0xD80CA40A,0x37CECF34,0xDCF97437,0x333B1F09, + 0xC2304484,0x2DF22FBA,0xC6C594B9,0x2907FF87,0xCBDBE4FE,0x24198FC0,0xCF2E34C3,0x20EC5FFD, + 0xF6498598,0x198BEEA6,0xF2BC55A5,0x1D7E3E9B,0xFFA225E2,0x10604EDC,0xFB57F5DF,0x14959EE1, + 0xE59EC56C,0x0A5CAE52,0xE16B1551,0x0EA97E6F,0xEC756516,0x03B70E28,0xE880B52B,0x0742DE15, + 0xE6050901,0x09C7623F,0xE2F0D93C,0x0D32B202,0xEFEEA97B,0x002CC245,0xEB1B7946,0x04D91278, + 0xF5D249F5,0x1A1022CB,0xF12799C8,0x1EE5F2F6,0xFC39E98F,0x13FB82B1,0xF8CC39B2,0x170E528C, + 0xC1AB88E9,0x2E69E3D7,0xC55E58D4,0x2A9C33EA,0xC8402893,0x278243AD,0xCCB5F8AE,0x23779390, + 0xD27CC81D,0x3DBEA323,0xD6891820,0x394B731E,0xDB976867,0x34550359,0xDF62B85A,0x30A0D364, + 0xA9580AD1,0x469A61EF,0xADADDAEC,0x426FB1D2,0xA0B3AAAB,0x4F71C195,0xA4467A96,0x4B8411A8, + 0xBA8F4A25,0x554D211B,0xBE7A9A18,0x51B8F126,0xB364EA5F,0x5CA68161,0xB7913A62,0x5853515C, + 0x8EF68B39,0x6134E007,0x8A035B04,0x65C1303A,0x871D2B43,0x68DF407D,0x83E8FB7E,0x6C2A9040, + 0x9D21CBCD,0x72E3A0F3,0x99D41BF0,0x761670CE,0x94CA6BB7,0x7B080089,0x903FBB8A,0x7FFDD0B4, + 0x78BF0EA1,0x977D659F,0x7C4ADE9C,0x9388B5A2,0x7154AEDB,0x9E96C5E5,0x75A17EE6,0x9A6315D8, + 0x6B684E55,0x84AA256B,0x6F9D9E68,0x805FF556,0x6283EE2F,0x8D418511,0x66763E12,0x89B4552C, + 0x5F118F49,0xB0D3E477,0x5BE45F74,0xB426344A,0x56FA2F33,0xB938440D,0x520FFF0E,0xBDCD9430, + 0x4CC6CFBD,0xA304A483,0x48331F80,0xA7F174BE,0x452D6FC7,0xAAEF04F9,0x41D8BFFA,0xAE1AD4C4, + 0x37E20D71,0xD820664F,0x3317DD4C,0xDCD5B672,0x3E09AD0B,0xD1CBC635,0x3AFC7D36,0xD53E1608, + 0x24354D85,0xCBF726BB,0x20C09DB8,0xCF02F686,0x2DDEEDFF,0xC21C86C1,0x292B3DC2,0xC6E956FC, + 0x104C8C99,0xFF8EE7A7,0x14B95CA4,0xFB7B379A,0x19A72CE3,0xF66547DD,0x1D52FCDE,0xF29097E0, + 0x039BCC6D,0xEC59A753,0x076E1C50,0xE8AC776E,0x0A706C17,0xE5B20729,0x0E85BC2A,0xE147D714, + }, + { + 0x00000000,0xC18EDFC0,0x586CB9C1,0x99E26601,0xB0D97382,0x7157AC42,0xE8B5CA43,0x293B1583, + 0xBAC3E145,0x7B4D3E85,0xE2AF5884,0x23218744,0x0A1A92C7,0xCB944D07,0x52762B06,0x93F8F4C6, + 0xAEF6C4CB,0x6F781B0B,0xF69A7D0A,0x3714A2CA,0x1E2FB749,0xDFA16889,0x46430E88,0x87CDD148, + 0x1435258E,0xD5BBFA4E,0x4C599C4F,0x8DD7438F,0xA4EC560C,0x656289CC,0xFC80EFCD,0x3D0E300D, + 0x869C8FD7,0x47125017,0xDEF03616,0x1F7EE9D6,0x3645FC55,0xF7CB2395,0x6E294594,0xAFA79A54, + 0x3C5F6E92,0xFDD1B152,0x6433D753,0xA5BD0893,0x8C861D10,0x4D08C2D0,0xD4EAA4D1,0x15647B11, + 0x286A4B1C,0xE9E494DC,0x7006F2DD,0xB1882D1D,0x98B3389E,0x593DE75E,0xC0DF815F,0x01515E9F, + 0x92A9AA59,0x53277599,0xCAC51398,0x0B4BCC58,0x2270D9DB,0xE3FE061B,0x7A1C601A,0xBB92BFDA, + 0xD64819EF,0x17C6C62F,0x8E24A02E,0x4FAA7FEE,0x66916A6D,0xA71FB5AD,0x3EFDD3AC,0xFF730C6C, + 0x6C8BF8AA,0xAD05276A,0x34E7416B,0xF5699EAB,0xDC528B28,0x1DDC54E8,0x843E32E9,0x45B0ED29, + 0x78BEDD24,0xB93002E4,0x20D264E5,0xE15CBB25,0xC867AEA6,0x09E97166,0x900B1767,0x5185C8A7, + 0xC27D3C61,0x03F3E3A1,0x9A1185A0,0x5B9F5A60,0x72A44FE3,0xB32A9023,0x2AC8F622,0xEB4629E2, + 0x50D49638,0x915A49F8,0x08B82FF9,0xC936F039,0xE00DE5BA,0x21833A7A,0xB8615C7B,0x79EF83BB, + 0xEA17777D,0x2B99A8BD,0xB27BCEBC,0x73F5117C,0x5ACE04FF,0x9B40DB3F,0x02A2BD3E,0xC32C62FE, + 0xFE2252F3,0x3FAC8D33,0xA64EEB32,0x67C034F2,0x4EFB2171,0x8F75FEB1,0x169798B0,0xD7194770, + 0x44E1B3B6,0x856F6C76,0x1C8D0A77,0xDD03D5B7,0xF438C034,0x35B61FF4,0xAC5479F5,0x6DDAA635, + 0x77E1359F,0xB66FEA5F,0x2F8D8C5E,0xEE03539E,0xC738461D,0x06B699DD,0x9F54FFDC,0x5EDA201C, + 0xCD22D4DA,0x0CAC0B1A,0x954E6D1B,0x54C0B2DB,0x7DFBA758,0xBC757898,0x25971E99,0xE419C159, + 0xD917F154,0x18992E94,0x817B4895,0x40F59755,0x69CE82D6,0xA8405D16,0x31A23B17,0xF02CE4D7, + 0x63D41011,0xA25ACFD1,0x3BB8A9D0,0xFA367610,0xD30D6393,0x1283BC53,0x8B61DA52,0x4AEF0592, + 0xF17DBA48,0x30F36588,0xA9110389,0x689FDC49,0x41A4C9CA,0x802A160A,0x19C8700B,0xD846AFCB, + 0x4BBE5B0D,0x8A3084CD,0x13D2E2CC,0xD25C3D0C,0xFB67288F,0x3AE9F74F,0xA30B914E,0x62854E8E, + 0x5F8B7E83,0x9E05A143,0x07E7C742,0xC6691882,0xEF520D01,0x2EDCD2C1,0xB73EB4C0,0x76B06B00, + 0xE5489FC6,0x24C64006,0xBD242607,0x7CAAF9C7,0x5591EC44,0x941F3384,0x0DFD5585,0xCC738A45, + 0xA1A92C70,0x6027F3B0,0xF9C595B1,0x384B4A71,0x11705FF2,0xD0FE8032,0x491CE633,0x889239F3, + 0x1B6ACD35,0xDAE412F5,0x430674F4,0x8288AB34,0xABB3BEB7,0x6A3D6177,0xF3DF0776,0x3251D8B6, + 0x0F5FE8BB,0xCED1377B,0x5733517A,0x96BD8EBA,0xBF869B39,0x7E0844F9,0xE7EA22F8,0x2664FD38, + 0xB59C09FE,0x7412D63E,0xEDF0B03F,0x2C7E6FFF,0x05457A7C,0xC4CBA5BC,0x5D29C3BD,0x9CA71C7D, + 0x2735A3A7,0xE6BB7C67,0x7F591A66,0xBED7C5A6,0x97ECD025,0x56620FE5,0xCF8069E4,0x0E0EB624, + 0x9DF642E2,0x5C789D22,0xC59AFB23,0x041424E3,0x2D2F3160,0xECA1EEA0,0x754388A1,0xB4CD5761, + 0x89C3676C,0x484DB8AC,0xD1AFDEAD,0x1021016D,0x391A14EE,0xF894CB2E,0x6176AD2F,0xA0F872EF, + 0x33008629,0xF28E59E9,0x6B6C3FE8,0xAAE2E028,0x83D9F5AB,0x42572A6B,0xDBB54C6A,0x1A3B93AA, + }, + { + 0x00000000,0x9BA54C6F,0xEC3B9E9F,0x779ED2F0,0x03063B7F,0x98A37710,0xEF3DA5E0,0x7498E98F, + 0x060C76FE,0x9DA93A91,0xEA37E861,0x7192A40E,0x050A4D81,0x9EAF01EE,0xE931D31E,0x72949F71, + 0x0C18EDFC,0x97BDA193,0xE0237363,0x7B863F0C,0x0F1ED683,0x94BB9AEC,0xE325481C,0x78800473, + 0x0A149B02,0x91B1D76D,0xE62F059D,0x7D8A49F2,0x0912A07D,0x92B7EC12,0xE5293EE2,0x7E8C728D, + 0x1831DBF8,0x83949797,0xF40A4567,0x6FAF0908,0x1B37E087,0x8092ACE8,0xF70C7E18,0x6CA93277, + 0x1E3DAD06,0x8598E169,0xF2063399,0x69A37FF6,0x1D3B9679,0x869EDA16,0xF10008E6,0x6AA54489, + 0x14293604,0x8F8C7A6B,0xF812A89B,0x63B7E4F4,0x172F0D7B,0x8C8A4114,0xFB1493E4,0x60B1DF8B, + 0x122540FA,0x89800C95,0xFE1EDE65,0x65BB920A,0x11237B85,0x8A8637EA,0xFD18E51A,0x66BDA975, + 0x3063B7F0,0xABC6FB9F,0xDC58296F,0x47FD6500,0x33658C8F,0xA8C0C0E0,0xDF5E1210,0x44FB5E7F, + 0x366FC10E,0xADCA8D61,0xDA545F91,0x41F113FE,0x3569FA71,0xAECCB61E,0xD95264EE,0x42F72881, + 0x3C7B5A0C,0xA7DE1663,0xD040C493,0x4BE588FC,0x3F7D6173,0xA4D82D1C,0xD346FFEC,0x48E3B383, + 0x3A772CF2,0xA1D2609D,0xD64CB26D,0x4DE9FE02,0x3971178D,0xA2D45BE2,0xD54A8912,0x4EEFC57D, + 0x28526C08,0xB3F72067,0xC469F297,0x5FCCBEF8,0x2B545777,0xB0F11B18,0xC76FC9E8,0x5CCA8587, + 0x2E5E1AF6,0xB5FB5699,0xC2658469,0x59C0C806,0x2D582189,0xB6FD6DE6,0xC163BF16,0x5AC6F379, + 0x244A81F4,0xBFEFCD9B,0xC8711F6B,0x53D45304,0x274CBA8B,0xBCE9F6E4,0xCB772414,0x50D2687B, + 0x2246F70A,0xB9E3BB65,0xCE7D6995,0x55D825FA,0x2140CC75,0xBAE5801A,0xCD7B52EA,0x56DE1E85, + 0x60C76FE0,0xFB62238F,0x8CFCF17F,0x1759BD10,0x63C1549F,0xF86418F0,0x8FFACA00,0x145F866F, + 0x66CB191E,0xFD6E5571,0x8AF08781,0x1155CBEE,0x65CD2261,0xFE686E0E,0x89F6BCFE,0x1253F091, + 0x6CDF821C,0xF77ACE73,0x80E41C83,0x1B4150EC,0x6FD9B963,0xF47CF50C,0x83E227FC,0x18476B93, + 0x6AD3F4E2,0xF176B88D,0x86E86A7D,0x1D4D2612,0x69D5CF9D,0xF27083F2,0x85EE5102,0x1E4B1D6D, + 0x78F6B418,0xE353F877,0x94CD2A87,0x0F6866E8,0x7BF08F67,0xE055C308,0x97CB11F8,0x0C6E5D97, + 0x7EFAC2E6,0xE55F8E89,0x92C15C79,0x09641016,0x7DFCF999,0xE659B5F6,0x91C76706,0x0A622B69, + 0x74EE59E4,0xEF4B158B,0x98D5C77B,0x03708B14,0x77E8629B,0xEC4D2EF4,0x9BD3FC04,0x0076B06B, + 0x72E22F1A,0xE9476375,0x9ED9B185,0x057CFDEA,0x71E41465,0xEA41580A,0x9DDF8AFA,0x067AC695, + 0x50A4D810,0xCB01947F,0xBC9F468F,0x273A0AE0,0x53A2E36F,0xC807AF00,0xBF997DF0,0x243C319F, + 0x56A8AEEE,0xCD0DE281,0xBA933071,0x21367C1E,0x55AE9591,0xCE0BD9FE,0xB9950B0E,0x22304761, + 0x5CBC35EC,0xC7197983,0xB087AB73,0x2B22E71C,0x5FBA0E93,0xC41F42FC,0xB381900C,0x2824DC63, + 0x5AB04312,0xC1150F7D,0xB68BDD8D,0x2D2E91E2,0x59B6786D,0xC2133402,0xB58DE6F2,0x2E28AA9D, + 0x489503E8,0xD3304F87,0xA4AE9D77,0x3F0BD118,0x4B933897,0xD03674F8,0xA7A8A608,0x3C0DEA67, + 0x4E997516,0xD53C3979,0xA2A2EB89,0x3907A7E6,0x4D9F4E69,0xD63A0206,0xA1A4D0F6,0x3A019C99, + 0x448DEE14,0xDF28A27B,0xA8B6708B,0x33133CE4,0x478BD56B,0xDC2E9904,0xABB04BF4,0x3015079B, + 0x428198EA,0xD924D485,0xAEBA0675,0x351F4A1A,0x4187A395,0xDA22EFFA,0xADBC3D0A,0x36197165, + }, + { + 0x00000000,0xDD96D985,0x605CB54B,0xBDCA6CCE,0xC0B96A96,0x1D2FB313,0xA0E5DFDD,0x7D730658, + 0x5A03D36D,0x87950AE8,0x3A5F6626,0xE7C9BFA3,0x9ABAB9FB,0x472C607E,0xFAE60CB0,0x2770D535, + 0xB407A6DA,0x69917F5F,0xD45B1391,0x09CDCA14,0x74BECC4C,0xA92815C9,0x14E27907,0xC974A082, + 0xEE0475B7,0x3392AC32,0x8E58C0FC,0x53CE1979,0x2EBD1F21,0xF32BC6A4,0x4EE1AA6A,0x937773EF, + 0xB37E4BF5,0x6EE89270,0xD322FEBE,0x0EB4273B,0x73C72163,0xAE51F8E6,0x139B9428,0xCE0D4DAD, + 0xE97D9898,0x34EB411D,0x89212DD3,0x54B7F456,0x29C4F20E,0xF4522B8B,0x49984745,0x940E9EC0, + 0x0779ED2F,0xDAEF34AA,0x67255864,0xBAB381E1,0xC7C087B9,0x1A565E3C,0xA79C32F2,0x7A0AEB77, + 0x5D7A3E42,0x80ECE7C7,0x3D268B09,0xE0B0528C,0x9DC354D4,0x40558D51,0xFD9FE19F,0x2009381A, + 0xBD8D91AB,0x601B482E,0xDDD124E0,0x0047FD65,0x7D34FB3D,0xA0A222B8,0x1D684E76,0xC0FE97F3, + 0xE78E42C6,0x3A189B43,0x87D2F78D,0x5A442E08,0x27372850,0xFAA1F1D5,0x476B9D1B,0x9AFD449E, + 0x098A3771,0xD41CEEF4,0x69D6823A,0xB4405BBF,0xC9335DE7,0x14A58462,0xA96FE8AC,0x74F93129, + 0x5389E41C,0x8E1F3D99,0x33D55157,0xEE4388D2,0x93308E8A,0x4EA6570F,0xF36C3BC1,0x2EFAE244, + 0x0EF3DA5E,0xD36503DB,0x6EAF6F15,0xB339B690,0xCE4AB0C8,0x13DC694D,0xAE160583,0x7380DC06, + 0x54F00933,0x8966D0B6,0x34ACBC78,0xE93A65FD,0x944963A5,0x49DFBA20,0xF415D6EE,0x29830F6B, + 0xBAF47C84,0x6762A501,0xDAA8C9CF,0x073E104A,0x7A4D1612,0xA7DBCF97,0x1A11A359,0xC7877ADC, + 0xE0F7AFE9,0x3D61766C,0x80AB1AA2,0x5D3DC327,0x204EC57F,0xFDD81CFA,0x40127034,0x9D84A9B1, + 0xA06A2517,0x7DFCFC92,0xC036905C,0x1DA049D9,0x60D34F81,0xBD459604,0x008FFACA,0xDD19234F, + 0xFA69F67A,0x27FF2FFF,0x9A354331,0x47A39AB4,0x3AD09CEC,0xE7464569,0x5A8C29A7,0x871AF022, + 0x146D83CD,0xC9FB5A48,0x74313686,0xA9A7EF03,0xD4D4E95B,0x094230DE,0xB4885C10,0x691E8595, + 0x4E6E50A0,0x93F88925,0x2E32E5EB,0xF3A43C6E,0x8ED73A36,0x5341E3B3,0xEE8B8F7D,0x331D56F8, + 0x13146EE2,0xCE82B767,0x7348DBA9,0xAEDE022C,0xD3AD0474,0x0E3BDDF1,0xB3F1B13F,0x6E6768BA, + 0x4917BD8F,0x9481640A,0x294B08C4,0xF4DDD141,0x89AED719,0x54380E9C,0xE9F26252,0x3464BBD7, + 0xA713C838,0x7A8511BD,0xC74F7D73,0x1AD9A4F6,0x67AAA2AE,0xBA3C7B2B,0x07F617E5,0xDA60CE60, + 0xFD101B55,0x2086C2D0,0x9D4CAE1E,0x40DA779B,0x3DA971C3,0xE03FA846,0x5DF5C488,0x80631D0D, + 0x1DE7B4BC,0xC0716D39,0x7DBB01F7,0xA02DD872,0xDD5EDE2A,0x00C807AF,0xBD026B61,0x6094B2E4, + 0x47E467D1,0x9A72BE54,0x27B8D29A,0xFA2E0B1F,0x875D0D47,0x5ACBD4C2,0xE701B80C,0x3A976189, + 0xA9E01266,0x7476CBE3,0xC9BCA72D,0x142A7EA8,0x695978F0,0xB4CFA175,0x0905CDBB,0xD493143E, + 0xF3E3C10B,0x2E75188E,0x93BF7440,0x4E29ADC5,0x335AAB9D,0xEECC7218,0x53061ED6,0x8E90C753, + 0xAE99FF49,0x730F26CC,0xCEC54A02,0x13539387,0x6E2095DF,0xB3B64C5A,0x0E7C2094,0xD3EAF911, + 0xF49A2C24,0x290CF5A1,0x94C6996F,0x495040EA,0x342346B2,0xE9B59F37,0x547FF3F9,0x89E92A7C, + 0x1A9E5993,0xC7088016,0x7AC2ECD8,0xA754355D,0xDA273305,0x07B1EA80,0xBA7B864E,0x67ED5FCB, + 0x409D8AFE,0x9D0B537B,0x20C13FB5,0xFD57E630,0x8024E068,0x5DB239ED,0xE0785523,0x3DEE8CA6, + }, + { + 0x00000000,0x9D0FE176,0xE16EC4AD,0x7C6125DB,0x19AC8F1B,0x84A36E6D,0xF8C24BB6,0x65CDAAC0, + 0x33591E36,0xAE56FF40,0xD237DA9B,0x4F383BED,0x2AF5912D,0xB7FA705B,0xCB9B5580,0x5694B4F6, + 0x66B23C6C,0xFBBDDD1A,0x87DCF8C1,0x1AD319B7,0x7F1EB377,0xE2115201,0x9E7077DA,0x037F96AC, + 0x55EB225A,0xC8E4C32C,0xB485E6F7,0x298A0781,0x4C47AD41,0xD1484C37,0xAD2969EC,0x3026889A, + 0xCD6478D8,0x506B99AE,0x2C0ABC75,0xB1055D03,0xD4C8F7C3,0x49C716B5,0x35A6336E,0xA8A9D218, + 0xFE3D66EE,0x63328798,0x1F53A243,0x825C4335,0xE791E9F5,0x7A9E0883,0x06FF2D58,0x9BF0CC2E, + 0xABD644B4,0x36D9A5C2,0x4AB88019,0xD7B7616F,0xB27ACBAF,0x2F752AD9,0x53140F02,0xCE1BEE74, + 0x988F5A82,0x0580BBF4,0x79E19E2F,0xE4EE7F59,0x8123D599,0x1C2C34EF,0x604D1134,0xFD42F042, + 0x41B9F7F1,0xDCB61687,0xA0D7335C,0x3DD8D22A,0x581578EA,0xC51A999C,0xB97BBC47,0x24745D31, + 0x72E0E9C7,0xEFEF08B1,0x938E2D6A,0x0E81CC1C,0x6B4C66DC,0xF64387AA,0x8A22A271,0x172D4307, + 0x270BCB9D,0xBA042AEB,0xC6650F30,0x5B6AEE46,0x3EA74486,0xA3A8A5F0,0xDFC9802B,0x42C6615D, + 0x1452D5AB,0x895D34DD,0xF53C1106,0x6833F070,0x0DFE5AB0,0x90F1BBC6,0xEC909E1D,0x719F7F6B, + 0x8CDD8F29,0x11D26E5F,0x6DB34B84,0xF0BCAAF2,0x95710032,0x087EE144,0x741FC49F,0xE91025E9, + 0xBF84911F,0x228B7069,0x5EEA55B2,0xC3E5B4C4,0xA6281E04,0x3B27FF72,0x4746DAA9,0xDA493BDF, + 0xEA6FB345,0x77605233,0x0B0177E8,0x960E969E,0xF3C33C5E,0x6ECCDD28,0x12ADF8F3,0x8FA21985, + 0xD936AD73,0x44394C05,0x385869DE,0xA55788A8,0xC09A2268,0x5D95C31E,0x21F4E6C5,0xBCFB07B3, + 0x8373EFE2,0x1E7C0E94,0x621D2B4F,0xFF12CA39,0x9ADF60F9,0x07D0818F,0x7BB1A454,0xE6BE4522, + 0xB02AF1D4,0x2D2510A2,0x51443579,0xCC4BD40F,0xA9867ECF,0x34899FB9,0x48E8BA62,0xD5E75B14, + 0xE5C1D38E,0x78CE32F8,0x04AF1723,0x99A0F655,0xFC6D5C95,0x6162BDE3,0x1D039838,0x800C794E, + 0xD698CDB8,0x4B972CCE,0x37F60915,0xAAF9E863,0xCF3442A3,0x523BA3D5,0x2E5A860E,0xB3556778, + 0x4E17973A,0xD318764C,0xAF795397,0x3276B2E1,0x57BB1821,0xCAB4F957,0xB6D5DC8C,0x2BDA3DFA, + 0x7D4E890C,0xE041687A,0x9C204DA1,0x012FACD7,0x64E20617,0xF9EDE761,0x858CC2BA,0x188323CC, + 0x28A5AB56,0xB5AA4A20,0xC9CB6FFB,0x54C48E8D,0x3109244D,0xAC06C53B,0xD067E0E0,0x4D680196, + 0x1BFCB560,0x86F35416,0xFA9271CD,0x679D90BB,0x02503A7B,0x9F5FDB0D,0xE33EFED6,0x7E311FA0, + 0xC2CA1813,0x5FC5F965,0x23A4DCBE,0xBEAB3DC8,0xDB669708,0x4669767E,0x3A0853A5,0xA707B2D3, + 0xF1930625,0x6C9CE753,0x10FDC288,0x8DF223FE,0xE83F893E,0x75306848,0x09514D93,0x945EACE5, + 0xA478247F,0x3977C509,0x4516E0D2,0xD81901A4,0xBDD4AB64,0x20DB4A12,0x5CBA6FC9,0xC1B58EBF, + 0x97213A49,0x0A2EDB3F,0x764FFEE4,0xEB401F92,0x8E8DB552,0x13825424,0x6FE371FF,0xF2EC9089, + 0x0FAE60CB,0x92A181BD,0xEEC0A466,0x73CF4510,0x1602EFD0,0x8B0D0EA6,0xF76C2B7D,0x6A63CA0B, + 0x3CF77EFD,0xA1F89F8B,0xDD99BA50,0x40965B26,0x255BF1E6,0xB8541090,0xC435354B,0x593AD43D, + 0x691C5CA7,0xF413BDD1,0x8872980A,0x157D797C,0x70B0D3BC,0xEDBF32CA,0x91DE1711,0x0CD1F667, + 0x5A454291,0xC74AA3E7,0xBB2B863C,0x2624674A,0x43E9CD8A,0xDEE62CFC,0xA2870927,0x3F88E851, + }, + { + 0x00000000,0xB9FBDBE8,0xA886B191,0x117D6A79,0x8A7C6563,0x3387BE8B,0x22FAD4F2,0x9B010F1A, + 0xCF89CC87,0x7672176F,0x670F7D16,0xDEF4A6FE,0x45F5A9E4,0xFC0E720C,0xED731875,0x5488C39D, + 0x44629F4F,0xFD9944A7,0xECE42EDE,0x551FF536,0xCE1EFA2C,0x77E521C4,0x66984BBD,0xDF639055, + 0x8BEB53C8,0x32108820,0x236DE259,0x9A9639B1,0x019736AB,0xB86CED43,0xA911873A,0x10EA5CD2, + 0x88C53E9E,0x313EE576,0x20438F0F,0x99B854E7,0x02B95BFD,0xBB428015,0xAA3FEA6C,0x13C43184, + 0x474CF219,0xFEB729F1,0xEFCA4388,0x56319860,0xCD30977A,0x74CB4C92,0x65B626EB,0xDC4DFD03, + 0xCCA7A1D1,0x755C7A39,0x64211040,0xDDDACBA8,0x46DBC4B2,0xFF201F5A,0xEE5D7523,0x57A6AECB, + 0x032E6D56,0xBAD5B6BE,0xABA8DCC7,0x1253072F,0x89520835,0x30A9D3DD,0x21D4B9A4,0x982F624C, + 0xCAFB7B7D,0x7300A095,0x627DCAEC,0xDB861104,0x40871E1E,0xF97CC5F6,0xE801AF8F,0x51FA7467, + 0x0572B7FA,0xBC896C12,0xADF4066B,0x140FDD83,0x8F0ED299,0x36F50971,0x27886308,0x9E73B8E0, + 0x8E99E432,0x37623FDA,0x261F55A3,0x9FE48E4B,0x04E58151,0xBD1E5AB9,0xAC6330C0,0x1598EB28, + 0x411028B5,0xF8EBF35D,0xE9969924,0x506D42CC,0xCB6C4DD6,0x7297963E,0x63EAFC47,0xDA1127AF, + 0x423E45E3,0xFBC59E0B,0xEAB8F472,0x53432F9A,0xC8422080,0x71B9FB68,0x60C49111,0xD93F4AF9, + 0x8DB78964,0x344C528C,0x253138F5,0x9CCAE31D,0x07CBEC07,0xBE3037EF,0xAF4D5D96,0x16B6867E, + 0x065CDAAC,0xBFA70144,0xAEDA6B3D,0x1721B0D5,0x8C20BFCF,0x35DB6427,0x24A60E5E,0x9D5DD5B6, + 0xC9D5162B,0x702ECDC3,0x6153A7BA,0xD8A87C52,0x43A97348,0xFA52A8A0,0xEB2FC2D9,0x52D41931, + 0x4E87F0BB,0xF77C2B53,0xE601412A,0x5FFA9AC2,0xC4FB95D8,0x7D004E30,0x6C7D2449,0xD586FFA1, + 0x810E3C3C,0x38F5E7D4,0x29888DAD,0x90735645,0x0B72595F,0xB28982B7,0xA3F4E8CE,0x1A0F3326, + 0x0AE56FF4,0xB31EB41C,0xA263DE65,0x1B98058D,0x80990A97,0x3962D17F,0x281FBB06,0x91E460EE, + 0xC56CA373,0x7C97789B,0x6DEA12E2,0xD411C90A,0x4F10C610,0xF6EB1DF8,0xE7967781,0x5E6DAC69, + 0xC642CE25,0x7FB915CD,0x6EC47FB4,0xD73FA45C,0x4C3EAB46,0xF5C570AE,0xE4B81AD7,0x5D43C13F, + 0x09CB02A2,0xB030D94A,0xA14DB333,0x18B668DB,0x83B767C1,0x3A4CBC29,0x2B31D650,0x92CA0DB8, + 0x8220516A,0x3BDB8A82,0x2AA6E0FB,0x935D3B13,0x085C3409,0xB1A7EFE1,0xA0DA8598,0x19215E70, + 0x4DA99DED,0xF4524605,0xE52F2C7C,0x5CD4F794,0xC7D5F88E,0x7E2E2366,0x6F53491F,0xD6A892F7, + 0x847C8BC6,0x3D87502E,0x2CFA3A57,0x9501E1BF,0x0E00EEA5,0xB7FB354D,0xA6865F34,0x1F7D84DC, + 0x4BF54741,0xF20E9CA9,0xE373F6D0,0x5A882D38,0xC1892222,0x7872F9CA,0x690F93B3,0xD0F4485B, + 0xC01E1489,0x79E5CF61,0x6898A518,0xD1637EF0,0x4A6271EA,0xF399AA02,0xE2E4C07B,0x5B1F1B93, + 0x0F97D80E,0xB66C03E6,0xA711699F,0x1EEAB277,0x85EBBD6D,0x3C106685,0x2D6D0CFC,0x9496D714, + 0x0CB9B558,0xB5426EB0,0xA43F04C9,0x1DC4DF21,0x86C5D03B,0x3F3E0BD3,0x2E4361AA,0x97B8BA42, + 0xC33079DF,0x7ACBA237,0x6BB6C84E,0xD24D13A6,0x494C1CBC,0xF0B7C754,0xE1CAAD2D,0x583176C5, + 0x48DB2A17,0xF120F1FF,0xE05D9B86,0x59A6406E,0xC2A74F74,0x7B5C949C,0x6A21FEE5,0xD3DA250D, + 0x8752E690,0x3EA93D78,0x2FD45701,0x962F8CE9,0x0D2E83F3,0xB4D5581B,0xA5A83262,0x1C53E98A, + }, + { + 0x00000000,0xAE689191,0x87A02563,0x29C8B4F2,0xD4314C87,0x7A59DD16,0x539169E4,0xFDF9F875, + 0x73139F4F,0xDD7B0EDE,0xF4B3BA2C,0x5ADB2BBD,0xA722D3C8,0x094A4259,0x2082F6AB,0x8EEA673A, + 0xE6273E9E,0x484FAF0F,0x61871BFD,0xCFEF8A6C,0x32167219,0x9C7EE388,0xB5B6577A,0x1BDEC6EB, + 0x9534A1D1,0x3B5C3040,0x129484B2,0xBCFC1523,0x4105ED56,0xEF6D7CC7,0xC6A5C835,0x68CD59A4, + 0x173F7B7D,0xB957EAEC,0x909F5E1E,0x3EF7CF8F,0xC30E37FA,0x6D66A66B,0x44AE1299,0xEAC68308, + 0x642CE432,0xCA4475A3,0xE38CC151,0x4DE450C0,0xB01DA8B5,0x1E753924,0x37BD8DD6,0x99D51C47, + 0xF11845E3,0x5F70D472,0x76B86080,0xD8D0F111,0x25290964,0x8B4198F5,0xA2892C07,0x0CE1BD96, + 0x820BDAAC,0x2C634B3D,0x05ABFFCF,0xABC36E5E,0x563A962B,0xF85207BA,0xD19AB348,0x7FF222D9, + 0x2E7EF6FA,0x8016676B,0xA9DED399,0x07B64208,0xFA4FBA7D,0x54272BEC,0x7DEF9F1E,0xD3870E8F, + 0x5D6D69B5,0xF305F824,0xDACD4CD6,0x74A5DD47,0x895C2532,0x2734B4A3,0x0EFC0051,0xA09491C0, + 0xC859C864,0x663159F5,0x4FF9ED07,0xE1917C96,0x1C6884E3,0xB2001572,0x9BC8A180,0x35A03011, + 0xBB4A572B,0x1522C6BA,0x3CEA7248,0x9282E3D9,0x6F7B1BAC,0xC1138A3D,0xE8DB3ECF,0x46B3AF5E, + 0x39418D87,0x97291C16,0xBEE1A8E4,0x10893975,0xED70C100,0x43185091,0x6AD0E463,0xC4B875F2, + 0x4A5212C8,0xE43A8359,0xCDF237AB,0x639AA63A,0x9E635E4F,0x300BCFDE,0x19C37B2C,0xB7ABEABD, + 0xDF66B319,0x710E2288,0x58C6967A,0xF6AE07EB,0x0B57FF9E,0xA53F6E0F,0x8CF7DAFD,0x229F4B6C, + 0xAC752C56,0x021DBDC7,0x2BD50935,0x85BD98A4,0x784460D1,0xD62CF140,0xFFE445B2,0x518CD423, + 0x5CFDEDF4,0xF2957C65,0xDB5DC897,0x75355906,0x88CCA173,0x26A430E2,0x0F6C8410,0xA1041581, + 0x2FEE72BB,0x8186E32A,0xA84E57D8,0x0626C649,0xFBDF3E3C,0x55B7AFAD,0x7C7F1B5F,0xD2178ACE, + 0xBADAD36A,0x14B242FB,0x3D7AF609,0x93126798,0x6EEB9FED,0xC0830E7C,0xE94BBA8E,0x47232B1F, + 0xC9C94C25,0x67A1DDB4,0x4E696946,0xE001F8D7,0x1DF800A2,0xB3909133,0x9A5825C1,0x3430B450, + 0x4BC29689,0xE5AA0718,0xCC62B3EA,0x620A227B,0x9FF3DA0E,0x319B4B9F,0x1853FF6D,0xB63B6EFC, + 0x38D109C6,0x96B99857,0xBF712CA5,0x1119BD34,0xECE04541,0x4288D4D0,0x6B406022,0xC528F1B3, + 0xADE5A817,0x038D3986,0x2A458D74,0x842D1CE5,0x79D4E490,0xD7BC7501,0xFE74C1F3,0x501C5062, + 0xDEF63758,0x709EA6C9,0x5956123B,0xF73E83AA,0x0AC77BDF,0xA4AFEA4E,0x8D675EBC,0x230FCF2D, + 0x72831B0E,0xDCEB8A9F,0xF5233E6D,0x5B4BAFFC,0xA6B25789,0x08DAC618,0x211272EA,0x8F7AE37B, + 0x01908441,0xAFF815D0,0x8630A122,0x285830B3,0xD5A1C8C6,0x7BC95957,0x5201EDA5,0xFC697C34, + 0x94A42590,0x3ACCB401,0x130400F3,0xBD6C9162,0x40956917,0xEEFDF886,0xC7354C74,0x695DDDE5, + 0xE7B7BADF,0x49DF2B4E,0x60179FBC,0xCE7F0E2D,0x3386F658,0x9DEE67C9,0xB426D33B,0x1A4E42AA, + 0x65BC6073,0xCBD4F1E2,0xE21C4510,0x4C74D481,0xB18D2CF4,0x1FE5BD65,0x362D0997,0x98459806, + 0x16AFFF3C,0xB8C76EAD,0x910FDA5F,0x3F674BCE,0xC29EB3BB,0x6CF6222A,0x453E96D8,0xEB560749, + 0x839B5EED,0x2DF3CF7C,0x043B7B8E,0xAA53EA1F,0x57AA126A,0xF9C283FB,0xD00A3709,0x7E62A698, + 0xF088C1A2,0x5EE05033,0x7728E4C1,0xD9407550,0x24B98D25,0x8AD11CB4,0xA319A846,0x0D7139D7, + } + }; + + uint32_t Crc32(const void* src, size_t size) + { + const uint8_t* p8 = (const uint8_t*)src; + uint32_t crc = 0xFFFFFFFF; + + for (; ((uintptr_t)p8 & (sizeof(uint32_t) - 1)) != 0 && size > 0; ++p8, --size) + crc = Crc32Table[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8); + + const uint32_t* p32 = (const uint32_t*)p8; + for (; size >= 16; size -= 16) + { +#ifdef SIMD_BIG_ENDIAN + uint32_t v0 = *p32++ ^ Reorder32(crc); + uint32_t v1 = *p32++; + uint32_t v2 = *p32++; + uint32_t v3 = *p32++; + crc = + Crc32Table[0x0][v3 & 0xFF] ^ + Crc32Table[0x1][(v3 >> 8) & 0xFF] ^ + Crc32Table[0x2][(v3 >> 16) & 0xFF] ^ + Crc32Table[0x3][(v3 >> 24) & 0xFF] ^ + Crc32Table[0x4][v2 & 0xFF] ^ + Crc32Table[0x5][(v2 >> 8) & 0xFF] ^ + Crc32Table[0x6][(v2 >> 16) & 0xFF] ^ + Crc32Table[0x7][(v2 >> 24) & 0xFF] ^ + Crc32Table[0x8][v1 & 0xFF] ^ + Crc32Table[0x9][(v1 >> 8) & 0xFF] ^ + Crc32Table[0xA][(v1 >> 16) & 0xFF] ^ + Crc32Table[0xB][(v1 >> 24) & 0xFF] ^ + Crc32Table[0xC][v0 & 0xFF] ^ + Crc32Table[0xD][(v0 >> 8) & 0xFF] ^ + Crc32Table[0xE][(v0 >> 16) & 0xFF] ^ + Crc32Table[0xF][(v0 >> 24) & 0xFF]; +#else + uint32_t v0 = *p32++ ^ crc; + uint32_t v1 = *p32++; + uint32_t v2 = *p32++; + uint32_t v3 = *p32++; + crc = + Crc32Table[0x0][(v3 >> 24) & 0xFF] ^ + Crc32Table[0x1][(v3 >> 16) & 0xFF] ^ + Crc32Table[0x2][(v3 >> 8) & 0xFF] ^ + Crc32Table[0x3][v3 & 0xFF] ^ + Crc32Table[0x4][(v2 >> 24) & 0xFF] ^ + Crc32Table[0x5][(v2 >> 16) & 0xFF] ^ + Crc32Table[0x6][(v2 >> 8) & 0xFF] ^ + Crc32Table[0x7][v2 & 0xFF] ^ + Crc32Table[0x8][(v1 >> 24) & 0xFF] ^ + Crc32Table[0x9][(v1 >> 16) & 0xFF] ^ + Crc32Table[0xA][(v1 >> 8) & 0xFF] ^ + Crc32Table[0xB][v1 & 0xFF] ^ + Crc32Table[0xC][(v0 >> 24) & 0xFF] ^ + Crc32Table[0xD][(v0 >> 16) & 0xFF] ^ + Crc32Table[0xE][(v0 >> 8) & 0xFF] ^ + Crc32Table[0xF][v0 & 0xFF]; +#endif + } + + for (p8 = (const uint8_t*)p32; size > 0; ++p8, size--) + crc = Crc32Table[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8); + + return (~crc); + } + + //--------------------------------------------------------------------- + + // Precalculated CRC32c lookup table for polynomial 0x1EDC6F41 (castagnoli-crc). + static const uint32_t Crc32cTable[8][256] = + { + { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 + }, + { + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, + 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, + 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, + 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, + 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, + 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, + 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, + 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, + 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, + 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, + 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, + 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, + 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, + 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, + 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, + 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, + 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, + 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, + 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, + 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, + 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, + 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 + }, + { + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, + 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, + 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, + 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, + 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, + 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, + 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, + 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, + 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, + 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, + 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, + 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, + 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, + 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, + 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, + 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, + 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, + 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, + 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, + 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, + 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, + 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 + }, + { + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, + 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, + 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, + 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, + 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, + 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, + 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, + 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, + 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, + 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, + 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, + 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, + 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, + 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, + 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, + 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, + 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, + 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, + 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, + 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, + 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, + 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 + }, + { + 0x00000000, 0x38116fac, 0x7022df58, 0x4833b0f4, 0xe045beb0, 0xd854d11c, 0x906761e8, 0xa8760e44, + 0xc5670b91, 0xfd76643d, 0xb545d4c9, 0x8d54bb65, 0x2522b521, 0x1d33da8d, 0x55006a79, 0x6d1105d5, + 0x8f2261d3, 0xb7330e7f, 0xff00be8b, 0xc711d127, 0x6f67df63, 0x5776b0cf, 0x1f45003b, 0x27546f97, + 0x4a456a42, 0x725405ee, 0x3a67b51a, 0x0276dab6, 0xaa00d4f2, 0x9211bb5e, 0xda220baa, 0xe2336406, + 0x1ba8b557, 0x23b9dafb, 0x6b8a6a0f, 0x539b05a3, 0xfbed0be7, 0xc3fc644b, 0x8bcfd4bf, 0xb3debb13, + 0xdecfbec6, 0xe6ded16a, 0xaeed619e, 0x96fc0e32, 0x3e8a0076, 0x069b6fda, 0x4ea8df2e, 0x76b9b082, + 0x948ad484, 0xac9bbb28, 0xe4a80bdc, 0xdcb96470, 0x74cf6a34, 0x4cde0598, 0x04edb56c, 0x3cfcdac0, + 0x51eddf15, 0x69fcb0b9, 0x21cf004d, 0x19de6fe1, 0xb1a861a5, 0x89b90e09, 0xc18abefd, 0xf99bd151, + 0x37516aae, 0x0f400502, 0x4773b5f6, 0x7f62da5a, 0xd714d41e, 0xef05bbb2, 0xa7360b46, 0x9f2764ea, + 0xf236613f, 0xca270e93, 0x8214be67, 0xba05d1cb, 0x1273df8f, 0x2a62b023, 0x625100d7, 0x5a406f7b, + 0xb8730b7d, 0x806264d1, 0xc851d425, 0xf040bb89, 0x5836b5cd, 0x6027da61, 0x28146a95, 0x10050539, + 0x7d1400ec, 0x45056f40, 0x0d36dfb4, 0x3527b018, 0x9d51be5c, 0xa540d1f0, 0xed736104, 0xd5620ea8, + 0x2cf9dff9, 0x14e8b055, 0x5cdb00a1, 0x64ca6f0d, 0xccbc6149, 0xf4ad0ee5, 0xbc9ebe11, 0x848fd1bd, + 0xe99ed468, 0xd18fbbc4, 0x99bc0b30, 0xa1ad649c, 0x09db6ad8, 0x31ca0574, 0x79f9b580, 0x41e8da2c, + 0xa3dbbe2a, 0x9bcad186, 0xd3f96172, 0xebe80ede, 0x439e009a, 0x7b8f6f36, 0x33bcdfc2, 0x0badb06e, + 0x66bcb5bb, 0x5eadda17, 0x169e6ae3, 0x2e8f054f, 0x86f90b0b, 0xbee864a7, 0xf6dbd453, 0xcecabbff, + 0x6ea2d55c, 0x56b3baf0, 0x1e800a04, 0x269165a8, 0x8ee76bec, 0xb6f60440, 0xfec5b4b4, 0xc6d4db18, + 0xabc5decd, 0x93d4b161, 0xdbe70195, 0xe3f66e39, 0x4b80607d, 0x73910fd1, 0x3ba2bf25, 0x03b3d089, + 0xe180b48f, 0xd991db23, 0x91a26bd7, 0xa9b3047b, 0x01c50a3f, 0x39d46593, 0x71e7d567, 0x49f6bacb, + 0x24e7bf1e, 0x1cf6d0b2, 0x54c56046, 0x6cd40fea, 0xc4a201ae, 0xfcb36e02, 0xb480def6, 0x8c91b15a, + 0x750a600b, 0x4d1b0fa7, 0x0528bf53, 0x3d39d0ff, 0x954fdebb, 0xad5eb117, 0xe56d01e3, 0xdd7c6e4f, + 0xb06d6b9a, 0x887c0436, 0xc04fb4c2, 0xf85edb6e, 0x5028d52a, 0x6839ba86, 0x200a0a72, 0x181b65de, + 0xfa2801d8, 0xc2396e74, 0x8a0ade80, 0xb21bb12c, 0x1a6dbf68, 0x227cd0c4, 0x6a4f6030, 0x525e0f9c, + 0x3f4f0a49, 0x075e65e5, 0x4f6dd511, 0x777cbabd, 0xdf0ab4f9, 0xe71bdb55, 0xaf286ba1, 0x9739040d, + 0x59f3bff2, 0x61e2d05e, 0x29d160aa, 0x11c00f06, 0xb9b60142, 0x81a76eee, 0xc994de1a, 0xf185b1b6, + 0x9c94b463, 0xa485dbcf, 0xecb66b3b, 0xd4a70497, 0x7cd10ad3, 0x44c0657f, 0x0cf3d58b, 0x34e2ba27, + 0xd6d1de21, 0xeec0b18d, 0xa6f30179, 0x9ee26ed5, 0x36946091, 0x0e850f3d, 0x46b6bfc9, 0x7ea7d065, + 0x13b6d5b0, 0x2ba7ba1c, 0x63940ae8, 0x5b856544, 0xf3f36b00, 0xcbe204ac, 0x83d1b458, 0xbbc0dbf4, + 0x425b0aa5, 0x7a4a6509, 0x3279d5fd, 0x0a68ba51, 0xa21eb415, 0x9a0fdbb9, 0xd23c6b4d, 0xea2d04e1, + 0x873c0134, 0xbf2d6e98, 0xf71ede6c, 0xcf0fb1c0, 0x6779bf84, 0x5f68d028, 0x175b60dc, 0x2f4a0f70, + 0xcd796b76, 0xf56804da, 0xbd5bb42e, 0x854adb82, 0x2d3cd5c6, 0x152dba6a, 0x5d1e0a9e, 0x650f6532, + 0x081e60e7, 0x300f0f4b, 0x783cbfbf, 0x402dd013, 0xe85bde57, 0xd04ab1fb, 0x9879010f, 0xa0686ea3 + }, + { + 0x00000000, 0xef306b19, 0xdb8ca0c3, 0x34bccbda, 0xb2f53777, 0x5dc55c6e, 0x697997b4, 0x8649fcad, + 0x6006181f, 0x8f367306, 0xbb8ab8dc, 0x54bad3c5, 0xd2f32f68, 0x3dc34471, 0x097f8fab, 0xe64fe4b2, + 0xc00c303e, 0x2f3c5b27, 0x1b8090fd, 0xf4b0fbe4, 0x72f90749, 0x9dc96c50, 0xa975a78a, 0x4645cc93, + 0xa00a2821, 0x4f3a4338, 0x7b8688e2, 0x94b6e3fb, 0x12ff1f56, 0xfdcf744f, 0xc973bf95, 0x2643d48c, + 0x85f4168d, 0x6ac47d94, 0x5e78b64e, 0xb148dd57, 0x370121fa, 0xd8314ae3, 0xec8d8139, 0x03bdea20, + 0xe5f20e92, 0x0ac2658b, 0x3e7eae51, 0xd14ec548, 0x570739e5, 0xb83752fc, 0x8c8b9926, 0x63bbf23f, + 0x45f826b3, 0xaac84daa, 0x9e748670, 0x7144ed69, 0xf70d11c4, 0x183d7add, 0x2c81b107, 0xc3b1da1e, + 0x25fe3eac, 0xcace55b5, 0xfe729e6f, 0x1142f576, 0x970b09db, 0x783b62c2, 0x4c87a918, 0xa3b7c201, + 0x0e045beb, 0xe13430f2, 0xd588fb28, 0x3ab89031, 0xbcf16c9c, 0x53c10785, 0x677dcc5f, 0x884da746, + 0x6e0243f4, 0x813228ed, 0xb58ee337, 0x5abe882e, 0xdcf77483, 0x33c71f9a, 0x077bd440, 0xe84bbf59, + 0xce086bd5, 0x213800cc, 0x1584cb16, 0xfab4a00f, 0x7cfd5ca2, 0x93cd37bb, 0xa771fc61, 0x48419778, + 0xae0e73ca, 0x413e18d3, 0x7582d309, 0x9ab2b810, 0x1cfb44bd, 0xf3cb2fa4, 0xc777e47e, 0x28478f67, + 0x8bf04d66, 0x64c0267f, 0x507ceda5, 0xbf4c86bc, 0x39057a11, 0xd6351108, 0xe289dad2, 0x0db9b1cb, + 0xebf65579, 0x04c63e60, 0x307af5ba, 0xdf4a9ea3, 0x5903620e, 0xb6330917, 0x828fc2cd, 0x6dbfa9d4, + 0x4bfc7d58, 0xa4cc1641, 0x9070dd9b, 0x7f40b682, 0xf9094a2f, 0x16392136, 0x2285eaec, 0xcdb581f5, + 0x2bfa6547, 0xc4ca0e5e, 0xf076c584, 0x1f46ae9d, 0x990f5230, 0x763f3929, 0x4283f2f3, 0xadb399ea, + 0x1c08b7d6, 0xf338dccf, 0xc7841715, 0x28b47c0c, 0xaefd80a1, 0x41cdebb8, 0x75712062, 0x9a414b7b, + 0x7c0eafc9, 0x933ec4d0, 0xa7820f0a, 0x48b26413, 0xcefb98be, 0x21cbf3a7, 0x1577387d, 0xfa475364, + 0xdc0487e8, 0x3334ecf1, 0x0788272b, 0xe8b84c32, 0x6ef1b09f, 0x81c1db86, 0xb57d105c, 0x5a4d7b45, + 0xbc029ff7, 0x5332f4ee, 0x678e3f34, 0x88be542d, 0x0ef7a880, 0xe1c7c399, 0xd57b0843, 0x3a4b635a, + 0x99fca15b, 0x76ccca42, 0x42700198, 0xad406a81, 0x2b09962c, 0xc439fd35, 0xf08536ef, 0x1fb55df6, + 0xf9fab944, 0x16cad25d, 0x22761987, 0xcd46729e, 0x4b0f8e33, 0xa43fe52a, 0x90832ef0, 0x7fb345e9, + 0x59f09165, 0xb6c0fa7c, 0x827c31a6, 0x6d4c5abf, 0xeb05a612, 0x0435cd0b, 0x308906d1, 0xdfb96dc8, + 0x39f6897a, 0xd6c6e263, 0xe27a29b9, 0x0d4a42a0, 0x8b03be0d, 0x6433d514, 0x508f1ece, 0xbfbf75d7, + 0x120cec3d, 0xfd3c8724, 0xc9804cfe, 0x26b027e7, 0xa0f9db4a, 0x4fc9b053, 0x7b757b89, 0x94451090, + 0x720af422, 0x9d3a9f3b, 0xa98654e1, 0x46b63ff8, 0xc0ffc355, 0x2fcfa84c, 0x1b736396, 0xf443088f, + 0xd200dc03, 0x3d30b71a, 0x098c7cc0, 0xe6bc17d9, 0x60f5eb74, 0x8fc5806d, 0xbb794bb7, 0x544920ae, + 0xb206c41c, 0x5d36af05, 0x698a64df, 0x86ba0fc6, 0x00f3f36b, 0xefc39872, 0xdb7f53a8, 0x344f38b1, + 0x97f8fab0, 0x78c891a9, 0x4c745a73, 0xa344316a, 0x250dcdc7, 0xca3da6de, 0xfe816d04, 0x11b1061d, + 0xf7fee2af, 0x18ce89b6, 0x2c72426c, 0xc3422975, 0x450bd5d8, 0xaa3bbec1, 0x9e87751b, 0x71b71e02, + 0x57f4ca8e, 0xb8c4a197, 0x8c786a4d, 0x63480154, 0xe501fdf9, 0x0a3196e0, 0x3e8d5d3a, 0xd1bd3623, + 0x37f2d291, 0xd8c2b988, 0xec7e7252, 0x034e194b, 0x8507e5e6, 0x6a378eff, 0x5e8b4525, 0xb1bb2e3c + }, + { + 0x00000000, 0x68032cc8, 0xd0065990, 0xb8057558, 0xa5e0c5d1, 0xcde3e919, 0x75e69c41, 0x1de5b089, + 0x4e2dfd53, 0x262ed19b, 0x9e2ba4c3, 0xf628880b, 0xebcd3882, 0x83ce144a, 0x3bcb6112, 0x53c84dda, + 0x9c5bfaa6, 0xf458d66e, 0x4c5da336, 0x245e8ffe, 0x39bb3f77, 0x51b813bf, 0xe9bd66e7, 0x81be4a2f, + 0xd27607f5, 0xba752b3d, 0x02705e65, 0x6a7372ad, 0x7796c224, 0x1f95eeec, 0xa7909bb4, 0xcf93b77c, + 0x3d5b83bd, 0x5558af75, 0xed5dda2d, 0x855ef6e5, 0x98bb466c, 0xf0b86aa4, 0x48bd1ffc, 0x20be3334, + 0x73767eee, 0x1b755226, 0xa370277e, 0xcb730bb6, 0xd696bb3f, 0xbe9597f7, 0x0690e2af, 0x6e93ce67, + 0xa100791b, 0xc90355d3, 0x7106208b, 0x19050c43, 0x04e0bcca, 0x6ce39002, 0xd4e6e55a, 0xbce5c992, + 0xef2d8448, 0x872ea880, 0x3f2bddd8, 0x5728f110, 0x4acd4199, 0x22ce6d51, 0x9acb1809, 0xf2c834c1, + 0x7ab7077a, 0x12b42bb2, 0xaab15eea, 0xc2b27222, 0xdf57c2ab, 0xb754ee63, 0x0f519b3b, 0x6752b7f3, + 0x349afa29, 0x5c99d6e1, 0xe49ca3b9, 0x8c9f8f71, 0x917a3ff8, 0xf9791330, 0x417c6668, 0x297f4aa0, + 0xe6ecfddc, 0x8eefd114, 0x36eaa44c, 0x5ee98884, 0x430c380d, 0x2b0f14c5, 0x930a619d, 0xfb094d55, + 0xa8c1008f, 0xc0c22c47, 0x78c7591f, 0x10c475d7, 0x0d21c55e, 0x6522e996, 0xdd279cce, 0xb524b006, + 0x47ec84c7, 0x2fefa80f, 0x97eadd57, 0xffe9f19f, 0xe20c4116, 0x8a0f6dde, 0x320a1886, 0x5a09344e, + 0x09c17994, 0x61c2555c, 0xd9c72004, 0xb1c40ccc, 0xac21bc45, 0xc422908d, 0x7c27e5d5, 0x1424c91d, + 0xdbb77e61, 0xb3b452a9, 0x0bb127f1, 0x63b20b39, 0x7e57bbb0, 0x16549778, 0xae51e220, 0xc652cee8, + 0x959a8332, 0xfd99affa, 0x459cdaa2, 0x2d9ff66a, 0x307a46e3, 0x58796a2b, 0xe07c1f73, 0x887f33bb, + 0xf56e0ef4, 0x9d6d223c, 0x25685764, 0x4d6b7bac, 0x508ecb25, 0x388de7ed, 0x808892b5, 0xe88bbe7d, + 0xbb43f3a7, 0xd340df6f, 0x6b45aa37, 0x034686ff, 0x1ea33676, 0x76a01abe, 0xcea56fe6, 0xa6a6432e, + 0x6935f452, 0x0136d89a, 0xb933adc2, 0xd130810a, 0xccd53183, 0xa4d61d4b, 0x1cd36813, 0x74d044db, + 0x27180901, 0x4f1b25c9, 0xf71e5091, 0x9f1d7c59, 0x82f8ccd0, 0xeafbe018, 0x52fe9540, 0x3afdb988, + 0xc8358d49, 0xa036a181, 0x1833d4d9, 0x7030f811, 0x6dd54898, 0x05d66450, 0xbdd31108, 0xd5d03dc0, + 0x8618701a, 0xee1b5cd2, 0x561e298a, 0x3e1d0542, 0x23f8b5cb, 0x4bfb9903, 0xf3feec5b, 0x9bfdc093, + 0x546e77ef, 0x3c6d5b27, 0x84682e7f, 0xec6b02b7, 0xf18eb23e, 0x998d9ef6, 0x2188ebae, 0x498bc766, + 0x1a438abc, 0x7240a674, 0xca45d32c, 0xa246ffe4, 0xbfa34f6d, 0xd7a063a5, 0x6fa516fd, 0x07a63a35, + 0x8fd9098e, 0xe7da2546, 0x5fdf501e, 0x37dc7cd6, 0x2a39cc5f, 0x423ae097, 0xfa3f95cf, 0x923cb907, + 0xc1f4f4dd, 0xa9f7d815, 0x11f2ad4d, 0x79f18185, 0x6414310c, 0x0c171dc4, 0xb412689c, 0xdc114454, + 0x1382f328, 0x7b81dfe0, 0xc384aab8, 0xab878670, 0xb66236f9, 0xde611a31, 0x66646f69, 0x0e6743a1, + 0x5daf0e7b, 0x35ac22b3, 0x8da957eb, 0xe5aa7b23, 0xf84fcbaa, 0x904ce762, 0x2849923a, 0x404abef2, + 0xb2828a33, 0xda81a6fb, 0x6284d3a3, 0x0a87ff6b, 0x17624fe2, 0x7f61632a, 0xc7641672, 0xaf673aba, + 0xfcaf7760, 0x94ac5ba8, 0x2ca92ef0, 0x44aa0238, 0x594fb2b1, 0x314c9e79, 0x8949eb21, 0xe14ac7e9, + 0x2ed97095, 0x46da5c5d, 0xfedf2905, 0x96dc05cd, 0x8b39b544, 0xe33a998c, 0x5b3fecd4, 0x333cc01c, + 0x60f48dc6, 0x08f7a10e, 0xb0f2d456, 0xd8f1f89e, 0xc5144817, 0xad1764df, 0x15121187, 0x7d113d4f + }, + { + 0x00000000, 0x493c7d27, 0x9278fa4e, 0xdb448769, 0x211d826d, 0x6821ff4a, 0xb3657823, 0xfa590504, + 0x423b04da, 0x0b0779fd, 0xd043fe94, 0x997f83b3, 0x632686b7, 0x2a1afb90, 0xf15e7cf9, 0xb86201de, + 0x847609b4, 0xcd4a7493, 0x160ef3fa, 0x5f328edd, 0xa56b8bd9, 0xec57f6fe, 0x37137197, 0x7e2f0cb0, + 0xc64d0d6e, 0x8f717049, 0x5435f720, 0x1d098a07, 0xe7508f03, 0xae6cf224, 0x7528754d, 0x3c14086a, + 0x0d006599, 0x443c18be, 0x9f789fd7, 0xd644e2f0, 0x2c1de7f4, 0x65219ad3, 0xbe651dba, 0xf759609d, + 0x4f3b6143, 0x06071c64, 0xdd439b0d, 0x947fe62a, 0x6e26e32e, 0x271a9e09, 0xfc5e1960, 0xb5626447, + 0x89766c2d, 0xc04a110a, 0x1b0e9663, 0x5232eb44, 0xa86bee40, 0xe1579367, 0x3a13140e, 0x732f6929, + 0xcb4d68f7, 0x827115d0, 0x593592b9, 0x1009ef9e, 0xea50ea9a, 0xa36c97bd, 0x782810d4, 0x31146df3, + 0x1a00cb32, 0x533cb615, 0x8878317c, 0xc1444c5b, 0x3b1d495f, 0x72213478, 0xa965b311, 0xe059ce36, + 0x583bcfe8, 0x1107b2cf, 0xca4335a6, 0x837f4881, 0x79264d85, 0x301a30a2, 0xeb5eb7cb, 0xa262caec, + 0x9e76c286, 0xd74abfa1, 0x0c0e38c8, 0x453245ef, 0xbf6b40eb, 0xf6573dcc, 0x2d13baa5, 0x642fc782, + 0xdc4dc65c, 0x9571bb7b, 0x4e353c12, 0x07094135, 0xfd504431, 0xb46c3916, 0x6f28be7f, 0x2614c358, + 0x1700aeab, 0x5e3cd38c, 0x857854e5, 0xcc4429c2, 0x361d2cc6, 0x7f2151e1, 0xa465d688, 0xed59abaf, + 0x553baa71, 0x1c07d756, 0xc743503f, 0x8e7f2d18, 0x7426281c, 0x3d1a553b, 0xe65ed252, 0xaf62af75, + 0x9376a71f, 0xda4ada38, 0x010e5d51, 0x48322076, 0xb26b2572, 0xfb575855, 0x2013df3c, 0x692fa21b, + 0xd14da3c5, 0x9871dee2, 0x4335598b, 0x0a0924ac, 0xf05021a8, 0xb96c5c8f, 0x6228dbe6, 0x2b14a6c1, + 0x34019664, 0x7d3deb43, 0xa6796c2a, 0xef45110d, 0x151c1409, 0x5c20692e, 0x8764ee47, 0xce589360, + 0x763a92be, 0x3f06ef99, 0xe44268f0, 0xad7e15d7, 0x572710d3, 0x1e1b6df4, 0xc55fea9d, 0x8c6397ba, + 0xb0779fd0, 0xf94be2f7, 0x220f659e, 0x6b3318b9, 0x916a1dbd, 0xd856609a, 0x0312e7f3, 0x4a2e9ad4, + 0xf24c9b0a, 0xbb70e62d, 0x60346144, 0x29081c63, 0xd3511967, 0x9a6d6440, 0x4129e329, 0x08159e0e, + 0x3901f3fd, 0x703d8eda, 0xab7909b3, 0xe2457494, 0x181c7190, 0x51200cb7, 0x8a648bde, 0xc358f6f9, + 0x7b3af727, 0x32068a00, 0xe9420d69, 0xa07e704e, 0x5a27754a, 0x131b086d, 0xc85f8f04, 0x8163f223, + 0xbd77fa49, 0xf44b876e, 0x2f0f0007, 0x66337d20, 0x9c6a7824, 0xd5560503, 0x0e12826a, 0x472eff4d, + 0xff4cfe93, 0xb67083b4, 0x6d3404dd, 0x240879fa, 0xde517cfe, 0x976d01d9, 0x4c2986b0, 0x0515fb97, + 0x2e015d56, 0x673d2071, 0xbc79a718, 0xf545da3f, 0x0f1cdf3b, 0x4620a21c, 0x9d642575, 0xd4585852, + 0x6c3a598c, 0x250624ab, 0xfe42a3c2, 0xb77edee5, 0x4d27dbe1, 0x041ba6c6, 0xdf5f21af, 0x96635c88, + 0xaa7754e2, 0xe34b29c5, 0x380faeac, 0x7133d38b, 0x8b6ad68f, 0xc256aba8, 0x19122cc1, 0x502e51e6, + 0xe84c5038, 0xa1702d1f, 0x7a34aa76, 0x3308d751, 0xc951d255, 0x806daf72, 0x5b29281b, 0x1215553c, + 0x230138cf, 0x6a3d45e8, 0xb179c281, 0xf845bfa6, 0x021cbaa2, 0x4b20c785, 0x906440ec, 0xd9583dcb, + 0x613a3c15, 0x28064132, 0xf342c65b, 0xba7ebb7c, 0x4027be78, 0x091bc35f, 0xd25f4436, 0x9b633911, + 0xa777317b, 0xee4b4c5c, 0x350fcb35, 0x7c33b612, 0x866ab316, 0xcf56ce31, 0x14124958, 0x5d2e347f, + 0xe54c35a1, 0xac704886, 0x7734cfef, 0x3e08b2c8, 0xc451b7cc, 0x8d6dcaeb, 0x56294d82, 0x1f1530a5 + } + }; + + uint32_t Crc32c(const void* src, size_t size) + { + const uint8_t* p8 = (const uint8_t*)src; + uint32_t crc = 0xFFFFFFFF; + + for (; ((uintptr_t)p8 & (sizeof(uint32_t) - 1)) != 0 && size > 0; ++p8, --size) + crc = Crc32cTable[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8); + + const uint32_t* p32 = (const uint32_t*)p8; + for (; size >= 8; size -= 8) + { +#ifdef SIMD_BIG_ENDIAN + uint32_t v0 = *p32++ ^ Reorder32(crc); + uint32_t v1 = *p32++; + crc = + Crc32cTable[0x0][v1 & 0xFF] ^ + Crc32cTable[0x1][(v1 >> 8) & 0xFF] ^ + Crc32cTable[0x2][(v1 >> 16) & 0xFF] ^ + Crc32cTable[0x3][(v1 >> 24) & 0xFF] ^ + Crc32cTable[0x4][v0 & 0xFF] ^ + Crc32cTable[0x5][(v0 >> 8) & 0xFF] ^ + Crc32cTable[0x6][(v0 >> 16) & 0xFF] ^ + Crc32cTable[0x7][(v0 >> 24) & 0xFF]; +#else + uint32_t v0 = *p32++ ^ crc; + uint32_t v1 = *p32++; + crc = + Crc32cTable[0x0][(v1 >> 24) & 0xFF] ^ + Crc32cTable[0x1][(v1 >> 16) & 0xFF] ^ + Crc32cTable[0x2][(v1 >> 8) & 0xFF] ^ + Crc32cTable[0x3][v1 & 0xFF] ^ + Crc32cTable[0x4][(v0 >> 24) & 0xFF] ^ + Crc32cTable[0x5][(v0 >> 16) & 0xFF] ^ + Crc32cTable[0x6][(v0 >> 8) & 0xFF] ^ + Crc32cTable[0x7][v0 & 0xFF]; +#endif + } + + for (p8 = (const uint8_t*)p32; size > 0; ++p8, size--) + crc = Crc32cTable[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8); + + return (~crc); + } + } +} diff --git a/3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp b/3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp new file mode 100644 index 0000000000..b064ca50a2 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp @@ -0,0 +1,371 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdArray.h" +#include "Simd/SimdCpu.h" +#include "Simd/SimdBase.h" + +#include + +#if defined(_MSC_VER) +#pragma warning (push) +#pragma warning (disable: 4996) +#endif + +namespace Simd +{ + uint8_t* ImageLoadFromFile(const ImageLoadFromMemoryPtr loader, const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) + { + uint8_t* data = NULL; + ::FILE* file = ::fopen(path, "rb"); + if (file) + { + ::fseek(file, 0, SEEK_END); + Array8u buffer(::ftell(file)); + ::fseek(file, 0, SEEK_SET); + if (::fread(buffer.data, 1, buffer.size, file) == buffer.size) + data = loader(buffer.data, buffer.size, stride, width, height, format); + ::fclose(file); + } + return data; + } + + //------------------------------------------------------------------------- + + ImageLoaderParam::ImageLoaderParam(const uint8_t* d, size_t s, SimdPixelFormatType f) + : data(d) + , size(s) + , format(f) + , file(SimdImageFileUndefined) + { + } + + bool ImageLoaderParam::Validate() + { + if (size >= 3) + { + if (data[0] == 'P' && data[2] == '\n') + { + if (data[1] == '2') + file = SimdImageFilePgmTxt; + if (data[1] == '3') + file = SimdImageFilePpmTxt; + if (data[1] == '5') + file = SimdImageFilePgmBin; + if (data[1] == '6') + file = SimdImageFilePpmBin; + } + } + if (size >= 8) + { + const uint8_t SIGNATURE[8] = { 137, 80, 78, 71, 13, 10, 26, 10 }; + if(memcmp(data, SIGNATURE, 8) == 0) + file = SimdImageFilePng; + } + if (size >= 2) + { + if (data[0] == 0xFF && data[1] == 0xD8) + file = SimdImageFileJpeg; + } + return + file != SimdImageFileUndefined && + (format == SimdPixelFormatNone || format == SimdPixelFormatGray8 || + format == SimdPixelFormatBgr24 || format == SimdPixelFormatBgra32 || + format == SimdPixelFormatRgb24 || format == SimdPixelFormatRgba32); + } + + namespace Base + { + ImagePxmLoader::ImagePxmLoader(const ImageLoaderParam& param) + : ImageLoader(param) + , _toAny(NULL) + , _toBgra(NULL) + { + } + + bool ImagePxmLoader::ReadHeader(size_t version) + { + if (_stream.Size() < 3 || + _stream.Data()[0] != 'P' || + _stream.Data()[1] != '0' + version || + _stream.Data()[2] != '\n') + return false; + _stream.Seek(3); + uint32_t width, height, max; + if (!(_stream.ReadUnsigned(width) && _stream.ReadUnsigned(height) && _stream.ReadUnsigned(max))) + return false; + if (!(width > 0 && height > 0 && max == 255)) + return false; + uint8_t byte; + if (!(_stream.Read(byte) && byte == '\n')) + return false; + _image.Recreate(width, height, (Image::Format)_param.format); + _block = height; + if (_param.file == SimdImageFilePgmTxt || _param.file == SimdImageFilePgmBin) + { + _size = width * 1; + if (_param.format != SimdPixelFormatGray8) + { + _block = Simd::RestrictRange(Base::AlgCacheL1() / _size, 1, height); + _buffer.Resize(_block * _size); + } + } + else if (_param.file == SimdImageFilePpmTxt || _param.file == SimdImageFilePpmBin) + { + _size = width * 3; + if (_param.format != SimdPixelFormatRgb24) + { + _block = Simd::RestrictRange(Base::AlgCacheL1() / _size, 1, height); + _buffer.Resize(_block * _size); + } + } + else + return false; + SetConverters(); + return true; + } + + //------------------------------------------------------------------------- + + ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param) + : ImagePxmLoader(param) + { + if (_param.format == SimdPixelFormatNone) + _param.format = SimdPixelFormatGray8; + } + + bool ImagePgmTxtLoader::FromStream() + { + if (!ReadHeader(2)) + return false; + size_t grayStride = _param.format == SimdPixelFormatGray8 ? _image.stride : _size; + for (size_t row = 0; row < _image.height;) + { + size_t block = Simd::Min(row + _block, _image.height) - row; + uint8_t * gray = _param.format == SimdPixelFormatGray8 ? _image.Row(row) : _buffer.data; + for (size_t b = 0; b < block; ++b) + { + for (size_t i = 0; i < _size; ++i) + { + if (!_stream.ReadUnsigned(gray[i])) + return false; + } + gray += grayStride; + } + if(_param.format == SimdPixelFormatBgr24 || _param.format == SimdPixelFormatRgb24) + _toAny(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride); + if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32) + _toBgra(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride, 0xFF); + row += block; + } + return true; + } + + void ImagePgmTxtLoader::SetConverters() + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Base::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Base::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Base::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Base::GrayToBgra; break; + default: break; + } + } + + //------------------------------------------------------------------------- + + ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param) + : ImagePxmLoader(param) + { + if (_param.format == SimdPixelFormatNone) + _param.format = SimdPixelFormatGray8; + } + + bool ImagePgmBinLoader::FromStream() + { + if (!ReadHeader(5)) + return false; + size_t grayStride = _param.format == SimdPixelFormatGray8 ? _image.stride : _size; + for (size_t row = 0; row < _image.height;) + { + size_t block = Simd::Min(row + _block, _image.height) - row; + uint8_t* gray = _param.format == SimdPixelFormatGray8 ? _image.Row(row) : _buffer.data; + for (size_t b = 0; b < block; ++b) + { + if (_stream.Read(_size, gray) != _size) + return false; + gray += grayStride; + } + if (_param.format == SimdPixelFormatBgr24 || _param.format == SimdPixelFormatRgb24) + _toAny(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride); + if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32) + _toBgra(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride, 0xFF); + row += block; + } + return true; + } + + void ImagePgmBinLoader::SetConverters() + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Base::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Base::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Base::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Base::GrayToBgra; break; + default: break; + } + } + + //------------------------------------------------------------------------- + + ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param) + : ImagePxmLoader(param) + { + if (_param.format == SimdPixelFormatNone) + _param.format = SimdPixelFormatRgb24; + } + + bool ImagePpmTxtLoader::FromStream() + { + if (!ReadHeader(3)) + return false; + size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? _image.stride : _size; + for (size_t row = 0; row < _image.height;) + { + size_t block = Simd::Min(row + _block, _image.height) - row; + uint8_t* rgb = _param.format == SimdPixelFormatRgb24 ? _image.Row(row) : _buffer.data; + for (size_t b = 0; b < block; ++b) + { + for (size_t i = 0; i < _size; ++i) + { + if (!_stream.ReadUnsigned(rgb[i])) + return false; + } + rgb += rgbStride; + } + if (_param.format == SimdPixelFormatGray8 || _param.format == SimdPixelFormatBgr24) + _toAny(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride); + if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32) + _toBgra(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride, 0xFF); + row += block; + } + return true; + } + + void ImagePpmTxtLoader::SetConverters() + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Base::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Base::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Base::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Base::BgrToBgra; break; + default: break; + } + } + + //------------------------------------------------------------------------- + + ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param) + : ImagePxmLoader(param) + { + if (_param.format == SimdPixelFormatNone) + _param.format = SimdPixelFormatRgb24; + } + + bool ImagePpmBinLoader::FromStream() + { + if (!ReadHeader(6)) + return false; + size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? _image.stride : _size; + for (size_t row = 0; row < _image.height;) + { + size_t block = Simd::Min(row + _block, _image.height) - row; + uint8_t* rgb = _param.format == SimdPixelFormatRgb24 ? _image.Row(row) : _buffer.data; + for (size_t b = 0; b < block; ++b) + { + if (_stream.Read(_size, rgb) != _size) + return false; + rgb += rgbStride; + } + if (_param.format == SimdPixelFormatGray8 || _param.format == SimdPixelFormatBgr24) + _toAny(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride); + if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32) + _toBgra(_buffer.data, _image.width, block, _size, _image.Row(row), _image.stride, 0xFF); + row += block; + } + return true; + } + + void ImagePpmBinLoader::SetConverters() + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Base::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Base::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Base::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Base::BgrToBgra; break; + default: break; + } + } + + //------------------------------------------------------------------------- + + ImageLoader* CreateImageLoader(const ImageLoaderParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param); + case SimdImageFilePgmBin: return new ImagePgmBinLoader(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param); + case SimdImageFilePpmBin: return new ImagePpmBinLoader(param); + case SimdImageFilePng: return new ImagePngLoader(param); + case SimdImageFileJpeg: return new ImageJpegLoader(param); + default: + return NULL; + } + } + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) + { + ImageLoaderParam param(data, size, *format); + if (param.Validate()) + { + Holder loader(CreateImageLoader(param)); + if (loader) + { + if (loader->FromStream()) + return loader->Release(stride, width, height, format); + } + } + return NULL; + } + } +} + +#if defined(_MSC_VER) +#pragma warning (pop) +#endif diff --git a/3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp b/3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp new file mode 100644 index 0000000000..88c5da73d0 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp @@ -0,0 +1,2456 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdArray.h" +#include "Simd/SimdCpu.h" +#include "Simd/SimdBase.h" + +namespace Simd +{ + namespace Base + { +#if defined(SIMD_X64_ENABLE) && !defined(SIMD_SSE2_DISABLE) +#define JPEG_SSE2 + static int jpeg__sse2_available(void) + { + return 1; + } +#endif + +#if defined(SIMD_ARM64_ENABLE) && !defined(SIMD_NEON_DISABLE) +#define JPEG_NEON +#endif + + typedef unsigned char jpeg_uc; + typedef unsigned short jpeg_us; + typedef unsigned short jpeg__uint16; + typedef signed short jpeg__int16; + typedef unsigned int jpeg__uint32; + typedef signed int jpeg__int32; + + typedef struct + { + int (*read) (void* user, char* data, int size); // fill 'data' with 'size' bytes. return number of bytes actually read + void (*skip) (void* user, int n); // skip the next 'n' bytes, or 'unget' the last -n bytes if negative + int (*eof) (void* user); // returns nonzero if we are at end of file/data + } jpeg_io_callbacks; + +#define jpeg_inline SIMD_INLINE +#define JPEG_ASSERT assert + +#ifdef _MSC_VER +#define JPEG_NOTUSED(v) (void)(v) +#else +#define JPEG_NOTUSED(v) (void)sizeof(v) +#endif + + typedef struct + { + jpeg__uint32 img_x, img_y; + int img_n, img_out_n; + + jpeg_io_callbacks io; + void* io_user_data; + + int read_from_callbacks; + int buflen; + jpeg_uc buffer_start[128]; + int callback_already_read; + + jpeg_uc* img_buffer, * img_buffer_end; + jpeg_uc* img_buffer_original, * img_buffer_original_end; + } jpeg__context; + + static int jpeg__err(const char* str) + { + //jpeg__g_failure_reason = str; + return 0; + } + + static int jpeg__err(const char* str1, const char* str2) + { + //jpeg__g_failure_reason = str; + return 0; + } + +#define jpeg__errpuc(x,y) ((unsigned char *)(size_t) (jpeg__err(x,y)?NULL:NULL)) + + static void jpeg__refill_buffer(jpeg__context* s) + { + int n = (s->io.read)(s->io_user_data, (char*)s->buffer_start, s->buflen); + s->callback_already_read += (int)(s->img_buffer - s->img_buffer_original); + if (n == 0) { + // at end of file, treat same as if from memory, but need to handle case + // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file + s->read_from_callbacks = 0; + s->img_buffer = s->buffer_start; + s->img_buffer_end = s->buffer_start + 1; + *s->img_buffer = 0; + } + else { + s->img_buffer = s->buffer_start; + s->img_buffer_end = s->buffer_start + n; + } + } + + jpeg_inline static jpeg_uc jpeg__get8(jpeg__context* s) + { + if (s->img_buffer < s->img_buffer_end) + return *s->img_buffer++; + if (s->read_from_callbacks) { + jpeg__refill_buffer(s); + return *s->img_buffer++; + } + return 0; + } + +#define jpeg_lrot(x,y) (((x) << (y)) | ((x) >> (32 - (y)))) + +#define JPEG_SIMD_ALIGN(type, name) SIMD_ALIGNED(16) type name + + static int jpeg__get16be(jpeg__context* s) + { + int z = jpeg__get8(s); + return (z << 8) + jpeg__get8(s); + } + + static void jpeg__skip(jpeg__context* s, int n) + { + if (n == 0) return; // already there! + if (n < 0) { + s->img_buffer = s->img_buffer_end; + return; + } + if (s->io.read) { + int blen = (int)(s->img_buffer_end - s->img_buffer); + if (blen < n) { + s->img_buffer = s->img_buffer_end; + (s->io.skip)(s->io_user_data, n - blen); + return; + } + } + s->img_buffer += n; + } + + jpeg_inline static int jpeg__at_eof(jpeg__context* s) + { + if (s->io.read) { + if (!(s->io.eof)(s->io_user_data)) return 0; + // if feof() is true, check if buffer = end + // special case: we've only got the special 0 character at the end + if (s->read_from_callbacks == 0) return 1; + } + + return s->img_buffer >= s->img_buffer_end; + } + +#define JPEG_MALLOC(sz) malloc(sz) +#define JPEG_REALLOC(p,newsz) realloc(p,newsz) +#define JPEG_FREE(p) free(p) + +#define JPEG_MAX_DIMENSIONS (1 << 24) + + enum + { + JPEG__SCAN_load = 0, + JPEG__SCAN_type, + JPEG__SCAN_header + }; + + static void* jpeg__malloc(size_t size) + { + return JPEG_MALLOC(size); + } + + static int jpeg__addsizes_valid(int a, int b) + { + if (b < 0) return 0; + // now 0 <= b <= INT_MAX, hence also + // 0 <= INT_MAX - b <= INTMAX. + // And "a + b <= INT_MAX" (which might overflow) is the + // same as a <= INT_MAX - b (no overflow) + return a <= INT_MAX - b; + } + + static int jpeg__mul2sizes_valid(int a, int b) + { + if (a < 0 || b < 0) return 0; + if (b == 0) return 1; // mul-by-0 is always safe + // portable way to check for no overflows in a*b + return a <= INT_MAX / b; + } + + static int jpeg__mad2sizes_valid(int a, int b, int add) + { + return jpeg__mul2sizes_valid(a, b) && jpeg__addsizes_valid(a * b, add); + } + + // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow + static int jpeg__mad3sizes_valid(int a, int b, int c, int add) + { + return jpeg__mul2sizes_valid(a, b) && jpeg__mul2sizes_valid(a * b, c) && + jpeg__addsizes_valid(a * b * c, add); + } + + static int jpeg__mad4sizes_valid(int a, int b, int c, int d, int add) + { + return jpeg__mul2sizes_valid(a, b) && jpeg__mul2sizes_valid(a * b, c) && + jpeg__mul2sizes_valid(a * b * c, d) && jpeg__addsizes_valid(a * b * c * d, add); + } + + static void* jpeg__malloc_mad2(int a, int b, int add) + { + if (!jpeg__mad2sizes_valid(a, b, add)) return NULL; + return jpeg__malloc(a * b + add); + } + + static void* jpeg__malloc_mad3(int a, int b, int c, int add) + { + if (!jpeg__mad3sizes_valid(a, b, c, add)) return NULL; + return jpeg__malloc(a * b * c + add); + } + + static jpeg_uc jpeg__compute_y(int r, int g, int b) + { + return (jpeg_uc)(((r * 77) + (g * 150) + (29 * b)) >> 8); + } + + typedef struct + { + int bits_per_channel; + int num_channels; + int channel_order; + } jpeg__result_info; + + static void jpeg__rewind(jpeg__context* s) + { + // conceptually rewind SHOULD rewind to the beginning of the stream, + // but we just rewind to the beginning of the initial buffer, because + // we only use it after doing 'test', which only ever looks at at most 92 bytes + s->img_buffer = s->img_buffer_original; + s->img_buffer_end = s->img_buffer_original_end; + } + + //------------------------------------------------------------------------------ + + // huffman decoding acceleration +#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache + + typedef struct + { + jpeg_uc fast[1 << FAST_BITS]; + // weirdly, repacking this into AoS is a 10% speed loss, instead of a win + jpeg__uint16 code[256]; + jpeg_uc values[256]; + jpeg_uc size[257]; + unsigned int maxcode[18]; + int delta[17]; // old 'firstsymbol' - old 'firstcode' + } jpeg__huffman; + + typedef struct + { + jpeg__context* s; + jpeg__huffman huff_dc[4]; + jpeg__huffman huff_ac[4]; + jpeg__uint16 dequant[4][64]; + jpeg__int16 fast_ac[4][1 << FAST_BITS]; + + // sizes for components, interleaved MCUs + int img_h_max, img_v_max; + int img_mcu_x, img_mcu_y; + int img_mcu_w, img_mcu_h; + + // definition of jpeg image component + struct + { + int id; + int h, v; + int tq; + int hd, ha; + int dc_pred; + + int x, y, w2, h2; + jpeg_uc* data; + void* raw_data, * raw_coeff; + jpeg_uc* linebuf; + short* coeff; // progressive only + int coeff_w, coeff_h; // number of 8x8 coefficient blocks + } img_comp[4]; + + jpeg__uint32 code_buffer; // jpeg entropy-coded buffer + int code_bits; // number of valid bits + unsigned char marker; // marker seen while filling entropy buffer + int nomore; // flag if we saw a marker so must stop + + int progressive; + int spec_start; + int spec_end; + int succ_high; + int succ_low; + int eob_run; + int jfif; + int app14_color_transform; // Adobe APP14 tag + int rgb; + + int scan_n, order[4]; + int restart_interval, todo; + + // kernels + void (*idct_block_kernel)(jpeg_uc* out, int out_stride, short data[64]); + void (*YCbCr_to_RGB_kernel)(jpeg_uc* out, const jpeg_uc* y, const jpeg_uc* pcb, const jpeg_uc* pcr, int count, int step); + jpeg_uc* (*resample_row_hv_2_kernel)(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs); + } jpeg__jpeg; + + static int jpeg__build_huffman(jpeg__huffman* h, int* count) + { + int i, j, k = 0; + unsigned int code; + // build size list for each symbol (from JPEG spec) + for (i = 0; i < 16; ++i) + for (j = 0; j < count[i]; ++j) + h->size[k++] = (jpeg_uc)(i + 1); + h->size[k] = 0; + + // compute actual symbols (from jpeg spec) + code = 0; + k = 0; + for (j = 1; j <= 16; ++j) { + // compute delta to add to code to compute symbol id + h->delta[j] = k - code; + if (h->size[k] == j) { + while (h->size[k] == j) + h->code[k++] = (jpeg__uint16)(code++); + if (code - 1 >= (1u << j)) return jpeg__err("bad code lengths", "Corrupt JPEG"); + } + // compute largest code + 1 for this size, preshifted as needed later + h->maxcode[j] = code << (16 - j); + code <<= 1; + } + h->maxcode[j] = 0xffffffff; + + // build non-spec acceleration table; 255 is flag for not-accelerated + memset(h->fast, 255, 1 << FAST_BITS); + for (i = 0; i < k; ++i) { + int s = h->size[i]; + if (s <= FAST_BITS) { + int c = h->code[i] << (FAST_BITS - s); + int m = 1 << (FAST_BITS - s); + for (j = 0; j < m; ++j) { + h->fast[c + j] = (jpeg_uc)i; + } + } + } + return 1; + } + + // build a table that decodes both magnitude and value of small ACs in + // one go. + static void jpeg__build_fast_ac(jpeg__int16* fast_ac, jpeg__huffman* h) + { + int i; + for (i = 0; i < (1 << FAST_BITS); ++i) { + jpeg_uc fast = h->fast[i]; + fast_ac[i] = 0; + if (fast < 255) { + int rs = h->values[fast]; + int run = (rs >> 4) & 15; + int magbits = rs & 15; + int len = h->size[fast]; + + if (magbits && len + magbits <= FAST_BITS) { + // magnitude code followed by receive_extend code + int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits); + int m = 1 << (magbits - 1); + if (k < m) k += (~0U << magbits) + 1; + // if the result is small enough, we can fit it in fast_ac table + if (k >= -128 && k <= 127) + fast_ac[i] = (jpeg__int16)((k * 256) + (run * 16) + (len + magbits)); + } + } + } + } + + static void jpeg__grow_buffer_unsafe(jpeg__jpeg* j) + { + do { + unsigned int b = j->nomore ? 0 : jpeg__get8(j->s); + if (b == 0xff) { + int c = jpeg__get8(j->s); + while (c == 0xff) c = jpeg__get8(j->s); // consume fill bytes + if (c != 0) { + j->marker = (unsigned char)c; + j->nomore = 1; + return; + } + } + j->code_buffer |= b << (24 - j->code_bits); + j->code_bits += 8; + } while (j->code_bits <= 24); + } + + // (1 << n) - 1 + static const jpeg__uint32 jpeg__bmask[17] = { 0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535 }; + + // decode a jpeg huffman value from the bitstream + jpeg_inline static int jpeg__jpeg_huff_decode(jpeg__jpeg* j, jpeg__huffman* h) + { + unsigned int temp; + int c, k; + + if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j); + + // look at the top FAST_BITS and determine what symbol ID it is, + // if the code is <= FAST_BITS + c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); + k = h->fast[c]; + if (k < 255) { + int s = h->size[k]; + if (s > j->code_bits) + return -1; + j->code_buffer <<= s; + j->code_bits -= s; + return h->values[k]; + } + + // naive test is to shift the code_buffer down so k bits are + // valid, then test against maxcode. To speed this up, we've + // preshifted maxcode left so that it has (16-k) 0s at the + // end; in other words, regardless of the number of bits, it + // wants to be compared against something shifted to have 16; + // that way we don't need to shift inside the loop. + temp = j->code_buffer >> 16; + for (k = FAST_BITS + 1; ; ++k) + if (temp < h->maxcode[k]) + break; + if (k == 17) { + // error! code not found + j->code_bits -= 16; + return -1; + } + + if (k > j->code_bits) + return -1; + + // convert the huffman code to the symbol id + c = ((j->code_buffer >> (32 - k)) & jpeg__bmask[k]) + h->delta[k]; + JPEG_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & jpeg__bmask[h->size[c]]) == h->code[c]); + + // convert the id to a symbol + j->code_bits -= k; + j->code_buffer <<= k; + return h->values[c]; + } + + // bias[n] = (-1<code_bits < n) jpeg__grow_buffer_unsafe(j); + + sgn = (jpeg__int32)j->code_buffer >> 31; // sign bit is always in MSB + k = jpeg_lrot(j->code_buffer, n); + if (n < 0 || n >= (int)(sizeof(jpeg__bmask) / sizeof(*jpeg__bmask))) return 0; + j->code_buffer = k & ~jpeg__bmask[n]; + k &= jpeg__bmask[n]; + j->code_bits -= n; + return k + (jpeg__jbias[n] & ~sgn); + } + + // get some unsigned bits + jpeg_inline static int jpeg__jpeg_get_bits(jpeg__jpeg* j, int n) + { + unsigned int k; + if (j->code_bits < n) jpeg__grow_buffer_unsafe(j); + k = jpeg_lrot(j->code_buffer, n); + j->code_buffer = k & ~jpeg__bmask[n]; + k &= jpeg__bmask[n]; + j->code_bits -= n; + return k; + } + + jpeg_inline static int jpeg__jpeg_get_bit(jpeg__jpeg* j) + { + unsigned int k; + if (j->code_bits < 1) jpeg__grow_buffer_unsafe(j); + k = j->code_buffer; + j->code_buffer <<= 1; + --j->code_bits; + return k & 0x80000000; + } + + // given a value that's at position X in the zigzag stream, + // where does it appear in the 8x8 matrix coded as row-major? + static const jpeg_uc jpeg__jpeg_dezigzag[64 + 15] = + { + 0, 1, 8, 16, 9, 2, 3, 10, + 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, + 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, + 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, + 53, 60, 61, 54, 47, 55, 62, 63, + // let corrupt input sample past end + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63 + }; + + // decode one 64-entry block-- + static int jpeg__jpeg_decode_block(jpeg__jpeg* j, short data[64], jpeg__huffman* hdc, jpeg__huffman* hac, jpeg__int16* fac, int b, jpeg__uint16* dequant) + { + int diff, dc, k; + int t; + + if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j); + t = jpeg__jpeg_huff_decode(j, hdc); + if (t < 0) return jpeg__err("bad huffman code", "Corrupt JPEG"); + + // 0 all the ac values now so we can do it 32-bits at a time + memset(data, 0, 64 * sizeof(data[0])); + + diff = t ? jpeg__extend_receive(j, t) : 0; + dc = j->img_comp[b].dc_pred + diff; + j->img_comp[b].dc_pred = dc; + data[0] = (short)(dc * dequant[0]); + + // decode AC components, see JPEG spec + k = 1; + do { + unsigned int zig; + int c, r, s; + if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j); + c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); + r = fac[c]; + if (r) { // fast-AC path + k += (r >> 4) & 15; // run + s = r & 15; // combined length + j->code_buffer <<= s; + j->code_bits -= s; + // decode into unzigzag'd location + zig = jpeg__jpeg_dezigzag[k++]; + data[zig] = (short)((r >> 8) * dequant[zig]); + } + else { + int rs = jpeg__jpeg_huff_decode(j, hac); + if (rs < 0) return jpeg__err("bad huffman code", "Corrupt JPEG"); + s = rs & 15; + r = rs >> 4; + if (s == 0) { + if (rs != 0xf0) break; // end block + k += 16; + } + else { + k += r; + // decode into unzigzag'd location + zig = jpeg__jpeg_dezigzag[k++]; + data[zig] = (short)(jpeg__extend_receive(j, s) * dequant[zig]); + } + } + } while (k < 64); + return 1; + } + + static int jpeg__jpeg_decode_block_prog_dc(jpeg__jpeg* j, short data[64], jpeg__huffman* hdc, int b) + { + int diff, dc; + int t; + if (j->spec_end != 0) return jpeg__err("can't merge dc and ac", "Corrupt JPEG"); + + if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j); + + if (j->succ_high == 0) { + // first scan for DC coefficient, must be first + memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now + t = jpeg__jpeg_huff_decode(j, hdc); + if (t == -1) return jpeg__err("can't merge dc and ac", "Corrupt JPEG"); + diff = t ? jpeg__extend_receive(j, t) : 0; + + dc = j->img_comp[b].dc_pred + diff; + j->img_comp[b].dc_pred = dc; + data[0] = (short)(dc << j->succ_low); + } + else { + // refinement scan for DC coefficient + if (jpeg__jpeg_get_bit(j)) + data[0] += (short)(1 << j->succ_low); + } + return 1; + } + + // @OPTIMIZE: store non-zigzagged during the decode passes, + // and only de-zigzag when dequantizing + static int jpeg__jpeg_decode_block_prog_ac(jpeg__jpeg* j, short data[64], jpeg__huffman* hac, jpeg__int16* fac) + { + int k; + if (j->spec_start == 0) return jpeg__err("can't merge dc and ac", "Corrupt JPEG"); + + if (j->succ_high == 0) { + int shift = j->succ_low; + + if (j->eob_run) { + --j->eob_run; + return 1; + } + + k = j->spec_start; + do { + unsigned int zig; + int c, r, s; + if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j); + c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); + r = fac[c]; + if (r) { // fast-AC path + k += (r >> 4) & 15; // run + s = r & 15; // combined length + j->code_buffer <<= s; + j->code_bits -= s; + zig = jpeg__jpeg_dezigzag[k++]; + data[zig] = (short)((r >> 8) << shift); + } + else { + int rs = jpeg__jpeg_huff_decode(j, hac); + if (rs < 0) return jpeg__err("bad huffman code", "Corrupt JPEG"); + s = rs & 15; + r = rs >> 4; + if (s == 0) { + if (r < 15) { + j->eob_run = (1 << r); + if (r) + j->eob_run += jpeg__jpeg_get_bits(j, r); + --j->eob_run; + break; + } + k += 16; + } + else { + k += r; + zig = jpeg__jpeg_dezigzag[k++]; + data[zig] = (short)(jpeg__extend_receive(j, s) << shift); + } + } + } while (k <= j->spec_end); + } + else { + // refinement scan for these AC coefficients + + short bit = (short)(1 << j->succ_low); + + if (j->eob_run) { + --j->eob_run; + for (k = j->spec_start; k <= j->spec_end; ++k) { + short* p = &data[jpeg__jpeg_dezigzag[k]]; + if (*p != 0) + if (jpeg__jpeg_get_bit(j)) + if ((*p & bit) == 0) { + if (*p > 0) + *p += bit; + else + *p -= bit; + } + } + } + else { + k = j->spec_start; + do { + int r, s; + int rs = jpeg__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh + if (rs < 0) return jpeg__err("bad huffman code", "Corrupt JPEG"); + s = rs & 15; + r = rs >> 4; + if (s == 0) { + if (r < 15) { + j->eob_run = (1 << r) - 1; + if (r) + j->eob_run += jpeg__jpeg_get_bits(j, r); + r = 64; // force end of block + } + else { + // r=15 s=0 should write 16 0s, so we just do + // a run of 15 0s and then write s (which is 0), + // so we don't have to do anything special here + } + } + else { + if (s != 1) return jpeg__err("bad huffman code", "Corrupt JPEG"); + // sign bit + if (jpeg__jpeg_get_bit(j)) + s = bit; + else + s = -bit; + } + + // advance by r + while (k <= j->spec_end) { + short* p = &data[jpeg__jpeg_dezigzag[k++]]; + if (*p != 0) { + if (jpeg__jpeg_get_bit(j)) + if ((*p & bit) == 0) { + if (*p > 0) + *p += bit; + else + *p -= bit; + } + } + else { + if (r == 0) { + *p = (short)s; + break; + } + --r; + } + } + } while (k <= j->spec_end); + } + } + return 1; + } + + // take a -128..127 value and jpeg__clamp it and convert to 0..255 + jpeg_inline static jpeg_uc jpeg__clamp(int x) + { + // trick to use a single test to catch both cases + if ((unsigned int)x > 255) { + if (x < 0) return 0; + if (x > 255) return 255; + } + return (jpeg_uc)x; + } + +#define jpeg__f2f(x) ((int) (((x) * 4096 + 0.5))) +#define jpeg__fsh(x) ((x) * 4096) + + // derived from jidctint -- DCT_ISLOW +#define JPEG__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \ + int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \ + p2 = s2; \ + p3 = s6; \ + p1 = (p2+p3) * jpeg__f2f(0.5411961f); \ + t2 = p1 + p3*jpeg__f2f(-1.847759065f); \ + t3 = p1 + p2*jpeg__f2f( 0.765366865f); \ + p2 = s0; \ + p3 = s4; \ + t0 = jpeg__fsh(p2+p3); \ + t1 = jpeg__fsh(p2-p3); \ + x0 = t0+t3; \ + x3 = t0-t3; \ + x1 = t1+t2; \ + x2 = t1-t2; \ + t0 = s7; \ + t1 = s5; \ + t2 = s3; \ + t3 = s1; \ + p3 = t0+t2; \ + p4 = t1+t3; \ + p1 = t0+t3; \ + p2 = t1+t2; \ + p5 = (p3+p4)*jpeg__f2f( 1.175875602f); \ + t0 = t0*jpeg__f2f( 0.298631336f); \ + t1 = t1*jpeg__f2f( 2.053119869f); \ + t2 = t2*jpeg__f2f( 3.072711026f); \ + t3 = t3*jpeg__f2f( 1.501321110f); \ + p1 = p5 + p1*jpeg__f2f(-0.899976223f); \ + p2 = p5 + p2*jpeg__f2f(-2.562915447f); \ + p3 = p3*jpeg__f2f(-1.961570560f); \ + p4 = p4*jpeg__f2f(-0.390180644f); \ + t3 += p1+p4; \ + t2 += p2+p3; \ + t1 += p2+p4; \ + t0 += p1+p3; + + static void jpeg__idct_block(jpeg_uc* out, int out_stride, short data[64]) + { + int i, val[64], * v = val; + jpeg_uc* o; + short* d = data; + + // columns + for (i = 0; i < 8; ++i, ++d, ++v) { + // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing + if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 + && d[40] == 0 && d[48] == 0 && d[56] == 0) { + // no shortcut 0 seconds + // (1|2|3|4|5|6|7)==0 0 seconds + // all separate -0.047 seconds + // 1 && 2|3 && 4|5 && 6|7: -0.047 seconds + int dcterm = d[0] * 4; + v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm; + } + else { + JPEG__IDCT_1D(d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56]) + // constants scaled things up by 1<<12; let's bring them back + // down, but keep 2 extra bits of precision + x0 += 512; x1 += 512; x2 += 512; x3 += 512; + v[0] = (x0 + t3) >> 10; + v[56] = (x0 - t3) >> 10; + v[8] = (x1 + t2) >> 10; + v[48] = (x1 - t2) >> 10; + v[16] = (x2 + t1) >> 10; + v[40] = (x2 - t1) >> 10; + v[24] = (x3 + t0) >> 10; + v[32] = (x3 - t0) >> 10; + } + } + + for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) { + // no fast case since the first 1D IDCT spread components out + JPEG__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]) + // constants scaled things up by 1<<12, plus we had 1<<2 from first + // loop, plus horizontal and vertical each scale by sqrt(8) so together + // we've got an extra 1<<3, so 1<<17 total we need to remove. + // so we want to round that, which means adding 0.5 * 1<<17, + // aka 65536. Also, we'll end up with -128 to 127 that we want + // to encode as 0..255 by adding 128, so we'll add that before the shift + x0 += 65536 + (128 << 17); + x1 += 65536 + (128 << 17); + x2 += 65536 + (128 << 17); + x3 += 65536 + (128 << 17); + // tried computing the shifts into temps, or'ing the temps to see + // if any were out of range, but that was slower + o[0] = jpeg__clamp((x0 + t3) >> 17); + o[7] = jpeg__clamp((x0 - t3) >> 17); + o[1] = jpeg__clamp((x1 + t2) >> 17); + o[6] = jpeg__clamp((x1 - t2) >> 17); + o[2] = jpeg__clamp((x2 + t1) >> 17); + o[5] = jpeg__clamp((x2 - t1) >> 17); + o[3] = jpeg__clamp((x3 + t0) >> 17); + o[4] = jpeg__clamp((x3 - t0) >> 17); + } + } + +#ifdef JPEG_SSE2 + // sse2 integer IDCT. not the fastest possible implementation but it + // produces bit-identical results to the generic C version so it's + // fully "transparent". + static void jpeg__idct_simd(jpeg_uc* out, int out_stride, short data[64]) + { + // This is constructed to match our regular (generic) integer IDCT exactly. + __m128i row0, row1, row2, row3, row4, row5, row6, row7; + __m128i tmp; + + // dot product constant: even elems=x, odd elems=y +#define dct_const(x,y) _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y)) + +// out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit) +// out(1) = c1[even]*x + c1[odd]*y +#define dct_rot(out0,out1, x,y,c0,c1) \ + __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \ + __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \ + __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \ + __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \ + __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \ + __m128i out1##_h = _mm_madd_epi16(c0##hi, c1) + + // out = in << 12 (in 16-bit, out 32-bit) +#define dct_widen(out, in) \ + __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \ + __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4) + + // wide add +#define dct_wadd(out, a, b) \ + __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \ + __m128i out##_h = _mm_add_epi32(a##_h, b##_h) + + // wide sub +#define dct_wsub(out, a, b) \ + __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \ + __m128i out##_h = _mm_sub_epi32(a##_h, b##_h) + + // butterfly a/b, add bias, then shift by "s" and pack +#define dct_bfly32o(out0, out1, a,b,bias,s) \ + { \ + __m128i abiased_l = _mm_add_epi32(a##_l, bias); \ + __m128i abiased_h = _mm_add_epi32(a##_h, bias); \ + dct_wadd(sum, abiased, b); \ + dct_wsub(dif, abiased, b); \ + out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \ + out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \ + } + + // 8-bit interleave step (for transposes) +#define dct_interleave8(a, b) \ + tmp = a; \ + a = _mm_unpacklo_epi8(a, b); \ + b = _mm_unpackhi_epi8(tmp, b) + + // 16-bit interleave step (for transposes) +#define dct_interleave16(a, b) \ + tmp = a; \ + a = _mm_unpacklo_epi16(a, b); \ + b = _mm_unpackhi_epi16(tmp, b) + +#define dct_pass(bias,shift) \ + { \ + /* even part */ \ + dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \ + __m128i sum04 = _mm_add_epi16(row0, row4); \ + __m128i dif04 = _mm_sub_epi16(row0, row4); \ + dct_widen(t0e, sum04); \ + dct_widen(t1e, dif04); \ + dct_wadd(x0, t0e, t3e); \ + dct_wsub(x3, t0e, t3e); \ + dct_wadd(x1, t1e, t2e); \ + dct_wsub(x2, t1e, t2e); \ + /* odd part */ \ + dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \ + dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \ + __m128i sum17 = _mm_add_epi16(row1, row7); \ + __m128i sum35 = _mm_add_epi16(row3, row5); \ + dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \ + dct_wadd(x4, y0o, y4o); \ + dct_wadd(x5, y1o, y5o); \ + dct_wadd(x6, y2o, y5o); \ + dct_wadd(x7, y3o, y4o); \ + dct_bfly32o(row0,row7, x0,x7,bias,shift); \ + dct_bfly32o(row1,row6, x1,x6,bias,shift); \ + dct_bfly32o(row2,row5, x2,x5,bias,shift); \ + dct_bfly32o(row3,row4, x3,x4,bias,shift); \ + } + + __m128i rot0_0 = dct_const(jpeg__f2f(0.5411961f), jpeg__f2f(0.5411961f) + jpeg__f2f(-1.847759065f)); + __m128i rot0_1 = dct_const(jpeg__f2f(0.5411961f) + jpeg__f2f(0.765366865f), jpeg__f2f(0.5411961f)); + __m128i rot1_0 = dct_const(jpeg__f2f(1.175875602f) + jpeg__f2f(-0.899976223f), jpeg__f2f(1.175875602f)); + __m128i rot1_1 = dct_const(jpeg__f2f(1.175875602f), jpeg__f2f(1.175875602f) + jpeg__f2f(-2.562915447f)); + __m128i rot2_0 = dct_const(jpeg__f2f(-1.961570560f) + jpeg__f2f(0.298631336f), jpeg__f2f(-1.961570560f)); + __m128i rot2_1 = dct_const(jpeg__f2f(-1.961570560f), jpeg__f2f(-1.961570560f) + jpeg__f2f(3.072711026f)); + __m128i rot3_0 = dct_const(jpeg__f2f(-0.390180644f) + jpeg__f2f(2.053119869f), jpeg__f2f(-0.390180644f)); + __m128i rot3_1 = dct_const(jpeg__f2f(-0.390180644f), jpeg__f2f(-0.390180644f) + jpeg__f2f(1.501321110f)); + + // rounding biases in column/row passes, see jpeg__idct_block for explanation. + __m128i bias_0 = _mm_set1_epi32(512); + __m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17)); + + // load + row0 = _mm_load_si128((const __m128i*) (data + 0 * 8)); + row1 = _mm_load_si128((const __m128i*) (data + 1 * 8)); + row2 = _mm_load_si128((const __m128i*) (data + 2 * 8)); + row3 = _mm_load_si128((const __m128i*) (data + 3 * 8)); + row4 = _mm_load_si128((const __m128i*) (data + 4 * 8)); + row5 = _mm_load_si128((const __m128i*) (data + 5 * 8)); + row6 = _mm_load_si128((const __m128i*) (data + 6 * 8)); + row7 = _mm_load_si128((const __m128i*) (data + 7 * 8)); + + // column pass + dct_pass(bias_0, 10); + + { + // 16bit 8x8 transpose pass 1 + dct_interleave16(row0, row4); + dct_interleave16(row1, row5); + dct_interleave16(row2, row6); + dct_interleave16(row3, row7); + + // transpose pass 2 + dct_interleave16(row0, row2); + dct_interleave16(row1, row3); + dct_interleave16(row4, row6); + dct_interleave16(row5, row7); + + // transpose pass 3 + dct_interleave16(row0, row1); + dct_interleave16(row2, row3); + dct_interleave16(row4, row5); + dct_interleave16(row6, row7); + } + + // row pass + dct_pass(bias_1, 17); + + { + // pack + __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7 + __m128i p1 = _mm_packus_epi16(row2, row3); + __m128i p2 = _mm_packus_epi16(row4, row5); + __m128i p3 = _mm_packus_epi16(row6, row7); + + // 8bit 8x8 transpose pass 1 + dct_interleave8(p0, p2); // a0e0a1e1... + dct_interleave8(p1, p3); // c0g0c1g1... + + // transpose pass 2 + dct_interleave8(p0, p1); // a0c0e0g0... + dct_interleave8(p2, p3); // b0d0f0h0... + + // transpose pass 3 + dct_interleave8(p0, p2); // a0b0c0d0... + dct_interleave8(p1, p3); // a4b4c4d4... + + // store + _mm_storel_epi64((__m128i*) out, p0); out += out_stride; + _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride; + _mm_storel_epi64((__m128i*) out, p2); out += out_stride; + _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride; + _mm_storel_epi64((__m128i*) out, p1); out += out_stride; + _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride; + _mm_storel_epi64((__m128i*) out, p3); out += out_stride; + _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p3, 0x4e)); + } + +#undef dct_const +#undef dct_rot +#undef dct_widen +#undef dct_wadd +#undef dct_wsub +#undef dct_bfly32o +#undef dct_interleave8 +#undef dct_interleave16 +#undef dct_pass + } + +#endif // JPEG_SSE2 + +#ifdef JPEG_NEON + + // NEON integer IDCT. should produce bit-identical + // results to the generic C version. + static void jpeg__idct_simd(jpeg_uc* out, int out_stride, short data[64]) + { + int16x8_t row0, row1, row2, row3, row4, row5, row6, row7; + + int16x4_t rot0_0 = vdup_n_s16(jpeg__f2f(0.5411961f)); + int16x4_t rot0_1 = vdup_n_s16(jpeg__f2f(-1.847759065f)); + int16x4_t rot0_2 = vdup_n_s16(jpeg__f2f(0.765366865f)); + int16x4_t rot1_0 = vdup_n_s16(jpeg__f2f(1.175875602f)); + int16x4_t rot1_1 = vdup_n_s16(jpeg__f2f(-0.899976223f)); + int16x4_t rot1_2 = vdup_n_s16(jpeg__f2f(-2.562915447f)); + int16x4_t rot2_0 = vdup_n_s16(jpeg__f2f(-1.961570560f)); + int16x4_t rot2_1 = vdup_n_s16(jpeg__f2f(-0.390180644f)); + int16x4_t rot3_0 = vdup_n_s16(jpeg__f2f(0.298631336f)); + int16x4_t rot3_1 = vdup_n_s16(jpeg__f2f(2.053119869f)); + int16x4_t rot3_2 = vdup_n_s16(jpeg__f2f(3.072711026f)); + int16x4_t rot3_3 = vdup_n_s16(jpeg__f2f(1.501321110f)); + +#define dct_long_mul(out, inq, coeff) \ + int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \ + int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff) + +#define dct_long_mac(out, acc, inq, coeff) \ + int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \ + int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff) + +#define dct_widen(out, inq) \ + int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \ + int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12) + + // wide add +#define dct_wadd(out, a, b) \ + int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \ + int32x4_t out##_h = vaddq_s32(a##_h, b##_h) + +// wide sub +#define dct_wsub(out, a, b) \ + int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \ + int32x4_t out##_h = vsubq_s32(a##_h, b##_h) + +// butterfly a/b, then shift using "shiftop" by "s" and pack +#define dct_bfly32o(out0,out1, a,b,shiftop,s) \ + { \ + dct_wadd(sum, a, b); \ + dct_wsub(dif, a, b); \ + out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \ + out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \ + } + +#define dct_pass(shiftop, shift) \ + { \ + /* even part */ \ + int16x8_t sum26 = vaddq_s16(row2, row6); \ + dct_long_mul(p1e, sum26, rot0_0); \ + dct_long_mac(t2e, p1e, row6, rot0_1); \ + dct_long_mac(t3e, p1e, row2, rot0_2); \ + int16x8_t sum04 = vaddq_s16(row0, row4); \ + int16x8_t dif04 = vsubq_s16(row0, row4); \ + dct_widen(t0e, sum04); \ + dct_widen(t1e, dif04); \ + dct_wadd(x0, t0e, t3e); \ + dct_wsub(x3, t0e, t3e); \ + dct_wadd(x1, t1e, t2e); \ + dct_wsub(x2, t1e, t2e); \ + /* odd part */ \ + int16x8_t sum15 = vaddq_s16(row1, row5); \ + int16x8_t sum17 = vaddq_s16(row1, row7); \ + int16x8_t sum35 = vaddq_s16(row3, row5); \ + int16x8_t sum37 = vaddq_s16(row3, row7); \ + int16x8_t sumodd = vaddq_s16(sum17, sum35); \ + dct_long_mul(p5o, sumodd, rot1_0); \ + dct_long_mac(p1o, p5o, sum17, rot1_1); \ + dct_long_mac(p2o, p5o, sum35, rot1_2); \ + dct_long_mul(p3o, sum37, rot2_0); \ + dct_long_mul(p4o, sum15, rot2_1); \ + dct_wadd(sump13o, p1o, p3o); \ + dct_wadd(sump24o, p2o, p4o); \ + dct_wadd(sump23o, p2o, p3o); \ + dct_wadd(sump14o, p1o, p4o); \ + dct_long_mac(x4, sump13o, row7, rot3_0); \ + dct_long_mac(x5, sump24o, row5, rot3_1); \ + dct_long_mac(x6, sump23o, row3, rot3_2); \ + dct_long_mac(x7, sump14o, row1, rot3_3); \ + dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \ + dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \ + dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \ + dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \ + } + + // load + row0 = vld1q_s16(data + 0 * 8); + row1 = vld1q_s16(data + 1 * 8); + row2 = vld1q_s16(data + 2 * 8); + row3 = vld1q_s16(data + 3 * 8); + row4 = vld1q_s16(data + 4 * 8); + row5 = vld1q_s16(data + 5 * 8); + row6 = vld1q_s16(data + 6 * 8); + row7 = vld1q_s16(data + 7 * 8); + + // add DC bias + row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0)); + + // column pass + dct_pass(vrshrn_n_s32, 10); + + // 16bit 8x8 transpose + { + // these three map to a single VTRN.16, VTRN.32, and VSWP, respectively. + // whether compilers actually get this is another story, sadly. +#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; } +#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); } +#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); } + + // pass 1 + dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6 + dct_trn16(row2, row3); + dct_trn16(row4, row5); + dct_trn16(row6, row7); + + // pass 2 + dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4 + dct_trn32(row1, row3); + dct_trn32(row4, row6); + dct_trn32(row5, row7); + + // pass 3 + dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0 + dct_trn64(row1, row5); + dct_trn64(row2, row6); + dct_trn64(row3, row7); + +#undef dct_trn16 +#undef dct_trn32 +#undef dct_trn64 + } + + // row pass + // vrshrn_n_s32 only supports shifts up to 16, we need + // 17. so do a non-rounding shift of 16 first then follow + // up with a rounding shift by 1. + dct_pass(vshrn_n_s32, 16); + + { + // pack and round + uint8x8_t p0 = vqrshrun_n_s16(row0, 1); + uint8x8_t p1 = vqrshrun_n_s16(row1, 1); + uint8x8_t p2 = vqrshrun_n_s16(row2, 1); + uint8x8_t p3 = vqrshrun_n_s16(row3, 1); + uint8x8_t p4 = vqrshrun_n_s16(row4, 1); + uint8x8_t p5 = vqrshrun_n_s16(row5, 1); + uint8x8_t p6 = vqrshrun_n_s16(row6, 1); + uint8x8_t p7 = vqrshrun_n_s16(row7, 1); + + // again, these can translate into one instruction, but often don't. +#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; } +#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); } +#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); } + + // sadly can't use interleaved stores here since we only write + // 8 bytes to each scan line! + + // 8x8 8-bit transpose pass 1 + dct_trn8_8(p0, p1); + dct_trn8_8(p2, p3); + dct_trn8_8(p4, p5); + dct_trn8_8(p6, p7); + + // pass 2 + dct_trn8_16(p0, p2); + dct_trn8_16(p1, p3); + dct_trn8_16(p4, p6); + dct_trn8_16(p5, p7); + + // pass 3 + dct_trn8_32(p0, p4); + dct_trn8_32(p1, p5); + dct_trn8_32(p2, p6); + dct_trn8_32(p3, p7); + + // store + vst1_u8(out, p0); out += out_stride; + vst1_u8(out, p1); out += out_stride; + vst1_u8(out, p2); out += out_stride; + vst1_u8(out, p3); out += out_stride; + vst1_u8(out, p4); out += out_stride; + vst1_u8(out, p5); out += out_stride; + vst1_u8(out, p6); out += out_stride; + vst1_u8(out, p7); + +#undef dct_trn8_8 +#undef dct_trn8_16 +#undef dct_trn8_32 + } + +#undef dct_long_mul +#undef dct_long_mac +#undef dct_widen +#undef dct_wadd +#undef dct_wsub +#undef dct_bfly32o +#undef dct_pass + } + +#endif // JPEG_NEON + +#define JPEG__MARKER_none 0xff + // if there's a pending marker from the entropy stream, return that + // otherwise, fetch from the stream and get a marker. if there's no + // marker, return 0xff, which is never a valid marker value + static jpeg_uc jpeg__get_marker(jpeg__jpeg* j) + { + jpeg_uc x; + if (j->marker != JPEG__MARKER_none) { x = j->marker; j->marker = JPEG__MARKER_none; return x; } + x = jpeg__get8(j->s); + if (x != 0xff) return JPEG__MARKER_none; + while (x == 0xff) + x = jpeg__get8(j->s); // consume repeated 0xff fill bytes + return x; + } + + // in each scan, we'll have scan_n components, and the order + // of the components is specified by order[] +#define JPEG__RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7) + +// after a restart interval, jpeg__jpeg_reset the entropy decoder and +// the dc prediction + static void jpeg__jpeg_reset(jpeg__jpeg* j) + { + j->code_bits = 0; + j->code_buffer = 0; + j->nomore = 0; + j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0; + j->marker = JPEG__MARKER_none; + j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff; + j->eob_run = 0; + // no more than 1<<31 MCUs if no restart_interal? that's plenty safe, + // since we don't even allow 1<<30 pixels + } + + static int jpeg__parse_entropy_coded_data(jpeg__jpeg* z) + { + jpeg__jpeg_reset(z); + if (!z->progressive) { + if (z->scan_n == 1) { + int i, j; + JPEG_SIMD_ALIGN(short, data[64]); + int n = z->order[0]; + // non-interleaved data, we just need to process one block at a time, + // in trivial scanline order + // number of blocks to do just depends on how many actual "pixels" this + // component has, independent of interleaved MCU blocking and such + int w = (z->img_comp[n].x + 7) >> 3; + int h = (z->img_comp[n].y + 7) >> 3; + for (j = 0; j < h; ++j) { + for (i = 0; i < w; ++i) { + int ha = z->img_comp[n].ha; + if (!jpeg__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0; + z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data); + // every data block is an MCU, so countdown the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z); + // if it's NOT a restart, then just bail, so we get corrupt data + // rather than no data + if (!JPEG__RESTART(z->marker)) return 1; + jpeg__jpeg_reset(z); + } + } + } + return 1; + } + else { // interleaved + int i, j, k, x, y; + JPEG_SIMD_ALIGN(short, data[64]); + for (j = 0; j < z->img_mcu_y; ++j) { + for (i = 0; i < z->img_mcu_x; ++i) { + // scan an interleaved mcu... process scan_n components in order + for (k = 0; k < z->scan_n; ++k) { + int n = z->order[k]; + // scan out an mcu's worth of this component; that's just determined + // by the basic H and V specified for the component + for (y = 0; y < z->img_comp[n].v; ++y) { + for (x = 0; x < z->img_comp[n].h; ++x) { + int x2 = (i * z->img_comp[n].h + x) * 8; + int y2 = (j * z->img_comp[n].v + y) * 8; + int ha = z->img_comp[n].ha; + if (!jpeg__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0; + z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2, z->img_comp[n].w2, data); + } + } + } + // after all interleaved components, that's an interleaved MCU, + // so now count down the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z); + if (!JPEG__RESTART(z->marker)) return 1; + jpeg__jpeg_reset(z); + } + } + } + return 1; + } + } + else { + if (z->scan_n == 1) { + int i, j; + int n = z->order[0]; + // non-interleaved data, we just need to process one block at a time, + // in trivial scanline order + // number of blocks to do just depends on how many actual "pixels" this + // component has, independent of interleaved MCU blocking and such + int w = (z->img_comp[n].x + 7) >> 3; + int h = (z->img_comp[n].y + 7) >> 3; + for (j = 0; j < h; ++j) { + for (i = 0; i < w; ++i) { + short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); + if (z->spec_start == 0) { + if (!jpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) + return 0; + } + else { + int ha = z->img_comp[n].ha; + if (!jpeg__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha])) + return 0; + } + // every data block is an MCU, so countdown the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z); + if (!JPEG__RESTART(z->marker)) return 1; + jpeg__jpeg_reset(z); + } + } + } + return 1; + } + else { // interleaved + int i, j, k, x, y; + for (j = 0; j < z->img_mcu_y; ++j) { + for (i = 0; i < z->img_mcu_x; ++i) { + // scan an interleaved mcu... process scan_n components in order + for (k = 0; k < z->scan_n; ++k) { + int n = z->order[k]; + // scan out an mcu's worth of this component; that's just determined + // by the basic H and V specified for the component + for (y = 0; y < z->img_comp[n].v; ++y) { + for (x = 0; x < z->img_comp[n].h; ++x) { + int x2 = (i * z->img_comp[n].h + x); + int y2 = (j * z->img_comp[n].v + y); + short* data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w); + if (!jpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) + return 0; + } + } + } + // after all interleaved components, that's an interleaved MCU, + // so now count down the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z); + if (!JPEG__RESTART(z->marker)) return 1; + jpeg__jpeg_reset(z); + } + } + } + return 1; + } + } + } + + static void jpeg__jpeg_dequantize(short* data, jpeg__uint16* dequant) + { + int i; + for (i = 0; i < 64; ++i) + data[i] *= dequant[i]; + } + + static void jpeg__jpeg_finish(jpeg__jpeg* z) + { + if (z->progressive) { + // dequantize and idct the data + int i, j, n; + for (n = 0; n < z->s->img_n; ++n) { + int w = (z->img_comp[n].x + 7) >> 3; + int h = (z->img_comp[n].y + 7) >> 3; + for (j = 0; j < h; ++j) { + for (i = 0; i < w; ++i) { + short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); + jpeg__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]); + z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data); + } + } + } + } + } + + static int jpeg__process_marker(jpeg__jpeg* z, int m) + { + int L; + switch (m) { + case JPEG__MARKER_none: // no marker found + return jpeg__err("expected marker", "Corrupt JPEG"); + + case 0xDD: // DRI - specify restart interval + if (jpeg__get16be(z->s) != 4) return jpeg__err("bad DRI len", "Corrupt JPEG"); + z->restart_interval = jpeg__get16be(z->s); + return 1; + + case 0xDB: // DQT - define quantization table + L = jpeg__get16be(z->s) - 2; + while (L > 0) { + int q = jpeg__get8(z->s); + int p = q >> 4, sixteen = (p != 0); + int t = q & 15, i; + if (p != 0 && p != 1) return jpeg__err("bad DQT type", "Corrupt JPEG"); + if (t > 3) return jpeg__err("bad DQT table", "Corrupt JPEG"); + + for (i = 0; i < 64; ++i) + z->dequant[t][jpeg__jpeg_dezigzag[i]] = (jpeg__uint16)(sixteen ? jpeg__get16be(z->s) : jpeg__get8(z->s)); + L -= (sixteen ? 129 : 65); + } + return L == 0; + + case 0xC4: // DHT - define huffman table + L = jpeg__get16be(z->s) - 2; + while (L > 0) { + jpeg_uc* v; + int sizes[16], i, n = 0; + int q = jpeg__get8(z->s); + int tc = q >> 4; + int th = q & 15; + if (tc > 1 || th > 3) return jpeg__err("bad DHT header", "Corrupt JPEG"); + for (i = 0; i < 16; ++i) { + sizes[i] = jpeg__get8(z->s); + n += sizes[i]; + } + L -= 17; + if (tc == 0) { + if (!jpeg__build_huffman(z->huff_dc + th, sizes)) return 0; + v = z->huff_dc[th].values; + } + else { + if (!jpeg__build_huffman(z->huff_ac + th, sizes)) return 0; + v = z->huff_ac[th].values; + } + for (i = 0; i < n; ++i) + v[i] = jpeg__get8(z->s); + if (tc != 0) + jpeg__build_fast_ac(z->fast_ac[th], z->huff_ac + th); + L -= n; + } + return L == 0; + } + + // check for comment block or APP blocks + if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) { + L = jpeg__get16be(z->s); + if (L < 2) { + if (m == 0xFE) + return jpeg__err("bad COM len", "Corrupt JPEG"); + else + return jpeg__err("bad APP len", "Corrupt JPEG"); + } + L -= 2; + + if (m == 0xE0 && L >= 5) { // JFIF APP0 segment + static const unsigned char tag[5] = { 'J','F','I','F','\0' }; + int ok = 1; + int i; + for (i = 0; i < 5; ++i) + if (jpeg__get8(z->s) != tag[i]) + ok = 0; + L -= 5; + if (ok) + z->jfif = 1; + } + else if (m == 0xEE && L >= 12) { // Adobe APP14 segment + static const unsigned char tag[6] = { 'A','d','o','b','e','\0' }; + int ok = 1; + int i; + for (i = 0; i < 6; ++i) + if (jpeg__get8(z->s) != tag[i]) + ok = 0; + L -= 6; + if (ok) { + jpeg__get8(z->s); // version + jpeg__get16be(z->s); // flags0 + jpeg__get16be(z->s); // flags1 + z->app14_color_transform = jpeg__get8(z->s); // color transform + L -= 6; + } + } + + jpeg__skip(z->s, L); + return 1; + } + + return jpeg__err("unknown marker", "Corrupt JPEG"); + } + + // after we see SOS + static int jpeg__process_scan_header(jpeg__jpeg* z) + { + int i; + int Ls = jpeg__get16be(z->s); + z->scan_n = jpeg__get8(z->s); + if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int)z->s->img_n) return jpeg__err("bad SOS component count", "Corrupt JPEG"); + if (Ls != 6 + 2 * z->scan_n) return jpeg__err("bad SOS len", "Corrupt JPEG"); + for (i = 0; i < z->scan_n; ++i) { + int id = jpeg__get8(z->s), which; + int q = jpeg__get8(z->s); + for (which = 0; which < z->s->img_n; ++which) + if (z->img_comp[which].id == id) + break; + if (which == z->s->img_n) return 0; // no match + z->img_comp[which].hd = q >> 4; if (z->img_comp[which].hd > 3) return jpeg__err("bad DC huff", "Corrupt JPEG"); + z->img_comp[which].ha = q & 15; if (z->img_comp[which].ha > 3) return jpeg__err("bad AC huff", "Corrupt JPEG"); + z->order[i] = which; + } + + { + int aa; + z->spec_start = jpeg__get8(z->s); + z->spec_end = jpeg__get8(z->s); // should be 63, but might be 0 + aa = jpeg__get8(z->s); + z->succ_high = (aa >> 4); + z->succ_low = (aa & 15); + if (z->progressive) { + if (z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13) + return jpeg__err("bad SOS", "Corrupt JPEG"); + } + else { + if (z->spec_start != 0) return jpeg__err("bad SOS", "Corrupt JPEG"); + if (z->succ_high != 0 || z->succ_low != 0) return jpeg__err("bad SOS", "Corrupt JPEG"); + z->spec_end = 63; + } + } + + return 1; + } + + static int jpeg__free_jpeg_components(jpeg__jpeg* z, int ncomp, int why) + { + int i; + for (i = 0; i < ncomp; ++i) { + if (z->img_comp[i].raw_data) { + JPEG_FREE(z->img_comp[i].raw_data); + z->img_comp[i].raw_data = NULL; + z->img_comp[i].data = NULL; + } + if (z->img_comp[i].raw_coeff) { + JPEG_FREE(z->img_comp[i].raw_coeff); + z->img_comp[i].raw_coeff = 0; + z->img_comp[i].coeff = 0; + } + if (z->img_comp[i].linebuf) { + JPEG_FREE(z->img_comp[i].linebuf); + z->img_comp[i].linebuf = NULL; + } + } + return why; + } + + static int jpeg__process_frame_header(jpeg__jpeg* z, int scan) + { + jpeg__context* s = z->s; + int Lf, p, i, q, h_max = 1, v_max = 1, c; + Lf = jpeg__get16be(s); if (Lf < 11) return jpeg__err("bad SOF len", "Corrupt JPEG"); // JPEG + p = jpeg__get8(s); if (p != 8) return jpeg__err("only 8-bit", "JPEG format not supported: 8-bit only"); // JPEG baseline + s->img_y = jpeg__get16be(s); if (s->img_y == 0) return jpeg__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG + s->img_x = jpeg__get16be(s); if (s->img_x == 0) return jpeg__err("0 width", "Corrupt JPEG"); // JPEG requires + if (s->img_y > JPEG_MAX_DIMENSIONS) return jpeg__err("too large", "Very large image (corrupt?)"); + if (s->img_x > JPEG_MAX_DIMENSIONS) return jpeg__err("too large", "Very large image (corrupt?)"); + c = jpeg__get8(s); + if (c != 3 && c != 1 && c != 4) return jpeg__err("bad component count", "Corrupt JPEG"); + s->img_n = c; + for (i = 0; i < c; ++i) { + z->img_comp[i].data = NULL; + z->img_comp[i].linebuf = NULL; + } + + if (Lf != 8 + 3 * s->img_n) return jpeg__err("bad SOF len", "Corrupt JPEG"); + + z->rgb = 0; + for (i = 0; i < s->img_n; ++i) { + static const unsigned char rgb[3] = { 'R', 'G', 'B' }; + z->img_comp[i].id = jpeg__get8(s); + if (s->img_n == 3 && z->img_comp[i].id == rgb[i]) + ++z->rgb; + q = jpeg__get8(s); + z->img_comp[i].h = (q >> 4); if (!z->img_comp[i].h || z->img_comp[i].h > 4) return jpeg__err("bad H", "Corrupt JPEG"); + z->img_comp[i].v = q & 15; if (!z->img_comp[i].v || z->img_comp[i].v > 4) return jpeg__err("bad V", "Corrupt JPEG"); + z->img_comp[i].tq = jpeg__get8(s); if (z->img_comp[i].tq > 3) return jpeg__err("bad TQ", "Corrupt JPEG"); + } + + if (scan != JPEG__SCAN_load) return 1; + + if (!jpeg__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return jpeg__err("too large", "Image too large to decode"); + + for (i = 0; i < s->img_n; ++i) { + if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h; + if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v; + } + + // compute interleaved mcu info + z->img_h_max = h_max; + z->img_v_max = v_max; + z->img_mcu_w = h_max * 8; + z->img_mcu_h = v_max * 8; + // these sizes can't be more than 17 bits + z->img_mcu_x = (s->img_x + z->img_mcu_w - 1) / z->img_mcu_w; + z->img_mcu_y = (s->img_y + z->img_mcu_h - 1) / z->img_mcu_h; + + for (i = 0; i < s->img_n; ++i) { + // number of effective pixels (e.g. for non-interleaved MCU) + z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max - 1) / h_max; + z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max - 1) / v_max; + // to simplify generation, we'll allocate enough memory to decode + // the bogus oversized data from using interleaved MCUs and their + // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't + // discard the extra data until colorspace conversion + // + // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier) + // so these muls can't overflow with 32-bit ints (which we require) + z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8; + z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8; + z->img_comp[i].coeff = 0; + z->img_comp[i].raw_coeff = 0; + z->img_comp[i].linebuf = NULL; + z->img_comp[i].raw_data = jpeg__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15); + if (z->img_comp[i].raw_data == NULL) + return jpeg__free_jpeg_components(z, i + 1, jpeg__err("outofmem", "Out of memory")); + // align blocks for idct using mmx/sse + z->img_comp[i].data = (jpeg_uc*)(((size_t)z->img_comp[i].raw_data + 15) & ~15); + if (z->progressive) { + // w2, h2 are multiples of 8 (see above) + z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8; + z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8; + z->img_comp[i].raw_coeff = jpeg__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15); + if (z->img_comp[i].raw_coeff == NULL) + return jpeg__free_jpeg_components(z, i + 1, jpeg__err("outofmem", "Out of memory")); + z->img_comp[i].coeff = (short*)(((size_t)z->img_comp[i].raw_coeff + 15) & ~15); + } + } + + return 1; + } + + // use comparisons since in some cases we handle more than one case (e.g. SOF) +#define jpeg__DNL(x) ((x) == 0xdc) +#define jpeg__SOI(x) ((x) == 0xd8) +#define jpeg__EOI(x) ((x) == 0xd9) +#define jpeg__SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2) +#define jpeg__SOS(x) ((x) == 0xda) + +#define jpeg__SOF_progressive(x) ((x) == 0xc2) + + static int jpeg__decode_jpeg_header(jpeg__jpeg* z, int scan) + { + int m; + z->jfif = 0; + z->app14_color_transform = -1; // valid values are 0,1,2 + z->marker = JPEG__MARKER_none; // initialize cached marker to empty + m = jpeg__get_marker(z); + if (!jpeg__SOI(m)) return jpeg__err("no SOI", "Corrupt JPEG"); + if (scan == JPEG__SCAN_type) return 1; + m = jpeg__get_marker(z); + while (!jpeg__SOF(m)) { + if (!jpeg__process_marker(z, m)) return 0; + m = jpeg__get_marker(z); + while (m == JPEG__MARKER_none) { + // some files have extra padding after their blocks, so ok, we'll scan + if (jpeg__at_eof(z->s)) return jpeg__err("no SOF", "Corrupt JPEG"); + m = jpeg__get_marker(z); + } + } + z->progressive = jpeg__SOF_progressive(m); + if (!jpeg__process_frame_header(z, scan)) return 0; + return 1; + } + + // decode image to YCbCr format + static int jpeg__decode_jpeg_image(jpeg__jpeg* j) + { + int m; + for (m = 0; m < 4; m++) { + j->img_comp[m].raw_data = NULL; + j->img_comp[m].raw_coeff = NULL; + } + j->restart_interval = 0; + if (!jpeg__decode_jpeg_header(j, JPEG__SCAN_load)) return 0; + m = jpeg__get_marker(j); + while (!jpeg__EOI(m)) { + if (jpeg__SOS(m)) { + if (!jpeg__process_scan_header(j)) return 0; + if (!jpeg__parse_entropy_coded_data(j)) return 0; + if (j->marker == JPEG__MARKER_none) { + // handle 0s at the end of image data from IP Kamera 9060 + while (!jpeg__at_eof(j->s)) { + int x = jpeg__get8(j->s); + if (x == 255) { + j->marker = jpeg__get8(j->s); + break; + } + } + // if we reach eof without hitting a marker, jpeg__get_marker() below will fail and we'll eventually return 0 + } + } + else if (jpeg__DNL(m)) { + int Ld = jpeg__get16be(j->s); + jpeg__uint32 NL = jpeg__get16be(j->s); + if (Ld != 4) return jpeg__err("bad DNL len", "Corrupt JPEG"); + if (NL != j->s->img_y) return jpeg__err("bad DNL height", "Corrupt JPEG"); + } + else { + if (!jpeg__process_marker(j, m)) return 0; + } + m = jpeg__get_marker(j); + } + if (j->progressive) + jpeg__jpeg_finish(j); + return 1; + } + + // static jfif-centered resampling (across block boundaries) + + typedef jpeg_uc* (*resample_row_func)(jpeg_uc* out, jpeg_uc* in0, jpeg_uc* in1, + int w, int hs); + +#define jpeg__div4(x) ((jpeg_uc) ((x) >> 2)) + + static jpeg_uc* resample_row_1(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs) + { + JPEG_NOTUSED(out); + JPEG_NOTUSED(in_far); + JPEG_NOTUSED(w); + JPEG_NOTUSED(hs); + return in_near; + } + + static jpeg_uc* jpeg__resample_row_v_2(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs) + { + // need to generate two samples vertically for every one in input + int i; + JPEG_NOTUSED(hs); + for (i = 0; i < w; ++i) + out[i] = jpeg__div4(3 * in_near[i] + in_far[i] + 2); + return out; + } + + static jpeg_uc* jpeg__resample_row_h_2(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs) + { + // need to generate two samples horizontally for every one in input + int i; + jpeg_uc* input = in_near; + + if (w == 1) { + // if only one sample, can't do any interpolation + out[0] = out[1] = input[0]; + return out; + } + + out[0] = input[0]; + out[1] = jpeg__div4(input[0] * 3 + input[1] + 2); + for (i = 1; i < w - 1; ++i) { + int n = 3 * input[i] + 2; + out[i * 2 + 0] = jpeg__div4(n + input[i - 1]); + out[i * 2 + 1] = jpeg__div4(n + input[i + 1]); + } + out[i * 2 + 0] = jpeg__div4(input[w - 2] * 3 + input[w - 1] + 2); + out[i * 2 + 1] = input[w - 1]; + + JPEG_NOTUSED(in_far); + JPEG_NOTUSED(hs); + + return out; + } + +#define jpeg__div16(x) ((jpeg_uc) ((x) >> 4)) + + static jpeg_uc* jpeg__resample_row_hv_2(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs) + { + // need to generate 2x2 samples for every one in input + int i, t0, t1; + if (w == 1) { + out[0] = out[1] = jpeg__div4(3 * in_near[0] + in_far[0] + 2); + return out; + } + + t1 = 3 * in_near[0] + in_far[0]; + out[0] = jpeg__div4(t1 + 2); + for (i = 1; i < w; ++i) { + t0 = t1; + t1 = 3 * in_near[i] + in_far[i]; + out[i * 2 - 1] = jpeg__div16(3 * t0 + t1 + 8); + out[i * 2] = jpeg__div16(3 * t1 + t0 + 8); + } + out[w * 2 - 1] = jpeg__div4(t1 + 2); + + JPEG_NOTUSED(hs); + + return out; + } + +#if defined(JPEG_SSE2) || defined(JPEG_NEON) + static jpeg_uc* jpeg__resample_row_hv_2_simd(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs) + { + // need to generate 2x2 samples for every one in input + int i = 0, t0, t1; + + if (w == 1) { + out[0] = out[1] = jpeg__div4(3 * in_near[0] + in_far[0] + 2); + return out; + } + + t1 = 3 * in_near[0] + in_far[0]; + // process groups of 8 pixels for as long as we can. + // note we can't handle the last pixel in a row in this loop + // because we need to handle the filter boundary conditions. + for (; i < ((w - 1) & ~7); i += 8) { +#if defined(JPEG_SSE2) + // load and perform the vertical filtering pass + // this uses 3*x + y = 4*x + (y - x) + __m128i zero = _mm_setzero_si128(); + __m128i farb = _mm_loadl_epi64((__m128i*) (in_far + i)); + __m128i nearb = _mm_loadl_epi64((__m128i*) (in_near + i)); + __m128i farw = _mm_unpacklo_epi8(farb, zero); + __m128i nearw = _mm_unpacklo_epi8(nearb, zero); + __m128i diff = _mm_sub_epi16(farw, nearw); + __m128i nears = _mm_slli_epi16(nearw, 2); + __m128i curr = _mm_add_epi16(nears, diff); // current row + + // horizontal filter works the same based on shifted vers of current + // row. "prev" is current row shifted right by 1 pixel; we need to + // insert the previous pixel value (from t1). + // "next" is current row shifted left by 1 pixel, with first pixel + // of next block of 8 pixels added in. + __m128i prv0 = _mm_slli_si128(curr, 2); + __m128i nxt0 = _mm_srli_si128(curr, 2); + __m128i prev = _mm_insert_epi16(prv0, t1, 0); + __m128i next = _mm_insert_epi16(nxt0, 3 * in_near[i + 8] + in_far[i + 8], 7); + + // horizontal filter, polyphase implementation since it's convenient: + // even pixels = 3*cur + prev = cur*4 + (prev - cur) + // odd pixels = 3*cur + next = cur*4 + (next - cur) + // note the shared term. + __m128i bias = _mm_set1_epi16(8); + __m128i curs = _mm_slli_epi16(curr, 2); + __m128i prvd = _mm_sub_epi16(prev, curr); + __m128i nxtd = _mm_sub_epi16(next, curr); + __m128i curb = _mm_add_epi16(curs, bias); + __m128i even = _mm_add_epi16(prvd, curb); + __m128i odd = _mm_add_epi16(nxtd, curb); + + // interleave even and odd pixels, then undo scaling. + __m128i int0 = _mm_unpacklo_epi16(even, odd); + __m128i int1 = _mm_unpackhi_epi16(even, odd); + __m128i de0 = _mm_srli_epi16(int0, 4); + __m128i de1 = _mm_srli_epi16(int1, 4); + + // pack and write output + __m128i outv = _mm_packus_epi16(de0, de1); + _mm_storeu_si128((__m128i*) (out + i * 2), outv); +#elif defined(JPEG_NEON) + // load and perform the vertical filtering pass + // this uses 3*x + y = 4*x + (y - x) + uint8x8_t farb = vld1_u8(in_far + i); + uint8x8_t nearb = vld1_u8(in_near + i); + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb)); + int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2)); + int16x8_t curr = vaddq_s16(nears, diff); // current row + + // horizontal filter works the same based on shifted vers of current + // row. "prev" is current row shifted right by 1 pixel; we need to + // insert the previous pixel value (from t1). + // "next" is current row shifted left by 1 pixel, with first pixel + // of next block of 8 pixels added in. + int16x8_t prv0 = vextq_s16(curr, curr, 7); + int16x8_t nxt0 = vextq_s16(curr, curr, 1); + int16x8_t prev = vsetq_lane_s16(t1, prv0, 0); + int16x8_t next = vsetq_lane_s16(3 * in_near[i + 8] + in_far[i + 8], nxt0, 7); + + // horizontal filter, polyphase implementation since it's convenient: + // even pixels = 3*cur + prev = cur*4 + (prev - cur) + // odd pixels = 3*cur + next = cur*4 + (next - cur) + // note the shared term. + int16x8_t curs = vshlq_n_s16(curr, 2); + int16x8_t prvd = vsubq_s16(prev, curr); + int16x8_t nxtd = vsubq_s16(next, curr); + int16x8_t even = vaddq_s16(curs, prvd); + int16x8_t odd = vaddq_s16(curs, nxtd); + + // undo scaling and round, then store with even/odd phases interleaved + uint8x8x2_t o; + o.val[0] = vqrshrun_n_s16(even, 4); + o.val[1] = vqrshrun_n_s16(odd, 4); + vst2_u8(out + i * 2, o); +#endif + + // "previous" value for next iter + t1 = 3 * in_near[i + 7] + in_far[i + 7]; + } + + t0 = t1; + t1 = 3 * in_near[i] + in_far[i]; + out[i * 2] = jpeg__div16(3 * t1 + t0 + 8); + + for (++i; i < w; ++i) { + t0 = t1; + t1 = 3 * in_near[i] + in_far[i]; + out[i * 2 - 1] = jpeg__div16(3 * t0 + t1 + 8); + out[i * 2] = jpeg__div16(3 * t1 + t0 + 8); + } + out[w * 2 - 1] = jpeg__div4(t1 + 2); + + JPEG_NOTUSED(hs); + + return out; + } +#endif + + static jpeg_uc* jpeg__resample_row_generic(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs) + { + // resample with nearest-neighbor + int i, j; + JPEG_NOTUSED(in_far); + for (i = 0; i < w; ++i) + for (j = 0; j < hs; ++j) + out[i * hs + j] = in_near[i]; + return out; + } + + // this is a reduced-precision calculation of YCbCr-to-RGB introduced + // to make sure the code produces the same results in both SIMD and scalar +#define jpeg__float2fixed(x) (((int) ((x) * 4096.0f + 0.5f)) << 8) + static void jpeg__YCbCr_to_RGB_row(jpeg_uc* out, const jpeg_uc* y, const jpeg_uc* pcb, const jpeg_uc* pcr, int count, int step) + { + int i; + for (i = 0; i < count; ++i) { + int y_fixed = (y[i] << 20) + (1 << 19); // rounding + int r, g, b; + int cr = pcr[i] - 128; + int cb = pcb[i] - 128; + r = y_fixed + cr * jpeg__float2fixed(1.40200f); + g = y_fixed + (cr * -jpeg__float2fixed(0.71414f)) + ((cb * -jpeg__float2fixed(0.34414f)) & 0xffff0000); + b = y_fixed + cb * jpeg__float2fixed(1.77200f); + r >>= 20; + g >>= 20; + b >>= 20; + if ((unsigned)r > 255) { if (r < 0) r = 0; else r = 255; } + if ((unsigned)g > 255) { if (g < 0) g = 0; else g = 255; } + if ((unsigned)b > 255) { if (b < 0) b = 0; else b = 255; } + out[0] = (jpeg_uc)r; + out[1] = (jpeg_uc)g; + out[2] = (jpeg_uc)b; + out[3] = 255; + out += step; + } + } + +#if defined(JPEG_SSE2) || defined(JPEG_NEON) + static void jpeg__YCbCr_to_RGB_simd(jpeg_uc* out, jpeg_uc const* y, jpeg_uc const* pcb, jpeg_uc const* pcr, int count, int step) + { + int i = 0; + +#ifdef JPEG_SSE2 + // step == 3 is pretty ugly on the final interleave, and i'm not convinced + // it's useful in practice (you wouldn't use it for textures, for example). + // so just accelerate step == 4 case. + if (step == 4) { + // this is a fairly straightforward implementation and not super-optimized. + __m128i signflip = _mm_set1_epi8(-0x80); + __m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f)); + __m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f)); + __m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f)); + __m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f)); + __m128i y_bias = _mm_set1_epi8((char)(unsigned char)128); + __m128i xw = _mm_set1_epi16(255); // alpha channel + + for (; i + 7 < count; i += 8) { + // load + __m128i y_bytes = _mm_loadl_epi64((__m128i*) (y + i)); + __m128i cr_bytes = _mm_loadl_epi64((__m128i*) (pcr + i)); + __m128i cb_bytes = _mm_loadl_epi64((__m128i*) (pcb + i)); + __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128 + __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128 + + // unpack to short (and left-shift cr, cb by 8) + __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes); + __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased); + __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased); + + // color transform + __m128i yws = _mm_srli_epi16(yw, 4); + __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw); + __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw); + __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1); + __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1); + __m128i rws = _mm_add_epi16(cr0, yws); + __m128i gwt = _mm_add_epi16(cb0, yws); + __m128i bws = _mm_add_epi16(yws, cb1); + __m128i gws = _mm_add_epi16(gwt, cr1); + + // descale + __m128i rw = _mm_srai_epi16(rws, 4); + __m128i bw = _mm_srai_epi16(bws, 4); + __m128i gw = _mm_srai_epi16(gws, 4); + + // back to byte, set up for transpose + __m128i brb = _mm_packus_epi16(rw, bw); + __m128i gxb = _mm_packus_epi16(gw, xw); + + // transpose to interleave channels + __m128i t0 = _mm_unpacklo_epi8(brb, gxb); + __m128i t1 = _mm_unpackhi_epi8(brb, gxb); + __m128i o0 = _mm_unpacklo_epi16(t0, t1); + __m128i o1 = _mm_unpackhi_epi16(t0, t1); + + // store + _mm_storeu_si128((__m128i*) (out + 0), o0); + _mm_storeu_si128((__m128i*) (out + 16), o1); + out += 32; + } + } +#endif + +#ifdef JPEG_NEON + // in this version, step=3 support would be easy to add. but is there demand? + if (step == 4) { + // this is a fairly straightforward implementation and not super-optimized. + uint8x8_t signflip = vdup_n_u8(0x80); + int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f)); + int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f)); + int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f)); + int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f)); + + for (; i + 7 < count; i += 8) { + // load + uint8x8_t y_bytes = vld1_u8(y + i); + uint8x8_t cr_bytes = vld1_u8(pcr + i); + uint8x8_t cb_bytes = vld1_u8(pcb + i); + int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip)); + int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip)); + + // expand to s16 + int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4)); + int16x8_t crw = vshll_n_s8(cr_biased, 7); + int16x8_t cbw = vshll_n_s8(cb_biased, 7); + + // color transform + int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0); + int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0); + int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1); + int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1); + int16x8_t rws = vaddq_s16(yws, cr0); + int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1); + int16x8_t bws = vaddq_s16(yws, cb1); + + // undo scaling, round, convert to byte + uint8x8x4_t o; + o.val[0] = vqrshrun_n_s16(rws, 4); + o.val[1] = vqrshrun_n_s16(gws, 4); + o.val[2] = vqrshrun_n_s16(bws, 4); + o.val[3] = vdup_n_u8(255); + + // store, interleaving r/g/b/a + vst4_u8(out, o); + out += 8 * 4; + } + } +#endif + + for (; i < count; ++i) { + int y_fixed = (y[i] << 20) + (1 << 19); // rounding + int r, g, b; + int cr = pcr[i] - 128; + int cb = pcb[i] - 128; + r = y_fixed + cr * jpeg__float2fixed(1.40200f); + g = y_fixed + cr * -jpeg__float2fixed(0.71414f) + ((cb * -jpeg__float2fixed(0.34414f)) & 0xffff0000); + b = y_fixed + cb * jpeg__float2fixed(1.77200f); + r >>= 20; + g >>= 20; + b >>= 20; + if ((unsigned)r > 255) { if (r < 0) r = 0; else r = 255; } + if ((unsigned)g > 255) { if (g < 0) g = 0; else g = 255; } + if ((unsigned)b > 255) { if (b < 0) b = 0; else b = 255; } + out[0] = (jpeg_uc)r; + out[1] = (jpeg_uc)g; + out[2] = (jpeg_uc)b; + out[3] = 255; + out += step; + } + } +#endif + + // set up the kernels + static void jpeg__setup_jpeg(jpeg__jpeg* j) + { + j->idct_block_kernel = jpeg__idct_block; + j->YCbCr_to_RGB_kernel = jpeg__YCbCr_to_RGB_row; + j->resample_row_hv_2_kernel = jpeg__resample_row_hv_2; + +#ifdef JPEG_SSE2 + if (jpeg__sse2_available()) { + j->idct_block_kernel = jpeg__idct_simd; + j->YCbCr_to_RGB_kernel = jpeg__YCbCr_to_RGB_simd; + j->resample_row_hv_2_kernel = jpeg__resample_row_hv_2_simd; + } +#endif + +#ifdef JPEG_NEON + j->idct_block_kernel = jpeg__idct_simd; + j->YCbCr_to_RGB_kernel = jpeg__YCbCr_to_RGB_simd; + j->resample_row_hv_2_kernel = jpeg__resample_row_hv_2_simd; +#endif + } + + // clean up the temporary component buffers + static void jpeg__cleanup_jpeg(jpeg__jpeg* j) + { + jpeg__free_jpeg_components(j, j->s->img_n, 0); + } + + typedef struct + { + resample_row_func resample; + jpeg_uc* line0, * line1; + int hs, vs; // expansion factor in each axis + int w_lores; // horizontal pixels pre-expansion + int ystep; // how far through vertical expansion we are + int ypos; // which pre-expansion row we're on + } jpeg__resample; + + // fast 0..255 * 0..255 => 0..255 rounded multiplication + static jpeg_uc jpeg__blinn_8x8(jpeg_uc x, jpeg_uc y) + { + unsigned int t = x * y + 128; + return (jpeg_uc)((t + (t >> 8)) >> 8); + } + + static jpeg_uc* load_jpeg_image(jpeg__jpeg* z, int* out_x, int* out_y, int* comp, int req_comp) + { + int n, decode_n, is_rgb; + z->s->img_n = 0; // make jpeg__cleanup_jpeg safe + + // validate req_comp + if (req_comp < 0 || req_comp > 4) return jpeg__errpuc("bad req_comp", "Internal error"); + + // load a jpeg image from whichever source, but leave in YCbCr format + if (!jpeg__decode_jpeg_image(z)) { jpeg__cleanup_jpeg(z); return NULL; } + + // determine actual number of components to generate + n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1; + + is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif)); + + if (z->s->img_n == 3 && n < 3 && !is_rgb) + decode_n = 1; + else + decode_n = z->s->img_n; + + // resample and color-convert + { + int k; + unsigned int i, j; + jpeg_uc* output; + jpeg_uc* coutput[4] = { NULL, NULL, NULL, NULL }; + + jpeg__resample res_comp[4]; + + for (k = 0; k < decode_n; ++k) { + jpeg__resample* r = &res_comp[k]; + + // allocate line buffer big enough for upsampling off the edges + // with upsample factor of 4 + z->img_comp[k].linebuf = (jpeg_uc*)jpeg__malloc(z->s->img_x + 3); + if (!z->img_comp[k].linebuf) { jpeg__cleanup_jpeg(z); return jpeg__errpuc("outofmem", "Out of memory"); } + + r->hs = z->img_h_max / z->img_comp[k].h; + r->vs = z->img_v_max / z->img_comp[k].v; + r->ystep = r->vs >> 1; + r->w_lores = (z->s->img_x + r->hs - 1) / r->hs; + r->ypos = 0; + r->line0 = r->line1 = z->img_comp[k].data; + + if (r->hs == 1 && r->vs == 1) r->resample = resample_row_1; + else if (r->hs == 1 && r->vs == 2) r->resample = jpeg__resample_row_v_2; + else if (r->hs == 2 && r->vs == 1) r->resample = jpeg__resample_row_h_2; + else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel; + else r->resample = jpeg__resample_row_generic; + } + + // can't error after this so, this is safe + output = (jpeg_uc*)jpeg__malloc_mad3(n, z->s->img_x, z->s->img_y, 1); + if (!output) { jpeg__cleanup_jpeg(z); return jpeg__errpuc("outofmem", "Out of memory"); } + + // now go ahead and resample + for (j = 0; j < z->s->img_y; ++j) { + jpeg_uc* out = output + n * z->s->img_x * j; + for (k = 0; k < decode_n; ++k) { + jpeg__resample* r = &res_comp[k]; + int y_bot = r->ystep >= (r->vs >> 1); + coutput[k] = r->resample(z->img_comp[k].linebuf, + y_bot ? r->line1 : r->line0, + y_bot ? r->line0 : r->line1, + r->w_lores, r->hs); + if (++r->ystep >= r->vs) { + r->ystep = 0; + r->line0 = r->line1; + if (++r->ypos < z->img_comp[k].y) + r->line1 += z->img_comp[k].w2; + } + } + if (n >= 3) { + jpeg_uc* y = coutput[0]; + if (z->s->img_n == 3) { + if (is_rgb) { + for (i = 0; i < z->s->img_x; ++i) { + out[0] = y[i]; + out[1] = coutput[1][i]; + out[2] = coutput[2][i]; + out[3] = 255; + out += n; + } + } + else { + z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); + } + } + else if (z->s->img_n == 4) { + if (z->app14_color_transform == 0) { // CMYK + for (i = 0; i < z->s->img_x; ++i) { + jpeg_uc m = coutput[3][i]; + out[0] = jpeg__blinn_8x8(coutput[0][i], m); + out[1] = jpeg__blinn_8x8(coutput[1][i], m); + out[2] = jpeg__blinn_8x8(coutput[2][i], m); + out[3] = 255; + out += n; + } + } + else if (z->app14_color_transform == 2) { // YCCK + z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); + for (i = 0; i < z->s->img_x; ++i) { + jpeg_uc m = coutput[3][i]; + out[0] = jpeg__blinn_8x8(255 - out[0], m); + out[1] = jpeg__blinn_8x8(255 - out[1], m); + out[2] = jpeg__blinn_8x8(255 - out[2], m); + out += n; + } + } + else { // YCbCr + alpha? Ignore the fourth channel for now + z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); + } + } + else + for (i = 0; i < z->s->img_x; ++i) { + out[0] = out[1] = out[2] = y[i]; + out[3] = 255; // not used if n==3 + out += n; + } + } + else { + if (is_rgb) { + if (n == 1) + for (i = 0; i < z->s->img_x; ++i) + *out++ = jpeg__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]); + else { + for (i = 0; i < z->s->img_x; ++i, out += 2) { + out[0] = jpeg__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]); + out[1] = 255; + } + } + } + else if (z->s->img_n == 4 && z->app14_color_transform == 0) { + for (i = 0; i < z->s->img_x; ++i) { + jpeg_uc m = coutput[3][i]; + jpeg_uc r = jpeg__blinn_8x8(coutput[0][i], m); + jpeg_uc g = jpeg__blinn_8x8(coutput[1][i], m); + jpeg_uc b = jpeg__blinn_8x8(coutput[2][i], m); + out[0] = jpeg__compute_y(r, g, b); + out[1] = 255; + out += n; + } + } + else if (z->s->img_n == 4 && z->app14_color_transform == 2) { + for (i = 0; i < z->s->img_x; ++i) { + out[0] = jpeg__blinn_8x8(255 - coutput[0][i], coutput[3][i]); + out[1] = 255; + out += n; + } + } + else { + jpeg_uc* y = coutput[0]; + if (n == 1) + for (i = 0; i < z->s->img_x; ++i) out[i] = y[i]; + else + for (i = 0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; } + } + } + } + jpeg__cleanup_jpeg(z); + *out_x = z->s->img_x; + *out_y = z->s->img_y; + if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output + return output; + } + } + + static void* jpeg__jpeg_load(jpeg__context* s, int* x, int* y, int* comp, int req_comp, jpeg__result_info* ri) + { + unsigned char* result; + jpeg__jpeg* j = (jpeg__jpeg*)jpeg__malloc(sizeof(jpeg__jpeg)); + JPEG_NOTUSED(ri); + j->s = s; + jpeg__setup_jpeg(j); + result = load_jpeg_image(j, x, y, comp, req_comp); + JPEG_FREE(j); + return result; + } + + static int jpeg__jpeg_test(jpeg__context* s) + { + int r; + jpeg__jpeg* j = (jpeg__jpeg*)jpeg__malloc(sizeof(jpeg__jpeg)); + j->s = s; + jpeg__setup_jpeg(j); + r = jpeg__decode_jpeg_header(j, JPEG__SCAN_type); + jpeg__rewind(s); + JPEG_FREE(j); + return r; + } + + static int jpeg__jpeg_info_raw(jpeg__jpeg* j, int* x, int* y, int* comp) + { + if (!jpeg__decode_jpeg_header(j, JPEG__SCAN_header)) { + jpeg__rewind(j->s); + return 0; + } + if (x) *x = j->s->img_x; + if (y) *y = j->s->img_y; + if (comp) *comp = j->s->img_n >= 3 ? 3 : 1; + return 1; + } + + static int jpeg__jpeg_info(jpeg__context* s, int* x, int* y, int* comp) + { + int result; + jpeg__jpeg* j = (jpeg__jpeg*)(jpeg__malloc(sizeof(jpeg__jpeg))); + j->s = s; + result = jpeg__jpeg_info_raw(j, x, y, comp); + JPEG_FREE(j); + return result; + } + + //------------------------------------------------------------------------ + + static int jpeg__stdio_read(void* user, char* data, int size) + { + InputMemoryStream* stream = (InputMemoryStream*)user; + return (int)stream->Read(size, data); + } + + static void jpeg__stdio_skip(void* user, int n) + { + InputMemoryStream* stream = (InputMemoryStream*)user; + stream->Skip(n); + } + + static int jpeg__stdio_eof(void* user) + { + InputMemoryStream* stream = (InputMemoryStream*)user; + return stream->Pos() == stream->Size() ? 1 : 0; + } + + //--------------------------------------------------------------------- + + ImageJpegLoader::ImageJpegLoader(const ImageLoaderParam& param) + : ImageLoader(param) + { + if (_param.format == SimdPixelFormatNone) + _param.format = SimdPixelFormatRgb24; + } + + bool ImageJpegLoader::FromStream() + { + int x, y, comp; + jpeg__context s; + s.io.eof = jpeg__stdio_eof; + s.io.read = jpeg__stdio_read; + s.io.skip = jpeg__stdio_skip; + s.io_user_data = &_stream; + s.buflen = sizeof(s.buffer_start); + s.read_from_callbacks = 1; + s.callback_already_read = 0; + s.img_buffer = s.img_buffer_original = s.buffer_start; + jpeg__refill_buffer(&s); + s.img_buffer_original_end = s.img_buffer_end; + jpeg__result_info ri; + uint8_t * data = (uint8_t*)jpeg__jpeg_load(&s, &x, &y, &comp, 3, &ri); + if (data) + { + size_t stride = 3 * x; + _image.Recreate(x, y, (Image::Format)_param.format); + switch (_param.format) + { + case SimdPixelFormatGray8: + Base::RgbToGray(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgr24: + Base::BgrToRgb(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgra32: + Base::RgbToBgra(data, x, y, stride, _image.data, _image.stride, 0xFF); + break; + case SimdPixelFormatRgb24: + Base::Copy(data, stride, x, y, 3, _image.data, _image.stride); + break; + case SimdPixelFormatRgba32: + Base::BgrToBgra(data, x, y, stride, _image.data, _image.stride, 0xFF); + break; + default: + break; + } + JPEG_FREE(data); + return true; + } + return false; + } + } +} diff --git a/3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp b/3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp new file mode 100644 index 0000000000..03ae0fab6f --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp @@ -0,0 +1,1317 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdImageSavePng.h" +#include "Simd/SimdArray.h" +#include "Simd/SimdCpu.h" +#include "Simd/SimdBase.h" + +namespace Simd +{ + namespace Base + { +#define PNG_MALLOC(sz) malloc(sz) +#define PNG_REALLOC(p,newsz) realloc(p,newsz) +#define PNG_FREE(p) free(p) + +#define PNG__BYTECAST(x) ((uint8_t) ((x) & 255)) // truncate int to byte without warnings + + SIMD_INLINE int PngError(const char* str, const char* stub) + { + std::cout << "PNG load error: " << str << ", " << stub << "!" << std::endl; + return 0; + } + + SIMD_INLINE uint8_t * PngErrorPtr(const char* str, const char* stub) + { + return (uint8_t*)(size_t)(PngError(str, stub) ? NULL : NULL); + } + + static void* png__malloc(size_t size) + { + return PNG_MALLOC(size); + } + + struct PngContext + { + uint32_t img_x, img_y; + int img_n, img_out_n; + }; + + static int png__addsizes_valid(int a, int b) + { + if (b < 0) return 0; + // now 0 <= b <= INT_MAX, hence also + // 0 <= INT_MAX - b <= INTMAX. + // And "a + b <= INT_MAX" (which might overflow) is the + // same as a <= INT_MAX - b (no overflow) + return a <= INT_MAX - b; + } + + // returns 1 if the product is valid, 0 on overflow. + // negative factors are considered invalid. + static int png__mul2sizes_valid(int a, int b) + { + if (a < 0 || b < 0) return 0; + if (b == 0) return 1; // mul-by-0 is always safe + // portable way to check for no overflows in a*b + return a <= INT_MAX / b; + } + + // returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow + static int png__mad2sizes_valid(int a, int b, int add) + { + return png__mul2sizes_valid(a, b) && png__addsizes_valid(a * b, add); + } + + // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow + static int png__mad3sizes_valid(int a, int b, int c, int add) + { + return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) && + png__addsizes_valid(a * b * c, add); + } + + // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow + static int png__mad4sizes_valid(int a, int b, int c, int d, int add) + { + return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) && + png__mul2sizes_valid(a * b * c, d) && png__addsizes_valid(a * b * c * d, add); + } + + // mallocs with size overflow checking + static void* png__malloc_mad2(int a, int b, int add) + { + if (!png__mad2sizes_valid(a, b, add)) return NULL; + return png__malloc(a * b + add); + } + + static void* png__malloc_mad3(int a, int b, int c, int add) + { + if (!png__mad3sizes_valid(a, b, c, add)) return NULL; + return png__malloc(a * b * c + add); + } + + static void* png__malloc_mad4(int a, int b, int c, int d, int add) + { + if (!png__mad4sizes_valid(a, b, c, d, add)) return NULL; + return png__malloc(a * b * c * d + add); + } + + static uint8_t png__compute_y(int r, int g, int b) + { + return (uint8_t)(((r * 77) + (g * 150) + (29 * b)) >> 8); + } + + static uint8_t* png__convert_format(uint8_t* data, int img_n, int req_comp, unsigned int x, unsigned int y) + { + int i, j; + uint8_t* good; + + if (req_comp == img_n) + return data; + assert(req_comp >= 1 && req_comp <= 4); + + good = (uint8_t*)png__malloc_mad3(req_comp, x, y, 0); + if (good == NULL) + { + PNG_FREE(data); + return PngErrorPtr("outofmem", "Out of memory"); + } + + for (j = 0; j < (int)y; ++j) + { + uint8_t* src = data + j * x * img_n; + uint8_t* dest = good + j * x * req_comp; + +#define PNG__COMBO(a,b) ((a)*8+(b)) +#define PNG__CASE(a,b) case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b) + // convert source image with img_n components to one with req_comp components; + // avoid switch per pixel, so use switch per scanline and massive macros + switch (PNG__COMBO(img_n, req_comp)) + { + PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 255; } break; + PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 255; } break; + PNG__CASE(2, 1) { dest[0] = src[0]; } break; + PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break; + PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 255; } break; + PNG__CASE(3, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break; + PNG__CASE(3, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = 255; } break; + PNG__CASE(4, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break; + PNG__CASE(4, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = src[3]; } break; + PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break; + default: assert(0); PNG_FREE(data); PNG_FREE(good); return PngErrorPtr("unsupported", "Unsupported format conversion"); + } +#undef PNG__CASE + } + + PNG_FREE(data); + return good; + } + + static uint16_t png__compute_y_16(int r, int g, int b) + { + return (uint16_t)(((r * 77) + (g * 150) + (29 * b)) >> 8); + } + + static uint16_t* png__convert_format16(uint16_t* data, int img_n, int req_comp, unsigned int x, unsigned int y) + { + int i, j; + uint16_t* good; + + if (req_comp == img_n) + return data; + assert(req_comp >= 1 && req_comp <= 4); + + good = (uint16_t*)png__malloc(req_comp * x * y * 2); + if (good == NULL) + { + PNG_FREE(data); + return (uint16_t*)PngErrorPtr("outofmem", "Out of memory"); + } + + for (j = 0; j < (int)y; ++j) + { + uint16_t* src = data + j * x * img_n; + uint16_t* dest = good + j * x * req_comp; + +#define PNG__COMBO(a,b) ((a)*8+(b)) +#define PNG__CASE(a,b) case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b) + // convert source image with img_n components to one with req_comp components; + // avoid switch per pixel, so use switch per scanline and massive macros + switch (PNG__COMBO(img_n, req_comp)) { + PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 0xffff; } break; + PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 0xffff; } break; + PNG__CASE(2, 1) { dest[0] = src[0]; } break; + PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break; + PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 0xffff; } break; + PNG__CASE(3, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break; + PNG__CASE(3, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = 0xffff; } break; + PNG__CASE(4, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break; + PNG__CASE(4, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = src[3]; } break; + PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break; + default: assert(0); PNG_FREE(data); PNG_FREE(good); return (uint16_t*)PngErrorPtr("unsupported", "Unsupported format conversion"); + } +#undef PNG__CASE + } + + PNG_FREE(data); + return good; + } + + namespace Zlib + { + const size_t ZFAST_BITS = 9; + const size_t ZFAST_SIZE = 1 << ZFAST_BITS; + const size_t ZFAST_MASK = ZFAST_SIZE - 1; + + struct Zhuffman + { + uint16_t fast[ZFAST_SIZE]; + uint16_t firstCode[16]; + int maxCode[17]; + uint16_t firstSymbol[16]; + uint8_t size[288]; + uint16_t value[288]; + + bool Build(const uint8_t* sizelist, int num) + { + int i, k = 0; + int code, nextCode[16], sizes[17]; + + memset(sizes, 0, sizeof(sizes)); + memset(fast, 0, sizeof(fast)); + for (i = 0; i < num; ++i) + ++sizes[sizelist[i]]; + sizes[0] = 0; + for (i = 1; i < 16; ++i) + if (sizes[i] > (1 << i)) + return PngError("bad sizes", "Corrupt PNG"); + code = 0; + for (i = 1; i < 16; ++i) + { + nextCode[i] = code; + firstCode[i] = (uint16_t)code; + firstSymbol[i] = (uint16_t)k; + code = (code + sizes[i]); + if (sizes[i] && code - 1 >= (1 << i)) + return PngError("bad codelengths", "Corrupt PNG"); + maxCode[i] = code << (16 - i); // preshift for inner loop + code <<= 1; + k += sizes[i]; + } + maxCode[16] = 0x10000; // sentinel + for (i = 0; i < num; ++i) + { + int s = sizelist[i]; + if (s) + { + int c = nextCode[s] - firstCode[s] + firstSymbol[s]; + uint16_t fastv = (uint16_t)((s << 9) | i); + size[c] = (uint8_t)s; + value[c] = (uint16_t)i; + if (s <= (int)ZFAST_BITS) + { + int j = ZlibBitRev(nextCode[s], s); + while (j < (1 << ZFAST_BITS)) + { + fast[j] = fastv; + j += (1 << s); + } + } + ++nextCode[s]; + } + } + return 1; + } + }; + + SIMD_INLINE static int BitRev16(int n) + { + n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1); + n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2); + n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4); + n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8); + return n; + } + + static int ZhuffmanDecode(InputMemoryStream& is, const Zhuffman& z) + { + int b, s; + if (is.BitCount() < 16) + { + if (is.Eof()) + return -1; + is.FillBits(); + } + b = z.fast[is.BitBuffer() & ZFAST_MASK]; + if (b) + { + s = b >> 9; + is.BitBuffer() >>= s; + is.BitCount() -= s; + return b & 511; + } + else + { + int k; + k = BitRev16(is.BitBuffer()); + for (s = ZFAST_BITS + 1; k >= z.maxCode[s]; ++s); + if (s >= 16) + return -1; + b = (k >> (16 - s)) - z.firstCode[s] + z.firstSymbol[s]; + if (b >= sizeof(z.size) || z.size[b] != s) + return -1; + is.BitBuffer() >>= s; + is.BitCount() -= s; + return z.value[b]; + } + } + + static int ParseHuffmanBlock(InputMemoryStream& is, const Zhuffman& zLength, const Zhuffman& zDistance, OutputMemoryStream& os) + { + static const int zlengthBase[31] = { 3,4,5,6,7,8,9,10,11,13, 15,17,19,23,27,31,35,43,51,59, 67,83,99,115,131,163,195,227,258,0,0 }; + static const int zlengthExtra[31] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 }; + static const int zdistBase[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193, 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0 }; + static const int zdistExtra[32] = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 }; + + uint8_t* beg = os.Data(), * dst = os.Current(), * end = beg + os.Capacity(); + for (;;) + { + ptrdiff_t z = ZhuffmanDecode(is, zLength); + if (z < 256) + { + if (z < 0) + return PngError("bad huffman code", "Corrupt PNG"); + if (dst >= end) + { + os.Reserve(end - beg + 1); + beg = os.Data(); + dst = os.Current(); + end = beg + os.Capacity(); + } + *dst++ = (uint8_t)z; + } + else + { + uint8_t* p; + ptrdiff_t len, dist; + if (z == 256) + { + os.Seek(dst - beg); + return 1; + } + z -= 257; + len = zlengthBase[z]; + if (zlengthExtra[z]) + len += is.ReadBits(zlengthExtra[z]); + z = ZhuffmanDecode(is, zDistance); + if (z < 0) + return PngError("bad huffman code", "Corrupt PNG"); + dist = zdistBase[z]; + if (zdistExtra[z]) + dist += is.ReadBits(zdistExtra[z]); + if (dst - beg < dist) + return PngError("bad dist", "Corrupt PNG"); + if (dst + len > end) + { + os.Reserve(end - beg + 1); + beg = os.Data(); + dst = os.Current(); + end = beg + os.Capacity(); + } + uint8_t* src = dst - dist; + if (dist == 1) + { + memset(dst, *src, len); + dst += len; + } + else if (dist < len || len < 16) + { + for (; len; len--) + *dst++ = *src++; + } + else + { + memcpy(dst, src, len); + dst += len; + } + } + } + } + + static int ComputeHuffmanCodes(InputMemoryStream& is, Zhuffman& zLength, Zhuffman& zDistance) + { + static const uint8_t length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 }; + Zhuffman z_codelength; + uint8_t lencodes[286 + 32 + 137]; + uint8_t codelength_sizes[19]; + int i, n; + + int hlit = is.ReadBits(5) + 257; + int hdist = is.ReadBits(5) + 1; + int hclen = is.ReadBits(4) + 4; + int ntot = hlit + hdist; + + memset(codelength_sizes, 0, sizeof(codelength_sizes)); + for (i = 0; i < hclen; ++i) + { + int s = is.ReadBits(3); + codelength_sizes[length_dezigzag[i]] = (uint8_t)s; + } + if (!z_codelength.Build(codelength_sizes, 19)) + return 0; + n = 0; + while (n < ntot) + { + int c = ZhuffmanDecode(is, z_codelength); + if (c < 0 || c >= 19) + return PngError("bad codelengths", "Corrupt PNG"); + if (c < 16) + lencodes[n++] = (uint8_t)c; + else + { + uint8_t fill = 0; + if (c == 16) + { + c = is.ReadBits(2) + 3; + if (n == 0) return PngError("bad codelengths", "Corrupt PNG"); + fill = lencodes[n - 1]; + } + else if (c == 17) + c = is.ReadBits(3) + 3; + else if (c == 18) + c = is.ReadBits(7) + 11; + else + return PngError("bad codelengths", "Corrupt PNG"); + if (ntot - n < c) + return PngError("bad codelengths", "Corrupt PNG"); + memset(lencodes + n, fill, c); + n += c; + } + } + if (n != ntot) + return PngError("bad codelengths", "Corrupt PNG"); + if (!zLength.Build(lencodes, hlit)) + return 0; + if (!zDistance.Build(lencodes + hlit, hdist)) + return 0; + return 1; + } + + static int ParseUncompressedBlock(InputMemoryStream& is, OutputMemoryStream& os) + { + is.ClearBits(); + uint16_t len, nlen; + if (!is.Read16u(len) || !is.Read16u(nlen) || nlen != (len ^ 0xffff)) + return PngError("zlib corrupt", "Corrupt PNG"); + if (!os.Write(is, len)) + return PngError("read past buffer", "Corrupt PNG"); + return 1; + } + + static int ParseHeader(InputMemoryStream& is) + { + uint8_t cmf, flg; + if (!(is.Read8u(cmf) && is.Read8u(flg))) + return PngError("bad zlib header", "Corrupt PNG"); + if ((int(cmf) * 256 + flg) % 31 != 0) + return PngError("bad zlib header", "Corrupt PNG"); + if (flg & 32) + return PngError("no preset dict", "Corrupt PNG"); + if ((cmf & 15) != 8) + return PngError("bad compression", "Corrupt PNG"); + return 1; + } + + bool Decode(InputMemoryStream& is, OutputMemoryStream& os, bool parseHeader) + { + static const uint8_t ZdefaultLength[288] = { + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8 + }; + static const uint8_t ZdefaultDistance[32] = { + 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 + }; + + Zhuffman zLength, zDistance; + int final, type; + if (parseHeader) + { + if (!ParseHeader(is)) + return false; + } + do + { + final = is.ReadBits(1); + type = is.ReadBits(2); + if (type == 0) + { + if (!ParseUncompressedBlock(is, os)) + return false; + } + else if (type == 3) + return false; + else + { + if (type == 1) + { + if (!zLength.Build(ZdefaultLength, 288)) + return false; + if (!zDistance.Build(ZdefaultDistance, 32)) + return false; + } + else + { + if (!ComputeHuffmanCodes(is, zLength, zDistance)) + return false; + } + if (!ParseHuffmanBlock(is, zLength, zDistance, os)) + return false; + } + } while (!final); + return true; + } + } + + typedef struct + { + PngContext* s; + uint8_t * out; + uint8_t depth; + } png__png; + + enum + { + PNG__F_none = 0, + PNG__F_sub = 1, + PNG__F_up = 2, + PNG__F_avg = 3, + PNG__F_paeth = 4, + // synthetic filters used for first scanline to avoid needing a dummy row of 0s + PNG__F_avg_first, + PNG__F_paeth_first + }; + + static uint8_t first_row_filter[5] = + { + PNG__F_none, + PNG__F_sub, + PNG__F_none, + PNG__F_avg_first, + PNG__F_paeth_first + }; + + static int png__paeth(int a, int b, int c) + { + int p = a + b - c; + int pa = abs(p - a); + int pb = abs(p - b); + int pc = abs(p - c); + if (pa <= pb && pa <= pc) return a; + if (pb <= pc) return b; + return c; + } + + static const uint8_t png__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 }; + + // create the png data from post-deflated data + static int png__create_png_image_raw(png__png* a, uint8_t* raw, uint32_t raw_len, int out_n, uint32_t x, uint32_t y, int depth, int color) + { + int bytes = (depth == 16 ? 2 : 1); + PngContext* s = a->s; + uint32_t i, j, stride = x * out_n * bytes; + uint32_t img_len, img_width_bytes; + int k; + int img_n = s->img_n; // copy it into a local for later + + int output_bytes = out_n * bytes; + int filter_bytes = img_n * bytes; + int width = x; + + assert(out_n == s->img_n || out_n == s->img_n + 1); + a->out = (uint8_t*)png__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into + if (!a->out) return PngError("outofmem", "Out of memory"); + + if (!png__mad3sizes_valid(img_n, x, depth, 7)) return PngError("too large", "Corrupt PNG"); + img_width_bytes = (((img_n * x * depth) + 7) >> 3); + img_len = (img_width_bytes + 1) * y; + + // we used to check for exact match between raw_len and img_len on non-interlaced PNGs, + // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros), + // so just check for raw_len < img_len always. + if (raw_len < img_len) + return PngError("not enough pixels", "Corrupt PNG"); + + for (j = 0; j < y; ++j) + { + uint8_t* cur = a->out + stride * j; + uint8_t* prior; + int filter = *raw++; + + if (filter > 4) + return PngError("invalid filter", "Corrupt PNG"); + + if (depth < 8) + { + if (img_width_bytes > x) + return PngError("invalid width", "Corrupt PNG"); + cur += x * out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place + filter_bytes = 1; + width = img_width_bytes; + } + prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above + + // if first row, use special filter that doesn't sample previous row + if (j == 0) filter = first_row_filter[filter]; + + // handle first byte explicitly + for (k = 0; k < filter_bytes; ++k) + { + switch (filter) { + case PNG__F_none: cur[k] = raw[k]; break; + case PNG__F_sub: cur[k] = raw[k]; break; + case PNG__F_up: cur[k] = PNG__BYTECAST(raw[k] + prior[k]); break; + case PNG__F_avg: cur[k] = PNG__BYTECAST(raw[k] + (prior[k] >> 1)); break; + case PNG__F_paeth: cur[k] = PNG__BYTECAST(raw[k] + png__paeth(0, prior[k], 0)); break; + case PNG__F_avg_first: cur[k] = raw[k]; break; + case PNG__F_paeth_first: cur[k] = raw[k]; break; + } + } + + if (depth == 8) + { + if (img_n != out_n) + cur[img_n] = 255; // first pixel + raw += img_n; + cur += out_n; + prior += out_n; + } + else if (depth == 16) + { + if (img_n != out_n) + { + cur[filter_bytes] = 255; // first pixel top byte + cur[filter_bytes + 1] = 255; // first pixel bottom byte + } + raw += filter_bytes; + cur += output_bytes; + prior += output_bytes; + } + else + { + raw += 1; + cur += 1; + prior += 1; + } + + // this is a little gross, so that we don't switch per-pixel or per-component + if (depth < 8 || img_n == out_n) + { + int nk = (width - 1) * filter_bytes; +#define PNG__CASE(f) \ + case f: \ + for (k=0; k < nk; ++k) + switch (filter) { + // "none" filter turns into a memcpy here; make that explicit. + case PNG__F_none: memcpy(cur, raw, nk); break; + PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - filter_bytes]); } break; + PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break; + PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1)); } break; + PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes])); } break; + PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); } break; + PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], 0, 0)); } break; + } +#undef PNG__CASE + raw += nk; + } + else + { + assert(img_n + 1 == out_n); +#define PNG__CASE(f) \ + case f: \ + for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \ + for (k=0; k < filter_bytes; ++k) + switch (filter) { + PNG__CASE(PNG__F_none) { cur[k] = raw[k]; } break; + PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - output_bytes]); } break; + PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break; + PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - output_bytes]) >> 1)); } break; + PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes])); } break; + PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - output_bytes] >> 1)); } break; + PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], 0, 0)); } break; + } +#undef PNG__CASE + + // the loop above sets the high byte of the pixels' alpha, but for + // 16 bit png files we also need the low byte set. we'll do that here. + if (depth == 16) + { + cur = a->out + stride * j; // start at the beginning of the row again + for (i = 0; i < x; ++i, cur += output_bytes) + cur[filter_bytes + 1] = 255; + } + } + } + + // we make a separate pass to expand bits to pixels; for performance, + // this could run two scanlines behind the above code, so it won't + // intefere with filtering but will still be in the cache. + if (depth < 8) + { + for (j = 0; j < y; ++j) + { + uint8_t* cur = a->out + stride * j; + uint8_t* in = a->out + stride * j + x * out_n - img_width_bytes; + // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit + // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop + uint8_t scale = (color == 0) ? png__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range + + // note that the final byte might overshoot and write more data than desired. + // we can allocate enough data that this never writes out of memory, but it + // could also overwrite the next scanline. can it overwrite non-empty data + // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel. + // so we need to explicitly clamp the final ones + + if (depth == 4) + { + for (k = x * img_n; k >= 2; k -= 2, ++in) + { + *cur++ = scale * ((*in >> 4)); + *cur++ = scale * ((*in) & 0x0f); + } + if (k > 0) + *cur++ = scale * ((*in >> 4)); + } + else if (depth == 2) + { + for (k = x * img_n; k >= 4; k -= 4, ++in) + { + *cur++ = scale * ((*in >> 6)); + *cur++ = scale * ((*in >> 4) & 0x03); + *cur++ = scale * ((*in >> 2) & 0x03); + *cur++ = scale * ((*in) & 0x03); + } + if (k > 0) + *cur++ = scale * ((*in >> 6)); + if (k > 1) + *cur++ = scale * ((*in >> 4) & 0x03); + if (k > 2) + *cur++ = scale * ((*in >> 2) & 0x03); + } + else if (depth == 1) + { + for (k = x * img_n; k >= 8; k -= 8, ++in) + { + *cur++ = scale * ((*in >> 7)); + *cur++ = scale * ((*in >> 6) & 0x01); + *cur++ = scale * ((*in >> 5) & 0x01); + *cur++ = scale * ((*in >> 4) & 0x01); + *cur++ = scale * ((*in >> 3) & 0x01); + *cur++ = scale * ((*in >> 2) & 0x01); + *cur++ = scale * ((*in >> 1) & 0x01); + *cur++ = scale * ((*in) & 0x01); + } + if (k > 0) *cur++ = scale * ((*in >> 7)); + if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01); + if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01); + if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01); + if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01); + if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01); + if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01); + } + if (img_n != out_n) + { + int q; + // insert alpha = 255 + cur = a->out + stride * j; + if (img_n == 1) + { + for (q = x - 1; q >= 0; --q) + { + cur[q * 2 + 1] = 255; + cur[q * 2 + 0] = cur[q]; + } + } + else + { + assert(img_n == 3); + for (q = x - 1; q >= 0; --q) + { + cur[q * 4 + 3] = 255; + cur[q * 4 + 2] = cur[q * 3 + 2]; + cur[q * 4 + 1] = cur[q * 3 + 1]; + cur[q * 4 + 0] = cur[q * 3 + 0]; + } + } + } + } + } + else if (depth == 16) + { + // force the image data from big-endian to platform-native. + // this is done in a separate pass due to the decoding relying + // on the data being untouched, but could probably be done + // per-line during decode if care is taken. + uint8_t* cur = a->out; + uint16_t* cur16 = (uint16_t*)cur; + + for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) + *cur16 = (cur[0] << 8) | cur[1]; + } + + return 1; + } + + static int png__create_png_image(png__png* a, uint8_t* image_data, uint32_t image_data_len, int out_n, int depth, int color, int interlaced) + { + int bytes = (depth == 16 ? 2 : 1); + int out_bytes = out_n * bytes; + uint8_t* final; + int p; + if (!interlaced) + return png__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color); + + // de-interlacing + final = (uint8_t*)png__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0); + for (p = 0; p < 7; ++p) + { + int xorig[] = { 0,4,0,2,0,1,0 }; + int yorig[] = { 0,0,4,0,2,0,1 }; + int xspc[] = { 8,8,4,4,2,2,1 }; + int yspc[] = { 8,8,8,4,4,2,2 }; + int i, j, x, y; + // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1 + x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p]; + y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p]; + if (x && y) + { + uint32_t img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y; + if (!png__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) + { + PNG_FREE(final); + return 0; + } + for (j = 0; j < y; ++j) + { + for (i = 0; i < x; ++i) + { + int out_y = j * yspc[p] + yorig[p]; + int out_x = i * xspc[p] + xorig[p]; + memcpy(final + out_y * a->s->img_x * out_bytes + out_x * out_bytes, + a->out + (j * x + i) * out_bytes, out_bytes); + } + } + PNG_FREE(a->out); + image_data += img_len; + image_data_len -= img_len; + } + } + a->out = final; + + return 1; + } + + static int png__compute_transparency(png__png* z, uint8_t tc[3], int out_n) + { + PngContext* s = z->s; + uint32_t i, pixel_count = s->img_x * s->img_y; + uint8_t* p = z->out; + + // compute color-based transparency, assuming we've + // already got 255 as the alpha value in the output + assert(out_n == 2 || out_n == 4); + + if (out_n == 2) + { + for (i = 0; i < pixel_count; ++i) + { + p[1] = (p[0] == tc[0] ? 0 : 255); + p += 2; + } + } + else + { + for (i = 0; i < pixel_count; ++i) + { + if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) + p[3] = 0; + p += 4; + } + } + return 1; + } + + static int png__compute_transparency16(png__png* z, uint16_t tc[3], int out_n) + { + PngContext* s = z->s; + uint32_t i, pixel_count = s->img_x * s->img_y; + uint16_t* p = (uint16_t*)z->out; + + // compute color-based transparency, assuming we've + // already got 65535 as the alpha value in the output + assert(out_n == 2 || out_n == 4); + + if (out_n == 2) + { + for (i = 0; i < pixel_count; ++i) + { + p[1] = (p[0] == tc[0] ? 0 : 65535); + p += 2; + } + } + else + { + for (i = 0; i < pixel_count; ++i) + { + if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) + p[3] = 0; + p += 4; + } + } + return 1; + } + + static int png__expand_png_palette(png__png* a, uint8_t* palette, int len, int pal_img_n) + { + uint32_t i, pixel_count = a->s->img_x * a->s->img_y; + uint8_t* p, * temp_out, * orig = a->out; + + p = (uint8_t*)png__malloc_mad2(pixel_count, pal_img_n, 0); + if (p == NULL) + return PngError("outofmem", "Out of memory"); + + // between here and free(out) below, exitting would leak + temp_out = p; + + if (pal_img_n == 3) + { + for (i = 0; i < pixel_count; ++i) + { + int n = orig[i] * 4; + p[0] = palette[n]; + p[1] = palette[n + 1]; + p[2] = palette[n + 2]; + p += 3; + } + } + else + { + for (i = 0; i < pixel_count; ++i) + { + int n = orig[i] * 4; + p[0] = palette[n]; + p[1] = palette[n + 1]; + p[2] = palette[n + 2]; + p[3] = palette[n + 3]; + p += 4; + } + } + PNG_FREE(a->out); + a->out = temp_out; + + return 1; + } + + //--------------------------------------------------------------------- + + ImagePngLoader::ImagePngLoader(const ImageLoaderParam& param) + : ImageLoader(param) + , _toAny8(NULL) + , _toBgra8(NULL) + , _toAny16(NULL) + , _toBgra16(NULL) + { + if (_param.format == SimdPixelFormatNone) + _param.format = SimdPixelFormatRgba32; + } + + void ImagePngLoader::SetConverters() + { + _bgrToBgra = Base::BgrToBgra; + } + + SIMD_INLINE constexpr uint32_t ChunkType(char a, char b, char c, char d) + { + return ((uint32_t(a) << 24) + (uint32_t(b) << 16) + (uint32_t(c) << 8) + uint32_t(d)); + } + + bool ImagePngLoader::FromStream() + { + const int req_comp = 4; + PngContext context; + png__png p; + p.s = &context; + png__png* z = &p; + + PngContext* s = z->s; + + z->out = NULL; + + if (!ParseFile()) + return false; + + s->img_x = _width; + s->img_y = _height; + z->depth = _depth; + s->img_n = _channels; + + InputMemoryStream zSrc = MergedDataStream(); + OutputMemoryStream zDst(AlignHi(size_t(_width) * _depth, 8) * _height * _channels + _height); + if(!Zlib::Decode(zSrc, zDst, !_iPhone)) + return false; + + if ((req_comp == s->img_n + 1 && req_comp != 3 && !_paletteChannels) || _hasTrans) + s->img_out_n = s->img_n + 1; + else + s->img_out_n = s->img_n; + if (!png__create_png_image(z, zDst.Data(), zDst.Size(), s->img_out_n, z->depth, _color, _interlace)) + return 0; + if (_hasTrans) + { + if (z->depth == 16) + { + if (!png__compute_transparency16(z, _tc16, s->img_out_n)) + return false; + } + else + { + if (!png__compute_transparency(z, _tc, s->img_out_n)) + return false; + } + } + if (_paletteChannels) + { + s->img_n = _paletteChannels; // record the actual colors we had + s->img_out_n = _paletteChannels; + if (req_comp >= 3) + s->img_out_n = req_comp; + if (!png__expand_png_palette(z, _palette.data, (int)_palette.size, s->img_out_n)) + return false; + } + else if (_hasTrans) + ++s->img_n; + + if (!(p.depth <= 8 || p.depth == 16)) + return false; + uint8_t* data = p.out; + p.out = NULL; + if (req_comp && req_comp != p.s->img_out_n) + { + if (p.depth <= 8) + data = png__convert_format((uint8_t*)data, p.s->img_out_n, req_comp, _width, _height); + else + data = (uint8_t*)png__convert_format16((uint16_t*)data, p.s->img_out_n, req_comp, _width, _height); + p.s->img_out_n = req_comp; + if (data == NULL) + return false; + } + if (p.depth == 16) + { + size_t size = context.img_x * context.img_y * req_comp; + const uint16_t* src = (uint16_t*)data; + uint8_t* dst = (uint8_t*)PNG_MALLOC(size); + for (size_t i = 0; i < size; ++i) + dst[i] = uint8_t(src[i] >> 8); + PNG_FREE(data); + data = dst; + } + PNG_FREE(p.out); + if (data) + { + size_t stride = 4 * context.img_x; + _image.Recreate(context.img_x, context.img_y, (Image::Format)_param.format); + switch (_param.format) + { + case SimdPixelFormatGray8: + Base::RgbaToGray(data, context.img_x, context.img_y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgr24: + Base::BgraToRgb(data, context.img_x, context.img_y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgra32: + Base::BgraToRgba(data, context.img_x, context.img_y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatRgb24: + Base::BgraToBgr(data, context.img_x, context.img_y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatRgba32: + Base::Copy(data, stride, context.img_x, context.img_y, 4, _image.data, _image.stride); + break; + default: + break; + } + PNG_FREE(data); + return true; + } + return false; + } + + bool ImagePngLoader::ParseFile() + { + _first = true, _iPhone = false, _hasTrans = false; + if (!CheckHeader()) + return false; + for (bool run = true; run;) + { + Chunk chunk; + if (!ReadChunk(chunk)) + return 0; + if (chunk.type == ChunkType('C', 'g', 'B', 'I')) + { + _iPhone = true; + _stream.Skip(chunk.size); + } + else if (chunk.type == ChunkType('I', 'H', 'D', 'R')) + { + if (!ReadHeader(chunk)) + return false; + SetConverters(); + } + else if (chunk.type == ChunkType('P', 'L', 'T', 'E')) + { + if (!ReadPalette(chunk)) + return false; + } + else if (chunk.type == ChunkType('t', 'R', 'N', 'S')) + { + if (!ReadTransparency(chunk)) + return false; + } + else if (chunk.type == ChunkType('I', 'D', 'A', 'T')) + { + if (!ReadData(chunk)) + return false; + } + else if (chunk.type == ChunkType('I', 'E', 'N', 'D')) + { + if (_first) + return false; + run = false; + } + else + { + if (_first || (chunk.type & (1 << 29)) == 0) + return false; + _stream.Skip(chunk.size); + } + uint32_t crc32; + if (!_stream.ReadBe32u(crc32)) + return false; + } + return _idats.size() != 0; + } + + bool ImagePngLoader::CheckHeader() + { + const size_t size = 8; + const uint8_t control[size] = { 137, 80, 78, 71, 13, 10, 26, 10 }; + uint8_t buffer[size]; + return _stream.Read(size, buffer) == size && memcmp(buffer, control, size) == 0; + } + + SIMD_INLINE bool ImagePngLoader::ReadChunk(Chunk& chunk) + { + if (_stream.ReadBe32u(chunk.size) && _stream.ReadBe32u(chunk.type)) + { + chunk.offs = (uint32_t)_stream.Pos(); + return true; + } + return false; + } + + bool ImagePngLoader::ReadHeader(const Chunk& chunk) + { + const int MAX_SIZE = 1 << 24; + if (!_first) + return false; + _first = false; + if (!(chunk.size == 13 && _stream.CanRead(13))) + return false; + uint8_t comp, filter; + if (!(_stream.ReadBe32u(_width) && _stream.ReadBe32u(_height) && + _stream.Read8u(_depth) && _stream.Read8u(_color) && _stream.Read8u(comp) && + _stream.Read8u(filter) && _stream.Read8u(_interlace))) + return false; + if (_width == 0 || _width > MAX_SIZE || _height == 0 || _height > MAX_SIZE) + return false; + if (_depth != 1 && _depth != 2 && _depth != 4 && _depth != 8 && _depth != 16) + return false; + if (_color > 6 || (_color == 3 && _depth == 16)) + return false; + _paletteChannels = 0; + if (_color == 3) + _paletteChannels = 3; + else if (_color & 1) + return false; + if (comp != 0 || filter != 0 || _interlace > 1) + return false; + if (!_paletteChannels) + { + _channels = (_color & 2 ? 3 : 1) + (_color & 4 ? 1 : 0); + if ((1 << 30) / _width / _channels < _height) + return false; + } + else + { + _channels = 1; + if ((1 << 30) / _width / 4 < _height) + return false; + } + return true; + } + + bool ImagePngLoader::ReadPalette(const Chunk& chunk) + { + if (_first || chunk.size > 256 * 3) + return false; + size_t length = chunk.size / 3; + if (length * 3 != chunk.size) + return false; + if (_stream.CanRead(chunk.size)) + { + _palette.Resize(length * 4); + _bgrToBgra(_stream.Current(), length, 1, length, _palette.data, _palette.size, 0xFF); + _stream.Skip(chunk.size); + return true; + } + else + return false; + } + + bool ImagePngLoader::ReadTransparency(const Chunk& chunk) + { + if (_first) + return false; + if (_idats.size()) + return false; + if (_paletteChannels) + { + if (_palette.size == 0 || chunk.size > _palette.size || !_stream.CanRead(chunk.size)) + return false; + _paletteChannels = 4; + for (size_t i = 0; i < chunk.size; ++i) + _palette.data[i * 4 + 3] = _stream.Current()[i]; + _stream.Skip(chunk.size); + } + else + { + if (!(_channels & 1) || chunk.size != _channels * 2) + return false; + _hasTrans = true; + for (size_t k = 0; k < _channels; ++k) + if (!_stream.ReadBe16u(_tc16[k])) + return false; + if (_depth != 16) + { + for (size_t k = 0; k < _channels; ++k) + _tc[k] = uint8_t(_tc16[k]) * png__depth_scale_table[_depth]; + } + } + return true; + } + + bool ImagePngLoader::ReadData(const Chunk& chunk) + { + if (_first) + return false; + if (_paletteChannels && !_palette.size) + return false; + if (!_stream.CanRead(chunk.size)) + return false; + _idats.push_back(chunk); + _stream.Skip(chunk.size); + return true; + } + + InputMemoryStream ImagePngLoader::MergedDataStream() + { + if (_idats.size() == 1) + return InputMemoryStream((uint8_t*)_stream.Data() + _idats[0].offs, _idats[0].size); + else + { + size_t size = 0; + for (size_t i = 0; i < _idats.size(); ++i) + size += _idats[i].size; + _idat.Resize(size); + for (size_t i = 0, offset = 0; i < _idats.size(); ++i) + { + memcpy(_idat.data + offset, _stream.Data() + _idats[i].offs, _idats[i].size); + offset += _idats[i].size; + } + return InputMemoryStream(_idat.data, _idat.size); + } + } + } +} diff --git a/3rdparty/simdlib/Simd/SimdBaseImageSave.cpp b/3rdparty/simdlib/Simd/SimdBaseImageSave.cpp new file mode 100644 index 0000000000..fb5a8eacef --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseImageSave.cpp @@ -0,0 +1,340 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdCpu.h" +#include "Simd/SimdBase.h" + +#include + +#include +#include + +#if defined(_MSC_VER) +#pragma warning (push) +#pragma warning (disable: 4996) +#endif + +namespace Simd +{ + SimdBool ImageSaveToFile(const ImageSaveToMemoryPtr saver, const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char* path) + { + SimdBool result = SimdFalse; + size_t size; + uint8_t * data = saver(src, stride, width, height, format, file, quality, &size); + if (data) + { + ::FILE* file = ::fopen(path, "wb"); + if (file) + { + if (::fwrite(data, 1, size, file) == size) + result = SimdTrue; + ::fclose(file); + } + Simd::Free(data); + } + return result; + } + + //------------------------------------------------------------------------- + + namespace Base + { + ImagePxmSaver::ImagePxmSaver(const ImageSaverParam& param) + : ImageSaver(param) + , _convert(NULL) + { + _block = _param.height; + if (_param.file == SimdImageFilePgmTxt || _param.file == SimdImageFilePgmBin) + { + _size = _param.width * 1; + if (_param.format != SimdPixelFormatGray8) + { + _block = Simd::RestrictRange(Base::AlgCacheL1() / _size, 1, _param.height); + _buffer.Resize(_block * _size); + } + } + else if (_param.file == SimdImageFilePpmTxt || _param.file == SimdImageFilePpmBin) + { + _size = _param.width * 3; + if (_param.format != SimdPixelFormatRgb24) + { + _block = Simd::RestrictRange(Base::AlgCacheL1() / _size, 1, _param.height); + _buffer.Resize(_block * _size); + } + } + else + assert(0); + } + + void ImagePxmSaver::WriteHeader(size_t version) + { + std::stringstream header; + header << "P" << version << "\n" << _param.width << " " << _param.height << "\n255\n"; + _stream.Write(header.str().c_str(), header.str().size()); + } + + uint8_t g_pxmPrint[256][4]; + bool PxmPrintInit() + { + for (int i = 0; i < 256; ++i) + { + int d0 = i / 100; + int d1 = (i / 10) % 10; + int d2 = i % 10; + g_pxmPrint[i][0] = d0 ? '0' + d0 : ' '; + g_pxmPrint[i][1] = (d1 || d0) ? '0' + d1 : ' '; + g_pxmPrint[i][2] = '0' + d2; + g_pxmPrint[i][3] = ' '; + } + return true; + } + bool g_pxmPrintInited = PxmPrintInit(); + + //--------------------------------------------------------------------- + + ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param) + : ImagePxmSaver(param) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Base::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Base::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Base::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Base::RgbaToGray; break; + default: break; + } + } + + bool ImagePgmTxtSaver::ToStream(const uint8_t* src, size_t stride) + { + size_t grayStride = _param.format == SimdPixelFormatGray8 ? stride : _size; + _stream.Reserve(32 + _param.height * (_param.width * 4 + DivHi(_param.width, 17))); + WriteHeader(2); + for (size_t row = 0; row < _param.height;) + { + size_t block = Simd::Min(row + _block, _param.height) - row; + const uint8_t* gray = src; + if (_param.format != SimdPixelFormatGray8) + { + _convert(src, _param.width, block, stride, _buffer.data, grayStride); + gray = _buffer.data; + } + for (size_t b = 0; b < block; ++b) + { + uint8_t string[70]; + for (size_t col = 0, offset = 0; col < _param.width; ++col) + { + *(uint32_t*)(string + offset) = *(uint32_t*)g_pxmPrint[gray[col]]; + offset += 4; + if (offset >= 68 || col == _param.width - 1) + { + string[offset++] = '\n'; + _stream.Write(string, offset); + offset = 0; + } + } + gray += grayStride; + } + src += stride * block; + row += block; + } + return true; + } + + //--------------------------------------------------------------------- + + ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param) + : ImagePxmSaver(param) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Base::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Base::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Base::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Base::RgbaToGray; break; + default: break; + } + } + + bool ImagePgmBinSaver::ToStream(const uint8_t* src, size_t stride) + { + size_t grayStride = _param.format == SimdPixelFormatGray8 ? stride : _size; + _stream.Reserve(32 + _param.height * _size); + WriteHeader(5); + for (size_t row = 0; row < _param.height;) + { + size_t block = Simd::Min(row + _block, _param.height) - row; + const uint8_t* gray = src; + if (_param.format != SimdPixelFormatGray8) + { + _convert(src, _param.width, block, stride, _buffer.data, grayStride); + gray = _buffer.data; + } + for (size_t b = 0; b < block; ++b) + { + _stream.Write(gray, _size); + gray += grayStride; + } + src += stride * block; + row += block; + } + return true; + } + + //--------------------------------------------------------------------- + + ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param) + : ImagePxmSaver(param) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Base::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Base::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Base::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Base::BgraToBgr; break; + default: break; + } + } + + bool ImagePpmTxtSaver::ToStream(const uint8_t* src, size_t stride) + { + size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? stride : _size; + _stream.Reserve(32 + _param.height * (_param.width * 13 + DivHi(_param.width, 5))); + WriteHeader(3); + for (size_t row = 0; row < _param.height;) + { + size_t block = Simd::Min(row + _block, _param.height) - row; + const uint8_t* rgb = src; + if (_param.format != SimdPixelFormatRgb24) + { + _convert(src, _param.width, block, stride, _buffer.data, rgbStride); + rgb = _buffer.data; + } + for (size_t b = 0; b < block; ++b) + { + uint8_t string[70]; + for (size_t col = 0, offset = 0; col < _size; col += 3) + { + ((uint32_t*)(string + offset))[0] = *(uint32_t*)g_pxmPrint[rgb[col + 0]]; + ((uint32_t*)(string + offset))[1] = *(uint32_t*)g_pxmPrint[rgb[col + 1]]; + ((uint32_t*)(string + offset))[2] = *(uint32_t*)g_pxmPrint[rgb[col + 2]]; + offset += 12; + if (offset >= 68 || col == _size - 3) + { + string[offset++] = '\n'; + _stream.Write(string, offset); + offset = 0; + } + else + { + string[offset++] = ' '; + string[offset++] = ' '; + } + } + rgb += rgbStride; + } + src += stride * block; + row += block; + } + return true; + } + + //--------------------------------------------------------------------- + + ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param) + : ImagePxmSaver(param) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Base::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Base::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Base::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Base::BgraToBgr; break; + default: break; + } + } + + bool ImagePpmBinSaver::ToStream(const uint8_t* src, size_t stride) + { + size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? stride : _size; + _stream.Reserve(32 + _param.height * _size); + WriteHeader(6); + for (size_t row = 0; row < _param.height;) + { + size_t block = Simd::Min(row + _block, _param.height) - row; + const uint8_t* rgb = src; + if (_param.format != SimdPixelFormatRgb24) + { + _convert(src, _param.width, block, stride, _buffer.data, rgbStride); + rgb = _buffer.data; + } + for (size_t b = 0; b < block; ++b) + { + _stream.Write(rgb, _size); + rgb += rgbStride; + } + src += stride * block; + row += block; + } + return true; + } + + //--------------------------------------------------------------------- + + ImageSaver* CreateImageSaver(const ImageSaverParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param); + case SimdImageFilePgmBin: return new ImagePgmBinSaver(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param); + case SimdImageFilePpmBin: return new ImagePpmBinSaver(param); + case SimdImageFilePng: return new ImagePngSaver(param); + case SimdImageFileJpeg: return new ImageJpegSaver(param); + default: + return NULL; + } + } + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size) + { + ImageSaverParam param(width, height, format, file, quality); + if (param.Validate()) + { + Holder saver(CreateImageSaver(param)); + if (saver) + { + if (saver->ToStream(src, stride)) + return saver->Release(size); + } + } + return NULL; + } + } +} + +#if defined(_MSC_VER) +#pragma warning (pop) +#endif diff --git a/3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp b/3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp new file mode 100644 index 0000000000..f7ba583247 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp @@ -0,0 +1,451 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdImageSaveJpeg.h" +#include "Simd/SimdBase.h" + +namespace Simd +{ + namespace Base + { + const uint8_t JpegZigZagD[64] = { + 0, 1, 5, 6, 14, 15, 27, 28, + 2, 4, 7, 13, 16, 26, 29, 42, + 3, 8, 12, 17, 25, 30, 41, 43, + 9, 11, 18, 24, 31, 40, 44, 53, + 10, 19, 23, 32, 39, 45, 52, 54, + 20, 22, 33, 38, 46, 51, 55, 60, + 21, 34, 37, 47, 50, 56, 59, 61, + 35, 36, 48, 49, 57, 58, 62, 63 }; + + const uint8_t JpegZigZagT[64] = { + 0, 2, 3, 9, 10, 20, 21, 35, + 1, 4, 8, 11, 19, 22, 34, 36, + 5, 7, 12, 18, 23, 33, 37, 48, + 6, 13, 17, 24, 32, 38, 47, 49, + 14, 16, 25, 31, 39, 46, 50, 57, + 15, 26, 30, 40, 45, 51, 56, 58, + 27, 29, 41, 44, 52, 55, 59, 62, + 28, 42, 43, 53, 54, 60, 61, 63 }; + + const uint16_t HuffmanYdc[256][2] = { {0, 2}, {2, 3}, {3, 3}, {4, 3}, {5, 3}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9} }; + const uint16_t HuffmanUVdc[256][2] = { {0, 2}, {1, 2}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}, {1022, 10}, {2046, 11} }; + const uint16_t HuffmanYac[256][2] = { + {10, 4}, {0, 2}, {1, 2}, {4, 3}, {11, 4}, {26, 5}, {120, 7}, {248, 8}, {1014, 10}, {65410, 16}, {65411, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {12, 4}, {27, 5}, {121, 7}, {502, 9}, {2038, 11}, {65412, 16}, {65413, 16}, {65414, 16}, {65415, 16}, {65416, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {28, 5}, {249, 8}, {1015, 10}, {4084, 12}, {65417, 16}, {65418, 16}, {65419, 16}, {65420, 16}, {65421, 16}, {65422, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {58, 6}, {503, 9}, {4085, 12}, {65423, 16}, {65424, 16}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {59, 6}, {1016, 10}, {65430, 16}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {122, 7}, {2039, 11}, {65438, 16}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {123, 7}, {4086, 12}, {65446, 16}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {250, 8}, {4087, 12}, {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {504, 9}, {32704, 15}, {65462, 16}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {505, 9}, {65470, 16}, {65471, 16}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {506, 9}, {65479, 16}, {65480, 16}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {1017, 10}, {65488, 16}, {65489, 16}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {1018, 10}, {65497, 16}, {65498, 16}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {2040, 11}, {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {2041, 11}, {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0} + }; + const uint16_t HuffmanUVac[256][2] = { + {0, 2}, {1, 2}, {4, 3}, {10, 4}, {24, 5}, {25, 5}, {56, 6}, {120, 7}, {500, 9}, {1014, 10}, {4084, 12}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {11, 4}, {57, 6}, {246, 8}, {501, 9}, {2038, 11}, {4085, 12}, {65416, 16}, {65417, 16}, {65418, 16}, {65419, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {26, 5}, {247, 8}, {1015, 10}, {4086, 12}, {32706, 15}, {65420, 16}, {65421, 16}, {65422, 16}, {65423, 16}, {65424, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {27, 5}, {248, 8}, {1016, 10}, {4087, 12}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {65430, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {58, 6}, {502, 9}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {65438, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {59, 6}, {1017, 10}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {65446, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {121, 7}, {2039, 11}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {65454, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {122, 7}, {2040, 11}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {65462, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {249, 8}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {65470, 16}, {65471, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {503, 9}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {65479, 16}, {65480, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {504, 9}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {65488, 16}, {65489, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {505, 9}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {65497, 16}, {65498, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {506, 9}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {65506, 16}, {65507, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {2041, 11}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {65515, 16}, {65516, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {65525, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, + {1018, 10}, {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0} + }; + +#if defined(SIMD_JPEG_CALC_BITS_TABLE) + uint16_t JpegCalcBitsTable[JpegCalcBitsRange * 2][2]; + bool JpegCalcBitsTableInit() + { + for (int i = 0, n = JpegCalcBitsRange * 2; i < n; ++i) + { + int val = i - JpegCalcBitsRange; + int tmp = val < 0 ? -val : val; + val = val < 0 ? val - 1 : val; + int cnt = 1; + while (tmp >>= 1) + ++cnt; + JpegCalcBitsTable[i][0] = val & ((1 << cnt) - 1); + JpegCalcBitsTable[i][1] = cnt; + } + return true; + } + bool JpegCalcBitsTableInited = JpegCalcBitsTableInit(); +#endif + + SIMD_INLINE void JpegDct(float* d0p, float* d1p, float* d2p, float* d3p, float* d4p, float* d5p, float* d6p, float* d7p) + { + float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p; + float z1, z2, z3, z4, z5, z11, z13; + float tmp0 = d0 + d7; + float tmp7 = d0 - d7; + float tmp1 = d1 + d6; + float tmp6 = d1 - d6; + float tmp2 = d2 + d5; + float tmp5 = d2 - d5; + float tmp3 = d3 + d4; + float tmp4 = d3 - d4; + + float tmp10 = tmp0 + tmp3; + float tmp13 = tmp0 - tmp3; + float tmp11 = tmp1 + tmp2; + float tmp12 = tmp1 - tmp2; + + d0 = tmp10 + tmp11; + d4 = tmp10 - tmp11; + + z1 = (tmp12 + tmp13) * 0.707106781f; + d2 = tmp13 + z1; + d6 = tmp13 - z1; + + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + z5 = (tmp10 - tmp12) * 0.382683433f; + z2 = tmp10 * 0.541196100f + z5; + z4 = tmp12 * 1.306562965f + z5; + z3 = tmp11 * 0.707106781f; + + z11 = tmp7 + z3; + z13 = tmp7 - z3; + + *d5p = z13 + z2; + *d3p = z13 - z2; + *d1p = z11 + z4; + *d7p = z11 - z4; + + *d0p = d0; *d2p = d2; *d4p = d4; *d6p = d6; + } + + static int JpegProcessDu(Base::BitBuf& bitBuf, float* CDU, int stride, const float* fdtbl, int DC, const uint16_t HTDC[256][2], const uint16_t HTAC[256][2]) + { + int offs, i, j, n, diff, end0pos, x, y; + for (offs = 0; offs < 8; ++offs) + JpegDct(&CDU[offs], &CDU[offs + stride], &CDU[offs + stride * 2], &CDU[offs + stride * 3], &CDU[offs + stride * 4], + &CDU[offs + stride * 5], &CDU[offs + stride * 6], &CDU[offs + stride * 7]); + for (offs = 0, n = stride * 8; offs < n; offs += stride) + JpegDct(&CDU[offs], &CDU[offs + 1], &CDU[offs + 2], &CDU[offs + 3], &CDU[offs + 4], &CDU[offs + 5], &CDU[offs + 6], &CDU[offs + 7]); + int DU[64]; + for (y = 0, j = 0; y < 8; ++y) + { + for (x = 0; x < 8; ++x, ++j) + { + i = y * stride + x; + float v = CDU[i] * fdtbl[j]; + DU[JpegZigZagD[j]] = Round(v); + } + } + diff = DU[0] - DC; + if (diff == 0) + bitBuf.Push(HTDC[0]); + else + { + uint16_t bits[2]; + JpegCalcBits(diff, bits); + bitBuf.Push(HTDC[bits[1]]); + bitBuf.Push(bits); + } + end0pos = 63; + for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos); + if (end0pos == 0) + { + bitBuf.Push(HTAC[0x00]); + return DU[0]; + } + for (i = 1; i <= end0pos; ++i) + { + int startpos = i; + int nrzeroes; + uint16_t bits[2]; + for (; DU[i] == 0 && i <= end0pos; ++i); + nrzeroes = i - startpos; + if (nrzeroes >= 16) + { + int lng = nrzeroes >> 4; + int nrmarker; + for (nrmarker = 1; nrmarker <= lng; ++nrmarker) + bitBuf.Push(HTAC[0xF0]); + nrzeroes &= 15; + } + JpegCalcBits(DU[i], bits); + bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]); + bitBuf.Push(bits); + } + if (end0pos != 63) + bitBuf.Push(HTAC[0x00]); + return DU[0]; + } + + void JpegWriteBlockSubs(OutputMemoryStream & stream, int width, int height, const uint8_t * red, + const uint8_t* green, const uint8_t* blue, int stride, const float * fY, const float* fUv, int dc[3]) + { + int & DCY = dc[0], & DCU = dc[1], & DCV = dc[2]; + float Y[256], U[256], V[256]; + float subU[64], subV[64]; + bool gray = red == green && red == blue; + Base::BitBuf bitBuf; + for (int y = 0; y < height; y += 16) + { + for (int x = 0; x < width; x += 16) + { + if (gray) + Base::GrayToY(red + x, stride, height - y, width - x, Y, 16); + else + Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 16); + DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + for (int yy = 0, pos = 0; yy < 8; ++yy) + { + for (int xx = 0; xx < 8; ++xx, ++pos) + { + int j = yy * 32 + xx * 2; + subU[pos] = (U[j + 0] + U[j + 1] + U[j + 16] + U[j + 17]) * 0.25f; + subV[pos] = (V[j + 0] + V[j + 1] + V[j + 16] + V[j + 17]) * 0.25f; + } + } + DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + if (bitBuf.Full()) + { + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + } + } + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + + void JpegWriteBlockFull(OutputMemoryStream& stream, int width, int height, const uint8_t* red, + const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3]) + { + int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2]; + float Y[64], U[64], V[64]; + bool gray = red == green && red == blue; + Base::BitBuf bitBuf; + for (int y = 0; y < height; y += 8) + { + for (int x = 0; x < width; x += 8) + { + if (gray) + Base::GrayToY(red + x, stride, height - y, width - x, Y, 8); + else + Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 8); + DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + if (bitBuf.Full()) + { + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + } + } + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + + //--------------------------------------------------------------------- + + ImageJpegSaver::ImageJpegSaver(const ImageSaverParam& param) + : ImageSaver(param) + , _deintBgra(NULL) + , _deintBgr(NULL) + { + } + + void ImageJpegSaver::Init() + { + InitParams(false); + switch (_param.format) + { + case SimdPixelFormatBgr24: + case SimdPixelFormatRgb24: + _deintBgr = Base::DeinterleaveBgr; + break; + case SimdPixelFormatBgra32: + case SimdPixelFormatRgba32: + _deintBgra = Base::DeinterleaveBgra; + break; + default: + break; + } + _writeBlock = _subSample ? JpegWriteBlockSubs : JpegWriteBlockFull; + } + + void ImageJpegSaver::InitParams(bool trans) + { + static const int YQT[] = { 16, 11, 10, 16, 24, 40, 51, 61, 12, 12, 14, 19, 26, 58, 60, 55, 14, 13, + 16, 24, 40, 57, 69, 56, 14, 17, 22, 29, 51, 87, 80, 62, 18, 22, 37, 56, 68, 109, 103, 77, 24, + 35, 55, 64, 81, 104, 113, 92, 49, 64, 78, 87, 103, 121, 120, 101, 72, 92, 95, 98, 112, 100, 103, 99 }; + static const int UVQT[] = { 17, 18, 24, 47, 99, 99, 99, 99, 18, 21, 26, 66, 99, 99, 99, 99, 24, + 26, 56, 99, 99, 99, 99, 99, 47, 66, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99 }; + static const float AASF[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, + 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f, 1.0f * 2.828427125f, + 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f }; + _quality = _param.quality; + _quality = _quality ? _quality : 90; + _subSample = _quality <= 90 ? 1 : 0; + _quality = _quality < 1 ? 1 : _quality > 100 ? 100 : _quality; + _quality = _quality < 50 ? 5000 / _quality : 200 - _quality * 2; + for (size_t i = 0; i < 64; ++i) + { + int uvti, yti = (YQT[i] * _quality + 50) / 100; + _uY[Base::JpegZigZagD[i]] = uint8_t(yti < 1 ? 1 : yti > 255 ? 255 : yti); + uvti = (UVQT[i] * _quality + 50) / 100; + _uUv[Base::JpegZigZagD[i]] = uint8_t(uvti < 1 ? 1 : uvti > 255 ? 255 : uvti); + } + const uint8_t *ZigZag = trans ? Base::JpegZigZagT : Base::JpegZigZagD; + for (size_t y = 0, i = 0; y < 8; ++y) + { + for (size_t x = 0; x < 8; ++x, ++i) + { + _fY[i] = 1.0f / (_uY[ZigZag[i]] * AASF[y] * AASF[x]); + _fUv[i] = 1.0f / (_uUv[ZigZag[i]] * AASF[y] * AASF[x]); + } + } + _block = _subSample ? 16 : 8; + _width = (int)AlignHi(_param.width, _block); + if (_param.format != SimdPixelFormatGray8) + _buffer.Resize(_width * _block * 3); + } + + void ImageJpegSaver::WriteHeader() + { + static const uint8_t DC_LUM_COD[] = { 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 }; + static const uint8_t DC_LUM_VAL[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; + static const uint8_t AC_LUM_COD[] = { 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d }; + static const uint8_t AC_LUM_VAL[] = { + 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08, + 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0, 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28, + 0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, + 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, + 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, + 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2, + 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa + }; + static const uint8_t DC_CHR_COD[] = { 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }; + static const uint8_t DC_CHR_VAL[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; + static const uint8_t AC_CHR_COD[] = { 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 }; + static const uint8_t AC_CHR_VAL[] = { + 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71, 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, + 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0, 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34, 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26, + 0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, + 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, + 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa + }; + static const uint8_t head0[] = { 0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F', 'I', 'F', 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0xFF, 0xDB, 0, 0x84, 0 }; + static const uint8_t head2[] = { 0xFF, 0xDA, 0, 0xC, 3, 1, 0, 2, 0x11, 3, 0x11, 0, 0x3F, 0 }; + const uint8_t head1[] = { 0xFF, 0xC0, 0, 0x11, 8, uint8_t(_param.height >> 8), uint8_t(_param.height), uint8_t(_param.width >> 8), + uint8_t(_param.width), 3, 1, uint8_t(_subSample ? 0x22 : 0x11), 0, 2, 0x11, 1, 3, 0x11, 1, 0xFF, 0xC4, 0x01, 0xA2, 0 }; + _stream.Write(head0, sizeof(head0)); + _stream.Write(_uY, 64); + _stream.Write8u(1); + _stream.Write(_uUv, 64); + _stream.Write(head1, sizeof(head1)); + _stream.Write(DC_LUM_COD + 1, sizeof(DC_LUM_COD) - 1); + _stream.Write(DC_LUM_VAL, sizeof(DC_LUM_VAL)); + _stream.Write8u(0x10); // HTYACinfo + _stream.Write(AC_LUM_COD + 1, sizeof(AC_LUM_COD) - 1); + _stream.Write(AC_LUM_VAL, sizeof(AC_LUM_VAL)); + _stream.Write8u(1); // HTUDCinfo + _stream.Write(DC_CHR_COD + 1, sizeof(DC_CHR_COD) - 1); + _stream.Write(DC_CHR_VAL, sizeof(DC_CHR_VAL)); + _stream.Write8u(0x11); // HTUACinfo + _stream.Write(AC_CHR_COD + 1, sizeof(AC_CHR_COD) - 1); + _stream.Write(AC_CHR_VAL, sizeof(AC_CHR_VAL)); + _stream.Write(head2, sizeof(head2)); + } + + bool ImageJpegSaver::ToStream(const uint8_t* src, size_t stride) + { + Init(); + WriteHeader(); + uint8_t* r = _buffer.data, * g = r + _width * _block,* b = g + _width * _block; + int dc[3] = { 0, 0, 0 }; + for (int row = 0; row < (int)_param.height; row += _block) + { + int block = Simd::Min(row + _block, (int)_param.height) - row; + switch (_param.format) + { + case SimdPixelFormatBgr24: + _deintBgr(src, stride, _param.width, block, b, _width, g, _width, r, _width); + break; + case SimdPixelFormatBgra32: + _deintBgra(src, stride, _param.width, block, b, _width, g, _width, r, _width, NULL, 0); + break; + case SimdPixelFormatRgb24: + _deintBgr(src, stride, _param.width, block, r, _width, g, _width, b, _width); + break; + case SimdPixelFormatRgba32: + _deintBgra(src, stride, _param.width, block, r, _width, g, _width, b, _width, NULL, 0); + break; + default: + break; + } + if(_param.format == SimdPixelFormatGray8) + _writeBlock(_stream, (int)_param.width, block, src, src, src, (int)stride, _fY, _fUv, dc); + else + _writeBlock(_stream, (int)_param.width, block, r, g, b, _width, _fY, _fUv, dc); + src += block * stride; + } + static const uint16_t FILL_BITS[] = { 0x7F, 7 }; + Base::WriteBits(_stream, FILL_BITS); + _stream.Write8u(0xFF); + _stream.Write8u(0xD9); + return true; + } + } +} diff --git a/3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp b/3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp new file mode 100644 index 0000000000..dcb8f2efbb --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp @@ -0,0 +1,379 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdImageSavePng.h" +#include "Simd/SimdBase.h" +#include "Simd/SimdCpu.h" + +namespace Simd +{ + namespace Base + { + const uint16_t ZlibLenC[30] = { 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 259 }; + const uint8_t ZlibLenEb[29] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0 }; + const uint16_t ZlibDistC[31] = { 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 32768 }; + const uint8_t ZlibDistEb[30] = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13 }; + +#if defined(SIMD_PNG_ZLIB_BIT_REV_TABLE) + int ZlibBitRevTable[512]; + static bool ZlibBitRevTableInit() + { + for (int i = 0; i < 512; i++) + { + int rev = 0, val = i; + for (size_t b = 0; b < 9; b++) + { + rev = (rev << 1) | (val & 1); + val >>= 1; + } + ZlibBitRevTable[i] = rev; + } + return true; + } + bool ZlibBitRevTableInited = ZlibBitRevTableInit(); + +#endif + + uint32_t ZlibAdler32(uint8_t* data, int size) + { + uint32_t lo = 1, hi = 0; + for (int b = 0, n = (int)(size % 5552); b < size;) + { + for (int i = 0; i < n; ++i) + { + lo += data[b + i]; + hi += lo; + } + lo %= 65521; + hi %= 65521; + b += n; + n = 5552; + } + return (hi << 16) | lo; + } + + void ZlibCompress(uint8_t* data, int size, int quality, OutputMemoryStream& stream) + { + const int ZHASH = 16384; + if (quality < 5) + quality = 5; + const int basket = quality * 2; + Array32i hashTable(ZHASH * basket); + memset(hashTable.data, -1, hashTable.RawSize()); + + stream.Write(uint8_t(0x78)); + stream.Write(uint8_t(0x5e)); + stream.WriteBits(1, 1); + stream.WriteBits(1, 2); + + int i = 0, j; + while (i < size - 3) + { + int h = ZlibHash(data + i) & (ZHASH - 1), best = 3; + uint8_t* bestLoc = 0; + int* hList = hashTable.data + h * basket; + for (j = 0; hList[j] != -1 && j < basket; ++j) + { + if (hList[j] > i - 32768) + { + int d = ZlibCount(data + hList[j], data + i, size - i); + if (d >= best) + { + best = d; + bestLoc = data + hList[j]; + } + } + } + if (j == basket) + { + memcpy(hList, hList + quality, quality * sizeof(int)); + memset(hList + quality, -1, quality * sizeof(int)); + j = quality; + } + hList[j] = i; + + if (bestLoc) + { + h = ZlibHash(data + i + 1) & (ZHASH - 1); + int* hList = hashTable.data + h * basket; + for (j = 0; hList[j] != -1 && j < basket; ++j) + { + if (hList[j] > i - 32767) + { + int e = ZlibCount(data + hList[j], data + i + 1, size - i - 1); + if (e > best) + { + bestLoc = NULL; + break; + } + } + } + } + + if (bestLoc) + { + int d = (int)(data + i - bestLoc); + assert(d <= 32767 && best <= 258); + for (j = 0; best > Base::ZlibLenC[j + 1] - 1; ++j); + Base::ZlibHuff(j + 257, stream); + if (Base::ZlibLenEb[j]) + stream.WriteBits(best - Base::ZlibLenC[j], Base::ZlibLenEb[j]); + for (j = 0; d > Base::ZlibDistC[j + 1] - 1; ++j); + stream.WriteBits(Base::ZlibBitRev(j, 5), 5); + if (Base::ZlibDistEb[j]) + stream.WriteBits(d - Base::ZlibDistC[j], Base::ZlibDistEb[j]); + i += best; + } + else + { + ZlibHuffB(data[i], stream); + ++i; + } + } + for (; i < size; ++i) + ZlibHuffB(data[i], stream); + ZlibHuff(256, stream); + stream.FlushBits(); + stream.WriteBe32u(ZlibAdler32(data, size)); + } + + uint32_t EncodeLine0(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + uint32_t sum = 0; + for (size_t i = 0; i < size; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine1(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + uint32_t sum = 0; + for (size_t i = 0; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + for (size_t i = n; i < size; ++i) + { + dst[i] = src[i] - src[i - n]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine2(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + uint32_t sum = 0; + for (size_t i = 0; i < n; ++i) + { + dst[i] = src[i] - src[i - stride]; + sum += ::abs(dst[i]); + } + for (size_t i = n; i < size; ++i) + { + dst[i] = src[i] - src[i - stride]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine3(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + uint32_t sum = 0; + for (size_t i = 0; i < n; ++i) + { + dst[i] = src[i] - (src[i - stride] >> 1); + sum += ::abs(dst[i]); + } + for (size_t i = n; i < size; ++i) + { + dst[i] = src[i] - ((src[i - n] + src[i - stride]) >> 1); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine4(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + uint32_t sum = 0; + for (size_t i = 0; i < n; ++i) + { + dst[i] = (int8_t)(src[i] - src[i - stride]); + sum += ::abs(dst[i]); + } + for (size_t i = n; i < size; ++i) + { + dst[i] = src[i] - Paeth(src[i - n], src[i - stride], src[i - stride - n]); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine5(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + uint32_t sum = 0; + for (size_t i = 0; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + for (size_t i = n; i < size; ++i) + { + dst[i] = src[i] - (src[i - n] >> 1); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine6(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + uint32_t sum = 0; + for (size_t i = 0; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + for (size_t i = n; i < size; ++i) + { + dst[i] = src[i] - src[i - n]; + sum += ::abs(dst[i]); + } + return sum; + } + + ImagePngSaver::ImagePngSaver(const ImageSaverParam& param) + : ImageSaver(param) + , _channels(0) + , _size(0) + , _convert(NULL) + { + switch (_param.format) + { + case SimdPixelFormatGray8: + _channels = 1; + break; + case SimdPixelFormatBgr24: + _channels = 3; + break; + case SimdPixelFormatBgra32: + _channels = 4; + break; + case SimdPixelFormatRgb24: + _channels = 3; + break; + case SimdPixelFormatRgba32: + _channels = 4; + break; + default: + break; + } + _size = _param.width * _channels; + if (_param.format == SimdPixelFormatBgr24) + { + _convert = Base::BgrToRgb; + _buff.Resize(_param.height * _size); + } + else if (_param.format == SimdPixelFormatBgra32) + { + _convert = Base::BgraToRgba; + _buff.Resize(_param.height * _size); + } + _filt.Resize((_size + 1) * _param.height); + _line.Resize(_size * FILTERS); + _encode[0] = Base::EncodeLine0; + _encode[1] = Base::EncodeLine1; + _encode[2] = Base::EncodeLine2; + _encode[3] = Base::EncodeLine3; + _encode[4] = Base::EncodeLine4; + _encode[5] = Base::EncodeLine5; + _encode[6] = Base::EncodeLine6; + _compress = Base::ZlibCompress; + } + + bool ImagePngSaver::ToStream(const uint8_t* src, size_t stride) + { + if (_convert) + { + _convert(src, _param.width, _param.height, stride, _buff.data, _size); + src = _buff.data; + stride = _size; + } + for (size_t row = 0; row < _param.height; ++row) + { + int bestFilter = 0, bestSum = INT_MAX; + for (int filter = 0; filter < FILTERS; filter++) + { + static const int TYPES[] = { 0, 1, 0, 5, 6, 0, 1, 2, 3, 4 }; + int type = TYPES[filter + (row ? 1 : 0) * FILTERS]; + int sum = _encode[type](src + stride * row, stride, _channels, _size, _line.data + _size * filter); + if (sum < bestSum) + { + bestSum = sum; + bestFilter = filter; + } + } + _filt[row * (_size + 1)] = (uint8_t)bestFilter; + memcpy(_filt.data + row * (_size + 1) + 1, _line.data + _size * bestFilter, _size); + } + OutputMemoryStream zlib(Min(_param.width * _param.height, Base::AlgCacheL1())); + _compress(_filt.data, (int)_filt.size, COMPRESSION, zlib); + WriteToStream(zlib.Data(), zlib.Size()); + return true; + } + + SIMD_INLINE void WriteCrc32(OutputMemoryStream& stream, size_t size) + { + stream.WriteBe32u(Base::Crc32(stream.Current() - size - 4, size + 4)); + } + + void ImagePngSaver::WriteToStream(const uint8_t* zlib, size_t zlen) + { + const uint8_t SIGNATURE[8] = { 137, 80, 78, 71, 13, 10, 26, 10 }; + const int8_t CTYPE[5] = { -1, 0, 4, 2, 6 }; + _stream.Reserve(8 + 12 + 13 + 12 + zlen + 12); + _stream.Write(SIGNATURE, 8); + _stream.WriteBe32u(13); + _stream.Write("IHDR", 4); + _stream.WriteBe32u((uint32_t)_param.width); + _stream.WriteBe32u((uint32_t)_param.height); + _stream.Write8u(8); + _stream.Write8u(CTYPE[_channels]); + _stream.Write8u(0); + _stream.Write8u(0); + _stream.Write8u(0); + WriteCrc32(_stream, 13); + _stream.WriteBe32u((uint32_t)zlen); + _stream.Write("IDAT", 4); + _stream.Write(zlib, zlen); + WriteCrc32(_stream, zlen); + _stream.WriteBe32u(0); + _stream.Write("IEND", 4); + WriteCrc32(_stream, 0); + } + } +} diff --git a/3rdparty/simdlib/Simd/SimdImageLoad.h b/3rdparty/simdlib/Simd/SimdImageLoad.h new file mode 100644 index 0000000000..43e44961e6 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdImageLoad.h @@ -0,0 +1,396 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdImageLoad_h__ +#define __SimdImageLoad_h__ + +#include "Simd/SimdMemoryStream.h" +#include "Simd/SimdArray.h" +#include "Simd/SimdAlignment.h" + +#include "Simd/SimdView.hpp" + +#include + +namespace Simd +{ + typedef uint8_t* (*ImageLoadFromMemoryPtr)(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format); + + uint8_t* ImageLoadFromFile(const ImageLoadFromMemoryPtr loader, const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format); + + //------------------------------------------------------------------------- + + struct ImageLoaderParam + { + const uint8_t* data; + size_t size; + SimdImageFileType file; + SimdPixelFormatType format; + + ImageLoaderParam(const uint8_t* d, size_t s, SimdPixelFormatType f); + + bool Validate(); + }; + + class ImageLoader + { + protected: + typedef Simd::View Image; + + ImageLoaderParam _param; + InputMemoryStream _stream; + Image _image; + + public: + ImageLoader(const ImageLoaderParam& param) + : _param(param) + , _stream(_param.data, _param.size) + { + } + + virtual ~ImageLoader() + { + } + + virtual bool FromStream() = 0; + + SIMD_INLINE uint8_t* Release(size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) + { + *stride = _image.stride; + *width = _image.width; + *height = _image.height; + *format = (SimdPixelFormatType)_image.format; + return _image.Release(); + } + }; + + namespace Base + { + class ImagePxmLoader : public ImageLoader + { + public: + ImagePxmLoader(const ImageLoaderParam& param); + + protected: + typedef void (*ToAnyPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride); + typedef void (*ToBgraPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + ToAnyPtr _toAny; + ToBgraPtr _toBgra; + Array8u _buffer; + size_t _block, _size; + + bool ReadHeader(size_t version); + virtual void SetConverters() = 0; + }; + + class ImagePgmTxtLoader : public ImagePxmLoader + { + public: + ImagePgmTxtLoader(const ImageLoaderParam& param); + + virtual bool FromStream(); + + protected: + virtual void SetConverters(); + }; + + class ImagePgmBinLoader : public ImagePxmLoader + { + public: + ImagePgmBinLoader(const ImageLoaderParam& param); + + virtual bool FromStream(); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmTxtLoader : public ImagePxmLoader + { + public: + ImagePpmTxtLoader(const ImageLoaderParam& param); + + virtual bool FromStream(); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmBinLoader : public ImagePxmLoader + { + public: + ImagePpmBinLoader(const ImageLoaderParam& param); + + virtual bool FromStream(); + + protected: + virtual void SetConverters(); + }; + + class ImagePngLoader : public ImageLoader + { + public: + ImagePngLoader(const ImageLoaderParam& param); + + virtual bool FromStream(); + + protected: + typedef void (*ToAny8Ptr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride); + typedef void (*ToBgra8Ptr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + typedef void (*ToAny16Ptr)(const uint16_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride); + typedef void (*ToBgra16Ptr)(const uint16_t* src, size_t width, size_t height, size_t srcStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha); + ToAny8Ptr _toAny8; + ToBgra8Ptr _toBgra8, _bgrToBgra; + ToAny16Ptr _toAny16; + ToBgra16Ptr _toBgra16; + + virtual void SetConverters(); + private: + bool _first, _hasTrans, _iPhone; + uint32_t _width, _height, _channels; + uint16_t _tc16[3]; + uint8_t _depth, _color, _interlace, _paletteChannels, _tc[3]; + Array8u _palette, _idat; + + struct Chunk + { + uint32_t size; + uint32_t type; + uint32_t offs; + }; + typedef std::vector Chunks; + Chunks _idats; + + bool ParseFile(); + bool CheckHeader(); + bool ReadChunk(Chunk& chunk); + bool ReadHeader(const Chunk & chunk); + bool ReadPalette(const Chunk& chunk); + bool ReadTransparency(const Chunk& chunk); + bool ReadData(const Chunk& chunk); + InputMemoryStream MergedDataStream(); + }; + + class ImageJpegLoader : public ImageLoader + { + public: + ImageJpegLoader(const ImageLoaderParam& param); + + virtual bool FromStream(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format); + } + +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + class ImagePgmTxtLoader : public Base::ImagePgmTxtLoader + { + public: + ImagePgmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePgmBinLoader : public Base::ImagePgmBinLoader + { + public: + ImagePgmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmTxtLoader : public Base::ImagePpmTxtLoader + { + public: + ImagePpmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmBinLoader : public Base::ImagePpmBinLoader + { + public: + ImagePpmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePngLoader : public Base::ImagePngLoader + { + public: + ImagePngLoader(const ImageLoaderParam& param); + + virtual bool FromStream(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format); + } +#endif// SIMD_SSE41_ENABLE + +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + class ImagePgmTxtLoader : public Sse41::ImagePgmTxtLoader + { + public: + ImagePgmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePgmBinLoader : public Sse41::ImagePgmBinLoader + { + public: + ImagePgmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmTxtLoader : public Sse41::ImagePpmTxtLoader + { + public: + ImagePpmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmBinLoader : public Sse41::ImagePpmBinLoader + { + public: + ImagePpmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format); + } +#endif// SIMD_AVX2_ENABLE + +#ifdef SIMD_AVX512BW_ENABLE + namespace Avx512bw + { + class ImagePgmTxtLoader : public Avx2::ImagePgmTxtLoader + { + public: + ImagePgmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePgmBinLoader : public Avx2::ImagePgmBinLoader + { + public: + ImagePgmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmTxtLoader : public Avx2::ImagePpmTxtLoader + { + public: + ImagePpmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmBinLoader : public Avx2::ImagePpmBinLoader + { + public: + ImagePpmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format); + } +#endif// SIMD_AVX512BW_ENABLE + +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + class ImagePgmTxtLoader : public Base::ImagePgmTxtLoader + { + public: + ImagePgmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePgmBinLoader : public Base::ImagePgmBinLoader + { + public: + ImagePgmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmTxtLoader : public Base::ImagePpmTxtLoader + { + public: + ImagePpmTxtLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + class ImagePpmBinLoader : public Base::ImagePpmBinLoader + { + public: + ImagePpmBinLoader(const ImageLoaderParam& param); + + protected: + virtual void SetConverters(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format); + } +#endif// SIMD_NEON_ENABLE +} + +#endif//__SimdImageLoad_h__ diff --git a/3rdparty/simdlib/Simd/SimdImageSave.h b/3rdparty/simdlib/Simd/SimdImageSave.h new file mode 100644 index 0000000000..4e1945c077 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdImageSave.h @@ -0,0 +1,386 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdImageSave_h__ +#define __SimdImageSave_h__ + +#include "Simd/SimdMemoryStream.h" +#include "Simd/SimdArray.h" +#include "Simd/SimdPerformance.h" + +namespace Simd +{ + typedef uint8_t* (*ImageSaveToMemoryPtr)(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size); + + SimdBool ImageSaveToFile(const ImageSaveToMemoryPtr saver, const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char* path); + + //--------------------------------------------------------------------- + + struct ImageSaverParam + { + size_t width, height; + SimdPixelFormatType format; + SimdImageFileType file; + int quality; + + SIMD_INLINE ImageSaverParam(size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality) + { + this->width = width; + this->height = height; + this->format = format; + this->file = file; + this->quality = quality; + } + + bool Validate() + { + if (file == SimdImageFileUndefined) + { + if (format == SimdPixelFormatGray8) + file = SimdImageFilePgmBin; + else + file = SimdImageFilePpmBin; + } + if (format < SimdPixelFormatGray8 || format > SimdPixelFormatRgba32) + return false; + if (width == 0 || height == 0) + return false; + if (file <= SimdImageFileUndefined || file > SimdImageFileJpeg) + return false; + return true; + } + }; + + class ImageSaver + { + protected: + ImageSaverParam _param; + OutputMemoryStream _stream; + public: + ImageSaver(const ImageSaverParam& param) + : _param(param) + { + } + + virtual ~ImageSaver() + { + } + + virtual bool ToStream(const uint8_t* src, size_t stride) = 0; + + SIMD_INLINE uint8_t* Release(size_t* size) + { + return _stream.Release(size); + } + }; + + namespace Base + { + class ImagePxmSaver : public ImageSaver + { + public: + ImagePxmSaver(const ImageSaverParam& param); + + protected: + typedef void (*ConvertPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride); + ConvertPtr _convert; + Array8u _buffer; + size_t _block, _size; + + void WriteHeader(size_t version); + }; + + class ImagePgmTxtSaver : public ImagePxmSaver + { + public: + ImagePgmTxtSaver(const ImageSaverParam& param); + + virtual bool ToStream(const uint8_t* src, size_t stride); + }; + + class ImagePgmBinSaver : public ImagePxmSaver + { + public: + ImagePgmBinSaver(const ImageSaverParam& param); + + virtual bool ToStream(const uint8_t* src, size_t stride); + }; + + class ImagePpmTxtSaver : public ImagePxmSaver + { + public: + ImagePpmTxtSaver(const ImageSaverParam& param); + + virtual bool ToStream(const uint8_t* src, size_t stride); + }; + + class ImagePpmBinSaver : public ImagePxmSaver + { + public: + ImagePpmBinSaver(const ImageSaverParam& param); + + virtual bool ToStream(const uint8_t* src, size_t stride); + }; + + class ImagePngSaver : public ImageSaver + { + public: + ImagePngSaver(const ImageSaverParam& param); + + virtual bool ToStream(const uint8_t* src, size_t stride); + protected: + static const int COMPRESSION = 8; + static const int FILTERS = 5; + static const int TYPES = 7; + typedef void (*ConvertPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride); + typedef uint32_t (*EncodePtr)(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst); + typedef void (*CompressPtr)(uint8_t* data, int size, int quality, OutputMemoryStream& stream); + ConvertPtr _convert; + EncodePtr _encode[TYPES]; + CompressPtr _compress; + size_t _channels, _size; + Array8u _filt, _buff; + Array8i _line; + + void WriteToStream(const uint8_t* zlib, size_t zlen); + }; + + class ImageJpegSaver : public ImageSaver + { + public: + ImageJpegSaver(const ImageSaverParam& param); + + virtual bool ToStream(const uint8_t* src, size_t stride); + protected: + typedef void (*DeintBgrPtr)(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, + uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride); + typedef void (*DeintBgraPtr)(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height, + uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride); + typedef void (*WriteBlockPtr)(OutputMemoryStream& stream, int width, int height, const uint8_t* red, + const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3]); + + Array8u _buffer; + DeintBgrPtr _deintBgr; + DeintBgraPtr _deintBgra; + WriteBlockPtr _writeBlock; + bool _subSample; + int _quality, _block, _width; + float _fY[64], _fUv[64]; + uint8_t _uY[64], _uUv[64]; + + virtual void Init(); + + void InitParams(bool trans); + void WriteHeader(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size); + } + +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + class ImagePgmTxtSaver : public Base::ImagePgmTxtSaver + { + public: + ImagePgmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePgmBinSaver : public Base::ImagePgmBinSaver + { + public: + ImagePgmBinSaver(const ImageSaverParam& param); + }; + + class ImagePpmTxtSaver : public Base::ImagePpmTxtSaver + { + public: + ImagePpmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePpmBinSaver : public Base::ImagePpmBinSaver + { + public: + ImagePpmBinSaver(const ImageSaverParam& param); + }; + + class ImagePngSaver : public Base::ImagePngSaver + { + public: + ImagePngSaver(const ImageSaverParam& param); + }; + + class ImageJpegSaver : public Base::ImageJpegSaver + { + public: + ImageJpegSaver(const ImageSaverParam& param); + + protected: + virtual void Init(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size); + } +#endif// SIMD_SSE41_ENABLE + +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + class ImagePgmTxtSaver : public Sse41::ImagePgmTxtSaver + { + public: + ImagePgmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePgmBinSaver : public Sse41::ImagePgmBinSaver + { + public: + ImagePgmBinSaver(const ImageSaverParam& param); + }; + + class ImagePpmTxtSaver : public Sse41::ImagePpmTxtSaver + { + public: + ImagePpmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePpmBinSaver : public Sse41::ImagePpmBinSaver + { + public: + ImagePpmBinSaver(const ImageSaverParam& param); + }; + + class ImagePngSaver : public Sse41::ImagePngSaver + { + public: + ImagePngSaver(const ImageSaverParam& param); + }; + + class ImageJpegSaver : public Sse41::ImageJpegSaver + { + public: + ImageJpegSaver(const ImageSaverParam& param); + + protected: + virtual void Init(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size); + } +#endif// SIMD_AVX2_ENABLE + +#ifdef SIMD_AVX512BW_ENABLE + namespace Avx512bw + { + class ImagePgmTxtSaver : public Avx2::ImagePgmTxtSaver + { + public: + ImagePgmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePgmBinSaver : public Avx2::ImagePgmBinSaver + { + public: + ImagePgmBinSaver(const ImageSaverParam& param); + }; + + class ImagePpmTxtSaver : public Avx2::ImagePpmTxtSaver + { + public: + ImagePpmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePpmBinSaver : public Avx2::ImagePpmBinSaver + { + public: + ImagePpmBinSaver(const ImageSaverParam& param); + }; + + class ImagePngSaver : public Avx2::ImagePngSaver + { + public: + ImagePngSaver(const ImageSaverParam& param); + }; + + class ImageJpegSaver : public Avx2::ImageJpegSaver + { + public: + ImageJpegSaver(const ImageSaverParam& param); + + protected: + virtual void Init(); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size); + } +#endif// SIMD_AVX512BW_ENABLE + +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + class ImagePgmTxtSaver : public Base::ImagePgmTxtSaver + { + public: + ImagePgmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePgmBinSaver : public Base::ImagePgmBinSaver + { + public: + ImagePgmBinSaver(const ImageSaverParam& param); + }; + + class ImagePpmTxtSaver : public Base::ImagePpmTxtSaver + { + public: + ImagePpmTxtSaver(const ImageSaverParam& param); + }; + + class ImagePpmBinSaver : public Base::ImagePpmBinSaver + { + public: + ImagePpmBinSaver(const ImageSaverParam& param); + }; + + class ImagePngSaver : public Base::ImagePngSaver + { + public: + ImagePngSaver(const ImageSaverParam& param); + }; + + //--------------------------------------------------------------------- + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size); + } +#endif// SIMD_NEON_ENABLE +} + +#endif//__SimdImageSave_h__ diff --git a/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h new file mode 100644 index 0000000000..d54164f7d4 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h @@ -0,0 +1,649 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdImageSaveJpeg_h__ +#define __SimdImageSaveJpeg_h__ + +#include "Simd/SimdImageSave.h" +#include "Simd/SimdMath.h" + +#define SIMD_JPEG_CALC_BITS_TABLE + +namespace Simd +{ + namespace Base + { + struct BitBuf + { + static const uint32_t capacity = 1024; + uint32_t size; + uint16_t data[1024][2]; + + SIMD_INLINE BitBuf() + : size(0) + { + } + + SIMD_INLINE void Push(const uint16_t* bits) + { + ((uint32_t*)data)[size++] = ((uint32_t*)bits)[0]; + } + + SIMD_INLINE bool Full(uint32_t tail = capacity / 2) const + { + return size + tail >= capacity; + } + + SIMD_INLINE uint32_t Capacity() const + { + return capacity; + } + + SIMD_INLINE void Clear() + { + size = 0; + } + }; + + extern const uint8_t JpegZigZagD[64]; + extern const uint8_t JpegZigZagT[64]; + + extern const uint16_t HuffmanYdc[256][2]; + extern const uint16_t HuffmanUVdc[256][2]; + extern const uint16_t HuffmanYac[256][2]; + extern const uint16_t HuffmanUVac[256][2]; + +#if defined(SIMD_JPEG_CALC_BITS_TABLE) + const int JpegCalcBitsRange = 2048; + extern uint16_t JpegCalcBitsTable[JpegCalcBitsRange * 2][2]; + SIMD_INLINE void JpegCalcBits(int val, uint16_t bits[2]) + { + assert(val >= -JpegCalcBitsRange && val < JpegCalcBitsRange); + ((uint32_t*)bits)[0] = ((uint32_t*)JpegCalcBitsTable)[val + JpegCalcBitsRange]; + } +#else + SIMD_INLINE void JpegCalcBits(int val, uint16_t bits[2]) + { + int tmp = val < 0 ? -val : val; + val = val < 0 ? val - 1 : val; + bits[1] = 1; + while (tmp >>= 1) + ++bits[1]; + bits[0] = val & ((1 << bits[1]) - 1); + } +#endif + + SIMD_INLINE void RgbToYuv(const uint8_t* r, const uint8_t* g, const uint8_t* b, int stride, int height, int width, float* y, float* u, float* v, int size) + { + for (int row = 0; row < size;) + { + for (int col = 0; col < size; col += 1) + { + int offs = (col < width ? col : width - 1); + float _r = r[offs], _g = g[offs], _b = b[offs]; + y[col] = +0.29900f * _r + 0.58700f * _g + 0.11400f * _b - 128.000f; + u[col] = -0.16874f * _r - 0.33126f * _g + 0.50000f * _b; + v[col] = +0.50000f * _r - 0.41869f * _g - 0.08131f * _b; + } + if (++row < height) + r += stride, g += stride, b += stride; + y += size, u += size, v += size; + } + } + + SIMD_INLINE void GrayToY(const uint8_t* g, int stride, int height, int width, float* y, int size) + { + for (int row = 0; row < size;) + { + for (int col = 0; col < size; col += 1) + { + int offs = (col < width ? col : width - 1); + y[col] = g[offs] - 128.000f; + } + if (++row < height) + g += stride; + y += size; + } + } + + SIMD_INLINE void JpegProcessDuGrayUv(BitBuf & bitBuf) + { + bitBuf.Push(Base::HuffmanUVdc[0]); + bitBuf.Push(Base::HuffmanUVac[0]); + bitBuf.Push(Base::HuffmanUVdc[0]); + bitBuf.Push(Base::HuffmanUVac[0]); + } + + SIMD_INLINE void WriteBits(OutputMemoryStream & stream, const uint16_t bits[2]) + { + stream.BitCount() += bits[1]; +#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) + stream.BitBuffer() |= uint64_t(bits[0]) << (64 - stream.BitCount()); + while (stream.BitCount() >= 8) + { + uint8_t byte = stream.BitBuffer() >> 56; + stream.Write8u(byte); + if (byte == 255) + stream.Write8u(0); + stream.BitBuffer() <<= 8; + stream.BitCount() -= 8; + } +#else + stream.BitBuffer() |= uint32_t(bits[0]) << (32 - stream.BitCount()); + while (stream.BitCount() >= 8) + { + uint8_t byte = stream.BitBuffer() >> 24; + stream.Write8u(byte); + if (byte == 255) + stream.Write8u(0); + stream.BitBuffer() <<= 8; + stream.BitCount() -= 8; + } +#endif + } + + SIMD_INLINE void WriteBits(OutputMemoryStream& stream, const uint16_t bits[][2], size_t size) + { + size_t pos = stream.Pos(); + stream.Reserve(pos + size * 2); + uint8_t* data = stream.Data(); + size_t & bitCount = stream.BitCount(); + size_t i = 0; +#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) + uint64_t &bitBuffer = stream.BitBuffer(); + for (size_t size3 = AlignLoAny(size, 3); i < size3; i += 3, bits += 3) + { + bitCount += bits[0][1]; + bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount); + bitCount += bits[1][1]; + bitBuffer |= uint64_t(bits[1][0]) << (64 - bitCount); + bitCount += bits[2][1]; + bitBuffer |= uint64_t(bits[2][0]) << (64 - bitCount); + assert(bitCount <= 64); + while (bitCount >= 16) + { + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + byte = uint8_t(bitBuffer >> 48); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 16; + bitCount -= 16; + } + } + if(bitCount >= 8) + { + assert(bitCount < 16); + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + for (; i < size; ++i, ++bits) + { + bitCount += bits[0][1]; + bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount); + while (bitCount >= 8) + { + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + } +#else + uint32_t &bitBuffer = stream.BitBuffer(); + for (; i < size; ++i, ++bits) + { + bitCount += bits[0][1]; + bitBuffer |= uint32_t(bits[0][0]) << (32 - bitCount); + while (bitCount >= 8) + { + uint8_t byte = uint8_t(bitBuffer >> 24); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + } +#endif + stream.Seek(pos); + } + } + +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + } +#endif// SIMD_SSE41_ENABLE + +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + extern const uint32_t JpegZigZagTi32[64]; + + SIMD_INLINE void JpegDctV(const float* src, size_t srcStride, float* dst, size_t dstStride) + { + static const __m256 _0_707106781 = _mm256_set1_ps(0.707106781f); + static const __m256 _0_382683433 = _mm256_set1_ps(0.382683433f); + static const __m256 _0_541196100 = _mm256_set1_ps(0.541196100f); + static const __m256 _1_306562965 = _mm256_set1_ps(1.306562965f); + + __m256 d0 = _mm256_loadu_ps(src + 0 * srcStride); + __m256 d1 = _mm256_loadu_ps(src + 1 * srcStride); + __m256 d2 = _mm256_loadu_ps(src + 2 * srcStride); + __m256 d3 = _mm256_loadu_ps(src + 3 * srcStride); + __m256 d4 = _mm256_loadu_ps(src + 4 * srcStride); + __m256 d5 = _mm256_loadu_ps(src + 5 * srcStride); + __m256 d6 = _mm256_loadu_ps(src + 6 * srcStride); + __m256 d7 = _mm256_loadu_ps(src + 7 * srcStride); + + __m256 tmp0 = _mm256_add_ps(d0, d7); + __m256 tmp7 = _mm256_sub_ps(d0, d7); + __m256 tmp1 = _mm256_add_ps(d1, d6); + __m256 tmp6 = _mm256_sub_ps(d1, d6); + __m256 tmp2 = _mm256_add_ps(d2, d5); + __m256 tmp5 = _mm256_sub_ps(d2, d5); + __m256 tmp3 = _mm256_add_ps(d3, d4); + __m256 tmp4 = _mm256_sub_ps(d3, d4); + + __m256 tmp10 = _mm256_add_ps(tmp0, tmp3); + __m256 tmp13 = _mm256_sub_ps(tmp0, tmp3); + __m256 tmp11 = _mm256_add_ps(tmp1, tmp2); + __m256 tmp12 = _mm256_sub_ps(tmp1, tmp2); + + d0 = _mm256_add_ps(tmp10, tmp11); + d4 = _mm256_sub_ps(tmp10, tmp11); + + __m256 z1 = _mm256_mul_ps(_mm256_add_ps(tmp12, tmp13), _0_707106781); + d2 = _mm256_add_ps(tmp13, z1); + d6 = _mm256_sub_ps(tmp13, z1); + + tmp10 = _mm256_add_ps(tmp4, tmp5); + tmp11 = _mm256_add_ps(tmp5, tmp6); + tmp12 = _mm256_add_ps(tmp6, tmp7); + + __m256 z5 = _mm256_mul_ps(_mm256_sub_ps(tmp10, tmp12), _0_382683433); + __m256 z2 = _mm256_add_ps(_mm256_mul_ps(tmp10, _0_541196100), z5); + __m256 z4 = _mm256_add_ps(_mm256_mul_ps(tmp12, _1_306562965), z5); + __m256 z3 = _mm256_mul_ps(tmp11, _0_707106781); + + __m256 z11 = _mm256_add_ps(tmp7, z3); + __m256 z13 = _mm256_sub_ps(tmp7, z3); + + _mm256_storeu_ps(dst + 0 * dstStride, d0); + _mm256_storeu_ps(dst + 1 * dstStride, _mm256_add_ps(z11, z4)); + _mm256_storeu_ps(dst + 2 * dstStride, d2); + _mm256_storeu_ps(dst + 3 * dstStride, _mm256_sub_ps(z13, z2)); + _mm256_storeu_ps(dst + 4 * dstStride, d4); + _mm256_storeu_ps(dst + 5 * dstStride, _mm256_add_ps(z13, z2)); + _mm256_storeu_ps(dst + 6 * dstStride, d6); + _mm256_storeu_ps(dst + 7 * dstStride, _mm256_sub_ps(z11, z4)); + } + + SIMD_INLINE void JpegDct(const float* src, size_t stride, const float* fdt, int* dst) + { + static const __m256 _0_707106781 = _mm256_set1_ps(0.707106781f); + static const __m256 _0_382683433 = _mm256_set1_ps(0.382683433f); + static const __m256 _0_541196100 = _mm256_set1_ps(0.541196100f); + static const __m256 _1_306562965 = _mm256_set1_ps(1.306562965f); + + __m256 d0 = _mm256_loadu_ps(src + 0 * stride); + __m256 d1 = _mm256_loadu_ps(src + 1 * stride); + __m256 d2 = _mm256_loadu_ps(src + 2 * stride); + __m256 d3 = _mm256_loadu_ps(src + 3 * stride); + __m256 d4 = _mm256_loadu_ps(src + 4 * stride); + __m256 d5 = _mm256_loadu_ps(src + 5 * stride); + __m256 d6 = _mm256_loadu_ps(src + 6 * stride); + __m256 d7 = _mm256_loadu_ps(src + 7 * stride); + + __m256 tmp0 = _mm256_add_ps(d0, d7); + __m256 tmp7 = _mm256_sub_ps(d0, d7); + __m256 tmp1 = _mm256_add_ps(d1, d6); + __m256 tmp6 = _mm256_sub_ps(d1, d6); + __m256 tmp2 = _mm256_add_ps(d2, d5); + __m256 tmp5 = _mm256_sub_ps(d2, d5); + __m256 tmp3 = _mm256_add_ps(d3, d4); + __m256 tmp4 = _mm256_sub_ps(d3, d4); + + __m256 tmp10 = _mm256_add_ps(tmp0, tmp3); + __m256 tmp13 = _mm256_sub_ps(tmp0, tmp3); + __m256 tmp11 = _mm256_add_ps(tmp1, tmp2); + __m256 tmp12 = _mm256_sub_ps(tmp1, tmp2); + + d0 = _mm256_add_ps(tmp10, tmp11); + d4 = _mm256_sub_ps(tmp10, tmp11); + + __m256 z1 = _mm256_mul_ps(_mm256_add_ps(tmp12, tmp13), _0_707106781); + d2 = _mm256_add_ps(tmp13, z1); + d6 = _mm256_sub_ps(tmp13, z1); + + tmp10 = _mm256_add_ps(tmp4, tmp5); + tmp11 = _mm256_add_ps(tmp5, tmp6); + tmp12 = _mm256_add_ps(tmp6, tmp7); + + __m256 z5 = _mm256_mul_ps(_mm256_sub_ps(tmp10, tmp12), _0_382683433); + __m256 z2 = _mm256_add_ps(_mm256_mul_ps(tmp10, _0_541196100), z5); + __m256 z4 = _mm256_add_ps(_mm256_mul_ps(tmp12, _1_306562965), z5); + __m256 z3 = _mm256_mul_ps(tmp11, _0_707106781); + + __m256 z11 = _mm256_add_ps(tmp7, z3); + __m256 z13 = _mm256_sub_ps(tmp7, z3); + + d1 = _mm256_add_ps(z11, z4); + d3 = _mm256_sub_ps(z13, z2); + d5 = _mm256_add_ps(z13, z2); + d7 = _mm256_sub_ps(z11, z4); + + tmp10 = _mm256_permute2f128_ps(d0, d4, 0x20); + tmp11 = _mm256_permute2f128_ps(d1, d5, 0x20); + tmp12 = _mm256_permute2f128_ps(d2, d6, 0x20); + tmp13 = _mm256_permute2f128_ps(d3, d7, 0x20); + d4 = _mm256_permute2f128_ps(d0, d4, 0x31); + d5 = _mm256_permute2f128_ps(d1, d5, 0x31); + d6 = _mm256_permute2f128_ps(d2, d6, 0x31); + d7 = _mm256_permute2f128_ps(d3, d7, 0x31); + + tmp0 = _mm256_unpacklo_ps(tmp10, tmp12); + tmp1 = _mm256_unpackhi_ps(tmp10, tmp12); + tmp2 = _mm256_unpacklo_ps(tmp11, tmp13); + tmp3 = _mm256_unpackhi_ps(tmp11, tmp13); + d0 = _mm256_unpacklo_ps(tmp0, tmp2); + d1 = _mm256_unpackhi_ps(tmp0, tmp2); + d2 = _mm256_unpacklo_ps(tmp1, tmp3); + d3 = _mm256_unpackhi_ps(tmp1, tmp3); + + tmp0 = _mm256_unpacklo_ps(d4, d6); + tmp1 = _mm256_unpackhi_ps(d4, d6); + tmp2 = _mm256_unpacklo_ps(d5, d7); + tmp3 = _mm256_unpackhi_ps(d5, d7); + d4 = _mm256_unpacklo_ps(tmp0, tmp2); + d5 = _mm256_unpackhi_ps(tmp0, tmp2); + d6 = _mm256_unpacklo_ps(tmp1, tmp3); + d7 = _mm256_unpackhi_ps(tmp1, tmp3); + + tmp0 = _mm256_add_ps(d0, d7); + tmp1 = _mm256_add_ps(d1, d6); + tmp2 = _mm256_add_ps(d2, d5); + tmp3 = _mm256_add_ps(d3, d4); + tmp7 = _mm256_sub_ps(d0, d7); + tmp6 = _mm256_sub_ps(d1, d6); + tmp5 = _mm256_sub_ps(d2, d5); + tmp4 = _mm256_sub_ps(d3, d4); + + tmp10 = _mm256_add_ps(tmp0, tmp3); + tmp13 = _mm256_sub_ps(tmp0, tmp3); + tmp11 = _mm256_add_ps(tmp1, tmp2); + tmp12 = _mm256_sub_ps(tmp1, tmp2); + + d0 = _mm256_add_ps(tmp10, tmp11); + d4 = _mm256_sub_ps(tmp10, tmp11); + + z1 = _mm256_mul_ps(_mm256_add_ps(tmp12, tmp13), _0_707106781); + d2 = _mm256_add_ps(tmp13, z1); + d6 = _mm256_sub_ps(tmp13, z1); + + tmp10 = _mm256_add_ps(tmp4, tmp5); + tmp11 = _mm256_add_ps(tmp5, tmp6); + tmp12 = _mm256_add_ps(tmp6, tmp7); + + z5 = _mm256_mul_ps(_mm256_sub_ps(tmp10, tmp12), _0_382683433); + z2 = _mm256_add_ps(_mm256_mul_ps(tmp10, _0_541196100), z5); + z4 = _mm256_add_ps(_mm256_mul_ps(tmp12, _1_306562965), z5); + z3 = _mm256_mul_ps(tmp11, _0_707106781); + + z11 = _mm256_add_ps(tmp7, z3); + z13 = _mm256_sub_ps(tmp7, z3); + + d1 = _mm256_add_ps(z11, z4); + d3 = _mm256_sub_ps(z13, z2); + d5 = _mm256_add_ps(z13, z2); + d7 = _mm256_sub_ps(z11, z4); + + _mm256_storeu_si256((__m256i*)dst + 0, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 0), d0))); + _mm256_storeu_si256((__m256i*)dst + 1, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 1), d1))); + _mm256_storeu_si256((__m256i*)dst + 2, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 2), d2))); + _mm256_storeu_si256((__m256i*)dst + 3, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 3), d3))); + _mm256_storeu_si256((__m256i*)dst + 4, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 4), d4))); + _mm256_storeu_si256((__m256i*)dst + 5, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 5), d5))); + _mm256_storeu_si256((__m256i*)dst + 6, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 6), d6))); + _mm256_storeu_si256((__m256i*)dst + 7, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 7), d7))); + } + + const __m256i K32_PERM_LD = SIMD_MM256_SETR_EPI32(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1); + + const __m256i K8_SHFL_VS = SIMD_MM256_SETR_EPI8( + 0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1, + 0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1); + + const __m256i K8_SHFL_SH = SIMD_MM256_SETR_EPI8( + 0x2, 0x3, -1, -1, 0x6, 0x7, -1, -1, 0xA, 0xB, -1, -1, -1, -1, -1, -1, + 0x2, 0x3, -1, -1, 0x6, 0x7, -1, -1, 0xA, 0xB, -1, -1, -1, -1, -1, -1); + + const __m256i K32_32 = SIMD_MM256_SET1_EPI32(32); + +#if defined(SIMD_X64_ENABLE) + SIMD_INLINE void WriteBits(uint8_t* data, size_t & pos, uint64_t & bitBuffer, size_t &bitCount, uint64_t shift, uint64_t value, uint64_t mask) + { + bitCount += shift; + assert(bitCount <= 64); + bitBuffer |= _pext_u64(value, mask) << (64 - bitCount); + while (bitCount >= 16) + { + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + byte = uint8_t(bitBuffer >> 48); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 16; + bitCount -= 16; + } + } +#endif + + SIMD_INLINE void WriteBits(OutputMemoryStream& stream, const uint16_t bits[][2], size_t size) + { + size_t pos = stream.Pos(); + stream.Reserve(pos + size * 2); + uint8_t* data = stream.Data(); + size_t& bitCount = stream.BitCount(); + size_t i = 0; +#if defined(SIMD_X64_ENABLE) + uint64_t &bitBuffer = stream.BitBuffer(); + size_t size12 = AlignLoAny(size, 12); + for (; i < size12; i += 12, bits += 12) + { + __m256i b0 = _mm256_permutevar8x32_epi32(_mm256_loadu_si256((__m256i*)(bits + 0)), K32_PERM_LD); + __m256i b1 = _mm256_permutevar8x32_epi32(_mm256_loadu_si256((__m256i*)(bits + 6)), K32_PERM_LD); + __m256i vs0 = _mm256_shuffle_epi8(b0, K8_SHFL_VS); + __m256i vs1 = _mm256_shuffle_epi8(b1, K8_SHFL_VS); + __m256i vv = Shuffle64i<0x0>(vs0, vs1); + __m256i ss = Shuffle64i<0xF>(vs0, vs1); + SIMD_ALIGNED(32) uint64_t value[4], mask[4], shift[4]; + _mm256_storeu_si256((__m256i*)value, vv); + _mm256_storeu_si256((__m256i*)shift, _mm256_sad_epu8(ss, K_ZERO)); + __m256i s0 = _mm256_sub_epi32(K32_32, _mm256_shuffle_epi8(b0, K8_SHFL_SH)); + __m256i m0 = _mm256_srlv_epi32(K_INV_ZERO, s0); + __m256i s1 = _mm256_sub_epi32(K32_32, _mm256_shuffle_epi8(b1, K8_SHFL_SH)); + __m256i m1 = _mm256_srlv_epi32(K_INV_ZERO, s1); + __m256i ms0 = _mm256_shuffle_epi8(m0, K8_SHFL_VS); + __m256i ms1 = _mm256_shuffle_epi8(m1, K8_SHFL_VS); + _mm256_storeu_si256((__m256i*)mask, Shuffle64i<0x0>(ms0, ms1)); + WriteBits(data, pos, bitBuffer, bitCount, shift[0], value[0], mask[0]); + WriteBits(data, pos, bitBuffer, bitCount, shift[2], value[2], mask[2]); + WriteBits(data, pos, bitBuffer, bitCount, shift[1], value[1], mask[1]); + WriteBits(data, pos, bitBuffer, bitCount, shift[3], value[3], mask[3]); + } + if (bitCount >= 8) + { + assert(bitCount < 16); + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + for (; i < size; ++i, ++bits) + { + bitCount += bits[0][1]; + bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount); + while (bitCount >= 8) + { + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + } +#else + uint32_t& bitBuffer = stream.BitBuffer(); + for (; i < size; ++i, ++bits) + { + bitCount += bits[0][1]; + bitBuffer |= uint32_t(bits[0][0]) << (32 - bitCount); + while (bitCount >= 8) + { + uint8_t byte = uint8_t(bitBuffer >> 24); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + } +#endif + stream.Seek(pos); + } + } +#endif// SIMD_AVX2_ENABLE + +#ifdef SIMD_AVX512BW_ENABLE + namespace Avx512bw + { + const __m512i K32_PERM_LD = SIMD_MM512_SETR_EPI32(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1); + + const __m512i K8_SHFL_VS = SIMD_MM512_SETR_EPI8( + 0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1, + 0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1, + 0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1, + 0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1); + + SIMD_INLINE void WriteBits(OutputMemoryStream& stream, const uint16_t bits[][2], size_t size) + { + size_t pos = stream.Pos(); + stream.Reserve(pos + size * 2); + uint8_t* data = stream.Data(); + size_t& bitCount = stream.BitCount(); + size_t i = 0; +#if defined(SIMD_X64_ENABLE) + uint64_t &bitBuffer = stream.BitBuffer(); + size_t size24 = AlignLoAny(size, 24); + for (; i < size24; i += 24, bits += 24) + { + __m512i b0 = _mm512_permutexvar_epi32(K32_PERM_LD, _mm512_loadu_si512((__m512i*)(bits + 00))); + __m512i b1 = _mm512_permutexvar_epi32(K32_PERM_LD, _mm512_loadu_si512((__m512i*)(bits + 12))); + __m512i vs0 = _mm512_shuffle_epi8(b0, K8_SHFL_VS); + __m512i vs1 = _mm512_shuffle_epi8(b1, K8_SHFL_VS); + __m512i vv = Shuffle64i<0x00>(vs0, vs1); + __m512i ss = Shuffle64i<0xFF>(vs0, vs1); + SIMD_ALIGNED(64) uint64_t value[8], mask[8], shift[8]; + _mm512_storeu_si512((__m512i*)value, vv); + _mm512_storeu_si512((__m512i*)shift, _mm512_sad_epu8(ss, K_ZERO)); + _mm512_storeu_si512((__m512i*)mask, _mm512_srlv_epi16(K_INV_ZERO, _mm512_sub_epi16(K16_0010, ss))); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[0], value[0], mask[0]); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[2], value[2], mask[2]); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[4], value[4], mask[4]); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[6], value[6], mask[6]); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[1], value[1], mask[1]); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[3], value[3], mask[3]); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[5], value[5], mask[5]); + Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[7], value[7], mask[7]); + } + if (bitCount >= 8) + { + assert(bitCount < 16); + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + for (; i < size; ++i, ++bits) + { + bitCount += bits[0][1]; + bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount); + while (bitCount >= 8) + { + uint8_t byte = uint8_t(bitBuffer >> 56); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + } +#else + uint32_t& bitBuffer = stream.BitBuffer(); + for (; i < size; ++i, ++bits) + { + bitCount += bits[0][1]; + bitBuffer |= uint32_t(bits[0][0]) << (32 - bitCount); + while (bitCount >= 8) + { + uint8_t byte = uint8_t(bitBuffer >> 24); + data[pos++] = byte; + if (byte == 255) + data[pos++] = 0; + bitBuffer <<= 8; + bitCount -= 8; + } + } +#endif + stream.Seek(pos); + } + } +#endif// SIMD_AVX512BW_ENABLE + +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + } +#endif// SIMD_NEON_ENABLE +} + +#endif//__SimdImageSaveJpeg_h__ diff --git a/3rdparty/simdlib/Simd/SimdImageSavePng.h b/3rdparty/simdlib/Simd/SimdImageSavePng.h new file mode 100644 index 0000000000..71efd1ca60 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdImageSavePng.h @@ -0,0 +1,235 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdImageSavePng_h__ +#define __SimdImageSavePng_h__ + +#include "Simd/SimdImageSave.h" +#include "Simd/SimdLoad.h" + +#define SIMD_PNG_ZLIB_BIT_REV_TABLE + +namespace Simd +{ + namespace Base + { + extern const uint16_t ZlibLenC[30]; + extern const uint8_t ZlibLenEb[29]; + extern const uint16_t ZlibDistC[31]; + extern const uint8_t ZlibDistEb[30]; + +#if defined(SIMD_PNG_ZLIB_BIT_REV_TABLE) + const int ZlibBitRevShift = 9; + const int ZlibBitRevSize = 1 << ZlibBitRevShift; + extern int ZlibBitRevTable[ZlibBitRevSize]; + SIMD_INLINE int ZlibBitRev(int bits, int count) + { + assert(bits < ZlibBitRevSize&& count <= ZlibBitRevShift); + return ZlibBitRevTable[bits] >> (ZlibBitRevShift - count); + } +#else + SIMD_INLINE int ZlibBitRev(int bits, int count) + { + int rev = 0; + for (size_t b = 0; b < count; b++) + { + rev = (rev << 1) | (bits & 1); + bits >>= 1; + } + return rev; + } +#endif + + SIMD_INLINE uint32_t ZlibHash(const uint8_t* data) + { + uint32_t hash = data[0] + (data[1] << 8) + (data[2] << 16); + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + return hash; + } + + SIMD_INLINE void ZlibHuffA(int bits, int count, OutputMemoryStream& stream) + { + stream.WriteBits(ZlibBitRev(bits, count), count); + } + + SIMD_INLINE void ZlibHuff1(int bits, OutputMemoryStream& stream) + { + ZlibHuffA(0x30 + bits, 8, stream); + } + + SIMD_INLINE void ZlibHuff2(int bits, OutputMemoryStream& stream) + { + ZlibHuffA(0x190 + bits - 144, 9, stream); + } + + SIMD_INLINE void ZlibHuff3(int bits, OutputMemoryStream& stream) + { + ZlibHuffA(0 + bits - 256, 7, stream); + } + + SIMD_INLINE void ZlibHuff4(int bits, OutputMemoryStream& stream) + { + ZlibHuffA(0xc0 + bits - 280, 8, stream); + } + + SIMD_INLINE void ZlibHuff(int bits, OutputMemoryStream& stream) + { + if (bits <= 143) + ZlibHuff1(bits, stream); + else if (bits <= 255) + ZlibHuff2(bits, stream); + else if (bits <= 279) + ZlibHuff3(bits, stream); + else + ZlibHuff4(bits, stream); + } + + SIMD_INLINE void ZlibHuffB(int bits, OutputMemoryStream& stream) + { + if (bits <= 143) + ZlibHuff1(bits, stream); + else + ZlibHuff2(bits, stream); + } + + SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit) + { + limit = Min(limit, 258); + int i = 0; +#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) + int limit8 = limit & (~7); + for (; i < limit8; i += 8) + if (*(uint64_t*)(a + i) != *(uint64_t*)(b + i)) + break; +#else + int limit4 = limit & (~3); + for (; i < limit4; i += 4) + if (*(uint32_t*)(a + i) != *(uint32_t*)(b + i)) + break; +#endif + for (; i < limit; i += 1) + if (a[i] != b[i]) + break; + return i; + } + + SIMD_INLINE uint8_t Paeth(int a, int b, int c) + { + int p = a + b - c, pa = abs(p - a), pb = abs(p - b), pc = abs(p - c); + if (pa <= pb && pa <= pc) + return uint8_t(a); + if (pb <= pc) + return uint8_t(b); + return uint8_t(c); + } + } + +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit) + { + limit = Min(limit, 258); + int i = 0; + int limit16 = limit & (~15); + for (; i < limit16; i += 16) + if (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)(a + i)), _mm_loadu_si128((__m128i*)(b + i)))) != 0xFFFF) + break; +#if defined(SIMD_X64_ENABLE) + int limit8 = limit & (~7); + for (; i < limit8; i += 8) + if (*(uint64_t*)(a + i) != *(uint64_t*)(b + i)) + break; +#else + int limit4 = limit & (~3); + for (; i < limit4; i += 4) + if (*(uint32_t*)(a + i) != *(uint32_t*)(b + i)) + break; +#endif + for (; i < limit; i += 1) + if (a[i] != b[i]) + break; + return i; + } + } +#endif// SIMD_SSE41_ENABLE + +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit) + { + limit = Min(limit, 258); + int i = 0; + for (; i < limit; i += 32) + { + __m256i _a = _mm256_loadu_si256((__m256i*)(a + i)); + __m256i _b = _mm256_loadu_si256((__m256i*)(b + i)); + uint32_t mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(_a, _b)); + if (mask != 0xFFFFFFFF) + { + i += _tzcnt_u32(~mask); + break; + } + } + return Min(i, limit); + } + } +#endif// SIMD_AVX2_ENABLE + +#ifdef SIMD_AVX512BW_ENABLE + namespace Avx512bw + { + SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit) + { + limit = Min(limit, 258); + int i = 0; + for (; i < limit; i += 64) + { + __m512i _a = _mm512_loadu_si512(a + i); + __m512i _b = _mm512_loadu_si512(b + i); + uint64_t mask = _mm512_cmp_epi8_mask(_a, _b, _MM_CMPINT_NE); + if (mask != 0) + { + i += (int)FirstNotZero64(mask); + break; + } + } + return Min(i, limit); + } + } +#endif// SIMD_AVX512BW_ENABLE + +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + } +#endif// SIMD_NEON_ENABLE +} + +#endif//__SimdImageSavePng_h__ diff --git a/3rdparty/simdlib/Simd/SimdLib.cpp b/3rdparty/simdlib/Simd/SimdLib.cpp index 89718bb80e..c168701413 100755 --- a/3rdparty/simdlib/Simd/SimdLib.cpp +++ b/3rdparty/simdlib/Simd/SimdLib.cpp @@ -61,8 +61,10 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved) #include "Simd/SimdConst.h" #include "Simd/SimdLog.h" -#include "Simd/SimdResizer.h" #include "Simd/SimdGaussianBlur.h" +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdResizer.h" #include "Simd/SimdBase.h" #include "Simd/SimdSse2.h" @@ -451,6 +453,34 @@ SIMD_API void SimdGrayToBgra(const uint8_t * gray, size_t width, size_t height, Base::GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha); } +SIMD_API uint8_t* SimdImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size) +{ + const static Simd::ImageSaveToMemoryPtr imageSaveToMemory = SIMD_FUNC3(ImageSaveToMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC); + + return imageSaveToMemory(src, stride, width, height, format, file, quality, size); +} + +SIMD_API SimdBool SimdImageSaveToFile(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char* path) +{ + const static Simd::ImageSaveToMemoryPtr imageSaveToMemory = SIMD_FUNC3(ImageSaveToMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC); + + return ImageSaveToFile(imageSaveToMemory, src, stride, width, height, format, file, quality, path); +} + +SIMD_API uint8_t* SimdImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) +{ + const static Simd::ImageLoadFromMemoryPtr imageLoadFromMemory = SIMD_FUNC3(ImageLoadFromMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC); + + return imageLoadFromMemory(data, size, stride, width, height, format); +} + +SIMD_API uint8_t* SimdImageLoadFromFile(const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) +{ + const static Simd::ImageLoadFromMemoryPtr imageLoadFromMemory = SIMD_FUNC3(ImageLoadFromMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC); + + return ImageLoadFromFile(imageLoadFromMemory, path, stride, width, height, format); +} + SIMD_API void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride) { diff --git a/3rdparty/simdlib/Simd/SimdLib.h b/3rdparty/simdlib/Simd/SimdLib.h index 4838b82261..5441805969 100755 --- a/3rdparty/simdlib/Simd/SimdLib.h +++ b/3rdparty/simdlib/Simd/SimdLib.h @@ -116,6 +116,27 @@ typedef enum SimdCpuInfoNeon, /*!< Availability of NEON (ARM). */ } SimdCpuInfoType; +/*! @ingroup c_types + Describes formats of image file. It is used in functions ::SimdImageSaveToMemory and ::SimdImageSaveToFile. +*/ +typedef enum +{ + /*! An undefined image file format (format auto choice). */ + SimdImageFileUndefined = 0, + /*! A PGM (Portable Gray Map) text (P2) image file format. */ + SimdImageFilePgmTxt, + /*! A PGM (Portable Gray Map) binary (P5) image file format. */ + SimdImageFilePgmBin, + /*! A PGM (Portable Pixel Map) text (P3) image file format. */ + SimdImageFilePpmTxt, + /*! A PGM (Portable Pixel Map) binary (P6) image file format. */ + SimdImageFilePpmBin, + /*! A PNG (Portable Network Graphics) image file format. */ + SimdImageFilePng, + /*! A JPEG (Joint Photographic Experts Group) image file format. */ + SimdImageFileJpeg, +} SimdImageFileType; + /*! @ingroup c_types Describes types of binary operation between two images performed by function ::SimdOperationBinary8u. Images must have the same format (unsigned 8-bit integer for every channel). @@ -167,18 +188,6 @@ typedef enum SimdPixelFormatFloat, /*! A single channel 64-bit float point pixel format. */ SimdPixelFormatDouble, - /*! A 8-bit Bayer pixel format (GRBG). */ - SimdPixelFormatBayerGrbg, - /*! A 8-bit Bayer pixel format (GBRG). */ - SimdPixelFormatBayerGbrg, - /*! A 8-bit Bayer pixel format (RGGB). */ - SimdPixelFormatBayerRggb, - /*! A 8-bit Bayer pixel format (BGGR). */ - SimdPixelFormatBayerBggr, - /*! A 24-bit (3 8-bit channels) HSV (Hue, Saturation, Value) pixel format. */ - SimdPixelFormatHsv24, - /*! A 24-bit (3 8-bit channels) HSL (Hue, Saturation, Lightness) pixel format. */ - SimdPixelFormatHsl24, /*! A 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */ SimdPixelFormatRgb24, /*! A 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */ @@ -753,6 +762,82 @@ extern "C" SIMD_API void SimdGrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha); + /*! @ingroup image_io + + \fn uint8_t* SimdImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t * size); + + \short Saves an image to memory in given image file format. + + \param [in] src - a pointer to pixels data of input image. + \param [in] stride - a row size of input image in bytes. + \param [in] width - a width of input image. + \param [in] height - a height of input image. + \param [in] format - a pixel format of input image. + Supported pixel formats: ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32. + \param [in] file - a format of output image file. To auto choise format of output file set this parameter to ::SimdImageFileUndefined. + \param [in] quality - a parameter of compression quality (if file format supports it). + \param [out] size - a pointer to the size of output image file in bytes. + \return a pointer to memory buffer with output image file. + It has to be deleted after use by function ::SimdFree. On error it returns NULL. + */ + SIMD_API uint8_t* SimdImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t * size); + + /*! @ingroup image_io + + \fn SimdBool SimdImageSaveToFile(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char * path); + + \short Saves an image to memory in given image file format. + + \param [in] src - a pointer to pixels data of input image. + \param [in] stride - a row size of input image in bytes. + \param [in] width - a width of input image. + \param [in] height - a height of input image. + \param [in] format - a pixel format of input image. + Supported pixel formats: ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32. + \param [in] file - a format of output image file. To auto choise format of output file set this parameter to ::SimdImageFileUndefined. + \param [in] quality - a parameter of compression quality (if file format supports it). + \param [in] path - a path to output image file. + \return result of the operation. + */ + SIMD_API SimdBool SimdImageSaveToFile(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char * path); + + /*! @ingroup image_io + + \fn uint8_t* SimdImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format); + + \short Loads an image from memory buffer. + + \param [in] data - a pointer to memory buffer with input image file. + \param [in] size - a size of input image file in bytes. + \param [out] stride - a pointer to row size of output image in bytes. + \param [out] width - a pointer to width of output image. + \param [out] height - a pointer to height of output image. + \param [in, out] format - a pointer to pixel format of output image. + Here you can set desired pixel format (it can be ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32). + Or set ::SimdPixelFormatNone and use pixel format of input image file. + \return a pointer to pixels data of output image. + It has to be deleted after use by function ::SimdFree. On error it returns NULL. + */ + SIMD_API uint8_t* SimdImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format); + + /*! @ingroup image_io + + \fn uint8_t* SimdImageLoadFromFile(const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format); + + \short Loads an image from file. + + \param [in] path - a path to input image file. + \param [out] stride - a pointer to row size of output image in bytes. + \param [out] width - a pointer to width of output image. + \param [out] height - a pointer to height of output image. + \param [in, out] format - a pointer to pixel format of output image. + Here you can set desired pixel format (it can be ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32). + Or set ::SimdPixelFormatNone and use pixel format of input image file. + \return a pointer to pixels data of output image. + It has to be deleted after use by function ::SimdFree. On error it returns NULL. + */ + SIMD_API uint8_t* SimdImageLoadFromFile(const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format); + /*! @ingroup other_conversion \fn void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride); diff --git a/3rdparty/simdlib/Simd/SimdMath.h b/3rdparty/simdlib/Simd/SimdMath.h index 0f7425f76e..f8c192a189 100755 --- a/3rdparty/simdlib/Simd/SimdMath.h +++ b/3rdparty/simdlib/Simd/SimdMath.h @@ -750,6 +750,11 @@ namespace Simd return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(lo), _mm256_castsi256_ps(hi), imm)); } + template SIMD_INLINE __m256i Shuffle64i(__m256i lo, __m256i hi) + { + return _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(lo), _mm256_castsi256_pd(hi), imm)); + } + template SIMD_INLINE __m256 Permute4x64(__m256 a) { return _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(a), imm)); diff --git a/3rdparty/simdlib/Simd/SimdMemory.h b/3rdparty/simdlib/Simd/SimdMemory.h index d7772ffa3c..f0fca8840a 100755 --- a/3rdparty/simdlib/Simd/SimdMemory.h +++ b/3rdparty/simdlib/Simd/SimdMemory.h @@ -35,6 +35,18 @@ namespace Simd { + SIMD_INLINE size_t DivHi(size_t value, size_t divider) + { + return (value + divider - 1) / divider; + } + + SIMD_INLINE size_t Pow2Hi(size_t value) + { + size_t pow2 = 1; + for (; pow2 < value; pow2 *= 2); + return pow2; + } + SIMD_INLINE size_t AlignHiAny(size_t size, size_t align) { return (size + align - 1) / align * align; @@ -108,6 +120,13 @@ namespace Simd return ptr; } + template T* Allocate(uint8_t*& buffer, size_t size, size_t align = SIMD_ALIGN) + { + T* ptr = (T*)buffer; + buffer = buffer + AlignHi(size * sizeof(T), align); + return ptr; + } + SIMD_INLINE void Free(void * ptr) { #ifdef SIMD_NO_MANS_LAND diff --git a/3rdparty/simdlib/Simd/SimdMemoryStream.h b/3rdparty/simdlib/Simd/SimdMemoryStream.h new file mode 100644 index 0000000000..9665f33d63 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdMemoryStream.h @@ -0,0 +1,510 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdMemoryStream_h__ +#define __SimdMemoryStream_h__ + +#include "Simd/SimdMemory.h" +#include "Simd/SimdPerformance.h" + +namespace Simd +{ + class InputMemoryStream + { + const uint8_t* _data; + size_t _pos, _size, _bitCount; +#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) + uint64_t _bitBuffer; +#else + uint32_t _bitBuffer; +#endif + + public: + SIMD_INLINE InputMemoryStream(const uint8_t* data = NULL, size_t size = 0) + { + Init(data, size); + } + + SIMD_INLINE void Init(const uint8_t* data, size_t size) + { + _pos = 0; + _data = data; + _size = size; + _bitBuffer = 0; + _bitCount = 0; + } + + SIMD_INLINE bool Seek(size_t pos) + { + if (pos <= _size) + { + _pos = pos; + return true; + } + return false; + } + + SIMD_INLINE size_t Size() const + { + return _size; + } + + SIMD_INLINE const uint8_t* Data() const + { + return _data; + } + + SIMD_INLINE size_t Pos() const + { + return _pos; + } + + SIMD_INLINE const uint8_t* Current() const + { + return _data + _pos; + } + + SIMD_INLINE bool Eof() const + { + return _pos >= _size; + } + + SIMD_INLINE bool CanRead(size_t size) const + { + return _pos + size <= _size; + } + + SIMD_INLINE size_t Read(size_t size, void* data) + { + size = Min(_size - _pos, size); + memcpy(data, _data + _pos, size); + _pos += size; + return size; + } + + template SIMD_INLINE bool Read(Value & value) + { + return Read(sizeof(Value), &value) == sizeof(Value); + } + + SIMD_INLINE bool Read8u(uint8_t & value) + { + if (_pos < _size) + { + value = _data[_pos++]; + return true; + } + else + return false; + } + + SIMD_INLINE bool Read16u(uint16_t& value) + { + if (_pos + 2 <= _size) + { + value = *(uint16_t*)(_data + _pos); + _pos += 2; + return true; + } + else + return false; + } + + SIMD_INLINE bool Read32u(uint32_t& value) + { + if (_pos + 4 <= _size) + { + value = *(uint32_t*)(_data + _pos); + _pos += 4; + return true; + } + else + return false; + } + + SIMD_INLINE bool ReadBe16u(uint16_t& value) + { + if (Read16u(value)) + { +#if !defined(SIMD_BIG_ENDIAN) + value = + (value & 0x00FF) << 8 | + (value & 0xFF00) >> 8; +#endif + return true; + } + else + return false; + } + + SIMD_INLINE bool ReadBe32u(uint32_t& value) + { + if (Read32u(value)) + { +#if !defined(SIMD_BIG_ENDIAN) + value = + (value & 0x000000FF) << 24 | + (value & 0x0000FF00) << 8 | + (value & 0x00FF0000) >> 8 | + (value & 0xFF000000) >> 24; +#endif + return true; + } + else + return false; + } + + template SIMD_INLINE bool ReadUnsigned(Unsigned& value) + { + if (!SkipGap()) + return false; + value = 0; + while (!IsGap(_data[_pos]) && _pos < _size) + { + if (_data[_pos] >= '0' && _data[_pos] <= '9') + value = value * 10 + Unsigned(_data[_pos] - '0'); + else + return false; + _pos++; + } + return true; + } + + SIMD_INLINE bool Skip(size_t size) + { + if (_pos + size < _size) + { + _pos += size; + return true; + } + return false; + } + + SIMD_INLINE bool SkipValue(uint8_t value) + { + while (_data[_pos] == value && _pos < _size) + _pos++; + return _pos < _size; + } + + SIMD_INLINE bool SkipNotGap() + { + while (!IsGap(_data[_pos]) && _pos < _size) + _pos++; + return _pos < _size; + } + + SIMD_INLINE bool SkipGap() + { + while (IsGap(_data[_pos]) && _pos < _size) + _pos++; + return _pos < _size; + } + + static SIMD_INLINE bool IsGap(uint8_t value) + { + return value == ' ' || value == '\t' || value == '\n' || value == '\r'; + } + +#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) + SIMD_INLINE uint64_t& BitBuffer() + { + return _bitBuffer; + } +#else + SIMD_INLINE uint32_t& BitBuffer() + { + return _bitBuffer; + } +#endif + + SIMD_INLINE size_t& BitCount() + { + return _bitCount; + } + + SIMD_INLINE void FillBits() + { + static const size_t canReadByte = (sizeof(_bitBuffer) - 1) * 8; + while (_bitCount <= canReadByte && _pos < _size) + { + _bitBuffer |= (size_t)_data[_pos++] << _bitCount; + _bitCount += 8; + } + } + + SIMD_INLINE void ClearBits() + { + _pos -= _bitCount / 8; + _bitBuffer = 0; + _bitCount = 0; + } + + SIMD_INLINE bool ReadBits(size_t & bits, size_t count) + { + if (_bitCount < count) + FillBits(); + if (_bitCount < count) + return false; + bits = _bitBuffer & ((size_t(1) << count) - 1); + _bitBuffer >>= count; + _bitCount -= count; + return true; + } + + SIMD_INLINE size_t ReadBits(size_t count) + { + if (_bitCount < count) + FillBits(); + size_t bits = _bitBuffer & ((size_t(1) << count) - 1); + _bitBuffer >>= count; + _bitCount -= count; + return bits; + } + }; + + //------------------------------------------------------------------------- + + class OutputMemoryStream + { + const size_t CAPACITY_MIN = 64; + + uint8_t * _data; + size_t _pos, _size, _capacity, _bitCount; +#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) + uint64_t _bitBuffer; +#else + uint32_t _bitBuffer; +#endif + + SIMD_INLINE void Reset(bool owner) + { + if (_data && owner) + Free(_data); + _data = NULL; + _pos = 0; + _size = 0; + _capacity = 0; + _bitBuffer = 0; + _bitCount = 0; + } + + public: + SIMD_INLINE OutputMemoryStream(size_t capacity = 0) + { + Reset(false); + if (capacity) + Reserve(capacity); + } + + SIMD_INLINE ~OutputMemoryStream() + { + Reset(true); + } + + SIMD_INLINE void Seek(size_t pos) + { + _pos = pos; + _size = Max(_size, _pos); + Reserve(_pos); + } + + SIMD_INLINE size_t Pos() const + { + return _pos; + } + + SIMD_INLINE size_t Size() const + { + return _size; + } + + SIMD_INLINE size_t Capacity() const + { + return _capacity; + } + + SIMD_INLINE uint8_t* Data() + { + return _data; + } + + SIMD_INLINE const uint8_t * Data() const + { + return _data; + } + + SIMD_INLINE uint8_t* Current() + { + return _data + _pos; + } + + SIMD_INLINE const uint8_t* Current() const + { + return _data + _pos; + } + + SIMD_INLINE void Write(const void * data, size_t size) + { + Reserve(_pos + size); + memcpy(_data + _pos, data, size); + _pos += size; + _size = Max(_size, _pos); + } + + SIMD_INLINE bool Write(InputMemoryStream & input, size_t size) + { + if (input.CanRead(size)) + { + Write(input.Current(), size); + input.Skip(size); + return true; + } + return false; + } + + SIMD_INLINE bool WriteSelf(ptrdiff_t offset, size_t size) + { + if (offset < 0) + return false; + Reserve(_pos + size); + if (offset + size > _pos) + { + for (size_t i = 0; i < size; ++i) + _data[_pos++] = _data[offset++]; + } + else + { + memcpy(_data + _pos, _data + offset, size); + _pos += size; + } + _size = Max(_size, _pos); + return true; + } + + template SIMD_INLINE void Write(const Value& value) + { + Write(&value, sizeof(Value)); + } + + SIMD_INLINE void Write8u(uint8_t value) + { + Reserve(_pos + 1); + _data[_pos++] = value; + _size = Max(_size, _pos); + } + + SIMD_INLINE void Write8u(uint8_t value, size_t count) + { + Reserve(_pos + count); + memset(_data + _pos, value, count); + _pos += count; + _size = Max(_size, _pos); + } + + SIMD_INLINE void WriteBe32u(const uint32_t & value) + { +#if defined(SIMD_BIG_ENDIAN) + Write(value); +#else + Write( + (value & 0x000000FF) << 24 | + (value & 0x0000FF00) << 8 | + (value & 0x00FF0000) >> 8 | + (value & 0xFF000000) >> 24); +#endif + } + + SIMD_INLINE uint8_t* Release(size_t* size = NULL) + { + uint8_t* data = _data; + if(size) + *size = _size; + Reset(false); + return data; + } + + SIMD_INLINE void Reserve(size_t size) + { + if (size > _capacity) + { + size_t capacity = Max(CAPACITY_MIN, Max(_capacity * 2, size)); + uint8_t* data = (uint8_t*)Allocate(capacity, SIMD_ALIGN); + if (_data) + { + memcpy(data, _data, _size); + Free(_data); + } + _data = data; + _capacity = capacity; + } + } + + SIMD_INLINE void WriteBits(const size_t bits, size_t count) + { + _bitBuffer |= (bits) << _bitCount; + _bitCount += count; + while (_bitCount >= 8) + { + Write8u((uint8_t)_bitBuffer); + _bitBuffer >>= 8; + _bitCount -= 8; + } + } + + SIMD_INLINE void FlushBits() + { + while (_bitCount >= 8) + { + Write8u((uint8_t)_bitBuffer); + _bitBuffer >>= 8; + _bitCount -= 8; + } + if (_bitCount) + { + Write8u((uint8_t)_bitBuffer); + _bitBuffer = 0; + _bitCount = 0; + } + } + +#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE) + SIMD_INLINE uint64_t & BitBuffer() + { + return _bitBuffer; + } +#else + SIMD_INLINE uint32_t& BitBuffer() + { + return _bitBuffer; + } +#endif + + SIMD_INLINE size_t& BitCount() + { + return _bitCount; + } + }; +} + +#endif//__SimdMemoryStream_h__ diff --git a/3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp b/3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp new file mode 100644 index 0000000000..61c5d90359 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp @@ -0,0 +1,154 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdNeon.h" + +#include + +namespace Simd +{ +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param) + : Base::ImagePgmTxtLoader(param) + { + } + + void ImagePgmTxtLoader::SetConverters() + { + Base::ImagePgmTxtLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Neon::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Neon::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Neon::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Neon::GrayToBgra; break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param) + : Base::ImagePgmBinLoader(param) + { + } + + void ImagePgmBinLoader::SetConverters() + { + Base::ImagePgmBinLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Neon::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Neon::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Neon::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Neon::GrayToBgra; break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param) + : Base::ImagePpmTxtLoader(param) + { + } + + void ImagePpmTxtLoader::SetConverters() + { + Base::ImagePpmTxtLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Neon::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Neon::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Neon::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Neon::BgrToBgra; break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param) + : Base::ImagePpmBinLoader(param) + { + } + + void ImagePpmBinLoader::SetConverters() + { + Base::ImagePpmBinLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Neon::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Neon::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Neon::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Neon::BgrToBgra; break; + } + } + } + + //--------------------------------------------------------------------- + + ImageLoader* CreateImageLoader(const ImageLoaderParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param); + case SimdImageFilePgmBin: return new ImagePgmBinLoader(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param); + case SimdImageFilePpmBin: return new ImagePpmBinLoader(param); + case SimdImageFilePng: return new Base::ImagePngLoader(param); + case SimdImageFileJpeg: return new Base::ImageJpegLoader(param); + default: + return NULL; + } + } + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) + { + ImageLoaderParam param(data, size, *format); + if (param.Validate()) + { + Holder loader(CreateImageLoader(param)); + if (loader) + { + if (loader->FromStream()) + return loader->Release(stride, width, height, format); + } + } + return NULL; + } + } +#endif// SIMD_NEON_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdNeonImageSave.cpp b/3rdparty/simdlib/Simd/SimdNeonImageSave.cpp new file mode 100644 index 0000000000..a0fbbd071a --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdNeonImageSave.cpp @@ -0,0 +1,134 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdNeon.h" + +#include + +namespace Simd +{ +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param) + : Base::ImagePgmTxtSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Neon::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Neon::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Neon::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Neon::RgbaToGray; break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param) + : Base::ImagePgmBinSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Neon::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Neon::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Neon::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Neon::RgbaToGray; break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param) + : Base::ImagePpmTxtSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Neon::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Neon::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Neon::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Neon::BgraToBgr; break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param) + : Base::ImagePpmBinSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Neon::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Neon::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Neon::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Neon::BgraToBgr; break; + } + } + } + + //--------------------------------------------------------------------- + + ImageSaver* CreateImageSaver(const ImageSaverParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param); + case SimdImageFilePgmBin: return new ImagePgmBinSaver(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param); + case SimdImageFilePpmBin: return new ImagePpmBinSaver(param); + case SimdImageFilePng: return new ImagePngSaver(param); + case SimdImageFileJpeg: return new Base::ImageJpegSaver(param); + default: + return NULL; + } + } + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size) + { + ImageSaverParam param(width, height, format, file, quality); + if (param.Validate()) + { + Holder saver(CreateImageSaver(param)); + if (saver) + { + if (saver->ToStream(src, stride)) + return saver->Release(size); + } + } + return NULL; + } + } +#endif// SIMD_NEON_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdPerformance.h b/3rdparty/simdlib/Simd/SimdPerformance.h new file mode 100644 index 0000000000..e695326a69 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdPerformance.h @@ -0,0 +1,197 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdPerformance_h__ +#define __SimdPerformance_h__ + +#include "Simd/SimdDefs.h" + +#include +#include + +namespace Simd +{ + typedef std::string String; + + template SIMD_INLINE String ToStr(const T & value) + { + std::stringstream ss; + ss << value; + return ss.str(); + } +} + +#if defined(SIMD_PERFORMANCE_STATISTIC) && (defined(NDEBUG) || defined(SIMD_PERF_STAT_IN_DEBUG)) + +#include "Simd/SimdTime.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Simd +{ + namespace Base + { + class PerformanceMeasurer + { + String _name; + int64_t _start, _current, _total, _min, _max; + int64_t _count, _flop; + bool _entered, _paused; + + public: + PerformanceMeasurer(const String& name = "Unknown", int64_t flop = 0); + + PerformanceMeasurer(const PerformanceMeasurer& pm); + + void Enter(); + + void Leave(bool pause = false); + + String Statistic() const; + + void Combine(const PerformanceMeasurer& other); + + private: + double Average() const; + double GFlops() const; + }; + + class PerformanceMeasurerHolder + { + PerformanceMeasurer * _pm; + + public: + SIMD_INLINE PerformanceMeasurerHolder(PerformanceMeasurer * pm, bool enter = true) + : _pm(pm) + { + if (_pm && enter) + _pm->Enter(); + } + + SIMD_INLINE void Enter() + { + if (_pm) + _pm->Enter(); + } + + SIMD_INLINE void Leave(bool pause) + { + if (_pm) + _pm->Leave(pause); + } + + SIMD_INLINE ~PerformanceMeasurerHolder() + { + if (_pm) + _pm->Leave(); + } + }; + + class PerformanceMeasurerStorage + { + typedef PerformanceMeasurer Pm; + typedef std::shared_ptr PmPtr; + typedef std::map FunctionMap; + typedef std::map ThreadMap; + + ThreadMap _map; + mutable std::mutex _mutex; + String _report; + + SIMD_INLINE FunctionMap & ThisThread() + { + static thread_local FunctionMap * thread = NULL; + if (thread == NULL) + { + std::lock_guard lock(_mutex); + thread = &_map[std::this_thread::get_id()]; + } + return *thread; + } + + public: + static PerformanceMeasurerStorage s_storage; + + PerformanceMeasurerStorage() + { + } + + SIMD_INLINE PerformanceMeasurer * Get(const String & name, int64_t flop = 0) + { + FunctionMap & thread = ThisThread(); + PerformanceMeasurer * pm = NULL; + FunctionMap::iterator it = thread.find(name); + if (it == thread.end()) + { + pm = new PerformanceMeasurer(name, flop); + thread[name].reset(pm); + } + else + pm = it->second.get(); + return pm; + } + + SIMD_INLINE PerformanceMeasurer * Get(const String func, const String & desc, int64_t flop = 0) + { + return Get(func + "{ " + desc + " }", flop); + } + + const char* PerformanceStatistic(); + }; + } +} +#define SIMD_PERF_FUNCF(flop) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)(Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, (int64_t)(flop))) +#define SIMD_PERF_FUNC() SIMD_PERF_FUNCF(0) +#define SIMD_PERF_BEGF(desc, flop) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)(Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc, (int64_t)(flop))) +#define SIMD_PERF_BEG(desc) SIMD_PERF_BEGF(desc, 0) +#define SIMD_PERF_IFF(cond, desc, flop) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)((cond) ? Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc, (int64_t)(flop)) : NULL) +#define SIMD_PERF_IF(cond, desc) SIMD_PERF_IFF(cond, desc, 0) +#define SIMD_PERF_END(desc) Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc)->Leave(); +#define SIMD_PERF_INITF(name, desc, flop) Simd::Base::PerformanceMeasurerHolder name(Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc, (int64_t)(flop)), false); +#define SIMD_PERF_INIT(name, desc) SIMD_PERF_INITF(name, desc, 0); +#define SIMD_PERF_START(name) name.Enter(); +#define SIMD_PERF_PAUSE(name) name.Leave(true); +#define SIMD_PERF_EXT(ext) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)((ext)->Perf(SIMD_FUNCTION)) +#else//SIMD_PERFORMANCE_STATISTIC +#define SIMD_PERF_FUNCF(flop) +#define SIMD_PERF_FUNC() +#define SIMD_PERF_BEGF(desc, flop) +#define SIMD_PERF_BEG(desc) +#define SIMD_PERF_IFF(cond, desc, flop) +#define SIMD_PERF_IF(cond, desc) +#define SIMD_PERF_END(desc) +#define SIMD_PERF_INITF(name, desc, flop) +#define SIMD_PERF_INIT(name, desc) +#define SIMD_PERF_START(name) +#define SIMD_PERF_PAUSE(name) +#define SIMD_PERF_EXT(ext) +#endif//SIMD_PERFORMANCE_STATISTIC + +#endif//__SimdPerformance_h__ diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp new file mode 100644 index 0000000000..eca83c63ed --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp @@ -0,0 +1,159 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdSse2.h" +#include "Simd/SimdSse41.h" + +#include + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param) + : Base::ImagePgmTxtLoader(param) + { + } + + void ImagePgmTxtLoader::SetConverters() + { + Base::ImagePgmTxtLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Sse41::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Sse2::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Sse41::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Sse41::GrayToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param) + : Base::ImagePgmBinLoader(param) + { + } + + void ImagePgmBinLoader::SetConverters() + { + Base::ImagePgmBinLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _toAny = Sse41::GrayToBgr; break; + case SimdPixelFormatBgra32: _toBgra = Sse2::GrayToBgra; break; + case SimdPixelFormatRgb24: _toAny = Sse41::GrayToBgr; break; + case SimdPixelFormatRgba32: _toBgra = Sse41::GrayToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param) + : Base::ImagePpmTxtLoader(param) + { + } + + void ImagePpmTxtLoader::SetConverters() + { + Base::ImagePpmTxtLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Sse41::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Sse41::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Sse41::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Sse41::BgrToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param) + : Base::ImagePpmBinLoader(param) + { + } + + void ImagePpmBinLoader::SetConverters() + { + Base::ImagePpmBinLoader::SetConverters(); + if (_image.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _toAny = Sse41::RgbToGray; break; + case SimdPixelFormatBgr24: _toAny = Sse41::BgrToRgb; break; + case SimdPixelFormatBgra32: _toBgra = Sse41::RgbToBgra; break; + case SimdPixelFormatRgba32: _toBgra = Sse41::BgrToBgra; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImageLoader* CreateImageLoader(const ImageLoaderParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param); + case SimdImageFilePgmBin: return new ImagePgmBinLoader(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param); + case SimdImageFilePpmBin: return new ImagePpmBinLoader(param); + case SimdImageFilePng: return new ImagePngLoader(param); + case SimdImageFileJpeg: return new Base::ImageJpegLoader(param); + default: + return NULL; + } + } + + uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format) + { + ImageLoaderParam param(data, size, *format); + if (param.Validate()) + { + std::unique_ptr loader(CreateImageLoader(param)); + if (loader) + { + if (loader->FromStream()) + return loader->Release(stride, width, height, format); + } + } + return NULL; + } + } +#endif// SIMD_SSE41_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp new file mode 100644 index 0000000000..1ec6ca0118 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp @@ -0,0 +1,1805 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdImageLoad.h" +#include "Simd/SimdArray.h" +#include "Simd/SimdCpu.h" +#include "Simd/SimdBase.h" +#include "Simd/SimdSse2.h" +#include "Simd/SimdSse41.h" + +namespace Simd +{ +#if defined(SIMD_SSE41_ENABLE) + namespace Sse41 + { + typedef unsigned char png_uc; + typedef unsigned short png_us; + + typedef uint16_t png__uint16; + typedef uint32_t png__uint32; + +#define png_inline SIMD_INLINE +#define PNG_ASSERT assert +#define PNG_MALLOC(sz) malloc(sz) +#define PNG_REALLOC(p,newsz) realloc(p,newsz) +#define PNG_FREE(p) free(p) +#define PNG_REALLOC_SIZED(p,oldsz,newsz) PNG_REALLOC(p,newsz) +#define STBIDEF static + +#ifdef _MSC_VER +#define PNG_NOTUSED(v) (void)(v) +#else +#define PNG_NOTUSED(v) (void)sizeof(v) +#endif + +#define PNG__BYTECAST(x) ((png_uc) ((x) & 255)) // truncate int to byte without warnings +#define PNG_MAX_DIMENSIONS (1 << 24) + + static int png__err(const char* str, const char* stub) + { + return 0; + } + +#define png__errpuc(x,y) ((unsigned char *)(size_t) (png__err(x,y)?NULL:NULL)) + + static void* png__malloc(size_t size) + { + return PNG_MALLOC(size); + } + + typedef struct + { + int (*read) (void* user, char* data, int size); // fill 'data' with 'size' bytes. return number of bytes actually read + void (*skip) (void* user, int n); // skip the next 'n' bytes, or 'unget' the last -n bytes if negative + int (*eof) (void* user); // returns nonzero if we are at end of file/data + } png_io_callbacks; + + typedef struct + { + png__uint32 img_x, img_y; + int img_n, img_out_n; + + png_io_callbacks io; + void* io_user_data; + + int read_from_callbacks; + int buflen; + png_uc buffer_start[128]; + int callback_already_read; + + png_uc* img_buffer, * img_buffer_end; + png_uc* img_buffer_original, * img_buffer_original_end; + } png__context; + + typedef struct + { + int bits_per_channel; + int num_channels; + int channel_order; + } png__result_info; + + enum + { + PNG__SCAN_load = 0, + PNG__SCAN_type, + PNG__SCAN_header + }; + + enum + { + PNG_ORDER_RGB, + PNG_ORDER_BGR + }; + + static void png__rewind(png__context* s) + { + // conceptually rewind SHOULD rewind to the beginning of the stream, + // but we just rewind to the beginning of the initial buffer, because + // we only use it after doing 'test', which only ever looks at at most 92 bytes + s->img_buffer = s->img_buffer_original; + s->img_buffer_end = s->img_buffer_original_end; + } + + static void png__refill_buffer(png__context* s) + { + int n = (s->io.read)(s->io_user_data, (char*)s->buffer_start, s->buflen); + s->callback_already_read += (int)(s->img_buffer - s->img_buffer_original); + if (n == 0) { + // at end of file, treat same as if from memory, but need to handle case + // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file + s->read_from_callbacks = 0; + s->img_buffer = s->buffer_start; + s->img_buffer_end = s->buffer_start + 1; + *s->img_buffer = 0; + } + else { + s->img_buffer = s->buffer_start; + s->img_buffer_end = s->buffer_start + n; + } + } + + png_inline static png_uc png__get8(png__context* s) + { + if (s->img_buffer < s->img_buffer_end) + return *s->img_buffer++; + if (s->read_from_callbacks) { + png__refill_buffer(s); + return *s->img_buffer++; + } + return 0; + } + + static int png__get16be(png__context* s) + { + int z = png__get8(s); + return (z << 8) + png__get8(s); + } + + static png__uint32 png__get32be(png__context* s) + { + png__uint32 z = png__get16be(s); + return (z << 16) + png__get16be(s); + } + + png_inline static int png__at_eof(png__context* s) + { + if (s->io.read) { + if (!(s->io.eof)(s->io_user_data)) return 0; + // if feof() is true, check if buffer = end + // special case: we've only got the special 0 character at the end + if (s->read_from_callbacks == 0) return 1; + } + + return s->img_buffer >= s->img_buffer_end; + } + + static void png__skip(png__context* s, int n) + { + if (n == 0) return; // already there! + if (n < 0) { + s->img_buffer = s->img_buffer_end; + return; + } + if (s->io.read) { + int blen = (int)(s->img_buffer_end - s->img_buffer); + if (blen < n) { + s->img_buffer = s->img_buffer_end; + (s->io.skip)(s->io_user_data, n - blen); + return; + } + } + s->img_buffer += n; + } + + static int png__getn(png__context* s, png_uc* buffer, int n) + { + if (s->io.read) { + int blen = (int)(s->img_buffer_end - s->img_buffer); + if (blen < n) { + int res, count; + + memcpy(buffer, s->img_buffer, blen); + + count = (s->io.read)(s->io_user_data, (char*)buffer + blen, n - blen); + res = (count == (n - blen)); + s->img_buffer = s->img_buffer_end; + return res; + } + } + + if (s->img_buffer + n <= s->img_buffer_end) { + memcpy(buffer, s->img_buffer, n); + s->img_buffer += n; + return 1; + } + else + return 0; + } + + static int png__addsizes_valid(int a, int b) + { + if (b < 0) return 0; + // now 0 <= b <= INT_MAX, hence also + // 0 <= INT_MAX - b <= INTMAX. + // And "a + b <= INT_MAX" (which might overflow) is the + // same as a <= INT_MAX - b (no overflow) + return a <= INT_MAX - b; + } + + // returns 1 if the product is valid, 0 on overflow. + // negative factors are considered invalid. + static int png__mul2sizes_valid(int a, int b) + { + if (a < 0 || b < 0) return 0; + if (b == 0) return 1; // mul-by-0 is always safe + // portable way to check for no overflows in a*b + return a <= INT_MAX / b; + } + + // returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow + static int png__mad2sizes_valid(int a, int b, int add) + { + return png__mul2sizes_valid(a, b) && png__addsizes_valid(a * b, add); + } + + // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow + static int png__mad3sizes_valid(int a, int b, int c, int add) + { + return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) && + png__addsizes_valid(a * b * c, add); + } + + // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow + static int png__mad4sizes_valid(int a, int b, int c, int d, int add) + { + return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) && + png__mul2sizes_valid(a * b * c, d) && png__addsizes_valid(a * b * c * d, add); + } + + // mallocs with size overflow checking + static void* png__malloc_mad2(int a, int b, int add) + { + if (!png__mad2sizes_valid(a, b, add)) return NULL; + return png__malloc(a * b + add); + } + + static void* png__malloc_mad3(int a, int b, int c, int add) + { + if (!png__mad3sizes_valid(a, b, c, add)) return NULL; + return png__malloc(a * b * c + add); + } + + static void* png__malloc_mad4(int a, int b, int c, int d, int add) + { + if (!png__mad4sizes_valid(a, b, c, d, add)) return NULL; + return png__malloc(a * b * c * d + add); + } + + static png_uc png__compute_y(int r, int g, int b) + { + return (png_uc)(((r * 77) + (g * 150) + (29 * b)) >> 8); + } + + static unsigned char* png__convert_format(unsigned char* data, int img_n, int req_comp, unsigned int x, unsigned int y) + { + int i, j; + unsigned char* good; + + if (req_comp == img_n) return data; + PNG_ASSERT(req_comp >= 1 && req_comp <= 4); + + good = (unsigned char*)png__malloc_mad3(req_comp, x, y, 0); + if (good == NULL) { + PNG_FREE(data); + return png__errpuc("outofmem", "Out of memory"); + } + + for (j = 0; j < (int)y; ++j) { + unsigned char* src = data + j * x * img_n; + unsigned char* dest = good + j * x * req_comp; + +#define PNG__COMBO(a,b) ((a)*8+(b)) +#define PNG__CASE(a,b) case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b) + // convert source image with img_n components to one with req_comp components; + // avoid switch per pixel, so use switch per scanline and massive macros + switch (PNG__COMBO(img_n, req_comp)) { + PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 255; } break; + PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 255; } break; + PNG__CASE(2, 1) { dest[0] = src[0]; } break; + PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break; + PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 255; } break; + PNG__CASE(3, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break; + PNG__CASE(3, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = 255; } break; + PNG__CASE(4, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break; + PNG__CASE(4, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = src[3]; } break; + PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break; + default: PNG_ASSERT(0); PNG_FREE(data); PNG_FREE(good); return png__errpuc("unsupported", "Unsupported format conversion"); + } +#undef PNG__CASE + } + + PNG_FREE(data); + return good; + } + + static png__uint16 png__compute_y_16(int r, int g, int b) + { + return (png__uint16)(((r * 77) + (g * 150) + (29 * b)) >> 8); + } + + static png__uint16* png__convert_format16(png__uint16* data, int img_n, int req_comp, unsigned int x, unsigned int y) + { + int i, j; + png__uint16* good; + + if (req_comp == img_n) return data; + PNG_ASSERT(req_comp >= 1 && req_comp <= 4); + + good = (png__uint16*)png__malloc(req_comp * x * y * 2); + if (good == NULL) { + PNG_FREE(data); + return (png__uint16*)png__errpuc("outofmem", "Out of memory"); + } + + for (j = 0; j < (int)y; ++j) { + png__uint16* src = data + j * x * img_n; + png__uint16* dest = good + j * x * req_comp; + +#define PNG__COMBO(a,b) ((a)*8+(b)) +#define PNG__CASE(a,b) case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b) + // convert source image with img_n components to one with req_comp components; + // avoid switch per pixel, so use switch per scanline and massive macros + switch (PNG__COMBO(img_n, req_comp)) { + PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 0xffff; } break; + PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 0xffff; } break; + PNG__CASE(2, 1) { dest[0] = src[0]; } break; + PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break; + PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break; + PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 0xffff; } break; + PNG__CASE(3, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break; + PNG__CASE(3, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = 0xffff; } break; + PNG__CASE(4, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break; + PNG__CASE(4, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = src[3]; } break; + PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break; + default: PNG_ASSERT(0); PNG_FREE(data); PNG_FREE(good); return (png__uint16*)png__errpuc("unsupported", "Unsupported format conversion"); + } +#undef PNG__CASE + } + + PNG_FREE(data); + return good; + } + + // fast-way is faster to check than jpeg huffman, but slow way is slower +#define PNG__ZFAST_BITS 9 // accelerate all cases in default tables +#define PNG__ZFAST_MASK ((1 << PNG__ZFAST_BITS) - 1) + +// zlib-style huffman encoding +// (jpegs packs from left, zlib from right, so can't share code) + typedef struct + { + png__uint16 fast[1 << PNG__ZFAST_BITS]; + png__uint16 firstcode[16]; + int maxcode[17]; + png__uint16 firstsymbol[16]; + png_uc size[288]; + png__uint16 value[288]; + } png__zhuffman; + + png_inline static int png__bitreverse16(int n) + { + n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1); + n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2); + n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4); + n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8); + return n; + } + + png_inline static int png__bit_reverse(int v, int bits) + { + PNG_ASSERT(bits <= 16); + // to bit reverse n bits, reverse 16 and shift + // e.g. 11 bits, bit reverse and shift away 5 + return png__bitreverse16(v) >> (16 - bits); + } + + static int png__zbuild_huffman(png__zhuffman* z, const png_uc* sizelist, int num) + { + int i, k = 0; + int code, next_code[16], sizes[17]; + + // DEFLATE spec for generating codes + memset(sizes, 0, sizeof(sizes)); + memset(z->fast, 0, sizeof(z->fast)); + for (i = 0; i < num; ++i) + ++sizes[sizelist[i]]; + sizes[0] = 0; + for (i = 1; i < 16; ++i) + if (sizes[i] > (1 << i)) + return png__err("bad sizes", "Corrupt PNG"); + code = 0; + for (i = 1; i < 16; ++i) { + next_code[i] = code; + z->firstcode[i] = (png__uint16)code; + z->firstsymbol[i] = (png__uint16)k; + code = (code + sizes[i]); + if (sizes[i]) + if (code - 1 >= (1 << i)) return png__err("bad codelengths", "Corrupt PNG"); + z->maxcode[i] = code << (16 - i); // preshift for inner loop + code <<= 1; + k += sizes[i]; + } + z->maxcode[16] = 0x10000; // sentinel + for (i = 0; i < num; ++i) { + int s = sizelist[i]; + if (s) { + int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s]; + png__uint16 fastv = (png__uint16)((s << 9) | i); + z->size[c] = (png_uc)s; + z->value[c] = (png__uint16)i; + if (s <= PNG__ZFAST_BITS) { + int j = png__bit_reverse(next_code[s], s); + while (j < (1 << PNG__ZFAST_BITS)) { + z->fast[j] = fastv; + j += (1 << s); + } + } + ++next_code[s]; + } + } + return 1; + } + + // zlib-from-memory implementation for PNG reading + // because PNG allows splitting the zlib stream arbitrarily, + // and it's annoying structurally to have PNG call ZLIB call PNG, + // we require PNG read all the IDATs and combine them into a single + // memory buffer + + typedef struct + { + png_uc* zbuffer, * zbuffer_end; + int num_bits; + png__uint32 code_buffer; + + char* zout; + char* zout_start; + char* zout_end; + int z_expandable; + + png__zhuffman z_length, z_distance; + } png__zbuf; + + png_inline static int png__zeof(png__zbuf* z) + { + return (z->zbuffer >= z->zbuffer_end); + } + + png_inline static png_uc png__zget8(png__zbuf* z) + { + return png__zeof(z) ? 0 : *z->zbuffer++; + } + + static void png__fill_bits(png__zbuf* z) + { + do { + if (z->code_buffer >= (1U << z->num_bits)) { + z->zbuffer = z->zbuffer_end; /* treat this as EOF so we fail. */ + return; + } + z->code_buffer |= (unsigned int)png__zget8(z) << z->num_bits; + z->num_bits += 8; + } while (z->num_bits <= 24); + } + + png_inline static unsigned int png__zreceive(png__zbuf* z, int n) + { + unsigned int k; + if (z->num_bits < n) png__fill_bits(z); + k = z->code_buffer & ((1 << n) - 1); + z->code_buffer >>= n; + z->num_bits -= n; + return k; + } + + static int png__zhuffman_decode_slowpath(png__zbuf* a, png__zhuffman* z) + { + int b, s, k; + // not resolved by fast table, so compute it the slow way + // use jpeg approach, which requires MSbits at top + k = png__bit_reverse(a->code_buffer, 16); + for (s = PNG__ZFAST_BITS + 1; ; ++s) + if (k < z->maxcode[s]) + break; + if (s >= 16) return -1; // invalid code! + // code size is s, so: + b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s]; + if (b >= sizeof(z->size)) return -1; // some data was corrupt somewhere! + if (z->size[b] != s) return -1; // was originally an assert, but report failure instead. + a->code_buffer >>= s; + a->num_bits -= s; + return z->value[b]; + } + + png_inline static int png__zhuffman_decode(png__zbuf* a, png__zhuffman* z) + { + int b, s; + if (a->num_bits < 16) { + if (png__zeof(a)) { + return -1; /* report error for unexpected end of data. */ + } + png__fill_bits(a); + } + b = z->fast[a->code_buffer & PNG__ZFAST_MASK]; + if (b) { + s = b >> 9; + a->code_buffer >>= s; + a->num_bits -= s; + return b & 511; + } + return png__zhuffman_decode_slowpath(a, z); + } + + static int png__zexpand(png__zbuf* z, char* zout, int n) // need to make room for n bytes + { + char* q; + unsigned int cur, limit, old_limit; + z->zout = zout; + if (!z->z_expandable) return png__err("output buffer limit", "Corrupt PNG"); + cur = (unsigned int)(z->zout - z->zout_start); + limit = old_limit = (unsigned)(z->zout_end - z->zout_start); + if (UINT_MAX - cur < (unsigned)n) return png__err("outofmem", "Out of memory"); + while (cur + n > limit) { + if (limit > UINT_MAX / 2) return png__err("outofmem", "Out of memory"); + limit *= 2; + } + q = (char*)PNG_REALLOC_SIZED(z->zout_start, old_limit, limit); + PNG_NOTUSED(old_limit); + if (q == NULL) return png__err("outofmem", "Out of memory"); + z->zout_start = q; + z->zout = q + cur; + z->zout_end = q + limit; + return 1; + } + + static const int png__zlength_base[31] = { + 3,4,5,6,7,8,9,10,11,13, + 15,17,19,23,27,31,35,43,51,59, + 67,83,99,115,131,163,195,227,258,0,0 }; + + static const int png__zlength_extra[31] = + { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 }; + + static const int png__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193, + 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0 }; + + static const int png__zdist_extra[32] = + { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 }; + + static int png__parse_huffman_block(png__zbuf* a) + { + char* zout = a->zout; + for (;;) { + int z = png__zhuffman_decode(a, &a->z_length); + if (z < 256) { + if (z < 0) return png__err("bad huffman code", "Corrupt PNG"); // error in huffman codes + if (zout >= a->zout_end) { + if (!png__zexpand(a, zout, 1)) return 0; + zout = a->zout; + } + *zout++ = (char)z; + } + else { + png_uc* p; + int len, dist; + if (z == 256) { + a->zout = zout; + return 1; + } + z -= 257; + len = png__zlength_base[z]; + if (png__zlength_extra[z]) len += png__zreceive(a, png__zlength_extra[z]); + z = png__zhuffman_decode(a, &a->z_distance); + if (z < 0) return png__err("bad huffman code", "Corrupt PNG"); + dist = png__zdist_base[z]; + if (png__zdist_extra[z]) dist += png__zreceive(a, png__zdist_extra[z]); + if (zout - a->zout_start < dist) return png__err("bad dist", "Corrupt PNG"); + if (zout + len > a->zout_end) { + if (!png__zexpand(a, zout, len)) return 0; + zout = a->zout; + } + p = (png_uc*)(zout - dist); + if (dist == 1) { // run of one byte; common in images. + png_uc v = *p; + if (len) { do *zout++ = v; while (--len); } + } + else { + if (len) { do *zout++ = *p++; while (--len); } + } + } + } + } + + static int png__compute_huffman_codes(png__zbuf* a) + { + static const png_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 }; + png__zhuffman z_codelength; + png_uc lencodes[286 + 32 + 137];//padding for maximum single op + png_uc codelength_sizes[19]; + int i, n; + + int hlit = png__zreceive(a, 5) + 257; + int hdist = png__zreceive(a, 5) + 1; + int hclen = png__zreceive(a, 4) + 4; + int ntot = hlit + hdist; + + memset(codelength_sizes, 0, sizeof(codelength_sizes)); + for (i = 0; i < hclen; ++i) { + int s = png__zreceive(a, 3); + codelength_sizes[length_dezigzag[i]] = (png_uc)s; + } + if (!png__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0; + + n = 0; + while (n < ntot) { + int c = png__zhuffman_decode(a, &z_codelength); + if (c < 0 || c >= 19) return png__err("bad codelengths", "Corrupt PNG"); + if (c < 16) + lencodes[n++] = (png_uc)c; + else { + png_uc fill = 0; + if (c == 16) { + c = png__zreceive(a, 2) + 3; + if (n == 0) return png__err("bad codelengths", "Corrupt PNG"); + fill = lencodes[n - 1]; + } + else if (c == 17) { + c = png__zreceive(a, 3) + 3; + } + else if (c == 18) { + c = png__zreceive(a, 7) + 11; + } + else { + return png__err("bad codelengths", "Corrupt PNG"); + } + if (ntot - n < c) return png__err("bad codelengths", "Corrupt PNG"); + memset(lencodes + n, fill, c); + n += c; + } + } + if (n != ntot) return png__err("bad codelengths", "Corrupt PNG"); + if (!png__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0; + if (!png__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist)) return 0; + return 1; + } + + static int png__parse_uncompressed_block(png__zbuf* a) + { + png_uc header[4]; + int len, nlen, k; + if (a->num_bits & 7) + png__zreceive(a, a->num_bits & 7); // discard + // drain the bit-packed data into header + k = 0; + while (a->num_bits > 0) { + header[k++] = (png_uc)(a->code_buffer & 255); // suppress MSVC run-time check + a->code_buffer >>= 8; + a->num_bits -= 8; + } + if (a->num_bits < 0) return png__err("zlib corrupt", "Corrupt PNG"); + // now fill header the normal way + while (k < 4) + header[k++] = png__zget8(a); + len = header[1] * 256 + header[0]; + nlen = header[3] * 256 + header[2]; + if (nlen != (len ^ 0xffff)) return png__err("zlib corrupt", "Corrupt PNG"); + if (a->zbuffer + len > a->zbuffer_end) return png__err("read past buffer", "Corrupt PNG"); + if (a->zout + len > a->zout_end) + if (!png__zexpand(a, a->zout, len)) return 0; + memcpy(a->zout, a->zbuffer, len); + a->zbuffer += len; + a->zout += len; + return 1; + } + + static int png__parse_zlib_header(png__zbuf* a) + { + int cmf = png__zget8(a); + int cm = cmf & 15; + /* int cinfo = cmf >> 4; */ + int flg = png__zget8(a); + if (png__zeof(a)) return png__err("bad zlib header", "Corrupt PNG"); // zlib spec + if ((cmf * 256 + flg) % 31 != 0) return png__err("bad zlib header", "Corrupt PNG"); // zlib spec + if (flg & 32) return png__err("no preset dict", "Corrupt PNG"); // preset dictionary not allowed in png + if (cm != 8) return png__err("bad compression", "Corrupt PNG"); // DEFLATE required for png + // window = 1 << (8 + cinfo)... but who cares, we fully buffer output + return 1; + } + + static const png_uc png__zdefault_length[288] = + { + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8 + }; + static const png_uc png__zdefault_distance[32] = + { + 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 + }; + /* + Init algorithm: + { + int i; // use <= to match clearly with spec + for (i=0; i <= 143; ++i) png__zdefault_length[i] = 8; + for ( ; i <= 255; ++i) png__zdefault_length[i] = 9; + for ( ; i <= 279; ++i) png__zdefault_length[i] = 7; + for ( ; i <= 287; ++i) png__zdefault_length[i] = 8; + + for (i=0; i <= 31; ++i) png__zdefault_distance[i] = 5; + } + */ + + static int png__parse_zlib(png__zbuf* a, int parse_header) + { + int final, type; + if (parse_header) + if (!png__parse_zlib_header(a)) return 0; + a->num_bits = 0; + a->code_buffer = 0; + do { + final = png__zreceive(a, 1); + type = png__zreceive(a, 2); + if (type == 0) { + if (!png__parse_uncompressed_block(a)) return 0; + } + else if (type == 3) { + return 0; + } + else { + if (type == 1) { + // use fixed code lengths + if (!png__zbuild_huffman(&a->z_length, png__zdefault_length, 288)) return 0; + if (!png__zbuild_huffman(&a->z_distance, png__zdefault_distance, 32)) return 0; + } + else { + if (!png__compute_huffman_codes(a)) return 0; + } + if (!png__parse_huffman_block(a)) return 0; + } + } while (!final); + return 1; + } + + static int png__do_zlib(png__zbuf* a, char* obuf, int olen, int exp, int parse_header) + { + a->zout_start = obuf; + a->zout = obuf; + a->zout_end = obuf + olen; + a->z_expandable = exp; + + return png__parse_zlib(a, parse_header); + } + + STBIDEF char* png_zlib_decode_malloc_guesssize(const char* buffer, int len, int initial_size, int* outlen) + { + png__zbuf a; + char* p = (char*)png__malloc(initial_size); + if (p == NULL) return NULL; + a.zbuffer = (png_uc*)buffer; + a.zbuffer_end = (png_uc*)buffer + len; + if (png__do_zlib(&a, p, initial_size, 1, 1)) { + if (outlen) *outlen = (int)(a.zout - a.zout_start); + return a.zout_start; + } + else { + PNG_FREE(a.zout_start); + return NULL; + } + } + + STBIDEF char* png_zlib_decode_malloc(char const* buffer, int len, int* outlen) + { + return png_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen); + } + + STBIDEF char* png_zlib_decode_malloc_guesssize_headerflag(const char* buffer, int len, int initial_size, int* outlen, int parse_header) + { + png__zbuf a; + char* p = (char*)png__malloc(initial_size); + if (p == NULL) return NULL; + a.zbuffer = (png_uc*)buffer; + a.zbuffer_end = (png_uc*)buffer + len; + if (png__do_zlib(&a, p, initial_size, 1, parse_header)) { + if (outlen) *outlen = (int)(a.zout - a.zout_start); + return a.zout_start; + } + else { + PNG_FREE(a.zout_start); + return NULL; + } + } + + STBIDEF int png_zlib_decode_buffer(char* obuffer, int olen, char const* ibuffer, int ilen) + { + png__zbuf a; + a.zbuffer = (png_uc*)ibuffer; + a.zbuffer_end = (png_uc*)ibuffer + ilen; + if (png__do_zlib(&a, obuffer, olen, 0, 1)) + return (int)(a.zout - a.zout_start); + else + return -1; + } + + STBIDEF char* png_zlib_decode_noheader_malloc(char const* buffer, int len, int* outlen) + { + png__zbuf a; + char* p = (char*)png__malloc(16384); + if (p == NULL) return NULL; + a.zbuffer = (png_uc*)buffer; + a.zbuffer_end = (png_uc*)buffer + len; + if (png__do_zlib(&a, p, 16384, 1, 0)) { + if (outlen) *outlen = (int)(a.zout - a.zout_start); + return a.zout_start; + } + else { + PNG_FREE(a.zout_start); + return NULL; + } + } + + STBIDEF int png_zlib_decode_noheader_buffer(char* obuffer, int olen, const char* ibuffer, int ilen) + { + png__zbuf a; + a.zbuffer = (png_uc*)ibuffer; + a.zbuffer_end = (png_uc*)ibuffer + ilen; + if (png__do_zlib(&a, obuffer, olen, 0, 0)) + return (int)(a.zout - a.zout_start); + else + return -1; + } + + + // public domain "baseline" PNG decoder v0.10 Sean Barrett 2006-11-18 + // simple implementation + // - only 8-bit samples + // - no CRC checking + // - allocates lots of intermediate memory + // - avoids problem of streaming data between subsystems + // - avoids explicit window management + // performance + // - uses stb_zlib, a PD zlib implementation with fast huffman decoding + + typedef struct + { + png__uint32 length; + png__uint32 type; + } png__pngchunk; + + static png__pngchunk png__get_chunk_header(png__context* s) + { + png__pngchunk c; + c.length = png__get32be(s); + c.type = png__get32be(s); + return c; + } + + static int png__check_png_header(png__context* s) + { + static const png_uc png_sig[8] = { 137,80,78,71,13,10,26,10 }; + int i; + for (i = 0; i < 8; ++i) + if (png__get8(s) != png_sig[i]) return png__err("bad png sig", "Not a PNG"); + return 1; + } + + typedef struct + { + png__context* s; + png_uc* idata, * expanded, * out; + int depth; + } png__png; + + + enum { + PNG__F_none = 0, + PNG__F_sub = 1, + PNG__F_up = 2, + PNG__F_avg = 3, + PNG__F_paeth = 4, + // synthetic filters used for first scanline to avoid needing a dummy row of 0s + PNG__F_avg_first, + PNG__F_paeth_first + }; + + static png_uc first_row_filter[5] = + { + PNG__F_none, + PNG__F_sub, + PNG__F_none, + PNG__F_avg_first, + PNG__F_paeth_first + }; + + static int png__paeth(int a, int b, int c) + { + int p = a + b - c; + int pa = abs(p - a); + int pb = abs(p - b); + int pc = abs(p - c); + if (pa <= pb && pa <= pc) return a; + if (pb <= pc) return b; + return c; + } + + static const png_uc png__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 }; + + // create the png data from post-deflated data + static int png__create_png_image_raw(png__png* a, png_uc* raw, png__uint32 raw_len, int out_n, png__uint32 x, png__uint32 y, int depth, int color) + { + int bytes = (depth == 16 ? 2 : 1); + png__context* s = a->s; + png__uint32 i, j, stride = x * out_n * bytes; + png__uint32 img_len, img_width_bytes; + int k; + int img_n = s->img_n; // copy it into a local for later + + int output_bytes = out_n * bytes; + int filter_bytes = img_n * bytes; + int width = x; + + PNG_ASSERT(out_n == s->img_n || out_n == s->img_n + 1); + a->out = (png_uc*)png__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into + if (!a->out) return png__err("outofmem", "Out of memory"); + + if (!png__mad3sizes_valid(img_n, x, depth, 7)) return png__err("too large", "Corrupt PNG"); + img_width_bytes = (((img_n * x * depth) + 7) >> 3); + img_len = (img_width_bytes + 1) * y; + + // we used to check for exact match between raw_len and img_len on non-interlaced PNGs, + // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros), + // so just check for raw_len < img_len always. + if (raw_len < img_len) return png__err("not enough pixels", "Corrupt PNG"); + + for (j = 0; j < y; ++j) { + png_uc* cur = a->out + stride * j; + png_uc* prior; + int filter = *raw++; + + if (filter > 4) + return png__err("invalid filter", "Corrupt PNG"); + + if (depth < 8) { + if (img_width_bytes > x) return png__err("invalid width", "Corrupt PNG"); + cur += x * out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place + filter_bytes = 1; + width = img_width_bytes; + } + prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above + + // if first row, use special filter that doesn't sample previous row + if (j == 0) filter = first_row_filter[filter]; + + // handle first byte explicitly + for (k = 0; k < filter_bytes; ++k) { + switch (filter) { + case PNG__F_none: cur[k] = raw[k]; break; + case PNG__F_sub: cur[k] = raw[k]; break; + case PNG__F_up: cur[k] = PNG__BYTECAST(raw[k] + prior[k]); break; + case PNG__F_avg: cur[k] = PNG__BYTECAST(raw[k] + (prior[k] >> 1)); break; + case PNG__F_paeth: cur[k] = PNG__BYTECAST(raw[k] + png__paeth(0, prior[k], 0)); break; + case PNG__F_avg_first: cur[k] = raw[k]; break; + case PNG__F_paeth_first: cur[k] = raw[k]; break; + } + } + + if (depth == 8) { + if (img_n != out_n) + cur[img_n] = 255; // first pixel + raw += img_n; + cur += out_n; + prior += out_n; + } + else if (depth == 16) { + if (img_n != out_n) { + cur[filter_bytes] = 255; // first pixel top byte + cur[filter_bytes + 1] = 255; // first pixel bottom byte + } + raw += filter_bytes; + cur += output_bytes; + prior += output_bytes; + } + else { + raw += 1; + cur += 1; + prior += 1; + } + + // this is a little gross, so that we don't switch per-pixel or per-component + if (depth < 8 || img_n == out_n) { + int nk = (width - 1) * filter_bytes; +#define PNG__CASE(f) \ + case f: \ + for (k=0; k < nk; ++k) + switch (filter) { + // "none" filter turns into a memcpy here; make that explicit. + case PNG__F_none: memcpy(cur, raw, nk); break; + PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - filter_bytes]); } break; + PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break; + PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1)); } break; + PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes])); } break; + PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); } break; + PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], 0, 0)); } break; + } +#undef PNG__CASE + raw += nk; + } + else { + PNG_ASSERT(img_n + 1 == out_n); +#define PNG__CASE(f) \ + case f: \ + for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \ + for (k=0; k < filter_bytes; ++k) + switch (filter) { + PNG__CASE(PNG__F_none) { cur[k] = raw[k]; } break; + PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - output_bytes]); } break; + PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break; + PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - output_bytes]) >> 1)); } break; + PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes])); } break; + PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - output_bytes] >> 1)); } break; + PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], 0, 0)); } break; + } +#undef PNG__CASE + + // the loop above sets the high byte of the pixels' alpha, but for + // 16 bit png files we also need the low byte set. we'll do that here. + if (depth == 16) { + cur = a->out + stride * j; // start at the beginning of the row again + for (i = 0; i < x; ++i, cur += output_bytes) { + cur[filter_bytes + 1] = 255; + } + } + } + } + + // we make a separate pass to expand bits to pixels; for performance, + // this could run two scanlines behind the above code, so it won't + // intefere with filtering but will still be in the cache. + if (depth < 8) { + for (j = 0; j < y; ++j) { + png_uc* cur = a->out + stride * j; + png_uc* in = a->out + stride * j + x * out_n - img_width_bytes; + // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit + // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop + png_uc scale = (color == 0) ? png__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range + + // note that the final byte might overshoot and write more data than desired. + // we can allocate enough data that this never writes out of memory, but it + // could also overwrite the next scanline. can it overwrite non-empty data + // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel. + // so we need to explicitly clamp the final ones + + if (depth == 4) { + for (k = x * img_n; k >= 2; k -= 2, ++in) { + *cur++ = scale * ((*in >> 4)); + *cur++ = scale * ((*in) & 0x0f); + } + if (k > 0) *cur++ = scale * ((*in >> 4)); + } + else if (depth == 2) { + for (k = x * img_n; k >= 4; k -= 4, ++in) { + *cur++ = scale * ((*in >> 6)); + *cur++ = scale * ((*in >> 4) & 0x03); + *cur++ = scale * ((*in >> 2) & 0x03); + *cur++ = scale * ((*in) & 0x03); + } + if (k > 0) *cur++ = scale * ((*in >> 6)); + if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03); + if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03); + } + else if (depth == 1) { + for (k = x * img_n; k >= 8; k -= 8, ++in) { + *cur++ = scale * ((*in >> 7)); + *cur++ = scale * ((*in >> 6) & 0x01); + *cur++ = scale * ((*in >> 5) & 0x01); + *cur++ = scale * ((*in >> 4) & 0x01); + *cur++ = scale * ((*in >> 3) & 0x01); + *cur++ = scale * ((*in >> 2) & 0x01); + *cur++ = scale * ((*in >> 1) & 0x01); + *cur++ = scale * ((*in) & 0x01); + } + if (k > 0) *cur++ = scale * ((*in >> 7)); + if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01); + if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01); + if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01); + if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01); + if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01); + if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01); + } + if (img_n != out_n) { + int q; + // insert alpha = 255 + cur = a->out + stride * j; + if (img_n == 1) { + for (q = x - 1; q >= 0; --q) { + cur[q * 2 + 1] = 255; + cur[q * 2 + 0] = cur[q]; + } + } + else { + PNG_ASSERT(img_n == 3); + for (q = x - 1; q >= 0; --q) { + cur[q * 4 + 3] = 255; + cur[q * 4 + 2] = cur[q * 3 + 2]; + cur[q * 4 + 1] = cur[q * 3 + 1]; + cur[q * 4 + 0] = cur[q * 3 + 0]; + } + } + } + } + } + else if (depth == 16) { + // force the image data from big-endian to platform-native. + // this is done in a separate pass due to the decoding relying + // on the data being untouched, but could probably be done + // per-line during decode if care is taken. + png_uc* cur = a->out; + png__uint16* cur16 = (png__uint16*)cur; + + for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) { + *cur16 = (cur[0] << 8) | cur[1]; + } + } + + return 1; + } + + static int png__create_png_image(png__png* a, png_uc* image_data, png__uint32 image_data_len, int out_n, int depth, int color, int interlaced) + { + int bytes = (depth == 16 ? 2 : 1); + int out_bytes = out_n * bytes; + png_uc* final; + int p; + if (!interlaced) + return png__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color); + + // de-interlacing + final = (png_uc*)png__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0); + for (p = 0; p < 7; ++p) { + int xorig[] = { 0,4,0,2,0,1,0 }; + int yorig[] = { 0,0,4,0,2,0,1 }; + int xspc[] = { 8,8,4,4,2,2,1 }; + int yspc[] = { 8,8,8,4,4,2,2 }; + int i, j, x, y; + // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1 + x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p]; + y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p]; + if (x && y) { + png__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y; + if (!png__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) { + PNG_FREE(final); + return 0; + } + for (j = 0; j < y; ++j) { + for (i = 0; i < x; ++i) { + int out_y = j * yspc[p] + yorig[p]; + int out_x = i * xspc[p] + xorig[p]; + memcpy(final + out_y * a->s->img_x * out_bytes + out_x * out_bytes, + a->out + (j * x + i) * out_bytes, out_bytes); + } + } + PNG_FREE(a->out); + image_data += img_len; + image_data_len -= img_len; + } + } + a->out = final; + + return 1; + } + + static int png__compute_transparency(png__png* z, png_uc tc[3], int out_n) + { + png__context* s = z->s; + png__uint32 i, pixel_count = s->img_x * s->img_y; + png_uc* p = z->out; + + // compute color-based transparency, assuming we've + // already got 255 as the alpha value in the output + PNG_ASSERT(out_n == 2 || out_n == 4); + + if (out_n == 2) { + for (i = 0; i < pixel_count; ++i) { + p[1] = (p[0] == tc[0] ? 0 : 255); + p += 2; + } + } + else { + for (i = 0; i < pixel_count; ++i) { + if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) + p[3] = 0; + p += 4; + } + } + return 1; + } + + static int png__compute_transparency16(png__png* z, png__uint16 tc[3], int out_n) + { + png__context* s = z->s; + png__uint32 i, pixel_count = s->img_x * s->img_y; + png__uint16* p = (png__uint16*)z->out; + + // compute color-based transparency, assuming we've + // already got 65535 as the alpha value in the output + PNG_ASSERT(out_n == 2 || out_n == 4); + + if (out_n == 2) { + for (i = 0; i < pixel_count; ++i) { + p[1] = (p[0] == tc[0] ? 0 : 65535); + p += 2; + } + } + else { + for (i = 0; i < pixel_count; ++i) { + if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) + p[3] = 0; + p += 4; + } + } + return 1; + } + + static int png__expand_png_palette(png__png* a, png_uc* palette, int len, int pal_img_n) + { + png__uint32 i, pixel_count = a->s->img_x * a->s->img_y; + png_uc* p, * temp_out, * orig = a->out; + + p = (png_uc*)png__malloc_mad2(pixel_count, pal_img_n, 0); + if (p == NULL) return png__err("outofmem", "Out of memory"); + + // between here and free(out) below, exitting would leak + temp_out = p; + + if (pal_img_n == 3) { + for (i = 0; i < pixel_count; ++i) { + int n = orig[i] * 4; + p[0] = palette[n]; + p[1] = palette[n + 1]; + p[2] = palette[n + 2]; + p += 3; + } + } + else { + for (i = 0; i < pixel_count; ++i) { + int n = orig[i] * 4; + p[0] = palette[n]; + p[1] = palette[n + 1]; + p[2] = palette[n + 2]; + p[3] = palette[n + 3]; + p += 4; + } + } + PNG_FREE(a->out); + a->out = temp_out; + + PNG_NOTUSED(len); + + return 1; + } + + static int png__unpremultiply_on_load = 0; + static int png__de_iphone_flag = 0; + + STBIDEF void png_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply) + { + png__unpremultiply_on_load = flag_true_if_should_unpremultiply; + } + + STBIDEF void png_convert_iphone_png_to_rgb(int flag_true_if_should_convert) + { + png__de_iphone_flag = flag_true_if_should_convert; + } + + static void png__de_iphone(png__png* z) + { + png__context* s = z->s; + png__uint32 i, pixel_count = s->img_x * s->img_y; + png_uc* p = z->out; + + if (s->img_out_n == 3) { // convert bgr to rgb + for (i = 0; i < pixel_count; ++i) { + png_uc t = p[0]; + p[0] = p[2]; + p[2] = t; + p += 3; + } + } + else { + PNG_ASSERT(s->img_out_n == 4); + if (png__unpremultiply_on_load) { + // convert bgr to rgb and unpremultiply + for (i = 0; i < pixel_count; ++i) { + png_uc a = p[3]; + png_uc t = p[0]; + if (a) { + png_uc half = a / 2; + p[0] = (p[2] * 255 + half) / a; + p[1] = (p[1] * 255 + half) / a; + p[2] = (t * 255 + half) / a; + } + else { + p[0] = p[2]; + p[2] = t; + } + p += 4; + } + } + else { + // convert bgr to rgb + for (i = 0; i < pixel_count; ++i) { + png_uc t = p[0]; + p[0] = p[2]; + p[2] = t; + p += 4; + } + } + } + } + +#define PNG__PNG_TYPE(a,b,c,d) (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d)) + + static int png__parse_png_file(png__png* z, int scan, int req_comp) + { + png_uc palette[1024], pal_img_n = 0; + png_uc has_trans = 0, tc[3] = { 0 }; + png__uint16 tc16[3]; + png__uint32 ioff = 0, idata_limit = 0, i, pal_len = 0; + int first = 1, k, interlace = 0, color = 0, is_iphone = 0; + png__context* s = z->s; + + z->expanded = NULL; + z->idata = NULL; + z->out = NULL; + + if (!png__check_png_header(s)) return 0; + + if (scan == PNG__SCAN_type) return 1; + + for (;;) { + png__pngchunk c = png__get_chunk_header(s); + switch (c.type) { + case PNG__PNG_TYPE('C', 'g', 'B', 'I'): + is_iphone = 1; + png__skip(s, c.length); + break; + case PNG__PNG_TYPE('I', 'H', 'D', 'R'): { + int comp, filter; + if (!first) return png__err("multiple IHDR", "Corrupt PNG"); + first = 0; + if (c.length != 13) return png__err("bad IHDR len", "Corrupt PNG"); + s->img_x = png__get32be(s); + s->img_y = png__get32be(s); + if (s->img_y > PNG_MAX_DIMENSIONS) return png__err("too large", "Very large image (corrupt?)"); + if (s->img_x > PNG_MAX_DIMENSIONS) return png__err("too large", "Very large image (corrupt?)"); + z->depth = png__get8(s); if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16) return png__err("1/2/4/8/16-bit only", "PNG not supported: 1/2/4/8/16-bit only"); + color = png__get8(s); if (color > 6) return png__err("bad ctype", "Corrupt PNG"); + if (color == 3 && z->depth == 16) return png__err("bad ctype", "Corrupt PNG"); + if (color == 3) pal_img_n = 3; else if (color & 1) return png__err("bad ctype", "Corrupt PNG"); + comp = png__get8(s); if (comp) return png__err("bad comp method", "Corrupt PNG"); + filter = png__get8(s); if (filter) return png__err("bad filter method", "Corrupt PNG"); + interlace = png__get8(s); if (interlace > 1) return png__err("bad interlace method", "Corrupt PNG"); + if (!s->img_x || !s->img_y) return png__err("0-pixel image", "Corrupt PNG"); + if (!pal_img_n) { + s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0); + if ((1 << 30) / s->img_x / s->img_n < s->img_y) return png__err("too large", "Image too large to decode"); + if (scan == PNG__SCAN_header) return 1; + } + else { + // if paletted, then pal_n is our final components, and + // img_n is # components to decompress/filter. + s->img_n = 1; + if ((1 << 30) / s->img_x / 4 < s->img_y) return png__err("too large", "Corrupt PNG"); + // if SCAN_header, have to scan to see if we have a tRNS + } + break; + } + + case PNG__PNG_TYPE('P', 'L', 'T', 'E'): { + if (first) return png__err("first not IHDR", "Corrupt PNG"); + if (c.length > 256 * 3) return png__err("invalid PLTE", "Corrupt PNG"); + pal_len = c.length / 3; + if (pal_len * 3 != c.length) return png__err("invalid PLTE", "Corrupt PNG"); + for (i = 0; i < pal_len; ++i) { + palette[i * 4 + 0] = png__get8(s); + palette[i * 4 + 1] = png__get8(s); + palette[i * 4 + 2] = png__get8(s); + palette[i * 4 + 3] = 255; + } + break; + } + + case PNG__PNG_TYPE('t', 'R', 'N', 'S'): { + if (first) return png__err("first not IHDR", "Corrupt PNG"); + if (z->idata) return png__err("tRNS after IDAT", "Corrupt PNG"); + if (pal_img_n) { + if (scan == PNG__SCAN_header) { s->img_n = 4; return 1; } + if (pal_len == 0) return png__err("tRNS before PLTE", "Corrupt PNG"); + if (c.length > pal_len) return png__err("bad tRNS len", "Corrupt PNG"); + pal_img_n = 4; + for (i = 0; i < c.length; ++i) + palette[i * 4 + 3] = png__get8(s); + } + else { + if (!(s->img_n & 1)) return png__err("tRNS with alpha", "Corrupt PNG"); + if (c.length != (png__uint32)s->img_n * 2) return png__err("bad tRNS len", "Corrupt PNG"); + has_trans = 1; + if (z->depth == 16) { + for (k = 0; k < s->img_n; ++k) tc16[k] = (png__uint16)png__get16be(s); // copy the values as-is + } + else { + for (k = 0; k < s->img_n; ++k) tc[k] = (png_uc)(png__get16be(s) & 255) * png__depth_scale_table[z->depth]; // non 8-bit images will be larger + } + } + break; + } + + case PNG__PNG_TYPE('I', 'D', 'A', 'T'): { + if (first) return png__err("first not IHDR", "Corrupt PNG"); + if (pal_img_n && !pal_len) return png__err("no PLTE", "Corrupt PNG"); + if (scan == PNG__SCAN_header) { s->img_n = pal_img_n; return 1; } + if ((int)(ioff + c.length) < (int)ioff) return 0; + if (ioff + c.length > idata_limit) { + png__uint32 idata_limit_old = idata_limit; + png_uc* p; + if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096; + while (ioff + c.length > idata_limit) + idata_limit *= 2; + PNG_NOTUSED(idata_limit_old); + p = (png_uc*)PNG_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return png__err("outofmem", "Out of memory"); + z->idata = p; + } + if (!png__getn(s, z->idata + ioff, c.length)) return png__err("outofdata", "Corrupt PNG"); + ioff += c.length; + break; + } + + case PNG__PNG_TYPE('I', 'E', 'N', 'D'): { + png__uint32 raw_len, bpl; + if (first) return png__err("first not IHDR", "Corrupt PNG"); + if (scan != PNG__SCAN_load) return 1; + if (z->idata == NULL) return png__err("no IDAT", "Corrupt PNG"); + // initial guess for decoded data size to avoid unnecessary reallocs + bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component + raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */; + z->expanded = (png_uc*)png_zlib_decode_malloc_guesssize_headerflag((char*)z->idata, ioff, raw_len, (int*)&raw_len, !is_iphone); + if (z->expanded == NULL) return 0; // zlib should set error + PNG_FREE(z->idata); z->idata = NULL; + if ((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || has_trans) + s->img_out_n = s->img_n + 1; + else + s->img_out_n = s->img_n; + if (!png__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0; + if (has_trans) { + if (z->depth == 16) { + if (!png__compute_transparency16(z, tc16, s->img_out_n)) return 0; + } + else { + if (!png__compute_transparency(z, tc, s->img_out_n)) return 0; + } + } + if (is_iphone && png__de_iphone_flag && s->img_out_n > 2) + png__de_iphone(z); + if (pal_img_n) { + // pal_img_n == 3 or 4 + s->img_n = pal_img_n; // record the actual colors we had + s->img_out_n = pal_img_n; + if (req_comp >= 3) s->img_out_n = req_comp; + if (!png__expand_png_palette(z, palette, pal_len, s->img_out_n)) + return 0; + } + else if (has_trans) { + // non-paletted image with tRNS -> source image has (constant) alpha + ++s->img_n; + } + PNG_FREE(z->expanded); z->expanded = NULL; + // end of PNG chunk, read and skip CRC + png__get32be(s); + return 1; + } + + default: + // if critical, fail + if (first) return png__err("first not IHDR", "Corrupt PNG"); + if ((c.type & (1 << 29)) == 0) { +#ifndef PNG_NO_FAILURE_STRINGS + // not threadsafe + static char invalid_chunk[] = "XXXX PNG chunk not known"; + invalid_chunk[0] = PNG__BYTECAST(c.type >> 24); + invalid_chunk[1] = PNG__BYTECAST(c.type >> 16); + invalid_chunk[2] = PNG__BYTECAST(c.type >> 8); + invalid_chunk[3] = PNG__BYTECAST(c.type >> 0); +#endif + return png__err(invalid_chunk, "PNG not supported: unknown PNG chunk type"); + } + png__skip(s, c.length); + break; + } + // end of PNG chunk, read and skip CRC + png__get32be(s); + } + } + + static void* png__do_png(png__png* p, int* x, int* y, int* n, int req_comp, png__result_info* ri) + { + void* result = NULL; + if (req_comp < 0 || req_comp > 4) return png__errpuc("bad req_comp", "Internal error"); + if (png__parse_png_file(p, PNG__SCAN_load, req_comp)) { + if (p->depth <= 8) + ri->bits_per_channel = 8; + else if (p->depth == 16) + ri->bits_per_channel = 16; + else + return png__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth"); + result = p->out; + p->out = NULL; + if (req_comp && req_comp != p->s->img_out_n) { + if (ri->bits_per_channel == 8) + result = png__convert_format((unsigned char*)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y); + else + result = png__convert_format16((png__uint16*)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y); + p->s->img_out_n = req_comp; + if (result == NULL) return result; + } + *x = p->s->img_x; + *y = p->s->img_y; + if (n) *n = p->s->img_n; + } + PNG_FREE(p->out); p->out = NULL; + PNG_FREE(p->expanded); p->expanded = NULL; + PNG_FREE(p->idata); p->idata = NULL; + + return result; + } + + static void* png__png_load(png__context* s, int* x, int* y, int* comp, int req_comp, png__result_info* ri) + { + png__png p; + p.s = s; + return png__do_png(&p, x, y, comp, req_comp, ri); + } + + static int png__png_test(png__context* s) + { + int r; + r = png__check_png_header(s); + png__rewind(s); + return r; + } + + static int png__png_info_raw(png__png* p, int* x, int* y, int* comp) + { + if (!png__parse_png_file(p, PNG__SCAN_header, 0)) { + png__rewind(p->s); + return 0; + } + if (x) *x = p->s->img_x; + if (y) *y = p->s->img_y; + if (comp) *comp = p->s->img_n; + return 1; + } + + static int png__png_info(png__context* s, int* x, int* y, int* comp) + { + png__png p; + p.s = s; + return png__png_info_raw(&p, x, y, comp); + } + + static int png__png_is16(png__context* s) + { + png__png p; + p.s = s; + if (!png__png_info_raw(&p, NULL, NULL, NULL)) + return 0; + if (p.depth != 16) { + png__rewind(p.s); + return 0; + } + return 1; + } + + static void* png__load_main(png__context* s, int* x, int* y, int* comp, int req_comp, png__result_info* ri, int bpc) + { + memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields + ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed + ri->channel_order = PNG_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order + ri->num_channels = 0; + + if (png__png_test(s)) return png__png_load(s, x, y, comp, req_comp, ri); + + return png__errpuc("unknown image type", "Image not of any known type, or corrupt"); + } + + static png_uc* png__convert_16_to_8(png__uint16* orig, int w, int h, int channels) + { + int i; + int img_len = w * h * channels; + png_uc* reduced; + + reduced = (png_uc*)png__malloc(img_len); + if (reduced == NULL) return png__errpuc("outofmem", "Out of memory"); + + for (i = 0; i < img_len; ++i) + reduced[i] = (png_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling + + PNG_FREE(orig); + return reduced; + } + + static unsigned char* png__load_and_postprocess_8bit(png__context* s, int* x, int* y, int* comp, int req_comp) + { + png__result_info ri; + void* result = png__load_main(s, x, y, comp, req_comp, &ri, 8); + + if (result == NULL) + return NULL; + + // it is the responsibility of the loaders to make sure we get either 8 or 16 bit. + PNG_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16); + + if (ri.bits_per_channel != 8) { + result = png__convert_16_to_8((png__uint16*)result, *x, *y, req_comp == 0 ? *comp : req_comp); + ri.bits_per_channel = 8; + } + + // @TODO: move png__convert_format to here + + //if (png__vertically_flip_on_load) { + // int channels = req_comp ? req_comp : *comp; + // png__vertical_flip(result, *x, *y, channels * sizeof(png_uc)); + //} + + return (unsigned char*)result; + } + + static void png__start_mem(png__context* s, png_uc const* buffer, int len) + { + s->io.read = NULL; + s->read_from_callbacks = 0; + s->callback_already_read = 0; + s->img_buffer = s->img_buffer_original = (png_uc*)buffer; + s->img_buffer_end = s->img_buffer_original_end = (png_uc*)buffer + len; + } + + STBIDEF png_uc* png_load_from_memory(png_uc const* buffer, int len, int* x, int* y, int* comp, int req_comp) + { + png__context s; + png__start_mem(&s, buffer, len); + return png__load_and_postprocess_8bit(&s, x, y, comp, req_comp); + } + + //------------------------------------------------------------------------ + + static int png__stdio_read(void* user, char* data, int size) + { + InputMemoryStream* stream = (InputMemoryStream*)user; + return (int)stream->Read(size, data); + } + + static void png__stdio_skip(void* user, int n) + { + InputMemoryStream* stream = (InputMemoryStream*)user; + stream->Skip(n); + } + + static int png__stdio_eof(void* user) + { + InputMemoryStream* stream = (InputMemoryStream*)user; + return stream->Pos() == stream->Size() ? 1 : 0; + } + + + //--------------------------------------------------------------------- + + ImagePngLoader::ImagePngLoader(const ImageLoaderParam& param) + : Base::ImagePngLoader(param) + { + if (_param.format == SimdPixelFormatNone) + _param.format = SimdPixelFormatRgb24; + } + + bool ImagePngLoader::FromStream() + { + const int req_comp = 4; + int x, y, comp; + png__context s; + s.io.eof = png__stdio_eof; + s.io.read = png__stdio_read; + s.io.skip = png__stdio_skip; + s.io_user_data = &_stream; + s.buflen = sizeof(s.buffer_start); + s.read_from_callbacks = 1; + s.callback_already_read = 0; + s.img_buffer = s.img_buffer_original = s.buffer_start; + png__refill_buffer(&s); + s.img_buffer_original_end = s.img_buffer_end; + png__result_info ri; + uint8_t* data = (uint8_t*)png__png_load(&s, &x, &y, &comp, req_comp, &ri); + if (data) + { + if (ri.bits_per_channel == 16) + { + const uint16_t* src = (uint16_t*)data; + size_t size = x * y * req_comp; + uint8_t* dst = (uint8_t*)PNG_MALLOC(size); + for (size_t i = 0; i < size; ++i) + dst[i] = uint8_t(src[i] >> 8); + PNG_FREE(data); + data = dst; + } + size_t stride = 4 * x; + _image.Recreate(x, y, (Image::Format)_param.format); + if (x < A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: + Base::RgbaToGray(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgr24: + Base::BgraToRgb(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgra32: + Base::BgraToRgba(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatRgb24: + Base::BgraToBgr(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatRgba32: + Base::Copy(data, stride, x, y, 4, _image.data, _image.stride); + break; + default: + break; + } + } + else + { + switch (_param.format) + { + case SimdPixelFormatGray8: + Sse2::RgbaToGray(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgr24: + Sse41::BgraToRgb(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatBgra32: + Sse41::BgraToRgba(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatRgb24: + Sse41::BgraToBgr(data, x, y, stride, _image.data, _image.stride); + break; + case SimdPixelFormatRgba32: + Base::Copy(data, stride, x, y, 4, _image.data, _image.stride); + break; + default: + break; + } + } + PNG_FREE(data); + return true; + } + return false; + } + } +#endif +} diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp new file mode 100644 index 0000000000..da20b395c0 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp @@ -0,0 +1,139 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdSse2.h" +#include "Simd/SimdSse41.h" + +#include + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param) + : Base::ImagePgmTxtSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Sse41::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Sse2::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Sse41::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Sse41::RgbaToGray; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param) + : Base::ImagePgmBinSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatBgr24: _convert = Sse41::BgrToGray; break; + case SimdPixelFormatBgra32: _convert = Sse2::BgraToGray; break; + case SimdPixelFormatRgb24: _convert = Sse41::RgbToGray; break; + case SimdPixelFormatRgba32: _convert = Sse41::RgbaToGray; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param) + : Base::ImagePpmTxtSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Sse41::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Sse41::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Sse41::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Sse41::BgraToBgr; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param) + : Base::ImagePpmBinSaver(param) + { + if (_param.width >= A) + { + switch (_param.format) + { + case SimdPixelFormatGray8: _convert = Sse41::GrayToBgr; break; + case SimdPixelFormatBgr24: _convert = Sse41::BgrToRgb; break; + case SimdPixelFormatBgra32: _convert = Sse41::BgraToRgb; break; + case SimdPixelFormatRgba32: _convert = Sse41::BgraToBgr; break; + default: break; + } + } + } + + //--------------------------------------------------------------------- + + ImageSaver* CreateImageSaver(const ImageSaverParam& param) + { + switch (param.file) + { + case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param); + case SimdImageFilePgmBin: return new ImagePgmBinSaver(param); + case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param); + case SimdImageFilePpmBin: return new ImagePpmBinSaver(param); + case SimdImageFilePng: return new ImagePngSaver(param); + case SimdImageFileJpeg: return new ImageJpegSaver(param); + default: + return NULL; + } + } + + uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size) + { + ImageSaverParam param(width, height, format, file, quality); + if (param.Validate()) + { + std::unique_ptr saver(CreateImageSaver(param)); + if (saver) + { + if (saver->ToStream(src, stride)) + return saver->Release(size); + } + } + return NULL; + } + } +#endif// SIMD_SSE41_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp new file mode 100644 index 0000000000..3a0a2079c1 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp @@ -0,0 +1,431 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdImageSaveJpeg.h" +#include "Simd/SimdSse41.h" +#include "Simd/SimdBase.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + SIMD_INLINE void JpegDctV(const float* src, size_t srcStride, float *dst, size_t dstStride) + { + for (int i = 0; i < 2; i++, src += 4, dst += 4) + { + __m128 d0 = _mm_loadu_ps(src + 0 * srcStride); + __m128 d1 = _mm_loadu_ps(src + 1 * srcStride); + __m128 d2 = _mm_loadu_ps(src + 2 * srcStride); + __m128 d3 = _mm_loadu_ps(src + 3 * srcStride); + __m128 d4 = _mm_loadu_ps(src + 4 * srcStride); + __m128 d5 = _mm_loadu_ps(src + 5 * srcStride); + __m128 d6 = _mm_loadu_ps(src + 6 * srcStride); + __m128 d7 = _mm_loadu_ps(src + 7 * srcStride); + + __m128 tmp0 = _mm_add_ps(d0, d7); + __m128 tmp7 = _mm_sub_ps(d0, d7); + __m128 tmp1 = _mm_add_ps(d1, d6); + __m128 tmp6 = _mm_sub_ps(d1, d6); + __m128 tmp2 = _mm_add_ps(d2, d5); + __m128 tmp5 = _mm_sub_ps(d2, d5); + __m128 tmp3 = _mm_add_ps(d3, d4); + __m128 tmp4 = _mm_sub_ps(d3, d4); + + __m128 tmp10 = _mm_add_ps(tmp0, tmp3); + __m128 tmp13 = _mm_sub_ps(tmp0, tmp3); + __m128 tmp11 = _mm_add_ps(tmp1, tmp2); + __m128 tmp12 = _mm_sub_ps(tmp1, tmp2); + + d0 = _mm_add_ps(tmp10, tmp11); + d4 = _mm_sub_ps(tmp10, tmp11); + + __m128 z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), _mm_set1_ps(0.707106781f)); + d2 = _mm_add_ps(tmp13, z1); + d6 = _mm_sub_ps(tmp13, z1); + + tmp10 = _mm_add_ps(tmp4, tmp5); + tmp11 = _mm_add_ps(tmp5, tmp6); + tmp12 = _mm_add_ps(tmp6, tmp7); + + __m128 z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12), _mm_set1_ps(0.382683433f)); + __m128 z2 = _mm_add_ps(_mm_mul_ps(tmp10, _mm_set1_ps(0.541196100f)), z5); + __m128 z4 = _mm_add_ps(_mm_mul_ps(tmp12, _mm_set1_ps(1.306562965f)), z5); + __m128 z3 = _mm_mul_ps(tmp11, _mm_set1_ps(0.707106781f)); + + __m128 z11 = _mm_add_ps(tmp7, z3); + __m128 z13 = _mm_sub_ps(tmp7, z3); + + _mm_storeu_ps(dst + 0 * dstStride, d0); + _mm_storeu_ps(dst + 1 * dstStride, _mm_add_ps(z11, z4)); + _mm_storeu_ps(dst + 2 * dstStride, d2); + _mm_storeu_ps(dst + 3 * dstStride, _mm_sub_ps(z13, z2)); + _mm_storeu_ps(dst + 4 * dstStride, d4); + _mm_storeu_ps(dst + 5 * dstStride, _mm_add_ps(z13, z2)); + _mm_storeu_ps(dst + 6 * dstStride, d6); + _mm_storeu_ps(dst + 7 * dstStride, _mm_sub_ps(z11, z4)); + } + } + + SIMD_INLINE void JpegDctH(const float* src, size_t srcStride, const float * fdt, int* dst) + { + for (int i = 0; i < 2; i++, src += 4 * srcStride, fdt += 4, dst += 4) + { + __m128 tmp0, tmp1, tmp2, tmp3; + __m128 d0 = _mm_loadu_ps(src + 0 * srcStride); + __m128 d1 = _mm_loadu_ps(src + 1 * srcStride); + __m128 d2 = _mm_loadu_ps(src + 2 * srcStride); + __m128 d3 = _mm_loadu_ps(src + 3 * srcStride); + tmp0 = _mm_unpacklo_ps(d0, d2); + tmp1 = _mm_unpackhi_ps(d0, d2); + tmp2 = _mm_unpacklo_ps(d1, d3); + tmp3 = _mm_unpackhi_ps(d1, d3); + d0 = _mm_unpacklo_ps(tmp0, tmp2); + d1 = _mm_unpackhi_ps(tmp0, tmp2); + d2 = _mm_unpacklo_ps(tmp1, tmp3); + d3 = _mm_unpackhi_ps(tmp1, tmp3); + + __m128 d4 = _mm_loadu_ps(src + 0 * srcStride + 4); + __m128 d5 = _mm_loadu_ps(src + 1 * srcStride + 4); + __m128 d6 = _mm_loadu_ps(src + 2 * srcStride + 4); + __m128 d7 = _mm_loadu_ps(src + 3 * srcStride + 4); + tmp0 = _mm_unpacklo_ps(d4, d6); + tmp1 = _mm_unpackhi_ps(d4, d6); + tmp2 = _mm_unpacklo_ps(d5, d7); + tmp3 = _mm_unpackhi_ps(d5, d7); + d4 = _mm_unpacklo_ps(tmp0, tmp2); + d5 = _mm_unpackhi_ps(tmp0, tmp2); + d6 = _mm_unpacklo_ps(tmp1, tmp3); + d7 = _mm_unpackhi_ps(tmp1, tmp3); + + tmp0 = _mm_add_ps(d0, d7); + tmp1 = _mm_add_ps(d1, d6); + tmp2 = _mm_add_ps(d2, d5); + tmp3 = _mm_add_ps(d3, d4); + __m128 tmp7 = _mm_sub_ps(d0, d7); + __m128 tmp6 = _mm_sub_ps(d1, d6); + __m128 tmp5 = _mm_sub_ps(d2, d5); + __m128 tmp4 = _mm_sub_ps(d3, d4); + + __m128 tmp10 = _mm_add_ps(tmp0, tmp3); + __m128 tmp13 = _mm_sub_ps(tmp0, tmp3); + __m128 tmp11 = _mm_add_ps(tmp1, tmp2); + __m128 tmp12 = _mm_sub_ps(tmp1, tmp2); + + d0 = _mm_add_ps(tmp10, tmp11); + d4 = _mm_sub_ps(tmp10, tmp11); + + __m128 z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), _mm_set1_ps(0.707106781f)); + d2 = _mm_add_ps(tmp13, z1); + d6 = _mm_sub_ps(tmp13, z1); + + tmp10 = _mm_add_ps(tmp4, tmp5); + tmp11 = _mm_add_ps(tmp5, tmp6); + tmp12 = _mm_add_ps(tmp6, tmp7); + + __m128 z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12), _mm_set1_ps(0.382683433f)); + __m128 z2 = _mm_add_ps(_mm_mul_ps(tmp10, _mm_set1_ps(0.541196100f)), z5); + __m128 z4 = _mm_add_ps(_mm_mul_ps(tmp12, _mm_set1_ps(1.306562965f)), z5); + __m128 z3 = _mm_mul_ps(tmp11, _mm_set1_ps(0.707106781f)); + + __m128 z11 = _mm_add_ps(tmp7, z3); + __m128 z13 = _mm_sub_ps(tmp7, z3); + + d1 = _mm_add_ps(z11, z4); + d3 = _mm_sub_ps(z13, z2); + d5 = _mm_add_ps(z13, z2); + d7 = _mm_sub_ps(z11, z4); + + _mm_storeu_si128((__m128i*)dst + 0x0, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 0), d0))); + _mm_storeu_si128((__m128i*)dst + 0x2, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 1), d1))); + _mm_storeu_si128((__m128i*)dst + 0x4, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 2), d2))); + _mm_storeu_si128((__m128i*)dst + 0x6, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 3), d3))); + _mm_storeu_si128((__m128i*)dst + 0x8, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 4), d4))); + _mm_storeu_si128((__m128i*)dst + 0xA, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 5), d5))); + _mm_storeu_si128((__m128i*)dst + 0xC, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 6), d6))); + _mm_storeu_si128((__m128i*)dst + 0xE, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 7), d7))); + } + } + + static int JpegProcessDu(Base::BitBuf& bitBuf, float* CDU, int stride, const float* fdtbl, int DC, const uint16_t HTDC[256][2], const uint16_t HTAC[256][2]) + { + JpegDctV(CDU, stride, CDU, stride); + SIMD_ALIGNED(16) int DUO[64], DU[64]; + JpegDctH(CDU, stride, fdtbl, DUO); + for (int i = 0; i < 64; ++i) + DU[Base::JpegZigZagT[i]] = DUO[i]; + int diff = DU[0] - DC; + if (diff == 0) + bitBuf.Push(HTDC[0]); + else + { + uint16_t bits[2]; + Base::JpegCalcBits(diff, bits); + bitBuf.Push(HTDC[bits[1]]); + bitBuf.Push(bits); + } + int end0pos4 = 60; + for (; end0pos4 > 0 && _mm_testz_si128(_mm_loadu_si128((__m128i*)(DU + end0pos4)), Sse2::K_INV_ZERO); end0pos4 -= 4); + int end0pos = end0pos4 + 3; + for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos); + if (end0pos == 0) + { + bitBuf.Push(HTAC[0x00]); + return DU[0]; + } + for (int i = 1; i <= end0pos; ++i) + { + int startpos = i; + for (; DU[i] == 0 && i <= end0pos; ++i); + int nrzeroes = i - startpos; + if (nrzeroes >= 16) + { + int lng = nrzeroes >> 4; + int nrmarker; + for (nrmarker = 1; nrmarker <= lng; ++nrmarker) + bitBuf.Push(HTAC[0xF0]); + nrzeroes &= 15; + } + uint16_t bits[2]; + Base::JpegCalcBits(DU[i], bits); + bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]); + bitBuf.Push(bits); + } + if (end0pos != 63) + bitBuf.Push(HTAC[0x00]); + return DU[0]; + } + + SIMD_INLINE void RgbToYuvInit(__m128 k[10]) + { + k[0] = _mm_set1_ps(+0.29900f); + k[1] = _mm_set1_ps(+0.58700f); + k[2] = _mm_set1_ps(+0.11400f); + k[3] = _mm_set1_ps(-128.000f); + k[4] = _mm_set1_ps(-0.16874f); + k[5] = _mm_set1_ps(-0.33126f); + k[6] = _mm_set1_ps(+0.50000f); + k[7] = _mm_set1_ps(+0.50000f); + k[8] = _mm_set1_ps(-0.41869f); + k[9] = _mm_set1_ps(-0.08131f); + } + + SIMD_INLINE void RgbToYuv(const uint8_t* r, const uint8_t* g, const uint8_t* b, int stride, int height, + const __m128 k[10], float* y, float* u, float* v, int size) + { + for (int row = 0; row < size;) + { + for (int col = 0; col < size; col += 4) + { + __m128 _r = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(r + col)))); + __m128 _g = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(g + col)))); + __m128 _b = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(b + col)))); + _mm_storeu_ps(y + col, _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, k[0]), _mm_mul_ps(_g, k[1])), _mm_mul_ps(_b, k[2])), k[3])); + //_mm_storeu_ps(y + col, _mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, _yr), _mm_mul_ps(_g, _yg)), _mm_add_ps(_mm_mul_ps(_b, _yb), _yt))); + _mm_storeu_ps(u + col, _mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, k[4]), _mm_mul_ps(_g, k[5])), _mm_mul_ps(_b, k[6]))); + _mm_storeu_ps(v + col, _mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, k[7]), _mm_mul_ps(_g, k[8])), _mm_mul_ps(_b, k[9]))); + } + if(++row < height) + r += stride, g += stride, b += stride; + y += size, u += size, v += size; + } + } + + SIMD_INLINE void GrayToY(const uint8_t* g, int stride, int height, const __m128 k[10], float* y, int size) + { + for (int row = 0; row < size;) + { + for (int col = 0; col < size; col += 4) + { + __m128 _g = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(g + col)))); + _mm_storeu_ps(y + col, _mm_add_ps(_g, k[3])); + } + if (++row < height) + g += stride; + y += size; + } + } + + SIMD_INLINE void SubUv(const float * src, float * dst) + { + __m128 _0_25 = _mm_set1_ps(0.25f), s0, s1; + for (int yy = 0; yy < 8; yy += 1) + { + s0 = _mm_add_ps(_mm_loadu_ps(src + 0), _mm_loadu_ps(src + 16)); + s1 = _mm_add_ps(_mm_loadu_ps(src + 4), _mm_loadu_ps(src + 20)); + _mm_storeu_ps(dst + 0, _mm_mul_ps(_mm_hadd_ps(s0, s1), _0_25)); + s0 = _mm_add_ps(_mm_loadu_ps(src + 8), _mm_loadu_ps(src + 24)); + s1 = _mm_add_ps(_mm_loadu_ps(src + 12), _mm_loadu_ps(src + 28)); + _mm_storeu_ps(dst + 4, _mm_mul_ps(_mm_hadd_ps(s0, s1), _0_25)); + src += 32; + dst += 8; + } + } + + void JpegWriteBlockSubs(OutputMemoryStream& stream, int width, int height, const uint8_t* red, + const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3]) + { + __m128 k[10]; + RgbToYuvInit(k); + int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2]; + int width16 = width& (~15); + bool gray = red == green && red == blue; + Base::BitBuf bitBuf; + for (int y = 0; y < height; y += 16) + { + int x = 0; + SIMD_ALIGNED(16) float Y[256], U[256], V[256]; + SIMD_ALIGNED(16) float subU[64], subV[64]; + for (; x < width16; x += 16) + { + if (gray) + GrayToY(red + x, stride, height - y, k, Y, 16); + else + RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 16); + DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + SubUv(U, subU); + SubUv(V, subV); + DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + if (bitBuf.Full()) + { + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + } + for (; x < width; x += 16) + { + if (gray) + Base::GrayToY(red + x, stride, height - y, width - x, Y, 16); + else + Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 16); + DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + SubUv(U, subU); + SubUv(V, subV); + DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + } + } + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + + void JpegWriteBlockFull(OutputMemoryStream& stream, int width, int height, const uint8_t* red, + const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3]) + { + __m128 k[10]; + RgbToYuvInit(k); + int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2]; + int width8 = width & (~7); + bool gray = red == green && red == blue; + Base::BitBuf bitBuf; + for (int y = 0; y < height; y += 8) + { + int x = 0; + SIMD_ALIGNED(16) float Y[64], U[64], V[64]; + for (; x < width8; x += 8) + { + if (gray) + GrayToY(red + x, stride, height - y, k, Y, 8); + else + RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 8); + DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + if (bitBuf.Full()) + { + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + } + for (; x < width; x += 8) + { + if (gray) + Base::GrayToY(red + x, stride, height - y, width - x, Y, 8); + else + Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 8); + DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac); + if (gray) + Base::JpegProcessDuGrayUv(bitBuf); + else + { + DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac); + DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac); + } + } + } + Base::WriteBits(stream, bitBuf.data, bitBuf.size); + bitBuf.Clear(); + } + + //--------------------------------------------------------------------- + + ImageJpegSaver::ImageJpegSaver(const ImageSaverParam& param) + : Base::ImageJpegSaver(param) + { + } + + void ImageJpegSaver::Init() + { + InitParams(true); + switch (_param.format) + { + case SimdPixelFormatBgr24: + case SimdPixelFormatRgb24: + _deintBgr = _param.width < 16 ? Base::DeinterleaveBgr : Sse41::DeinterleaveBgr; + break; + case SimdPixelFormatBgra32: + case SimdPixelFormatRgba32: + _deintBgra = _param.width < 16 ? Base::DeinterleaveBgra : Sse41::DeinterleaveBgra; + break; + default: + break; + } + _writeBlock = _subSample ? JpegWriteBlockSubs : JpegWriteBlockFull; + } + } +#endif// SIMD_SSE41_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp new file mode 100644 index 0000000000..0e1c76b710 --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp @@ -0,0 +1,370 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdImageSavePng.h" +#include "Simd/SimdBase.h" +#include "Simd/SimdSse41.h" +#include "Simd/SimdExtract.h" + +namespace Simd +{ +#ifdef SIMD_SSE41_ENABLE + namespace Sse41 + { + uint32_t ZlibAdler32(uint8_t* data, int size) + { + __m128i _i0 = _mm_setr_epi32(0, -1, -2, -3), _4 = _mm_set1_epi32(4); + uint32_t lo = 1, hi = 0; + for (int b = 0, n = (int)(size % 5552); b < size;) + { + int n4 = n & (~3), i = 0; + __m128i _i = _mm_add_epi32(_i0, _mm_set1_epi32(n)); + __m128i _l = _mm_setzero_si128(), _h = _mm_setzero_si128(); + for (; i < n4; i += 4) + { + __m128i d = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(data + b + i))); + _l = _mm_add_epi32(_l, d); + _h = _mm_add_epi32(_h, _mm_mullo_epi32(d, _i)); + _i = _mm_sub_epi32(_i, _4); + } + int l = Sse2::ExtractInt32Sum(_l), h = Sse2::ExtractInt32Sum(_h); + for (; i < n; ++i) + { + l += data[b + i]; + h += data[b + i]*(n - i); + } + hi = (hi + h + lo*n) % 65521; + lo = (lo + l) % 65521; + b += n; + n = 5552; + } + return (hi << 16) | lo; + } + + void ZlibCompress(uint8_t* data, int size, int quality, OutputMemoryStream& stream) + { + const int ZHASH = 16384; + if (quality < 5) + quality = 5; + const int basket = quality * 2; + Array32i hashTable(ZHASH * basket); + memset(hashTable.data, -1, hashTable.RawSize()); + + stream.Write(uint8_t(0x78)); + stream.Write(uint8_t(0x5e)); + stream.WriteBits(1, 1); + stream.WriteBits(1, 2); + + int i = 0, j; + while (i < size - 3) + { + int h = Base::ZlibHash(data + i) & (ZHASH - 1), best = 3; + uint8_t* bestLoc = 0; + int* hList = hashTable.data + h * basket; + for (j = 0; hList[j] != -1 && j < basket; ++j) + { + if (hList[j] > i - 32768) + { + int d = ZlibCount(data + hList[j], data + i, size - i); + if (d >= best) + { + best = d; + bestLoc = data + hList[j]; + } + } + } + if (j == basket) + { + memcpy(hList, hList + quality, quality * sizeof(int)); + memset(hList + quality, -1, quality * sizeof(int)); + j = quality; + } + hList[j] = i; + + if (bestLoc) + { + h = Base::ZlibHash(data + i + 1) & (ZHASH - 1); + int* hList = hashTable.data + h * basket; + for (j = 0; hList[j] != -1 && j < basket; ++j) + { + if (hList[j] > i - 32767) + { + int e = ZlibCount(data + hList[j], data + i + 1, size - i - 1); + if (e > best) + { + bestLoc = NULL; + break; + } + } + } + } + + if (bestLoc) + { + int d = (int)(data + i - bestLoc); + assert(d <= 32767 && best <= 258); + for (j = 0; best > Base::ZlibLenC[j + 1] - 1; ++j); + Base::ZlibHuff(j + 257, stream); + if (Base::ZlibLenEb[j]) + stream.WriteBits(best - Base::ZlibLenC[j], Base::ZlibLenEb[j]); + for (j = 0; d > Base::ZlibDistC[j + 1] - 1; ++j); + stream.WriteBits(Base::ZlibBitRev(j, 5), 5); + if (Base::ZlibDistEb[j]) + stream.WriteBits(d - Base::ZlibDistC[j], Base::ZlibDistEb[j]); + i += best; + } + else + { + Base::ZlibHuffB(data[i], stream); + ++i; + } + } + for (; i < size; ++i) + Base::ZlibHuffB(data[i], stream); + Base::ZlibHuff(256, stream); + stream.FlushBits(); + stream.WriteBe32u(ZlibAdler32(data, size)); + } + + uint32_t EncodeLine0(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size, A); + __m128i _sum = _mm_setzero_si128(); + for (; i < sizeA; i += A) + { + __m128i _src = _mm_loadu_si128((__m128i*)(src + i)); + _mm_storeu_si128((__m128i*)(dst + i), _src); + _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_src))); + } + uint32_t sum = Sse2::ExtractInt32Sum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine1(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + __m128i _sum = _mm_setzero_si128(); + for (; i < sizeA; i += A) + { + __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i)); + __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n)); + __m128i _dst = _mm_sub_epi8(_src0, _src1); + _mm_storeu_si128((__m128i*)(dst + i), _dst); + _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst))); + } + sum += Sse2::ExtractInt32Sum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - src[i - n]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine2(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i] - src[i - stride]; + sum += ::abs(dst[i]); + } + __m128i _sum = _mm_setzero_si128(); + for (; i < sizeA; i += A) + { + __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i)); + __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - stride)); + __m128i _dst = _mm_sub_epi8(_src0, _src1); + _mm_storeu_si128((__m128i*)(dst + i), _dst); + _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst))); + } + sum += Sse2::ExtractInt32Sum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - src[i - stride]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine3(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i] - (src[i - stride] >> 1); + sum += ::abs(dst[i]); + } + __m128i _sum = _mm_setzero_si128(); + for (; i < sizeA; i += A) + { + __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i)); + __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n)); + __m128i _src2 = _mm_loadu_si128((__m128i*)(src + i - stride)); + __m128i lo = _mm_srli_epi16(_mm_add_epi16(UnpackU8<0>(_src1), UnpackU8<0>(_src2)), 1); + __m128i hi = _mm_srli_epi16(_mm_add_epi16(UnpackU8<1>(_src1), UnpackU8<1>(_src2)), 1); + __m128i _dst = _mm_sub_epi8(_src0, _mm_packus_epi16(lo, hi)); + _mm_storeu_si128((__m128i*)(dst + i), _dst); + _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst))); + } + sum += Sse2::ExtractInt32Sum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - ((src[i - n] + src[i - stride]) >> 1); + sum += ::abs(dst[i]); + } + return sum; + } + + SIMD_INLINE __m128i Paeth(__m128i a, __m128i b, __m128i c) + { + __m128i p = _mm_sub_epi16(_mm_add_epi16(a, b), c); + __m128i pa = _mm_abs_epi16(_mm_sub_epi16(p, a)); + __m128i pb = _mm_abs_epi16(_mm_sub_epi16(p, b)); + __m128i pc = _mm_abs_epi16(_mm_sub_epi16(p, c)); + __m128i mbc = _mm_or_si128(_mm_cmpgt_epi16(pa, pb), _mm_cmpgt_epi16(pa, pc)); + __m128i mc = _mm_cmpgt_epi16(pb, pc); + return _mm_blendv_epi8(a, _mm_blendv_epi8(b, c, mc), mbc); + } + + uint32_t EncodeLine4(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = (int8_t)(src[i] - src[i - stride]); + sum += ::abs(dst[i]); + } + __m128i _sum = _mm_setzero_si128(); + for (; i < sizeA; i += A) + { + __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i)); + __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n)); + __m128i _src2 = _mm_loadu_si128((__m128i*)(src + i - stride)); + __m128i _src3 = _mm_loadu_si128((__m128i*)(src + i - stride - n)); + __m128i lo = Paeth(UnpackU8<0>(_src1), UnpackU8<0>(_src2), UnpackU8<0>(_src3)); + __m128i hi = Paeth(UnpackU8<1>(_src1), UnpackU8<1>(_src2), UnpackU8<1>(_src3)); + __m128i _dst = _mm_sub_epi8(_src0, _mm_packus_epi16(lo, hi)); + _mm_storeu_si128((__m128i*)(dst + i), _dst); + _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst))); + } + sum += Sse2::ExtractInt32Sum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - Base::Paeth(src[i - n], src[i - stride], src[i - stride - n]); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine5(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + __m128i _sum = _mm_setzero_si128(); + for (; i < sizeA; i += A) + { + __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i)); + __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n)); + __m128i lo = _mm_srli_epi16(UnpackU8<0>(_src1), 1); + __m128i hi = _mm_srli_epi16(UnpackU8<1>(_src1), 1); + __m128i _dst = _mm_sub_epi8(_src0, _mm_packus_epi16(lo, hi)); + _mm_storeu_si128((__m128i*)(dst + i), _dst); + _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst))); + } + sum += Sse2::ExtractInt32Sum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - (src[i - n] >> 1); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine6(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + __m128i _sum = _mm_setzero_si128(); + for (; i < sizeA; i += A) + { + __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i)); + __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n)); + __m128i _dst = _mm_sub_epi8(_src0, _src1); + _mm_storeu_si128((__m128i*)(dst + i), _dst); + _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst))); + } + sum += Sse2::ExtractInt32Sum(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - src[i - n]; + sum += ::abs(dst[i]); + } + return sum; + } + + ImagePngSaver::ImagePngSaver(const ImageSaverParam& param) + : Base::ImagePngSaver(param) + { + if (_param.format == SimdPixelFormatBgr24) + _convert = Sse41::BgrToRgb; + else if (_param.format == SimdPixelFormatBgra32) + _convert = Sse41::BgraToRgba; + _encode[0] = Sse41::EncodeLine0; + _encode[1] = Sse41::EncodeLine1; + _encode[2] = Sse41::EncodeLine2; + _encode[3] = Sse41::EncodeLine3; + _encode[4] = Sse41::EncodeLine4; + _encode[5] = Sse41::EncodeLine5; + _encode[6] = Sse41::EncodeLine6; + _compress = Sse41::ZlibCompress; + } + } +#endif// SIMD_SSE41_ENABLE +} diff --git a/3rdparty/simdlib/Simd/SimdView.hpp b/3rdparty/simdlib/Simd/SimdView.hpp index 0c61a0e6e8..33629be94f 100755 --- a/3rdparty/simdlib/Simd/SimdView.hpp +++ b/3rdparty/simdlib/Simd/SimdView.hpp @@ -27,7 +27,6 @@ #ifndef __SimdView_hpp__ #define __SimdView_hpp__ -#include "Simd/SimdDefs.h" #include "Simd/SimdRectangle.hpp" #include "Simd/SimdAllocator.hpp" @@ -493,34 +492,57 @@ namespace Simd /*! Loads image from file. - Supported formats: - - PGM(Portable Gray Map) text(P2) or binary(P5) (the file is loaded as 8-bit gray image). - - PPM(Portable Pixel Map) text(P3) or binary(P6) (the file is loaded as 32-bit BGRA image). + Supported formats are described by ::SimdImageFileType enumeration. \note PGM and PPM files with comments are not supported. - \param [in] path - a path to file with PGM or PPM image. + \param [in] path - a path to image file. + \param [in] format - a desired format of loaded image. + Supported values are View::Gray8, View::Bgr24, View::Bgra32, View::Rgb24, View::Rgba32 and View::None. + Default value is View::None (loads image in native pixel format of image file). \return - a result of loading. */ - bool Load(const std::string & path); + bool Load(const std::string & path, Format format = None); + + /*! + Loads image from memory buffer. + + Supported formats are described by ::SimdImageFileType enumeration. + + \note PGM and PPM files with comments are not supported. + + \param [in] src - a pointer to memory buffer. + \param [in] size - a buffer size. + \param [in] format - a desired format of loaded image. + Supported values are View::Gray8, View::Bgr24, View::Bgra32, View::Rgb24, View::Rgba32 and View::None. + Default value is View::None (loads image in native pixel format of image file). + \return - a result of loading. + */ + bool Load(const uint8_t * src, size_t size, Format format = None); /*! Saves image to file. - Supported formats: - - PGM(Portable Gray Map) binary(P5) (this format is used in order to save 8-bit gray images). - - PPM(Portable Pixel Map) binary(P6) (this format is used in order to save 24-bit BGR and 32-bit BGRA images). - \param [in] path - a path to file. + \param [in] type - a image file format. By default is equal to ::SimdImageFileUndefined (format auto choice). + \param [in] quality - a parameter of compression quality (if file format supports it). \return - a result of saving. */ - bool Save(const std::string & path) const; + bool Save(const std::string & path, SimdImageFileType type = SimdImageFileUndefined, int quality = 100) const; /*! - Clear View structure (reset all fields) and free memory if it's owner + Clears View structure (reset all fields) and free memory if it's owner. */ void Clear(); + /*! + Releases pixel data and resets all fields. + + \param [out] size - a pointer to the size of released pixel data. Can be NULL. + \return - a released pointer to pixel data. It must be deleted by function ::SimdFree. + */ + uint8_t* Release(size_t* size = NULL); + private: bool _owner; }; @@ -1027,6 +1049,7 @@ namespace Simd case Float: return 4; case Double: return 8; case Rgb24: return 3; + case Rgba32: return 4; default: assert(0); return 0; } } @@ -1050,6 +1073,7 @@ namespace Simd case Float: return 4; case Double: return 8; case Rgb24: return 1; + case Rgba32: return 1; default: assert(0); return 0; } } @@ -1073,6 +1097,7 @@ namespace Simd case Float: return 1; case Double: return 1; case Rgb24: return 3; + case Rgba32: return 4; default: assert(0); return 0; } } @@ -1124,139 +1149,33 @@ namespace Simd std::swap((bool&)_owner, (bool&)other._owner); } - template class A> SIMD_INLINE bool View::Load(const std::string & path) + template class A> SIMD_INLINE bool View::Load(const std::string & path, Format format_) { - std::ifstream ifs(path.c_str(), std::ifstream::binary); - if (ifs.is_open()) - { - std::string type; - ifs >> type; - if (type == "P2" || type == "P5") - { - size_t w, h, d; - ifs >> w >> h >> d; - if (d != 255) - return false; - ifs.get(); - Recreate(w, h, View::Gray8); - if (type == "P2") - { - for (size_t row = 0; row < height; ++row) - { - for (size_t col = 0; col < width; ++col) - { - int gray; - ifs >> gray; - data[row * stride + col] = (uint8_t)gray; - } - } - } - else - { - for (size_t row = 0; row < height; ++row) - ifs.read((char*)(data + row*stride), width); - } - return true; - } - if (type == "P3" || type == "P6") - { - size_t w, h, d; - ifs >> w >> h >> d; - if (d != 255) - return false; - ifs.get(); - Recreate(w, h, View::Bgra32); - if (type == "P3") - { - for (size_t row = 0; row < height; ++row) - { - uint8_t * bgra = data + row * stride; - for (size_t col = 0; col < width; ++col, bgra += 4) - { - int blue, green, red; - ifs >> red >> green >> blue; - bgra[0] = (uint8_t)blue; - bgra[1] = (uint8_t)green; - bgra[2] = (uint8_t)red; - bgra[3] = 0xFF; - } - } - } - else - { - View buffer(width, 1, Bgr24); - for (size_t row = 0; row < height; ++row) - { - ifs.read((char*)buffer.data, width*3); - const uint8_t * rgb = buffer.data; - uint8_t * bgra = data + row*stride; - for (size_t col = 0; col < width; ++col, rgb += 3, bgra += 4) - { - bgra[0] = rgb[2]; - bgra[1] = rgb[1]; - bgra[2] = rgb[0]; - bgra[3] = 0xFF; - } - } - } - return true; - } - } - return false; + Clear(); + (Format&)format = format_; + *(uint8_t**)&data = SimdImageLoadFromFile(path.c_str(), (size_t*)&stride, (size_t*)&width, (size_t*)&height, (SimdPixelFormatType*)&format); + if (data) + _owner = true; + else + (Format&)format = None; + return _owner; } - template class A> SIMD_INLINE bool View::Save(const std::string & path) const + template class A> SIMD_INLINE bool View::Load(const uint8_t * src, size_t size, Format format_) { - if (!(format == View::Gray8 || format == View::Bgr24 || format == View::Bgra32)) - return false; - - std::ofstream ofs(path.c_str(), std::ofstream::binary); - if (ofs.is_open()) - { - if (format == View::Gray8) - { - ofs << "P5\n" << width << " " << height << "\n255\n"; - for (size_t row = 0; row < height; ++row) - ofs.write((const char*)(data + row*stride), width); - } - else if (format == View::Bgr24) - { - ofs << "P6\n" << width << " " << height << "\n255\n"; - View buffer(width, 1, Bgr24); - for (size_t row = 0; row < height; ++row) - { - const uint8_t * bgr = data + row*stride; - uint8_t * rgb = buffer.data; - for (size_t col = 0; col < width; ++col, bgr += 3, rgb += 3) - { - rgb[0] = bgr[2]; - rgb[1] = bgr[1]; - rgb[2] = bgr[0]; - } - ofs.write((const char*)(buffer.data), width*3); - } - } - else if (format == View::Bgra32) - { - ofs << "P6\n" << width << " " << height << "\n255\n"; - View buffer(width, 1, Bgr24); - for (size_t row = 0; row < height; ++row) - { - const uint8_t * bgra = data + row*stride; - uint8_t * rgb = buffer.data; - for (size_t col = 0; col < width; ++col, bgra += 4, rgb += 3) - { - rgb[0] = bgra[2]; - rgb[1] = bgra[1]; - rgb[2] = bgra[0]; - } - ofs.write((const char*)buffer.data, width * 3); - } - } - return true; - } + Clear(); + (Format&)format = format_; + *(uint8_t**)&data = SimdImageLoadFromMemory(src, size, (size_t*)&stride, (size_t*)&width, (size_t*)&height, (SimdPixelFormatType*)&format); + if (data) + _owner = true; else - return false; + (Format&)format = None; + return _owner; + } + + template class A> SIMD_INLINE bool View::Save(const std::string & path, SimdImageFileType type, int quality) const + { + return SimdImageSaveToFile(data, stride, width, height, (SimdPixelFormatType)format, type, quality, path.c_str()) == SimdTrue; } template class A> SIMD_INLINE void View::Clear() @@ -1279,6 +1198,16 @@ namespace Simd #endif } + template class A> SIMD_INLINE uint8_t* View::Release(size_t* size) + { + uint8_t* released = data; + if (size) + *size = DataSize(); + _owner = false; + Clear(); + return released; + } + // View utilities implementation: template class A, class T> const T & At(const View & view, size_t x, size_t y) diff --git a/CMakeLists.txt b/CMakeLists.txt index e61019f297..32b89aae0e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -679,6 +679,8 @@ if(NOT USE_OPENCV AND (NOT USE_PNG OR NOT USE_JPEG)) else() set(WITH_STBIMAGE OFF) endif() +# TODO: +set(WITH_STBIMAGE ON) VP_OPTION(WITH_CATCH2 "" "" "Use catch2" "" ON IF (VISP_CXX_STANDARD GREATER VISP_CXX_STANDARD_98)) diff --git a/modules/io/CMakeLists.txt b/modules/io/CMakeLists.txt index 959ee1c9b6..949ec58aef 100644 --- a/modules/io/CMakeLists.txt +++ b/modules/io/CMakeLists.txt @@ -57,11 +57,21 @@ if(USE_PNG) add_definitions(${PNG_DEFINITIONS}) endif() -if(WITH_STBIMAGE) +# TODO: +#if(WITH_STBIMAGE) # stb_image is private include_directories(${STBIMAGE_INCLUDE_DIRS}) +#endif() + +if(WITH_CATCH2) + # catch2 is private + include_directories(${CATCH2_INCLUDE_DIRS}) endif() +# simdlib is always enabled since it contains fallback code to plain C++ code +# Simd lib is private +include_directories(${SIMDLIB_INCLUDE_DIRS}) + # OpenCV if(USE_OPENCV) # On win32 since OpenCV 2.4.7 and on OSX with OpenCV 2.4.10 we cannot use OpenCV_LIBS to set ViSP 3rd party libraries. @@ -178,7 +188,7 @@ endif() vp_glob_module_sources() vp_module_include_directories(${opt_incs}) vp_create_module(${opt_libs}) -vp_add_tests(DEPENDS_ON visp_features) +vp_add_tests() vp_set_source_file_compile_flag(src/tools/vpParseArgv.cpp -Wno-strict-overflow) diff --git a/modules/io/include/visp3/io/vpImageIo.h b/modules/io/include/visp3/io/vpImageIo.h index d37cad48e3..11bd9aa766 100644 --- a/modules/io/include/visp3/io/vpImageIo.h +++ b/modules/io/include/visp3/io/vpImageIo.h @@ -144,6 +144,10 @@ class VISP_EXPORT vpImageIo static void readPNG(vpImage &I, const std::string &filename); static void readPNG(vpImage &I, const std::string &filename); + //TODO: + static void readSimdlib(vpImage &I, const std::string &filename); + static void readStb(vpImage &I, const std::string &filename); + static void writePFM(const vpImage &I, const std::string &filename); static void writePGM(const vpImage &I, const std::string &filename); @@ -158,5 +162,9 @@ class VISP_EXPORT vpImageIo static void writePNG(const vpImage &I, const std::string &filename); static void writePNG(const vpImage &I, const std::string &filename); + + //TODO: + static void writeSimdlib(vpImage &I, const std::string &filename); + static void writeStb(vpImage &I, const std::string &filename); }; #endif diff --git a/modules/io/src/image/vpImageIo.cpp b/modules/io/src/image/vpImageIo.cpp index ab290fa5f7..cc7799d158 100644 --- a/modules/io/src/image/vpImageIo.cpp +++ b/modules/io/src/image/vpImageIo.cpp @@ -62,6 +62,15 @@ #include #endif +//TODO: +#include +//TODO: +#define STB_IMAGE_IMPLEMENTATION +#include + +#define STB_IMAGE_WRITE_IMPLEMENTATION +#include + #if !defined(VISP_HAVE_OPENCV) #if !defined(VISP_HAVE_JPEG) || !defined(VISP_HAVE_PNG) @@ -2059,6 +2068,60 @@ void vpImageIo::readPNG(vpImage &I, const std::string &filename) fclose(file); } +//TODO: +void vpImageIo::readSimdlib(vpImage &I, const std::string &filename) +{ + size_t stride = 0, width = 0, height = 0; + SimdPixelFormatType format = SimdPixelFormatRgba32; + uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format); + const bool copyData = false; + I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData); +} + +void vpImageIo::readStb(vpImage &I, const std::string &filename) +{ + int width = 0, height = 0, channels = 0; + unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha); + if (image == NULL) { + throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); + } + I.init(reinterpret_cast(image), static_cast(height), static_cast(width), true); + stbi_image_free(image); +} + +inline bool ends_with(std::string const & value, std::string const & ending) +{ + if (ending.size() > value.size()) return false; + return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); +} + +void vpImageIo::writeSimdlib(vpImage &I, const std::string &filename) +{ + if (ends_with(filename, ".png")) { + SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFilePng, 90, filename.c_str()); + } else { + SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, 90, filename.c_str()); + } +} + +void vpImageIo::writeStb(vpImage &I, const std::string &filename) +{ + if (ends_with(filename, ".png")) { + const int stride_in_bytes = static_cast(4 * I.getWidth()); + int res = stbi_write_png(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, + reinterpret_cast(I.bitmap), stride_in_bytes); + if (res == 0) { + throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str())); + } + } else { + int res = stbi_write_jpg(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, + reinterpret_cast(I.bitmap), 90); + if (res == 0) { + throw(vpImageException(vpImageException::ioError, "JEPG write error")); + } + } +} + #elif defined(VISP_HAVE_OPENCV) /*! diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp new file mode 100644 index 0000000000..ce0d416b70 --- /dev/null +++ b/modules/io/test/perfImageLoadSave.cpp @@ -0,0 +1,461 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Benchmark color image conversion. + * + *****************************************************************************/ + +#include + +#ifdef VISP_HAVE_CATCH2 +#define CATCH_CONFIG_ENABLE_BENCHMARKING +#define CATCH_CONFIG_RUNNER +#include + +#include +#include +#include + +static std::string ipath = vpIoTools::getViSPImagesDataPath(); +static std::string imagePathJpeg = vpIoTools::createFilePath(ipath, "Klimt/Klimt.jpeg"); +static std::string imagePathPng = vpIoTools::createFilePath(ipath, "Klimt/Klimt.png"); +static std::string imagePathPngBig = vpIoTools::createFilePath(ipath, "Klimt/test_image_resize.png"); +static int nThreads = 0; + +TEST_CASE("Benchmark Jpeg image loading", "[benchmark]") { + { + vpImage I; + + BENCHMARK("vpImageIo::read()") { + vpImageIo::read(I, imagePathJpeg); + return I; + }; + } + + { + vpImage I; + + BENCHMARK("vpImageIo::readSimdlib()") { + vpImageIo::readSimdlib(I, imagePathJpeg); + return I; + }; + } + + { + vpImage I; + + BENCHMARK("vpImageIo::readStb()") { + vpImageIo::readStb(I, imagePathJpeg); + return I; + }; + } +} + +TEST_CASE("Benchmark Png image loading", "[benchmark]") { + { + vpImage I; + + BENCHMARK("vpImageIo::read()") { + vpImageIo::read(I, imagePathPng); + return I; + }; + } + + { + vpImage I; + + BENCHMARK("vpImageIo::readSimdlib()") { + vpImageIo::readSimdlib(I, imagePathPng); + return I; + }; + } + + { + vpImage I; + + BENCHMARK("vpImageIo::readStb()") { + vpImageIo::readStb(I, imagePathPng); + return I; + }; + } +} + +TEST_CASE("Benchmark big Png image loading", "[benchmark]") { + { + vpImage I; + + BENCHMARK("vpImageIo::read()") { + vpImageIo::read(I, imagePathPngBig); + return I; + }; + } + + { + vpImage I; + + BENCHMARK("vpImageIo::readSimdlib()") { + vpImageIo::readSimdlib(I, imagePathPngBig); + return I; + }; + } + + { + vpImage I; + + BENCHMARK("vpImageIo::readStb()") { + vpImageIo::readStb(I, imagePathPngBig); + return I; + }; + } +} + +TEST_CASE("Benchmark Jpeg image saving", "[benchmark]") { + vpImage I; + vpImageIo::read(I, imagePathJpeg); + { + const std::string filename = "/tmp/Klimt_ViSP.jpg"; + + BENCHMARK("vpImageIo::write()") { + vpImageIo::write(I, filename); + return I; + }; + } + + { + const std::string filename = "/tmp/Klimt_Simd.jpg"; + + BENCHMARK("vpImageIo::writeSimdlib()") { + vpImageIo::writeSimdlib(I, filename); + return I; + }; + } + + { + const std::string filename = "/tmp/Klimt_stb.jpg"; + + BENCHMARK("vpImageIo::writeStb()") { + vpImageIo::writeStb(I, filename); + return I; + }; + } +} + +TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") { + vpImage I; + vpImageIo::read(I, imagePathPngBig); + { + const std::string filename = "/tmp/Big_images_ViSP.jpg"; + + BENCHMARK("vpImageIo::write()") { + vpImageIo::write(I, filename); + return I; + }; + } + +// { +// const std::string filename = "/tmp/Big_images_Simd.jpg"; + +// BENCHMARK("vpImageIo::writeSimdlib()") { +// vpImageIo::writeSimdlib(I, filename); +// return I; +// }; +// } + + { + const std::string filename = "/tmp/Big_images_stb.jpg"; + + BENCHMARK("vpImageIo::writeStb()") { + vpImageIo::writeStb(I, filename); + return I; + }; + } +} + +TEST_CASE("Benchmark Png image saving", "[benchmark]") { + vpImage I; + vpImageIo::read(I, imagePathPng); + { + const std::string filename = "/tmp/Klimt_ViSP.png"; + + BENCHMARK("vpImageIo::write()") { + vpImageIo::write(I, filename); + return I; + }; + } + + { + const std::string filename = "/tmp/Klimt_Simd.png"; + + BENCHMARK("vpImageIo::writeSimdlib()") { + vpImageIo::writeSimdlib(I, filename); + return I; + }; + } + + { + const std::string filename = "/tmp/Klimt_stb.png"; + + BENCHMARK("vpImageIo::writeStb()") { + vpImageIo::writeStb(I, filename); + return I; + }; + } +} + +TEST_CASE("Benchmark big Png image saving", "[benchmark]") { + vpImage I; + vpImageIo::read(I, imagePathPngBig); + { + const std::string filename = "/tmp/Big_images_ViSP.png"; + + BENCHMARK("vpImageIo::write()") { + vpImageIo::write(I, filename); + return I; + }; + } + + { + const std::string filename = "/tmp/Big_images_Simd.png"; + + BENCHMARK("vpImageIo::writeSimdlib()") { + vpImageIo::writeSimdlib(I, filename); + return I; + }; + } + + { + const std::string filename = "/tmp/Big_images_stb.png"; + + BENCHMARK("vpImageIo::writeStb()") { + vpImageIo::writeStb(I, filename); + return I; + }; + } +} + +//TEST_CASE("Benchmark bgr to grayscale (ViSP)", "[benchmark]") { +// vpImage I; +// vpImageIo::read(I, imagePathColor); + +// std::vector bgr; +// common_tools::RGBaToBGR(I, bgr); + +// vpImage I_gray(I.getHeight(), I.getWidth()); + +// BENCHMARK("Benchmark bgr to grayscale (ViSP)") { +// vpImageConvert::BGRToGrey(bgr.data(), +// I_gray.bitmap, +// I.getWidth(), I.getHeight(), +// false, nThreads); +// return I_gray; +// }; + +//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101) +// SECTION("OpenCV Mat type") +// { +// cv::Mat img; +// vpImageConvert::convert(I, img); + +// BENCHMARK("Benchmark bgr to grayscale (ViSP + OpenCV Mat type)") { +// vpImageConvert::convert(img, I_gray, false, nThreads); +// return I_gray; +// }; +// } +//#endif +//} +//#endif + +//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101) +//TEST_CASE("Benchmark bgr to grayscale (OpenCV)", "[benchmark]") { +// cv::Mat img = cv::imread(imagePathColor); +// cv::Mat img_gray(img.size(), CV_8UC1); + +// BENCHMARK("Benchmark bgr to grayscale (OpenCV)") { +// cv::cvtColor(img, img_gray, cv::COLOR_BGR2GRAY); +// return img_gray; +// }; +//} +//#endif + +//// C++11 to be able to do bgr.data() +//#if VISP_CXX_STANDARD >= VISP_CXX_STANDARD_11 +//TEST_CASE("Benchmark bgr to rgba (naive code)", "[benchmark]") { +// vpImage I; +// vpImageIo::read(I, imagePathColor); + +// std::vector bgr; +// common_tools::RGBaToBGR(I, bgr); + +// vpImage I_bench(I.getHeight(), I.getWidth()); +// BENCHMARK("Benchmark bgr to rgba (naive code)") { +// common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast(I_bench.bitmap), +// I.getWidth(), I.getHeight(), false); +// return I_bench; +// }; +//} + +//TEST_CASE("Benchmark bgr to rgba (ViSP)", "[benchmark]") { +// vpImage I; +// vpImageIo::read(I, imagePathColor); + +// std::vector bgr; +// common_tools::RGBaToBGR(I, bgr); + +// SECTION("Check BGR to RGBa conversion") +// { +// vpImage ref(I.getHeight(), I.getWidth()); +// common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast(ref.bitmap), +// I.getWidth(), I.getHeight(), false); +// vpImage rgba(I.getHeight(), I.getWidth()); +// vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast(rgba.bitmap), +// I.getWidth(), I.getHeight(), false); + +// CHECK((rgba == ref)); +// } + +// vpImage I_rgba(I.getHeight(), I.getWidth()); +// BENCHMARK("Benchmark bgr to rgba (ViSP)") { +// vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast(I_rgba.bitmap), +// I.getWidth(), I.getHeight(), false); +// return I_rgba; +// }; + +//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101) +// SECTION("OpenCV Mat type") +// { +// cv::Mat img; +// vpImageConvert::convert(I, img); + +// BENCHMARK("Benchmark bgr to rgba (ViSP + OpenCV Mat type)") { +// vpImageConvert::convert(img, I_rgba); +// return I_rgba; +// }; +// } +//#endif +//} + +//TEST_CASE("Benchmark bgra to rgba (naive code)", "[benchmark]") { +// vpImage I; +// vpImageIo::read(I, imagePathColor); + +// std::vector bgra; +// common_tools::RGBaToBGRa(I, bgra); + +// vpImage I_bench(I.getHeight(), I.getWidth()); +// BENCHMARK("Benchmark bgra to rgba (naive code)") { +// common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast(I_bench.bitmap), +// I.getWidth(), I.getHeight(), false); +// return I_bench; +// }; +//} + +//TEST_CASE("Benchmark bgra to rgba (ViSP)", "[benchmark]") { +// vpImage I; +// vpImageIo::read(I, imagePathColor); + +// std::vector bgra; +// common_tools::RGBaToBGRa(I, bgra); + +// SECTION("Check BGRa to RGBa conversion") +// { +// vpImage ref(I.getHeight(), I.getWidth()); +// common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast(ref.bitmap), +// I.getWidth(), I.getHeight(), false); +// vpImage rgba(I.getHeight(), I.getWidth()); +// vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast(rgba.bitmap), +// I.getWidth(), I.getHeight(), false); + +// CHECK((rgba == ref)); +// } +// vpImage I_rgba(I.getHeight(), I.getWidth()); +// BENCHMARK("Benchmark bgra to rgba (ViSP)") { +// vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast(I_rgba.bitmap), +// I.getWidth(), I.getHeight(), false); +// return I_rgba; +// }; +//} +//#endif + +int main(int argc, char *argv[]) +{ + Catch::Session session; // There must be exactly one instance + + bool runBenchmark = false; + // Build a new parser on top of Catch's + using namespace Catch::clara; + auto cli = session.cli() // Get Catch's composite command line parser + | Opt(runBenchmark) // bind variable to a new option, with a hint string + ["--benchmark"] // the option names it will respond to + ("run benchmark?") // description string for the help output + | Opt(imagePathJpeg, "imagePathColor") + ["--imagePathColor"] + ("Path to color image") + | Opt(imagePathPng, "imagePathColor") + ["--imagePathGray"] + ("Path to gray image") + | Opt(nThreads, "nThreads") + ["--nThreads"] + ("Number of threads"); + + // Now pass the new composite back to Catch so it uses that + session.cli(cli); + + // Let Catch (using Clara) parse the command line + session.applyCommandLine(argc, argv); + + if (runBenchmark) { +// vpImage I_color; +// vpImageIo::read(I_color, imagePathColor); +// std::cout << "imagePathColor:\n\t" << imagePathColor << "\n\t" << I_color.getWidth() << "x" << I_color.getHeight() << std::endl; + +// vpImage I_gray; +// vpImageIo::read(I_gray, imagePathGray); +// std::cout << "imagePathGray:\n\t" << imagePathGray << "\n\t" << I_gray.getWidth() << "x" << I_gray.getHeight() << std::endl; + std::cout << "nThreads: " << nThreads << " / available threads: " << std::thread::hardware_concurrency() << std::endl; + + int numFailed = session.run(); + + // numFailed is clamped to 255 as some unices only use the lower 8 bits. + // This clamping has already been applied, so just return it here + // You can also do any post run clean-up here + return numFailed; + } + + return EXIT_SUCCESS; +} +#else +#include + +int main() +{ + return 0; +} +#endif From 9d2183b339c0bac50d2a1a3636b434b97ca4a959 Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Thu, 4 Nov 2021 14:06:19 +0100 Subject: [PATCH 11/18] Fix issue when writing big Jpeg images. --- 3rdparty/simdlib/Simd/SimdImageSaveJpeg.h | 5 +++-- modules/io/test/perfImageLoadSave.cpp | 14 +++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h index d54164f7d4..f3d5f4a96c 100644 --- a/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h +++ b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h @@ -35,9 +35,9 @@ namespace Simd { struct BitBuf { - static const uint32_t capacity = 1024; + static const uint32_t capacity = 2048; uint32_t size; - uint16_t data[1024][2]; + uint16_t data[capacity][2]; SIMD_INLINE BitBuf() : size(0) @@ -51,6 +51,7 @@ namespace Simd SIMD_INLINE bool Full(uint32_t tail = capacity / 2) const { + assert(size <= capacity); return size + tail >= capacity; } diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp index ce0d416b70..8efe2c759e 100644 --- a/modules/io/test/perfImageLoadSave.cpp +++ b/modules/io/test/perfImageLoadSave.cpp @@ -180,14 +180,14 @@ TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") { }; } -// { -// const std::string filename = "/tmp/Big_images_Simd.jpg"; + { + const std::string filename = "/tmp/Big_images_Simd.jpg"; -// BENCHMARK("vpImageIo::writeSimdlib()") { -// vpImageIo::writeSimdlib(I, filename); -// return I; -// }; -// } + BENCHMARK("vpImageIo::writeSimdlib()") { + vpImageIo::writeSimdlib(I, filename); + return I; + }; + } { const std::string filename = "/tmp/Big_images_stb.jpg"; From 28c034ed87d847da2aa2be2f930d7c5d0d923c46 Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Wed, 17 Nov 2021 00:51:26 +0100 Subject: [PATCH 12/18] Experimental: wip code to try adding a backend system for image I/O. --- .../core/include/visp3/core/vpImageTools.h | 8 +- modules/io/include/visp3/io/vpImageIo.h | 34 +- .../io/src/image/private/vpImageIoBackend.h | 104 + .../io/src/image/private/vpImageIoLibjpeg.cpp | 345 +++ .../io/src/image/private/vpImageIoLibpng.cpp | 615 +++++ .../io/src/image/private/vpImageIoOpenCV.cpp | 205 ++ .../src/image/private/vpImageIoPortable.cpp | 569 +++++ .../io/src/image/private/vpImageIoSimd.cpp | 87 + modules/io/src/image/private/vpImageIoStb.cpp | 121 + modules/io/src/image/vpImageIo.cpp | 2112 ++--------------- modules/io/test/perfImageLoadSave.cpp | 171 +- 11 files changed, 2286 insertions(+), 2085 deletions(-) create mode 100644 modules/io/src/image/private/vpImageIoBackend.h create mode 100644 modules/io/src/image/private/vpImageIoLibjpeg.cpp create mode 100644 modules/io/src/image/private/vpImageIoLibpng.cpp create mode 100644 modules/io/src/image/private/vpImageIoOpenCV.cpp create mode 100644 modules/io/src/image/private/vpImageIoPortable.cpp create mode 100644 modules/io/src/image/private/vpImageIoSimd.cpp create mode 100644 modules/io/src/image/private/vpImageIoStb.cpp diff --git a/modules/core/include/visp3/core/vpImageTools.h b/modules/core/include/visp3/core/vpImageTools.h index 4dbf1a809a..d367aa5290 100644 --- a/modules/core/include/visp3/core/vpImageTools.h +++ b/modules/core/include/visp3/core/vpImageTools.h @@ -1489,19 +1489,19 @@ void vpImageTools::warpLinear(const vpImage &src, const vpMatrix &T, vpIma const Type val01 = src[y_][x_ + 1]; const Type val10 = src[y_ + 1][x_]; const Type val11 = src[y_ + 1][x_ + 1]; - const float col0 = lerp(val00, val01, s); - const float col1 = lerp(val10, val11, s); + const float col0 = lerp(static_cast(val00), static_cast(val01), s); + const float col1 = lerp(static_cast(val10), static_cast(val11), s); const float interp = lerp(col0, col1, t); dst[i][j] = vpMath::saturate(interp); } else if (y_ < static_cast(src.getHeight()) - 1) { const Type val00 = src[y_][x_]; const Type val10 = src[y_ + 1][x_]; - const float interp = lerp(val00, val10, t); + const float interp = lerp(static_cast(val00), static_cast(val10), t); dst[i][j] = vpMath::saturate(interp); } else if (x_ < static_cast(src.getWidth()) - 1) { const Type val00 = src[y_][x_]; const Type val01 = src[y_][x_ + 1]; - const float interp = lerp(val00, val01, s); + const float interp = lerp(static_cast(val00), static_cast(val01), s); dst[i][j] = vpMath::saturate(interp); } else { dst[i][j] = src[y_][x_]; diff --git a/modules/io/include/visp3/io/vpImageIo.h b/modules/io/include/visp3/io/vpImageIo.h index 11bd9aa766..fa395e3882 100644 --- a/modules/io/include/visp3/io/vpImageIo.h +++ b/modules/io/include/visp3/io/vpImageIo.h @@ -124,6 +124,16 @@ class VISP_EXPORT vpImageIo static std::string getExtension(const std::string &filename); public: + //TODO: + // Image IO backend for only jpeg and png formats + enum vpImageIoBackendType { + IO_DEFAULT_BACKEND, + IO_LIB_BACKEND, + IO_OPENCV_BACKEND, + IO_SIMDLIB_BACKEND, + IO_STB_IMAGE_BACKEND + }; + static void read(vpImage &I, const std::string &filename); static void read(vpImage &I, const std::string &filename); @@ -138,15 +148,11 @@ class VISP_EXPORT vpImageIo static void readPPM(vpImage &I, const std::string &filename); static void readPPM(vpImage &I, const std::string &filename); - static void readJPEG(vpImage &I, const std::string &filename); - static void readJPEG(vpImage &I, const std::string &filename); - - static void readPNG(vpImage &I, const std::string &filename); - static void readPNG(vpImage &I, const std::string &filename); + static void readJPEG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); + static void readJPEG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); - //TODO: - static void readSimdlib(vpImage &I, const std::string &filename); - static void readStb(vpImage &I, const std::string &filename); + static void readPNG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); + static void readPNG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); static void writePFM(const vpImage &I, const std::string &filename); @@ -157,14 +163,10 @@ class VISP_EXPORT vpImageIo static void writePPM(const vpImage &I, const std::string &filename); static void writePPM(const vpImage &I, const std::string &filename); - static void writeJPEG(const vpImage &I, const std::string &filename); - static void writeJPEG(const vpImage &I, const std::string &filename); - - static void writePNG(const vpImage &I, const std::string &filename); - static void writePNG(const vpImage &I, const std::string &filename); + static void writeJPEG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); + static void writeJPEG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); - //TODO: - static void writeSimdlib(vpImage &I, const std::string &filename); - static void writeStb(vpImage &I, const std::string &filename); + static void writePNG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); + static void writePNG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); }; #endif diff --git a/modules/io/src/image/private/vpImageIoBackend.h b/modules/io/src/image/private/vpImageIoBackend.h new file mode 100644 index 0000000000..e1b434c030 --- /dev/null +++ b/modules/io/src/image/private/vpImageIoBackend.h @@ -0,0 +1,104 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Read/write images. + * + * Authors: + * Eric Marchand + * + *****************************************************************************/ + +/*! + \file vpImageIo.h + \brief Read/write images +*/ + +#ifndef vpIMAGEIOBACKEND_H +#define vpIMAGEIOBACKEND_H + +#include + + +// +void vp_writePFM(const vpImage &I, const std::string &filename); +void vp_writePGM(const vpImage &I, const std::string &filename); +void vp_writePGM(const vpImage &I, const std::string &filename); +void vp_writePGM(const vpImage &I, const std::string &filename); +void vp_readPFM(vpImage &I, const std::string &filename); +void vp_readPGM(vpImage &I, const std::string &filename); +void vp_readPGM(vpImage &I, const std::string &filename); +void vp_readPPM(vpImage &I, const std::string &filename); +void vp_readPPM(vpImage &I, const std::string &filename); +void vp_writePPM(const vpImage &I, const std::string &filename); +void vp_writePPM(const vpImage &I, const std::string &filename); + +// +void readJPEGLibjpeg(vpImage &I, const std::string &filename); +void readJPEGLibjpeg(vpImage &I, const std::string &filename); + +void writeJPEGLibjpeg(const vpImage &I, const std::string &filename); +void writeJPEGLibjpeg(const vpImage &I, const std::string &filename); + +// +void readPNGLibpng(vpImage &I, const std::string &filename); +void readPNGLibpng(vpImage &I, const std::string &filename); + +void writePNGLibpng(const vpImage &I, const std::string &filename); +void writePNGLibpng(const vpImage &I, const std::string &filename); + +// +void readOpenCV(vpImage &I, const std::string &filename); +void readOpenCV(vpImage &I, const std::string &filename); + +void writeOpenCV(const vpImage &I, const std::string &filename); +void writeOpenCV(const vpImage &I, const std::string &filename); + +// +void readSimdlib(vpImage &I, const std::string &filename); +void readSimdlib(vpImage &I, const std::string &filename); + +void writeJPEGSimdlib(const vpImage &I, const std::string &filename); +void writeJPEGSimdlib(const vpImage &I, const std::string &filename); + +void writePNGSimdlib(const vpImage &I, const std::string &filename); +void writePNGSimdlib(const vpImage &I, const std::string &filename); + +// +void readStb(vpImage &I, const std::string &filename); +void readStb(vpImage &I, const std::string &filename); + +void writeJPEGStb(const vpImage &I, const std::string &filename); +void writeJPEGStb(const vpImage &I, const std::string &filename); + +void writePNGStb(const vpImage &I, const std::string &filename); +void writePNGStb(const vpImage &I, const std::string &filename); + +#endif diff --git a/modules/io/src/image/private/vpImageIoLibjpeg.cpp b/modules/io/src/image/private/vpImageIoLibjpeg.cpp new file mode 100644 index 0000000000..99debb3021 --- /dev/null +++ b/modules/io/src/image/private/vpImageIoLibjpeg.cpp @@ -0,0 +1,345 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Read/write images. + * + * Authors: + * Eric Marchand + * + *****************************************************************************/ + +/*! + \file vpImageIo.cpp + \brief Read/write images +*/ + +#include "vpImageIoBackend.h" +#include + +//TODO: +#if defined(_WIN32) +// Include WinSock2.h before windows.h to ensure that winsock.h is not +// included by windows.h since winsock.h and winsock2.h are incompatible +#include +#include +#endif + +#if defined(VISP_HAVE_JPEG) +#include +#include +#endif + + +//-------------------------------------------------------------------------- +// JPEG +//-------------------------------------------------------------------------- + +#if defined(VISP_HAVE_JPEG) + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a JPEG file. + + \param I : Image to save as a JPEG file. + \param filename : Name of the file containing the image. +*/ +void writeJPEGLibjpeg(const vpImage &I, const std::string &filename) +{ + struct jpeg_compress_struct cinfo; + struct jpeg_error_mgr jerr; + FILE *file; + + cinfo.err = jpeg_std_error(&jerr); + jpeg_create_compress(&cinfo); + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty")); + } + + file = fopen(filename.c_str(), "wb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str())); + } + + unsigned int width = I.getWidth(); + unsigned int height = I.getHeight(); + + jpeg_stdio_dest(&cinfo, file); + + cinfo.image_width = width; + cinfo.image_height = height; + cinfo.input_components = 1; + cinfo.in_color_space = JCS_GRAYSCALE; + jpeg_set_defaults(&cinfo); + + jpeg_start_compress(&cinfo, TRUE); + + unsigned char *line; + line = new unsigned char[width]; + unsigned char *input = (unsigned char *)I.bitmap; + while (cinfo.next_scanline < cinfo.image_height) { + for (unsigned int i = 0; i < width; i++) { + line[i] = *(input); + input++; + } + jpeg_write_scanlines(&cinfo, &line, 1); + } + + jpeg_finish_compress(&cinfo); + jpeg_destroy_compress(&cinfo); + delete[] line; + fclose(file); +} + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a JPEG file. + + \param I : Image to save as a JPEG file. + \param filename : Name of the file containing the image. +*/ +void writeJPEGLibjpeg(const vpImage &I, const std::string &filename) +{ + struct jpeg_compress_struct cinfo; + struct jpeg_error_mgr jerr; + FILE *file; + + cinfo.err = jpeg_std_error(&jerr); + jpeg_create_compress(&cinfo); + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty")); + } + + file = fopen(filename.c_str(), "wb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str())); + } + + unsigned int width = I.getWidth(); + unsigned int height = I.getHeight(); + + jpeg_stdio_dest(&cinfo, file); + + cinfo.image_width = width; + cinfo.image_height = height; + cinfo.input_components = 3; + cinfo.in_color_space = JCS_RGB; + jpeg_set_defaults(&cinfo); + + jpeg_start_compress(&cinfo, TRUE); + + unsigned char *line; + line = new unsigned char[3 * width]; + unsigned char *input = (unsigned char *)I.bitmap; + while (cinfo.next_scanline < cinfo.image_height) { + for (unsigned int i = 0; i < width; i++) { + line[i * 3] = *(input); + input++; + line[i * 3 + 1] = *(input); + input++; + line[i * 3 + 2] = *(input); + input++; + input++; + } + jpeg_write_scanlines(&cinfo, &line, 1); + } + + jpeg_finish_compress(&cinfo); + jpeg_destroy_compress(&cinfo); + delete[] line; + fclose(file); +} + +/*! + Read the contents of the JPEG file, allocate memory + for the corresponding gray level image, if necessary convert the data in + gray level, and set the bitmap whith the gray level data. That means that + the image \e I is a "black and white" rendering of the original image in \e + filename, as in a black and white photograph. If necessary, the quantization + formula used is \f$0,299 r + 0,587 g + 0,114 b\f$. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + \param I : Image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void readJPEGLibjpeg(vpImage &I, const std::string &filename) +{ + struct jpeg_decompress_struct cinfo; + struct jpeg_error_mgr jerr; + FILE *file; + + cinfo.err = jpeg_std_error(&jerr); + jpeg_create_decompress(&cinfo); + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty")); + } + + file = fopen(filename.c_str(), "rb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str())); + } + + jpeg_stdio_src(&cinfo, file); + jpeg_read_header(&cinfo, TRUE); + + unsigned int width = cinfo.image_width; + unsigned int height = cinfo.image_height; + + if ((width != I.getWidth()) || (height != I.getHeight())) + I.resize(height, width); + + jpeg_start_decompress(&cinfo); + + unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components); + JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1); + + if (cinfo.out_color_space == JCS_RGB) { + vpImage Ic(height, width); + unsigned char *output = (unsigned char *)Ic.bitmap; + while (cinfo.output_scanline < cinfo.output_height) { + jpeg_read_scanlines(&cinfo, buffer, 1); + for (unsigned int i = 0; i < width; i++) { + *(output++) = buffer[0][i * 3]; + *(output++) = buffer[0][i * 3 + 1]; + *(output++) = buffer[0][i * 3 + 2]; + *(output++) = vpRGBa::alpha_default; + } + } + vpImageConvert::convert(Ic, I); + } + + else if (cinfo.out_color_space == JCS_GRAYSCALE) { + while (cinfo.output_scanline < cinfo.output_height) { + unsigned int row = cinfo.output_scanline; + jpeg_read_scanlines(&cinfo, buffer, 1); + memcpy(I[row], buffer[0], rowbytes); + } + } + + jpeg_finish_decompress(&cinfo); + jpeg_destroy_decompress(&cinfo); + fclose(file); +} + +/*! + Read a JPEG file and initialize a scalar image. + + Read the contents of the JPEG file, allocate + memory for the corresponding image, and set + the bitmap whith the content of + the file. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + If the file corresponds to a grayscaled image, a conversion is done to deal + with \e I which is a color image. + + \param I : Color image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void readJPEGLibjpeg(vpImage &I, const std::string &filename) +{ + struct jpeg_decompress_struct cinfo; + struct jpeg_error_mgr jerr; + FILE *file; + + cinfo.err = jpeg_std_error(&jerr); + jpeg_create_decompress(&cinfo); + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty")); + } + + file = fopen(filename.c_str(), "rb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str())); + } + + jpeg_stdio_src(&cinfo, file); + + jpeg_read_header(&cinfo, TRUE); + + unsigned int width = cinfo.image_width; + unsigned int height = cinfo.image_height; + + if ((width != I.getWidth()) || (height != I.getHeight())) + I.resize(height, width); + + jpeg_start_decompress(&cinfo); + + unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components); + JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1); + + if (cinfo.out_color_space == JCS_RGB) { + unsigned char *output = (unsigned char *)I.bitmap; + while (cinfo.output_scanline < cinfo.output_height) { + jpeg_read_scanlines(&cinfo, buffer, 1); + for (unsigned int i = 0; i < width; i++) { + *(output++) = buffer[0][i * 3]; + *(output++) = buffer[0][i * 3 + 1]; + *(output++) = buffer[0][i * 3 + 2]; + *(output++) = vpRGBa::alpha_default; + } + } + } + + else if (cinfo.out_color_space == JCS_GRAYSCALE) { + vpImage Ig(height, width); + + while (cinfo.output_scanline < cinfo.output_height) { + unsigned int row = cinfo.output_scanline; + jpeg_read_scanlines(&cinfo, buffer, 1); + memcpy(Ig[row], buffer[0], rowbytes); + } + + vpImageConvert::convert(Ig, I); + } + + jpeg_finish_decompress(&cinfo); + jpeg_destroy_decompress(&cinfo); + fclose(file); +} +#endif diff --git a/modules/io/src/image/private/vpImageIoLibpng.cpp b/modules/io/src/image/private/vpImageIoLibpng.cpp new file mode 100644 index 0000000000..e350e4260b --- /dev/null +++ b/modules/io/src/image/private/vpImageIoLibpng.cpp @@ -0,0 +1,615 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Read/write images. + * + * Authors: + * Eric Marchand + * + *****************************************************************************/ + +/*! + \file vpImageIo.cpp + \brief Read/write images +*/ + +#include "vpImageIoBackend.h" +#include + +//TODO: +#if defined(_WIN32) +// Include WinSock2.h before windows.h to ensure that winsock.h is not +// included by windows.h since winsock.h and winsock2.h are incompatible +#include +#include +#endif + +#if defined(VISP_HAVE_PNG) +#include +#endif + + +//-------------------------------------------------------------------------- +// PNG +//-------------------------------------------------------------------------- + +#if defined(VISP_HAVE_PNG) + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a PNG file. + + \param I : Image to save as a PNG file. + \param filename : Name of the file containing the image. +*/ +void writePNGLibpng(const vpImage &I, const std::string &filename) +{ + FILE *file; + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty")); + } + + file = fopen(filename.c_str(), "wb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str())); + } + + /* create a png info struct */ + png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + if (!png_ptr) { + fclose(file); + vpERROR_TRACE("Error during png_create_write_struct()\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + png_infop info_ptr = png_create_info_struct(png_ptr); + if (!info_ptr) { + fclose(file); + png_destroy_write_struct(&png_ptr, NULL); + vpERROR_TRACE("Error during png_create_info_struct()\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + /* initialize the setjmp for returning properly after a libpng error occured + */ + if (setjmp(png_jmpbuf(png_ptr))) { + fclose(file); + png_destroy_write_struct(&png_ptr, &info_ptr); + vpERROR_TRACE("Error during init_io\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + /* setup libpng for using standard C fwrite() function with our FILE pointer + */ + png_init_io(png_ptr, file); + + unsigned int width = I.getWidth(); + unsigned int height = I.getHeight(); + int bit_depth = 8; + int color_type = PNG_COLOR_TYPE_GRAY; + /* set some useful information from header */ + + if (setjmp(png_jmpbuf(png_ptr))) { + fclose(file); + png_destroy_write_struct(&png_ptr, &info_ptr); + vpERROR_TRACE("Error during write header\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE, + PNG_FILTER_TYPE_BASE); + + png_write_info(png_ptr, info_ptr); + + png_bytep *row_ptrs = new png_bytep[height]; + for (unsigned int i = 0; i < height; i++) + row_ptrs[i] = new png_byte[width]; + + unsigned char *input = (unsigned char *)I.bitmap; + + for (unsigned int i = 0; i < height; i++) { + png_byte *row = row_ptrs[i]; + for (unsigned int j = 0; j < width; j++) { + row[j] = *(input); + input++; + } + } + + png_write_image(png_ptr, row_ptrs); + + png_write_end(png_ptr, NULL); + + for (unsigned int j = 0; j < height; j++) + delete[] row_ptrs[j]; + + delete[] row_ptrs; + + png_destroy_write_struct(&png_ptr, &info_ptr); + + fclose(file); +} + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a PNG file. + + \param I : Image to save as a PNG file. + \param filename : Name of the file containing the image. +*/ +void writePNGLibpng(const vpImage &I, const std::string &filename) +{ + FILE *file; + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty")); + } + + file = fopen(filename.c_str(), "wb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str())); + } + + /* create a png info struct */ + png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + if (!png_ptr) { + fclose(file); + vpERROR_TRACE("Error during png_create_write_struct()\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + png_infop info_ptr = png_create_info_struct(png_ptr); + if (!info_ptr) { + fclose(file); + png_destroy_write_struct(&png_ptr, NULL); + vpERROR_TRACE("Error during png_create_info_struct()\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + /* initialize the setjmp for returning properly after a libpng error occured + */ + if (setjmp(png_jmpbuf(png_ptr))) { + fclose(file); + png_destroy_write_struct(&png_ptr, &info_ptr); + vpERROR_TRACE("Error during init_io\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + /* setup libpng for using standard C fwrite() function with our FILE pointer + */ + png_init_io(png_ptr, file); + + unsigned int width = I.getWidth(); + unsigned int height = I.getHeight(); + int bit_depth = 8; + int color_type = PNG_COLOR_TYPE_RGB; + /* set some useful information from header */ + + if (setjmp(png_jmpbuf(png_ptr))) { + fclose(file); + png_destroy_write_struct(&png_ptr, &info_ptr); + vpERROR_TRACE("Error during write header\n"); + throw(vpImageException(vpImageException::ioError, "PNG write error")); + } + + png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE, + PNG_FILTER_TYPE_BASE); + + png_write_info(png_ptr, info_ptr); + + png_bytep *row_ptrs = new png_bytep[height]; + for (unsigned int i = 0; i < height; i++) + row_ptrs[i] = new png_byte[3 * width]; + + unsigned char *input = (unsigned char *)I.bitmap; + + for (unsigned int i = 0; i < height; i++) { + png_byte *row = row_ptrs[i]; + for (unsigned int j = 0; j < width; j++) { + row[3 * j] = *(input); + input++; + row[3 * j + 1] = *(input); + input++; + row[3 * j + 2] = *(input); + input++; + input++; + } + } + + png_write_image(png_ptr, row_ptrs); + + png_write_end(png_ptr, NULL); + + for (unsigned int j = 0; j < height; j++) + delete[] row_ptrs[j]; + + delete[] row_ptrs; + + png_destroy_write_struct(&png_ptr, &info_ptr); + + fclose(file); +} + +/*! + Read the contents of the PNG file, allocate memory + for the corresponding gray level image, if necessary convert the data in + gray level, and set the bitmap whith the gray level data. That means that + the image \e I is a "black and white" rendering of the original image in \e + filename, as in a black and white photograph. If necessary, the quantization + formula used is \f$0,299 r + 0,587 g + 0,114 b\f$. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + \param I : Image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void readPNGLibpng(vpImage &I, const std::string &filename) +{ + FILE *file; + png_byte magic[8]; + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty")); + } + + file = fopen(filename.c_str(), "rb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str())); + } + + /* read magic number */ + if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) { + fclose(file); + throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str())); + } + + /* check for valid magic number */ + if (png_sig_cmp(magic, 0, sizeof(magic))) { + fclose(file); + throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image", + filename.c_str())); + } + + /* create a png read struct */ + // printf("version %s\n", PNG_LIBPNG_VER_STRING); + png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + if (png_ptr == NULL) { + fprintf(stderr, "error: can't create a png read structure!\n"); + fclose(file); + throw(vpImageException(vpImageException::ioError, "error reading png file")); + } + + /* create a png info struct */ + png_infop info_ptr = png_create_info_struct(png_ptr); + if (info_ptr == NULL) { + fprintf(stderr, "error: can't create a png info structure!\n"); + fclose(file); + png_destroy_read_struct(&png_ptr, NULL, NULL); + throw(vpImageException(vpImageException::ioError, "error reading png file")); + } + + /* initialize the setjmp for returning properly after a libpng error occured + */ + if (setjmp(png_jmpbuf(png_ptr))) { + fclose(file); + png_destroy_read_struct(&png_ptr, &info_ptr, NULL); + vpERROR_TRACE("Error during init io\n"); + throw(vpImageException(vpImageException::ioError, "PNG read error")); + } + + /* setup libpng for using standard C fread() function with our FILE pointer + */ + png_init_io(png_ptr, file); + + /* tell libpng that we have already read the magic number */ + png_set_sig_bytes(png_ptr, sizeof(magic)); + + /* read png info */ + png_read_info(png_ptr, info_ptr); + + unsigned int width = png_get_image_width(png_ptr, info_ptr); + unsigned int height = png_get_image_height(png_ptr, info_ptr); + + unsigned int bit_depth, channels, color_type; + /* get some useful information from header */ + bit_depth = png_get_bit_depth(png_ptr, info_ptr); + channels = png_get_channels(png_ptr, info_ptr); + color_type = png_get_color_type(png_ptr, info_ptr); + + /* convert index color images to RGB images */ + if (color_type == PNG_COLOR_TYPE_PALETTE) + png_set_palette_to_rgb(png_ptr); + + /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */ + if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) + png_set_expand(png_ptr); + + // if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS)) + // png_set_tRNS_to_alpha (png_ptr); + + if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA) + png_set_strip_alpha(png_ptr); + + if (bit_depth == 16) + png_set_strip_16(png_ptr); + else if (bit_depth < 8) + png_set_packing(png_ptr); + + /* update info structure to apply transformations */ + png_read_update_info(png_ptr, info_ptr); + + channels = png_get_channels(png_ptr, info_ptr); + + if ((width != I.getWidth()) || (height != I.getHeight())) + I.resize(height, width); + + png_bytep *rowPtrs = new png_bytep[height]; + + unsigned int stride = png_get_rowbytes(png_ptr, info_ptr); + unsigned char *data = new unsigned char[stride * height]; + + for (unsigned int i = 0; i < height; i++) + rowPtrs[i] = (png_bytep)data + (i * stride); + + png_read_image(png_ptr, rowPtrs); + + vpImage Ic(height, width); + unsigned char *output; + + switch (channels) { + case 1: + output = (unsigned char *)I.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i]; + } + break; + + case 2: + output = (unsigned char *)I.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i * 2]; + } + break; + + case 3: + output = (unsigned char *)Ic.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i * 3]; + *(output++) = data[i * 3 + 1]; + *(output++) = data[i * 3 + 2]; + *(output++) = vpRGBa::alpha_default; + } + vpImageConvert::convert(Ic, I); + break; + + case 4: + output = (unsigned char *)Ic.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i * 4]; + *(output++) = data[i * 4 + 1]; + *(output++) = data[i * 4 + 2]; + *(output++) = data[i * 4 + 3]; + } + vpImageConvert::convert(Ic, I); + break; + } + + delete[](png_bytep) rowPtrs; + delete[] data; + png_read_end(png_ptr, NULL); + png_destroy_read_struct(&png_ptr, &info_ptr, NULL); + fclose(file); +} + +/*! + Read a PNG file and initialize a scalar image. + + Read the contents of the PNG file, allocate + memory for the corresponding image, and set + the bitmap whith the content of + the file. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + If the file corresponds to a grayscaled image, a conversion is done to deal + with \e I which is a color image. + + \param I : Color image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void readPNGLibpng(vpImage &I, const std::string &filename) +{ + FILE *file; + png_byte magic[8]; + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty")); + } + + file = fopen(filename.c_str(), "rb"); + + if (file == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str())); + } + + /* read magic number */ + if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) { + fclose(file); + throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str())); + } + + /* check for valid magic number */ + if (png_sig_cmp(magic, 0, sizeof(magic))) { + fclose(file); + throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image", + filename.c_str())); + } + + /* create a png read struct */ + png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + if (!png_ptr) { + fclose(file); + vpERROR_TRACE("Error during png_create_read_struct()\n"); + throw(vpImageException(vpImageException::ioError, "PNG read error")); + } + + /* create a png info struct */ + png_infop info_ptr = png_create_info_struct(png_ptr); + if (!info_ptr) { + fclose(file); + png_destroy_read_struct(&png_ptr, NULL, NULL); + vpERROR_TRACE("Error during png_create_info_struct()\n"); + throw(vpImageException(vpImageException::ioError, "PNG read error")); + } + + /* initialize the setjmp for returning properly after a libpng error occured + */ + if (setjmp(png_jmpbuf(png_ptr))) { + fclose(file); + png_destroy_read_struct(&png_ptr, &info_ptr, NULL); + vpERROR_TRACE("Error during init io\n"); + throw(vpImageException(vpImageException::ioError, "PNG read error")); + } + + /* setup libpng for using standard C fread() function with our FILE pointer + */ + png_init_io(png_ptr, file); + + /* tell libpng that we have already read the magic number */ + png_set_sig_bytes(png_ptr, sizeof(magic)); + + /* read png info */ + png_read_info(png_ptr, info_ptr); + + unsigned int width = png_get_image_width(png_ptr, info_ptr); + unsigned int height = png_get_image_height(png_ptr, info_ptr); + + unsigned int bit_depth, channels, color_type; + /* get some useful information from header */ + bit_depth = png_get_bit_depth(png_ptr, info_ptr); + channels = png_get_channels(png_ptr, info_ptr); + color_type = png_get_color_type(png_ptr, info_ptr); + + /* convert index color images to RGB images */ + if (color_type == PNG_COLOR_TYPE_PALETTE) + png_set_palette_to_rgb(png_ptr); + + /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */ + if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) + png_set_expand(png_ptr); + + // if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS)) + // png_set_tRNS_to_alpha (png_ptr); + + if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA) + png_set_strip_alpha(png_ptr); + + if (bit_depth == 16) + png_set_strip_16(png_ptr); + else if (bit_depth < 8) + png_set_packing(png_ptr); + + /* update info structure to apply transformations */ + png_read_update_info(png_ptr, info_ptr); + + channels = png_get_channels(png_ptr, info_ptr); + + if ((width != I.getWidth()) || (height != I.getHeight())) + I.resize(height, width); + + png_bytep *rowPtrs = new png_bytep[height]; + + unsigned int stride = png_get_rowbytes(png_ptr, info_ptr); + unsigned char *data = new unsigned char[stride * height]; + + for (unsigned int i = 0; i < height; i++) + rowPtrs[i] = (png_bytep)data + (i * stride); + + png_read_image(png_ptr, rowPtrs); + + vpImage Ig(height, width); + unsigned char *output; + + switch (channels) { + case 1: + output = (unsigned char *)Ig.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i]; + } + vpImageConvert::convert(Ig, I); + break; + + case 2: + output = (unsigned char *)Ig.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i * 2]; + } + vpImageConvert::convert(Ig, I); + break; + + case 3: + output = (unsigned char *)I.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i * 3]; + *(output++) = data[i * 3 + 1]; + *(output++) = data[i * 3 + 2]; + *(output++) = vpRGBa::alpha_default; + } + break; + + case 4: + output = (unsigned char *)I.bitmap; + for (unsigned int i = 0; i < width * height; i++) { + *(output++) = data[i * 4]; + *(output++) = data[i * 4 + 1]; + *(output++) = data[i * 4 + 2]; + *(output++) = data[i * 4 + 3]; + } + break; + } + + delete[](png_bytep) rowPtrs; + delete[] data; + png_read_end(png_ptr, NULL); + png_destroy_read_struct(&png_ptr, &info_ptr, NULL); + fclose(file); +} +#endif diff --git a/modules/io/src/image/private/vpImageIoOpenCV.cpp b/modules/io/src/image/private/vpImageIoOpenCV.cpp new file mode 100644 index 0000000000..93b6a1ca1d --- /dev/null +++ b/modules/io/src/image/private/vpImageIoOpenCV.cpp @@ -0,0 +1,205 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Read/write images. + * + * Authors: + * Eric Marchand + * + *****************************************************************************/ + +/*! + \file vpImageIo.cpp + \brief Read/write images +*/ + +#include "vpImageIoBackend.h" + +//TODO: +#ifdef VISP_HAVE_OPENCV +#if (VISP_HAVE_OPENCV_VERSION >= 0x030000) // Require opencv >= 3.0.0 +# include +#elif (VISP_HAVE_OPENCV_VERSION >= 0x020408) // Require opencv >= 2.4.8 +# include +# include +# include +#elif (VISP_HAVE_OPENCV_VERSION >= 0x020101) // Require opencv >= 2.1.1 +# include +# include +# include +# include +#else +# include +#endif +#endif + +#include + + +#if defined(VISP_HAVE_OPENCV) + +/*! + Read the contents of the JPEG file, allocate memory + for the corresponding gray level image, if necessary convert the data in + gray level, and set the bitmap whith the gray level data. That means that + the image \e I is a "black and white" rendering of the original image in \e + filename, as in a black and white photograph. If necessary, the quantization + formula used is \f$0,299 r + 0,587 g + 0,114 b\f$. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + If EXIF information is embedded in the image file, the EXIF orientation is ignored. + + \param I : Image to set with the \e filename content. + \param filename : Name of the file containing the image. + +*/ +void readOpenCV(vpImage &I, const std::string &filename) +{ +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 +#if VISP_HAVE_OPENCV_VERSION >= 0x030200 + int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION; +#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 + int flags = cv::IMREAD_GRAYSCALE; +#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 + int flags = CV_LOAD_IMAGE_GRAYSCALE; +#endif + cv::Mat Ip = cv::imread(filename.c_str(), flags); + if (!Ip.empty()) + vpImageConvert::convert(Ip, I); + else + throw(vpImageException(vpImageException::ioError, "Can't read the image")); +#else + IplImage *Ip = NULL; + Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE); + if (Ip != NULL) + vpImageConvert::convert(Ip, I); + else + throw(vpImageException(vpImageException::ioError, "Can't read the image")); + cvReleaseImage(&Ip); +#endif +} + +/*! + Read a JPEG file and initialize a scalar image. + + Read the contents of the JPEG file, allocate + memory for the corresponding image, and set + the bitmap whith the content of + the file. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + If the file corresponds to a grayscaled image, a conversion is done to deal + with \e I which is a color image. + + If EXIF information is embedded in the image file, the EXIF orientation is ignored. + + \param I : Color image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void readOpenCV(vpImage &I, const std::string &filename) +{ +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 +#if VISP_HAVE_OPENCV_VERSION >= 0x030200 + int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION; +#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 + int flags = cv::IMREAD_GRAYSCALE; +#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 + int flags = CV_LOAD_IMAGE_GRAYSCALE; +#endif + cv::Mat Ip = cv::imread(filename.c_str(), flags); + if (!Ip.empty()) + vpImageConvert::convert(Ip, I); + else + throw(vpImageException(vpImageException::ioError, "Can't read the image")); +#else + IplImage *Ip = NULL; + Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_COLOR); + if (Ip != NULL) + vpImageConvert::convert(Ip, I); + else + throw(vpImageException(vpImageException::ioError, "Can't read the image")); + cvReleaseImage(&Ip); +#endif +} + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a JPEG file. + + \param I : Image to save as a JPEG file. + \param filename : Name of the file containing the image. +*/ +void writeOpenCV(const vpImage &I, const std::string &filename) +{ +#if (VISP_HAVE_OPENCV_VERSION >= 0x020408) + cv::Mat Ip; + vpImageConvert::convert(I, Ip); + cv::imwrite(filename.c_str(), Ip); +#else + IplImage *Ip = NULL; + vpImageConvert::convert(I, Ip); + + cvSaveImage(filename.c_str(), Ip); + + cvReleaseImage(&Ip); +#endif +} + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a JPEG file. + + \param I : Image to save as a JPEG file. + \param filename : Name of the file containing the image. +*/ +void writeOpenCV(const vpImage &I, const std::string &filename) +{ +#if (VISP_HAVE_OPENCV_VERSION >= 0x020408) + cv::Mat Ip; + vpImageConvert::convert(I, Ip); + cv::imwrite(filename.c_str(), Ip); +#else + IplImage *Ip = NULL; + vpImageConvert::convert(I, Ip); + + cvSaveImage(filename.c_str(), Ip); + + cvReleaseImage(&Ip); +#endif +} + +#endif diff --git a/modules/io/src/image/private/vpImageIoPortable.cpp b/modules/io/src/image/private/vpImageIoPortable.cpp new file mode 100644 index 0000000000..0031e4c96a --- /dev/null +++ b/modules/io/src/image/private/vpImageIoPortable.cpp @@ -0,0 +1,569 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Read/write images. + * + * Authors: + * Eric Marchand + * + *****************************************************************************/ + +/*! + \file vpImageIo.cpp + \brief Read/write images +*/ + +#include "vpImageIoBackend.h" +#include +#include + +//TODO: +#if defined(_WIN32) +// Include WinSock2.h before windows.h to ensure that winsock.h is not +// included by windows.h since winsock.h and winsock2.h are incompatible +#include +#include +#endif + + +void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w, + unsigned int &h, unsigned int &maxval); + +#ifndef DOXYGEN_SHOULD_SKIP_THIS +/*! + * Decode the PNM image header. + * \param filename[in] : File name. + * \param fd[in] : File desdcriptor. + * \param magic[in] : Magic number for identifying the file type. + * \param w[out] : Image width. + * \param h[out] : Image height. + * \param maxval[out] : Maximum pixel value. + */ +void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w, + unsigned int &h, unsigned int &maxval) +{ + std::string line; + unsigned int nb_elt = 4, cpt_elt = 0; + while (cpt_elt != nb_elt) { + // Skip empty lines or lines starting with # (comment) + while (std::getline(fd, line) && (line.compare(0, 1, "#") == 0 || line.size() == 0)) { + } + + if (fd.eof()) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str())); + } + + std::vector header = vpIoTools::splitChain(line, std::string(" ")); + + if (header.size() == 0) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str())); + } + + if (cpt_elt == 0) { // decode magic + if (header[0].compare(0, magic.size(), magic) != 0) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "\"%s\" is not a PNM file with magic number %s", + filename.c_str(), magic.c_str())); + } + cpt_elt++; + header.erase(header.begin(), + header.begin() + 1); // erase first element that is processed + } + while (header.size()) { + if (cpt_elt == 1) { // decode width + std::istringstream ss(header[0]); + ss >> w; + cpt_elt++; + header.erase(header.begin(), + header.begin() + 1); // erase first element that is processed + } else if (cpt_elt == 2) { // decode height + std::istringstream ss(header[0]); + ss >> h; + cpt_elt++; + header.erase(header.begin(), + header.begin() + 1); // erase first element that is processed + } else if (cpt_elt == 3) { // decode maxval + std::istringstream ss(header[0]); + ss >> maxval; + cpt_elt++; + header.erase(header.begin(), + header.begin() + 1); // erase first element that is processed + } + } + } +} +#endif + +//-------------------------------------------------------------------------- +// PFM +//-------------------------------------------------------------------------- + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function is built like portable gray pixmap (eg PGM P5) file. + but considers float image data. + + \param I : Image to save as a (PFM P8) file. + \param filename : Name of the file containing the image. +*/ +void vp_writePFM(const vpImage &I, const std::string &filename) +{ + FILE *fd; + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot write PFM image: filename empty")); + } + + fd = fopen(filename.c_str(), "wb"); + + if (fd == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create PFM file \"%s\"", filename.c_str())); + } + + // Write the head + fprintf(fd, "P8\n"); // Magic number + fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size + fprintf(fd, "255\n"); // Max level + + // Write the bitmap + size_t ierr; + size_t nbyte = I.getWidth() * I.getHeight(); + + ierr = fwrite(I.bitmap, sizeof(float), nbyte, fd); + if (ierr != nbyte) { + fclose(fd); + throw(vpImageException(vpImageException::ioError, "Cannot save PFM file \"%s\": only %d bytes over %d saved ", + filename.c_str(), ierr, nbyte)); + } + + fflush(fd); + fclose(fd); +} + +//-------------------------------------------------------------------------- +// PGM +//-------------------------------------------------------------------------- + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a portable gray pixmap (PGM P5) file. + + \param I : Image to save as a (PGM P5) file. + \param filename : Name of the file containing the image. +*/ +void vp_writePGM(const vpImage &I, const std::string &filename) +{ + FILE *fd; + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty")); + } + + fd = fopen(filename.c_str(), "wb"); + + if (fd == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str())); + } + + // Write the head + fprintf(fd, "P5\n"); // Magic number + fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size + fprintf(fd, "255\n"); // Max level + + // Write the bitmap + size_t ierr; + size_t nbyte = I.getWidth() * I.getHeight(); + + ierr = fwrite(I.bitmap, sizeof(unsigned char), nbyte, fd); + if (ierr != nbyte) { + fclose(fd); + throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved", + filename.c_str(), ierr, nbyte)); + } + + fflush(fd); + fclose(fd); +} + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a portable gray pixmap (PGM P5) file. + + \param I : Image to save as a (PGM P5) file. + \param filename : Name of the file containing the image. +*/ +void vp_writePGM(const vpImage &I, const std::string &filename) +{ + vpImage Iuc; + unsigned int nrows = I.getHeight(); + unsigned int ncols = I.getWidth(); + + Iuc.resize(nrows, ncols); + + for (unsigned int i = 0; i < nrows * ncols; i++) + Iuc.bitmap[i] = (unsigned char)I.bitmap[i]; + + vp_writePGM(Iuc, filename); +} + +/*! + Write the content of the image bitmap in the file which name is given by \e + filename. This function writes a portable gray pixmap (PGM P5) file. + Color image is converted into a grayscale image. + + \param I : Image to save as a (PGM P5) file. + \param filename : Name of the file containing the image. +*/ +void vp_writePGM(const vpImage &I, const std::string &filename) +{ + + FILE *fd; + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty")); + } + + fd = fopen(filename.c_str(), "wb"); + + if (fd == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str())); + } + + // Write the head + fprintf(fd, "P5\n"); // Magic number + fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size + fprintf(fd, "255\n"); // Max level + + // Write the bitmap + size_t ierr; + size_t nbyte = I.getWidth() * I.getHeight(); + + vpImage Itmp; + vpImageConvert::convert(I, Itmp); + + ierr = fwrite(Itmp.bitmap, sizeof(unsigned char), nbyte, fd); + if (ierr != nbyte) { + fclose(fd); + throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved", + filename.c_str(), ierr, nbyte)); + } + + fflush(fd); + fclose(fd); +} + +/*! + Read a PFM P8 file and initialize a float image. + + Read the contents of the portable gray pixmap (PFM P8) filename, allocate + memory for the corresponding image, and set the bitmap whith the content of + the file. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + \param I : Image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void vp_readPFM(vpImage &I, const std::string &filename) +{ + unsigned int w = 0, h = 0, maxval = 0; + unsigned int w_max = 100000, h_max = 100000, maxval_max = 255; + std::string magic("P8"); + + std::ifstream fd(filename.c_str(), std::ios::binary); + + // Open the filename + if (!fd.is_open()) { + throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str())); + } + + vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval); + + if (w > w_max || h > h_max) { + fd.close(); + throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str())); + } + if (maxval > maxval_max) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str())); + } + + if ((h != I.getHeight()) || (w != I.getWidth())) { + I.resize(h, w); + } + + unsigned int nbyte = I.getHeight() * I.getWidth(); + fd.read((char *)I.bitmap, sizeof(float) * nbyte); + if (!fd) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte, + filename.c_str())); + } + + fd.close(); +} + +/*! + Read a PGM P5 file and initialize a scalar image. + + Read the contents of the portable gray pixmap (PGM P5) filename, allocate + memory for the corresponding image, and set the bitmap whith the content of + the file. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + \param I : Image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void vp_readPGM(vpImage &I, const std::string &filename) +{ + unsigned int w = 0, h = 0, maxval = 0; + unsigned int w_max = 100000, h_max = 100000, maxval_max = 255; + std::string magic("P5"); + + std::ifstream fd(filename.c_str(), std::ios::binary); + + // Open the filename + if (!fd.is_open()) { + throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str())); + } + + vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval); + + if (w > w_max || h > h_max) { + fd.close(); + throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str())); + } + if (maxval > maxval_max) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str())); + } + + if ((h != I.getHeight()) || (w != I.getWidth())) { + I.resize(h, w); + } + + unsigned int nbyte = I.getHeight() * I.getWidth(); + fd.read((char *)I.bitmap, nbyte); + if (!fd) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte, + filename.c_str())); + } + + fd.close(); +} + +/*! + Read a PGM P5 file and initialize a scalar image. + + Read the contents of the portable gray pixmap (PGM P5) filename, allocate + memory for the corresponding image, and set the bitmap whith the content of + the file. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + The gray level image contained in the \e filename is converted in a + color image in \e I. + + \param I : Color image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void vp_readPGM(vpImage &I, const std::string &filename) +{ + vpImage Itmp; + + vp_readPGM(Itmp, filename); + + vpImageConvert::convert(Itmp, I); +} + +//-------------------------------------------------------------------------- +// PPM +//-------------------------------------------------------------------------- + +/*! + Read the contents of the portable pixmap (PPM P6) filename, allocate memory + for the corresponding gray level image, convert the data in gray level, and + set the bitmap whith the gray level data. That means that the image \e I is + a "black and white" rendering of the original image in \e filename, as in a + black and white photograph. The quantization formula used is \f$0,299 r + + 0,587 g + 0,114 b\f$. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + \param I : Image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void vp_readPPM(vpImage &I, const std::string &filename) +{ + vpImage Itmp; + + vp_readPPM(Itmp, filename); + + vpImageConvert::convert(Itmp, I); +} + +/*! + Read the contents of the portable pixmap (PPM P6) filename, + allocate memory for the corresponding vpRGBa image. + + If the image has been already initialized, memory allocation is done + only if the new image size is different, else we re-use the same + memory space. + + \param I : Image to set with the \e filename content. + \param filename : Name of the file containing the image. +*/ +void vp_readPPM(vpImage &I, const std::string &filename) +{ + unsigned int w = 0, h = 0, maxval = 0; + unsigned int w_max = 100000, h_max = 100000, maxval_max = 255; + std::string magic("P6"); + + std::ifstream fd(filename.c_str(), std::ios::binary); + + // Open the filename + if (!fd.is_open()) { + throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str())); + } + + vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval); + + if (w > w_max || h > h_max) { + fd.close(); + throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str())); + } + if (maxval > maxval_max) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str())); + } + + if ((h != I.getHeight()) || (w != I.getWidth())) { + I.resize(h, w); + } + + for (unsigned int i = 0; i < I.getHeight(); i++) { + for (unsigned int j = 0; j < I.getWidth(); j++) { + unsigned char rgb[3]; + fd.read((char *)&rgb, 3); + + if (!fd) { + fd.close(); + throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", + (i * I.getWidth() + j) * 3 + fd.gcount(), I.getSize() * 3, filename.c_str())); + } + + I[i][j].R = rgb[0]; + I[i][j].G = rgb[1]; + I[i][j].B = rgb[2]; + I[i][j].A = vpRGBa::alpha_default; + } + } + + fd.close(); +} + +/*! + Write the content of the bitmap in the file which name is given by \e + filename. This function writes a portable gray pixmap (PPM P6) file. + grayscale image is converted into a color image vpRGBa. + + \param I : Image to save as a (PPM P6) file. + \param filename : Name of the file containing the image. +*/ +void vp_writePPM(const vpImage &I, const std::string &filename) +{ + vpImage Itmp; + + vpImageConvert::convert(I, Itmp); + + vp_writePPM(Itmp, filename); +} + +/*! + Write the content of the bitmap in the file which name is given by \e + filename. This function writes a portable gray pixmap (PPM P6) file. + + \param I : Image to save as a (PPM P6) file. + \param filename : Name of the file containing the image. +*/ +void vp_writePPM(const vpImage &I, const std::string &filename) +{ + FILE *f; + + // Test the filename + if (filename.empty()) { + throw(vpImageException(vpImageException::ioError, "Cannot create PPM file: filename empty")); + } + + f = fopen(filename.c_str(), "wb"); + + if (f == NULL) { + throw(vpImageException(vpImageException::ioError, "Cannot create PPM file \"%s\"", filename.c_str())); + } + + fprintf(f, "P6\n"); // Magic number + fprintf(f, "%u %u\n", I.getWidth(), I.getHeight()); // Image size + fprintf(f, "%d\n", 255); // Max level + + for (unsigned int i = 0; i < I.getHeight(); i++) { + for (unsigned int j = 0; j < I.getWidth(); j++) { + vpRGBa v = I[i][j]; + unsigned char rgb[3]; + rgb[0] = v.R; + rgb[1] = v.G; + rgb[2] = v.B; + + size_t res = fwrite(&rgb, 1, 3, f); + if (res != 3) { + fclose(f); + throw(vpImageException(vpImageException::ioError, "cannot write file \"%s\"", filename.c_str())); + } + } + } + + fflush(f); + fclose(f); +} diff --git a/modules/io/src/image/private/vpImageIoSimd.cpp b/modules/io/src/image/private/vpImageIoSimd.cpp new file mode 100644 index 0000000000..40986bf743 --- /dev/null +++ b/modules/io/src/image/private/vpImageIoSimd.cpp @@ -0,0 +1,87 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Read/write images. + * + * Authors: + * Eric Marchand + * + *****************************************************************************/ + +/*! + \file vpImageIo.cpp + \brief Read/write images +*/ + +#include "vpImageIoBackend.h" + +//TODO: +#include + + +//TODO: +void readSimdlib(vpImage &I, const std::string &filename) +{ + size_t stride = 0, width = 0, height = 0; + SimdPixelFormatType format = SimdPixelFormatGray8; + uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format); + const bool copyData = false; + I.init(data, (unsigned int)height, (unsigned int)width, copyData); +} + +void readSimdlib(vpImage &I, const std::string &filename) +{ + size_t stride = 0, width = 0, height = 0; + SimdPixelFormatType format = SimdPixelFormatRgba32; + uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format); + const bool copyData = false; + I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData); +} + +void writeJPEGSimdlib(const vpImage &I, const std::string &filename) +{ + SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFileJpeg, 90, filename.c_str()); +} + +void writeJPEGSimdlib(const vpImage &I, const std::string &filename) +{ + SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, 90, filename.c_str()); +} + +void writePNGSimdlib(const vpImage &I, const std::string &filename) +{ + SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFileJpeg, 90, filename.c_str()); +} + +void writePNGSimdlib(const vpImage &I, const std::string &filename) +{ + SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFilePng, 90, filename.c_str()); +} diff --git a/modules/io/src/image/private/vpImageIoStb.cpp b/modules/io/src/image/private/vpImageIoStb.cpp new file mode 100644 index 0000000000..97b453d841 --- /dev/null +++ b/modules/io/src/image/private/vpImageIoStb.cpp @@ -0,0 +1,121 @@ +/**************************************************************************** + * + * ViSP, open source Visual Servoing Platform software. + * Copyright (C) 2005 - 2019 by Inria. All rights reserved. + * + * This software is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * See the file LICENSE.txt at the root directory of this source + * distribution for additional information about the GNU GPL. + * + * For using ViSP with software that can not be combined with the GNU + * GPL, please contact Inria about acquiring a ViSP Professional + * Edition License. + * + * See http://visp.inria.fr for more information. + * + * This software was developed at: + * Inria Rennes - Bretagne Atlantique + * Campus Universitaire de Beaulieu + * 35042 Rennes Cedex + * France + * + * If you have questions regarding the use of this file, please contact + * Inria at visp@inria.fr + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE + * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * Description: + * Read/write images. + * + * Authors: + * Eric Marchand + * + *****************************************************************************/ + +/*! + \file vpImageIo.cpp + \brief Read/write images +*/ + +#include "vpImageIoBackend.h" + +//TODO: +#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) +# define VISP_HAVE_SSE2 1 +#endif + +#ifndef VISP_HAVE_SSE2 +# define STBI_NO_SIMD +#endif + +#define STB_IMAGE_IMPLEMENTATION +#include + +#define STB_IMAGE_WRITE_IMPLEMENTATION +#include + + +//TODO: +void readStb(vpImage &I, const std::string &filename) +{ + int width = 0, height = 0, channels = 0; + unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_grey); + if (image == NULL) { + throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); + } + I.init(image, static_cast(height), static_cast(width), true); + stbi_image_free(image); +} + +void readStb(vpImage &I, const std::string &filename) +{ + int width = 0, height = 0, channels = 0; + unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha); + if (image == NULL) { + throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); + } + I.init(reinterpret_cast(image), static_cast(height), static_cast(width), true); + stbi_image_free(image); +} + +void writeJPEGStb(const vpImage &I, const std::string &filename) +{ + int res = stbi_write_jpg(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_grey, + reinterpret_cast(I.bitmap), 90); + if (res == 0) { + throw(vpImageException(vpImageException::ioError, "JEPG write error")); + } +} + +void writeJPEGStb(const vpImage &I, const std::string &filename) +{ + int res = stbi_write_jpg(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, + reinterpret_cast(I.bitmap), 90); + if (res == 0) { + throw(vpImageException(vpImageException::ioError, "JEPG write error")); + } +} + +void writePNGStb(const vpImage &I, const std::string &filename) +{ + const int stride_in_bytes = static_cast(I.getWidth()); + int res = stbi_write_png(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_grey, + reinterpret_cast(I.bitmap), stride_in_bytes); + if (res == 0) { + throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str())); + } +} + +void writePNGStb(const vpImage &I, const std::string &filename) +{ + const int stride_in_bytes = static_cast(4 * I.getWidth()); + int res = stbi_write_png(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, + reinterpret_cast(I.bitmap), stride_in_bytes); + if (res == 0) { + throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str())); + } +} diff --git a/modules/io/src/image/vpImageIo.cpp b/modules/io/src/image/vpImageIo.cpp index cc7799d158..e8b221049e 100644 --- a/modules/io/src/image/vpImageIo.cpp +++ b/modules/io/src/image/vpImageIo.cpp @@ -46,119 +46,9 @@ #include #include -#if defined(_WIN32) -// Include WinSock2.h before windows.h to ensure that winsock.h is not -// included by windows.h since winsock.h and winsock2.h are incompatible -#include -#include -#endif - -#if defined(VISP_HAVE_JPEG) -#include -#include -#endif - -#if defined(VISP_HAVE_PNG) -#include -#endif - //TODO: -#include -//TODO: -#define STB_IMAGE_IMPLEMENTATION -#include - -#define STB_IMAGE_WRITE_IMPLEMENTATION -#include - -#if !defined(VISP_HAVE_OPENCV) -#if !defined(VISP_HAVE_JPEG) || !defined(VISP_HAVE_PNG) - -#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) -# define VISP_HAVE_SSE2 1 -#endif - -#ifndef VISP_HAVE_SSE2 -# define STBI_NO_SIMD -#endif - -#define STB_IMAGE_IMPLEMENTATION -#include +#include "private/vpImageIoBackend.h" -#define STB_IMAGE_WRITE_IMPLEMENTATION -#include -#endif -#endif - -void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w, - unsigned int &h, unsigned int &maxval); - -#ifndef DOXYGEN_SHOULD_SKIP_THIS -/*! - * Decode the PNM image header. - * \param filename[in] : File name. - * \param fd[in] : File desdcriptor. - * \param magic[in] : Magic number for identifying the file type. - * \param w[out] : Image width. - * \param h[out] : Image height. - * \param maxval[out] : Maximum pixel value. - */ -void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w, - unsigned int &h, unsigned int &maxval) -{ - std::string line; - unsigned int nb_elt = 4, cpt_elt = 0; - while (cpt_elt != nb_elt) { - // Skip empty lines or lines starting with # (comment) - while (std::getline(fd, line) && (line.compare(0, 1, "#") == 0 || line.size() == 0)) { - } - - if (fd.eof()) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str())); - } - - std::vector header = vpIoTools::splitChain(line, std::string(" ")); - - if (header.size() == 0) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str())); - } - - if (cpt_elt == 0) { // decode magic - if (header[0].compare(0, magic.size(), magic) != 0) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "\"%s\" is not a PNM file with magic number %s", - filename.c_str(), magic.c_str())); - } - cpt_elt++; - header.erase(header.begin(), - header.begin() + 1); // erase first element that is processed - } - while (header.size()) { - if (cpt_elt == 1) { // decode width - std::istringstream ss(header[0]); - ss >> w; - cpt_elt++; - header.erase(header.begin(), - header.begin() + 1); // erase first element that is processed - } else if (cpt_elt == 2) { // decode height - std::istringstream ss(header[0]); - ss >> h; - cpt_elt++; - header.erase(header.begin(), - header.begin() + 1); // erase first element that is processed - } else if (cpt_elt == 3) { // decode maxval - std::istringstream ss(header[0]); - ss >> maxval; - cpt_elt++; - header.erase(header.begin(), - header.begin() + 1); // erase first element that is processed - } - } - } -} -#endif vpImageIo::vpImageFormatType vpImageIo::getFormat(const std::string &filename) { @@ -271,18 +161,10 @@ void vpImageIo::read(vpImage &I, const std::string &filename) readPPM(I, final_filename); break; case FORMAT_JPEG: -#ifdef VISP_HAVE_JPEG readJPEG(I, final_filename); -#else - try_opencv_reader = true; -#endif break; case FORMAT_PNG: -#if defined(VISP_HAVE_PNG) readPNG(I, final_filename); -#else - try_opencv_reader = true; -#endif break; case FORMAT_TIFF: case FORMAT_BMP: @@ -297,39 +179,10 @@ void vpImageIo::read(vpImage &I, const std::string &filename) if (try_opencv_reader) { #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 -#if VISP_HAVE_OPENCV_VERSION >= 0x030200 - int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION; -#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 - int flags = cv::IMREAD_GRAYSCALE; -#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 - int flags = CV_LOAD_IMAGE_GRAYSCALE; -#endif - // std::cout << "Use opencv to read the image" << std::endl; - cv::Mat cvI = cv::imread(final_filename, flags); - if (cvI.cols == 0 && cvI.rows == 0) { - std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported"; - throw(vpImageException(vpImageException::ioError, message)); - } - vpImageConvert::convert(cvI, I); + readOpenCV(I, filename); #else - switch (getFormat(final_filename)) { - case FORMAT_JPEG: - readJPEG(I, final_filename); - break; - case FORMAT_PNG: - readPNG(I, final_filename); - break; - case FORMAT_BMP: - case FORMAT_TIFF: - case FORMAT_DIB: - case FORMAT_PBM: - case FORMAT_RASTER: - case FORMAT_JPEG2000: - case FORMAT_UNKNOWN: - default: - std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported"; - throw(vpImageException(vpImageException::ioError, message)); - } + std::string message = "Cannot read file \"" + filename + "\": No backend able to support this image format"; + throw(vpImageException(vpImageException::ioError, message)); #endif } } @@ -374,18 +227,10 @@ void vpImageIo::read(vpImage &I, const std::string &filename) readPPM(I, final_filename); break; case FORMAT_JPEG: -#ifdef VISP_HAVE_JPEG readJPEG(I, final_filename); -#else - try_opencv_reader = true; -#endif break; case FORMAT_PNG: -#if defined(VISP_HAVE_PNG) readPNG(I, final_filename); -#else - try_opencv_reader = true; -#endif break; case FORMAT_TIFF: case FORMAT_BMP: @@ -400,39 +245,10 @@ void vpImageIo::read(vpImage &I, const std::string &filename) if (try_opencv_reader) { #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 -#if VISP_HAVE_OPENCV_VERSION >= 0x030200 - int flags = cv::IMREAD_COLOR | cv::IMREAD_IGNORE_ORIENTATION; -#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 - int flags = cv::IMREAD_COLOR; -#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 - int flags = CV_LOAD_IMAGE_COLOR; -#endif - // std::cout << "Use opencv to read the image" << std::endl; - cv::Mat cvI = cv::imread(final_filename, flags); - if (cvI.cols == 0 && cvI.rows == 0) { - std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported"; - throw(vpImageException(vpImageException::ioError, message)); - } - vpImageConvert::convert(cvI, I); + readOpenCV(I, filename); #else - switch (getFormat(final_filename)) { - case FORMAT_JPEG: - readJPEG(I, final_filename); - break; - case FORMAT_PNG: - readPNG(I, final_filename); - break; - case FORMAT_BMP: - case FORMAT_TIFF: - case FORMAT_DIB: - case FORMAT_PBM: - case FORMAT_RASTER: - case FORMAT_JPEG2000: - case FORMAT_UNKNOWN: - default: - std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported"; - throw(vpImageException(vpImageException::ioError, message)); - } + std::string message = "Cannot read file \"" + filename + "\": No backend able to support this image format"; + throw(vpImageException(vpImageException::ioError, message)); #endif } } @@ -463,18 +279,10 @@ void vpImageIo::write(const vpImage &I, const std::string &filena writePPM(I, filename); break; case FORMAT_JPEG: -#ifdef VISP_HAVE_JPEG writeJPEG(I, filename); -#else - try_opencv_writer = true; -#endif break; case FORMAT_PNG: -#ifdef VISP_HAVE_PNG writePNG(I, filename); -#else - try_opencv_writer = true; -#endif break; case FORMAT_TIFF: case FORMAT_BMP: @@ -488,30 +296,11 @@ void vpImageIo::write(const vpImage &I, const std::string &filena } if (try_opencv_writer) { -#if VISP_HAVE_OPENCV_VERSION >= 0x020100 - // std::cout << "Use opencv to write the image" << std::endl; - cv::Mat cvI; - vpImageConvert::convert(I, cvI); - cv::imwrite(filename, cvI); +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + writeOpenCV(I, filename); #else - switch (getFormat(filename)) { - case FORMAT_JPEG: - writeJPEG(I, filename); - break; - case FORMAT_PNG: - writePNG(I, filename); - break; - case FORMAT_BMP: - case FORMAT_TIFF: - case FORMAT_DIB: - case FORMAT_PBM: - case FORMAT_RASTER: - case FORMAT_JPEG2000: - case FORMAT_UNKNOWN: - default: - vpCERROR << "Cannot write file: Image format not supported..." << std::endl; - throw(vpImageException(vpImageException::ioError, "Cannot write file: Image format not supported")); - } + std::string message = "Cannot write file \"" + filename + "\": No backend able to support this image format"; + throw(vpImageException(vpImageException::ioError, message)); #endif } } @@ -542,18 +331,10 @@ void vpImageIo::write(const vpImage &I, const std::string &filename) writePPM(I, filename); break; case FORMAT_JPEG: -#ifdef VISP_HAVE_JPEG writeJPEG(I, filename); -#else - try_opencv_writer = true; -#endif break; case FORMAT_PNG: -#ifdef VISP_HAVE_PNG writePNG(I, filename); -#else - try_opencv_writer = true; -#endif break; case FORMAT_TIFF: case FORMAT_BMP: @@ -567,1735 +348,250 @@ void vpImageIo::write(const vpImage &I, const std::string &filename) } if (try_opencv_writer) { -#if VISP_HAVE_OPENCV_VERSION >= 0x020100 - // std::cout << "Use opencv to write the image" << std::endl; - cv::Mat cvI; - vpImageConvert::convert(I, cvI); - cv::imwrite(filename, cvI); +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + writeOpenCV(I, filename); #else - switch (getFormat(filename)) { - case FORMAT_JPEG: - writeJPEG(I, filename); - break; - case FORMAT_PNG: - writePNG(I, filename); - break; - case FORMAT_BMP: - case FORMAT_TIFF: - case FORMAT_DIB: - case FORMAT_PBM: - case FORMAT_RASTER: - case FORMAT_JPEG2000: - case FORMAT_UNKNOWN: - default: - vpCERROR << "Cannot write file: Image format not supported..." << std::endl; - throw(vpImageException(vpImageException::ioError, "Cannot write file: Image format not supported")); - } + std::string message = "Cannot write file \"" + filename + "\": No backend able to support this image format"; + throw(vpImageException(vpImageException::ioError, message)); #endif } } -//-------------------------------------------------------------------------- -// PFM -//-------------------------------------------------------------------------- - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function is built like portable gray pixmap (eg PGM P5) file. - but considers float image data. - - \param I : Image to save as a (PFM P8) file. - \param filename : Name of the file containing the image. -*/ - -void vpImageIo::writePFM(const vpImage &I, const std::string &filename) +void vpImageIo::readJPEG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) { - FILE *fd; - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot write PFM image: filename empty")); + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_JPEG) + readJPEGLibjpeg(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": Libjpeg backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + readOpenCV(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + readSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + readStb(I, filename); } +} - fd = fopen(filename.c_str(), "wb"); - - if (fd == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create PFM file \"%s\"", filename.c_str())); +void vpImageIo::readJPEG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) +{ + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_JPEG) + readJPEGLibjpeg(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": Libjpeg backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + readOpenCV(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + readSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + readStb(I, filename); } +} - // Write the head - fprintf(fd, "P8\n"); // Magic number - fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size - fprintf(fd, "255\n"); // Max level - - // Write the bitmap - size_t ierr; - size_t nbyte = I.getWidth() * I.getHeight(); - - ierr = fwrite(I.bitmap, sizeof(float), nbyte, fd); - if (ierr != nbyte) { - fclose(fd); - throw(vpImageException(vpImageException::ioError, "Cannot save PFM file \"%s\": only %d bytes over %d saved ", - filename.c_str(), ierr, nbyte)); +void vpImageIo::readPNG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) +{ + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_PNG) + readPNGLibpng(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": Libpng backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + readOpenCV(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + readSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + readStb(I, filename); } - - fflush(fd); - fclose(fd); } -//-------------------------------------------------------------------------- -// PGM -//-------------------------------------------------------------------------- - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a portable gray pixmap (PGM P5) file. - - \param I : Image to save as a (PGM P5) file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writePGM(const vpImage &I, const std::string &filename) +void vpImageIo::readPNG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) { - - FILE *fd; - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty")); + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_PNG) + readPNGLibpng(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": Libpng backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + readOpenCV(I, filename); +#else + std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + readSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + readStb(I, filename); } +} - fd = fopen(filename.c_str(), "wb"); - - if (fd == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str())); +void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) +{ + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_JPEG) + writeJPEGLibjpeg(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": Libjpeg backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + writeOpenCV(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + writeJPEGSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + writeJPEGStb(I, filename); } +} - // Write the head - fprintf(fd, "P5\n"); // Magic number - fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size - fprintf(fd, "255\n"); // Max level +void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) +{ + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_JPEG) + writeJPEGLibjpeg(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": Libjpeg backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + writeOpenCV(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + writeJPEGSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + writeJPEGStb(I, filename); + } +} - // Write the bitmap - size_t ierr; - size_t nbyte = I.getWidth() * I.getHeight(); +void vpImageIo::writePNG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) +{ + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_PNG) + writePNGLibpng(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": Libpng backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + writeOpenCV(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + writePNGSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + writePNGStb(I, filename); + } +} - ierr = fwrite(I.bitmap, sizeof(unsigned char), nbyte, fd); - if (ierr != nbyte) { - fclose(fd); - throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved", - filename.c_str(), ierr, nbyte)); +void vpImageIo::writePNG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) +{ + if (backend == IO_LIB_BACKEND) { +#if defined(VISP_HAVE_PNG) + writePNGLibpng(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": Libpng backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_OPENCV_BACKEND) { +#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 + writeOpenCV(I, filename); +#else + std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available"; + throw(vpImageException(vpImageException::ioError, message)); +#endif + } else if (backend == IO_SIMDLIB_BACKEND) { + writePNGSimdlib(I, filename); + } else if (backend == IO_STB_IMAGE_BACKEND) { + writePNGStb(I, filename); } +} - fflush(fd); - fclose(fd); +void vpImageIo::writePFM(const vpImage &I, const std::string &filename) +{ + vp_writePFM(I, filename); } -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a portable gray pixmap (PGM P5) file. +void vpImageIo::writePGM(const vpImage &I, const std::string &filename) +{ + vp_writePGM(I, filename); +} - \param I : Image to save as a (PGM P5) file. - \param filename : Name of the file containing the image. -*/ void vpImageIo::writePGM(const vpImage &I, const std::string &filename) { - vpImage Iuc; - unsigned int nrows = I.getHeight(); - unsigned int ncols = I.getWidth(); - - Iuc.resize(nrows, ncols); - - for (unsigned int i = 0; i < nrows * ncols; i++) - Iuc.bitmap[i] = (unsigned char)I.bitmap[i]; - - vpImageIo::writePGM(Iuc, filename); + vp_writePGM(I, filename); } -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a portable gray pixmap (PGM P5) file. - Color image is converted into a grayscale image. - - \param I : Image to save as a (PGM P5) file. - \param filename : Name of the file containing the image. -*/ void vpImageIo::writePGM(const vpImage &I, const std::string &filename) { - - FILE *fd; - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty")); - } - - fd = fopen(filename.c_str(), "wb"); - - if (fd == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str())); - } - - // Write the head - fprintf(fd, "P5\n"); // Magic number - fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size - fprintf(fd, "255\n"); // Max level - - // Write the bitmap - size_t ierr; - size_t nbyte = I.getWidth() * I.getHeight(); - - vpImage Itmp; - vpImageConvert::convert(I, Itmp); - - ierr = fwrite(Itmp.bitmap, sizeof(unsigned char), nbyte, fd); - if (ierr != nbyte) { - fclose(fd); - throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved", - filename.c_str(), ierr, nbyte)); - } - - fflush(fd); - fclose(fd); + vp_writePGM(I, filename); } -/*! - Read a PFM P8 file and initialize a float image. - - Read the contents of the portable gray pixmap (PFM P8) filename, allocate - memory for the corresponding image, and set the bitmap whith the content of - the file. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. - -*/ - void vpImageIo::readPFM(vpImage &I, const std::string &filename) { - unsigned int w = 0, h = 0, maxval = 0; - unsigned int w_max = 100000, h_max = 100000, maxval_max = 255; - std::string magic("P8"); - - std::ifstream fd(filename.c_str(), std::ios::binary); - - // Open the filename - if (!fd.is_open()) { - throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str())); - } - - vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval); - - if (w > w_max || h > h_max) { - fd.close(); - throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str())); - } - if (maxval > maxval_max) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str())); - } - - if ((h != I.getHeight()) || (w != I.getWidth())) { - I.resize(h, w); - } - - unsigned int nbyte = I.getHeight() * I.getWidth(); - fd.read((char *)I.bitmap, sizeof(float) * nbyte); - if (!fd) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte, - filename.c_str())); - } - - fd.close(); + vp_readPFM(I, filename); } -/*! - Read a PGM P5 file and initialize a scalar image. - - Read the contents of the portable gray pixmap (PGM P5) filename, allocate - memory for the corresponding image, and set the bitmap whith the content of - the file. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. -*/ - void vpImageIo::readPGM(vpImage &I, const std::string &filename) { - unsigned int w = 0, h = 0, maxval = 0; - unsigned int w_max = 100000, h_max = 100000, maxval_max = 255; - std::string magic("P5"); - - std::ifstream fd(filename.c_str(), std::ios::binary); - - // Open the filename - if (!fd.is_open()) { - throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str())); - } - - vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval); - - if (w > w_max || h > h_max) { - fd.close(); - throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str())); - } - if (maxval > maxval_max) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str())); - } - - if ((h != I.getHeight()) || (w != I.getWidth())) { - I.resize(h, w); - } - - unsigned int nbyte = I.getHeight() * I.getWidth(); - fd.read((char *)I.bitmap, nbyte); - if (!fd) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte, - filename.c_str())); - } - - fd.close(); + vp_readPGM(I, filename); } -/*! - Read a PGM P5 file and initialize a scalar image. - - Read the contents of the portable gray pixmap (PGM P5) filename, allocate - memory for the corresponding image, and set the bitmap whith the content of - the file. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - The gray level image contained in the \e filename is converted in a - color image in \e I. - - \param I : Color image to set with the \e filename content. - \param filename : Name of the file containing the image. -*/ - void vpImageIo::readPGM(vpImage &I, const std::string &filename) { - vpImage Itmp; + vp_readPGM(I, filename); +} - vpImageIo::readPGM(Itmp, filename); +void vpImageIo::readPPM(vpImage &I, const std::string &filename) +{ + vp_readPPM(I, filename); +} - vpImageConvert::convert(Itmp, I); +void vpImageIo::readPPM(vpImage &I, const std::string &filename) +{ + vp_readPPM(I, filename); } -//-------------------------------------------------------------------------- -// PPM -//-------------------------------------------------------------------------- - -/*! - Read the contents of the portable pixmap (PPM P6) filename, allocate memory - for the corresponding gray level image, convert the data in gray level, and - set the bitmap whith the gray level data. That means that the image \e I is - a "black and white" rendering of the original image in \e filename, as in a - black and white photograph. The quantization formula used is \f$0,299 r + - 0,587 g + 0,114 b\f$. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. - -*/ -void vpImageIo::readPPM(vpImage &I, const std::string &filename) -{ - vpImage Itmp; - - vpImageIo::readPPM(Itmp, filename); - - vpImageConvert::convert(Itmp, I); -} - -/*! - Read the contents of the portable pixmap (PPM P6) filename, - allocate memory for the corresponding vpRGBa image. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::readPPM(vpImage &I, const std::string &filename) -{ - unsigned int w = 0, h = 0, maxval = 0; - unsigned int w_max = 100000, h_max = 100000, maxval_max = 255; - std::string magic("P6"); - - std::ifstream fd(filename.c_str(), std::ios::binary); - - // Open the filename - if (!fd.is_open()) { - throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str())); - } - - vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval); - - if (w > w_max || h > h_max) { - fd.close(); - throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str())); - } - if (maxval > maxval_max) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str())); - } - - if ((h != I.getHeight()) || (w != I.getWidth())) { - I.resize(h, w); - } - - for (unsigned int i = 0; i < I.getHeight(); i++) { - for (unsigned int j = 0; j < I.getWidth(); j++) { - unsigned char rgb[3]; - fd.read((char *)&rgb, 3); - - if (!fd) { - fd.close(); - throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", - (i * I.getWidth() + j) * 3 + fd.gcount(), I.getSize() * 3, filename.c_str())); - } - - I[i][j].R = rgb[0]; - I[i][j].G = rgb[1]; - I[i][j].B = rgb[2]; - I[i][j].A = vpRGBa::alpha_default; - } - } - - fd.close(); -} - -/*! - Write the content of the bitmap in the file which name is given by \e - filename. This function writes a portable gray pixmap (PPM P6) file. - grayscale image is converted into a color image vpRGBa. - - \param I : Image to save as a (PPM P6) file. - \param filename : Name of the file containing the image. - -*/ - void vpImageIo::writePPM(const vpImage &I, const std::string &filename) { - vpImage Itmp; - - vpImageConvert::convert(I, Itmp); - - vpImageIo::writePPM(Itmp, filename); + vp_writePPM(I, filename); } -/*! - Write the content of the bitmap in the file which name is given by \e - filename. This function writes a portable gray pixmap (PPM P6) file. - - \param I : Image to save as a (PPM P6) file. - \param filename : Name of the file containing the image. -*/ void vpImageIo::writePPM(const vpImage &I, const std::string &filename) { - FILE *f; - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot create PPM file: filename empty")); - } - - f = fopen(filename.c_str(), "wb"); - - if (f == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create PPM file \"%s\"", filename.c_str())); - } - - fprintf(f, "P6\n"); // Magic number - fprintf(f, "%u %u\n", I.getWidth(), I.getHeight()); // Image size - fprintf(f, "%d\n", 255); // Max level - - for (unsigned int i = 0; i < I.getHeight(); i++) { - for (unsigned int j = 0; j < I.getWidth(); j++) { - vpRGBa v = I[i][j]; - unsigned char rgb[3]; - rgb[0] = v.R; - rgb[1] = v.G; - rgb[2] = v.B; - - size_t res = fwrite(&rgb, 1, 3, f); - if (res != 3) { - fclose(f); - throw(vpImageException(vpImageException::ioError, "cannot write file \"%s\"", filename.c_str())); - } - } - } - - fflush(f); - fclose(f); -} - -//-------------------------------------------------------------------------- -// JPEG -//-------------------------------------------------------------------------- - -#if defined(VISP_HAVE_JPEG) - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a JPEG file. - - \param I : Image to save as a JPEG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename) -{ - struct jpeg_compress_struct cinfo; - struct jpeg_error_mgr jerr; - FILE *file; - - cinfo.err = jpeg_std_error(&jerr); - jpeg_create_compress(&cinfo); - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty")); - } - - file = fopen(filename.c_str(), "wb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str())); - } - - unsigned int width = I.getWidth(); - unsigned int height = I.getHeight(); - - jpeg_stdio_dest(&cinfo, file); - - cinfo.image_width = width; - cinfo.image_height = height; - cinfo.input_components = 1; - cinfo.in_color_space = JCS_GRAYSCALE; - jpeg_set_defaults(&cinfo); - - jpeg_start_compress(&cinfo, TRUE); - - unsigned char *line; - line = new unsigned char[width]; - unsigned char *input = (unsigned char *)I.bitmap; - while (cinfo.next_scanline < cinfo.image_height) { - for (unsigned int i = 0; i < width; i++) { - line[i] = *(input); - input++; - } - jpeg_write_scanlines(&cinfo, &line, 1); - } - - jpeg_finish_compress(&cinfo); - jpeg_destroy_compress(&cinfo); - delete[] line; - fclose(file); -} - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a JPEG file. - - \param I : Image to save as a JPEG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename) -{ - struct jpeg_compress_struct cinfo; - struct jpeg_error_mgr jerr; - FILE *file; - - cinfo.err = jpeg_std_error(&jerr); - jpeg_create_compress(&cinfo); - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty")); - } - - file = fopen(filename.c_str(), "wb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str())); - } - - unsigned int width = I.getWidth(); - unsigned int height = I.getHeight(); - - jpeg_stdio_dest(&cinfo, file); - - cinfo.image_width = width; - cinfo.image_height = height; - cinfo.input_components = 3; - cinfo.in_color_space = JCS_RGB; - jpeg_set_defaults(&cinfo); - - jpeg_start_compress(&cinfo, TRUE); - - unsigned char *line; - line = new unsigned char[3 * width]; - unsigned char *input = (unsigned char *)I.bitmap; - while (cinfo.next_scanline < cinfo.image_height) { - for (unsigned int i = 0; i < width; i++) { - line[i * 3] = *(input); - input++; - line[i * 3 + 1] = *(input); - input++; - line[i * 3 + 2] = *(input); - input++; - input++; - } - jpeg_write_scanlines(&cinfo, &line, 1); - } - - jpeg_finish_compress(&cinfo); - jpeg_destroy_compress(&cinfo); - delete[] line; - fclose(file); -} - -/*! - Read the contents of the JPEG file, allocate memory - for the corresponding gray level image, if necessary convert the data in - gray level, and set the bitmap whith the gray level data. That means that - the image \e I is a "black and white" rendering of the original image in \e - filename, as in a black and white photograph. If necessary, the quantization - formula used is \f$0,299 r + 0,587 g + 0,114 b\f$. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. - -*/ -void vpImageIo::readJPEG(vpImage &I, const std::string &filename) -{ - struct jpeg_decompress_struct cinfo; - struct jpeg_error_mgr jerr; - FILE *file; - - cinfo.err = jpeg_std_error(&jerr); - jpeg_create_decompress(&cinfo); - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty")); - } - - file = fopen(filename.c_str(), "rb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str())); - } - - jpeg_stdio_src(&cinfo, file); - jpeg_read_header(&cinfo, TRUE); - - unsigned int width = cinfo.image_width; - unsigned int height = cinfo.image_height; - - if ((width != I.getWidth()) || (height != I.getHeight())) - I.resize(height, width); - - jpeg_start_decompress(&cinfo); - - unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components); - JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1); - - if (cinfo.out_color_space == JCS_RGB) { - vpImage Ic(height, width); - unsigned char *output = (unsigned char *)Ic.bitmap; - while (cinfo.output_scanline < cinfo.output_height) { - jpeg_read_scanlines(&cinfo, buffer, 1); - for (unsigned int i = 0; i < width; i++) { - *(output++) = buffer[0][i * 3]; - *(output++) = buffer[0][i * 3 + 1]; - *(output++) = buffer[0][i * 3 + 2]; - *(output++) = vpRGBa::alpha_default; - } - } - vpImageConvert::convert(Ic, I); - } - - else if (cinfo.out_color_space == JCS_GRAYSCALE) { - while (cinfo.output_scanline < cinfo.output_height) { - unsigned int row = cinfo.output_scanline; - jpeg_read_scanlines(&cinfo, buffer, 1); - memcpy(I[row], buffer[0], rowbytes); - } - } - - jpeg_finish_decompress(&cinfo); - jpeg_destroy_decompress(&cinfo); - fclose(file); -} - -/*! - Read a JPEG file and initialize a scalar image. - - Read the contents of the JPEG file, allocate - memory for the corresponding image, and set - the bitmap whith the content of - the file. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - If the file corresponds to a grayscaled image, a conversion is done to deal - with \e I which is a color image. - - \param I : Color image to set with the \e filename content. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::readJPEG(vpImage &I, const std::string &filename) -{ - struct jpeg_decompress_struct cinfo; - struct jpeg_error_mgr jerr; - FILE *file; - - cinfo.err = jpeg_std_error(&jerr); - jpeg_create_decompress(&cinfo); - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty")); - } - - file = fopen(filename.c_str(), "rb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str())); - } - - jpeg_stdio_src(&cinfo, file); - - jpeg_read_header(&cinfo, TRUE); - - unsigned int width = cinfo.image_width; - unsigned int height = cinfo.image_height; - - if ((width != I.getWidth()) || (height != I.getHeight())) - I.resize(height, width); - - jpeg_start_decompress(&cinfo); - - unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components); - JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1); - - if (cinfo.out_color_space == JCS_RGB) { - unsigned char *output = (unsigned char *)I.bitmap; - while (cinfo.output_scanline < cinfo.output_height) { - jpeg_read_scanlines(&cinfo, buffer, 1); - for (unsigned int i = 0; i < width; i++) { - *(output++) = buffer[0][i * 3]; - *(output++) = buffer[0][i * 3 + 1]; - *(output++) = buffer[0][i * 3 + 2]; - *(output++) = vpRGBa::alpha_default; - } - } - } - - else if (cinfo.out_color_space == JCS_GRAYSCALE) { - vpImage Ig(height, width); - - while (cinfo.output_scanline < cinfo.output_height) { - unsigned int row = cinfo.output_scanline; - jpeg_read_scanlines(&cinfo, buffer, 1); - memcpy(Ig[row], buffer[0], rowbytes); - } - - vpImageConvert::convert(Ig, I); - } - - jpeg_finish_decompress(&cinfo); - jpeg_destroy_decompress(&cinfo); - fclose(file); -} - -#elif defined(VISP_HAVE_OPENCV) - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a JPEG file. - - \param I : Image to save as a JPEG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename) -{ -#if (VISP_HAVE_OPENCV_VERSION >= 0x020408) - cv::Mat Ip; - vpImageConvert::convert(I, Ip); - cv::imwrite(filename.c_str(), Ip); -#else - IplImage *Ip = NULL; - vpImageConvert::convert(I, Ip); - - cvSaveImage(filename.c_str(), Ip); - - cvReleaseImage(&Ip); -#endif -} - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a JPEG file. - - \param I : Image to save as a JPEG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename) -{ -#if (VISP_HAVE_OPENCV_VERSION >= 0x020408) - cv::Mat Ip; - vpImageConvert::convert(I, Ip); - cv::imwrite(filename.c_str(), Ip); -#else - IplImage *Ip = NULL; - vpImageConvert::convert(I, Ip); - - cvSaveImage(filename.c_str(), Ip); - - cvReleaseImage(&Ip); -#endif -} - -/*! - Read the contents of the JPEG file, allocate memory - for the corresponding gray level image, if necessary convert the data in - gray level, and set the bitmap whith the gray level data. That means that - the image \e I is a "black and white" rendering of the original image in \e - filename, as in a black and white photograph. If necessary, the quantization - formula used is \f$0,299 r + 0,587 g + 0,114 b\f$. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - If EXIF information is embedded in the image file, the EXIF orientation is ignored. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. - -*/ -void vpImageIo::readJPEG(vpImage &I, const std::string &filename) -{ -#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 -#if VISP_HAVE_OPENCV_VERSION >= 0x030200 - int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION; -#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 - int flags = cv::IMREAD_GRAYSCALE; -#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 - int flags = CV_LOAD_IMAGE_GRAYSCALE; -#endif - cv::Mat Ip = cv::imread(filename.c_str(), flags); - if (!Ip.empty()) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); -#else - IplImage *Ip = NULL; - Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE); - if (Ip != NULL) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); - cvReleaseImage(&Ip); -#endif -} - -/*! - Read a JPEG file and initialize a scalar image. - - Read the contents of the JPEG file, allocate - memory for the corresponding image, and set - the bitmap whith the content of - the file. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - If the file corresponds to a grayscaled image, a conversion is done to deal - with \e I which is a color image. - - If EXIF information is embedded in the image file, the EXIF orientation is ignored. - - \param I : Color image to set with the \e filename content. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::readJPEG(vpImage &I, const std::string &filename) -{ -#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 -#if VISP_HAVE_OPENCV_VERSION >= 0x030200 - int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION; -#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 - int flags = cv::IMREAD_GRAYSCALE; -#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 - int flags = CV_LOAD_IMAGE_GRAYSCALE; -#endif - cv::Mat Ip = cv::imread(filename.c_str(), flags); - if (!Ip.empty()) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); -#else - IplImage *Ip = NULL; - Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_COLOR); - if (Ip != NULL) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); - cvReleaseImage(&Ip); -#endif -} -#else -void vpImageIo::readJPEG(vpImage &I, const std::string &filename) -{ - int width = 0, height = 0, channels = 0; - unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_grey); - if (image == NULL) { - throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); - } - I.init(image, static_cast(height), static_cast(width), true); - stbi_image_free(image); -} -void vpImageIo::readJPEG(vpImage &I, const std::string &filename) -{ - int width = 0, height = 0, channels = 0; - unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha); - if (image == NULL) { - throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); - } - I.init(reinterpret_cast(image), static_cast(height), static_cast(width), true); - stbi_image_free(image); -} -void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename) -{ - int res = stbi_write_jpg(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_grey, - reinterpret_cast(I.bitmap), 90); - if (res == 0) { - throw(vpImageException(vpImageException::ioError, "JPEG write error")); - } -} -void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename) -{ - int res = stbi_write_jpg(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, - reinterpret_cast(I.bitmap), 90); - if (res == 0) { - throw(vpImageException(vpImageException::ioError, "JEPG write error")); - } + vp_writePPM(I, filename); } -#endif - -//-------------------------------------------------------------------------- -// PNG -//-------------------------------------------------------------------------- - -#if defined(VISP_HAVE_PNG) - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a PNG file. - - \param I : Image to save as a PNG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writePNG(const vpImage &I, const std::string &filename) -{ - FILE *file; - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty")); - } - - file = fopen(filename.c_str(), "wb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str())); - } - - /* create a png info struct */ - png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); - if (!png_ptr) { - fclose(file); - vpERROR_TRACE("Error during png_create_write_struct()\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - png_infop info_ptr = png_create_info_struct(png_ptr); - if (!info_ptr) { - fclose(file); - png_destroy_write_struct(&png_ptr, NULL); - vpERROR_TRACE("Error during png_create_info_struct()\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - /* initialize the setjmp for returning properly after a libpng error occured - */ - if (setjmp(png_jmpbuf(png_ptr))) { - fclose(file); - png_destroy_write_struct(&png_ptr, &info_ptr); - vpERROR_TRACE("Error during init_io\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - /* setup libpng for using standard C fwrite() function with our FILE pointer - */ - png_init_io(png_ptr, file); - - unsigned int width = I.getWidth(); - unsigned int height = I.getHeight(); - int bit_depth = 8; - int color_type = PNG_COLOR_TYPE_GRAY; - /* set some useful information from header */ - - if (setjmp(png_jmpbuf(png_ptr))) { - fclose(file); - png_destroy_write_struct(&png_ptr, &info_ptr); - vpERROR_TRACE("Error during write header\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE, - PNG_FILTER_TYPE_BASE); - - png_write_info(png_ptr, info_ptr); - - png_bytep *row_ptrs = new png_bytep[height]; - for (unsigned int i = 0; i < height; i++) - row_ptrs[i] = new png_byte[width]; - - unsigned char *input = (unsigned char *)I.bitmap; - - for (unsigned int i = 0; i < height; i++) { - png_byte *row = row_ptrs[i]; - for (unsigned int j = 0; j < width; j++) { - row[j] = *(input); - input++; - } - } - - png_write_image(png_ptr, row_ptrs); - - png_write_end(png_ptr, NULL); - - for (unsigned int j = 0; j < height; j++) - delete[] row_ptrs[j]; - - delete[] row_ptrs; - - png_destroy_write_struct(&png_ptr, &info_ptr); - - fclose(file); -} - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a PNG file. - - \param I : Image to save as a PNG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writePNG(const vpImage &I, const std::string &filename) -{ - FILE *file; - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty")); - } - - file = fopen(filename.c_str(), "wb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str())); - } - - /* create a png info struct */ - png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); - if (!png_ptr) { - fclose(file); - vpERROR_TRACE("Error during png_create_write_struct()\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - png_infop info_ptr = png_create_info_struct(png_ptr); - if (!info_ptr) { - fclose(file); - png_destroy_write_struct(&png_ptr, NULL); - vpERROR_TRACE("Error during png_create_info_struct()\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - /* initialize the setjmp for returning properly after a libpng error occured - */ - if (setjmp(png_jmpbuf(png_ptr))) { - fclose(file); - png_destroy_write_struct(&png_ptr, &info_ptr); - vpERROR_TRACE("Error during init_io\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - /* setup libpng for using standard C fwrite() function with our FILE pointer - */ - png_init_io(png_ptr, file); - - unsigned int width = I.getWidth(); - unsigned int height = I.getHeight(); - int bit_depth = 8; - int color_type = PNG_COLOR_TYPE_RGB; - /* set some useful information from header */ - - if (setjmp(png_jmpbuf(png_ptr))) { - fclose(file); - png_destroy_write_struct(&png_ptr, &info_ptr); - vpERROR_TRACE("Error during write header\n"); - throw(vpImageException(vpImageException::ioError, "PNG write error")); - } - - png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE, - PNG_FILTER_TYPE_BASE); - - png_write_info(png_ptr, info_ptr); - - png_bytep *row_ptrs = new png_bytep[height]; - for (unsigned int i = 0; i < height; i++) - row_ptrs[i] = new png_byte[3 * width]; - - unsigned char *input = (unsigned char *)I.bitmap; - ; - - for (unsigned int i = 0; i < height; i++) { - png_byte *row = row_ptrs[i]; - for (unsigned int j = 0; j < width; j++) { - row[3 * j] = *(input); - input++; - row[3 * j + 1] = *(input); - input++; - row[3 * j + 2] = *(input); - input++; - input++; - } - } - - png_write_image(png_ptr, row_ptrs); - - png_write_end(png_ptr, NULL); - - for (unsigned int j = 0; j < height; j++) - delete[] row_ptrs[j]; - - delete[] row_ptrs; - - png_destroy_write_struct(&png_ptr, &info_ptr); - - fclose(file); -} - -/*! - Read the contents of the PNG file, allocate memory - for the corresponding gray level image, if necessary convert the data in - gray level, and set the bitmap whith the gray level data. That means that - the image \e I is a "black and white" rendering of the original image in \e - filename, as in a black and white photograph. If necessary, the quantization - formula used is \f$0,299 r + 0,587 g + 0,114 b\f$. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. - -*/ -void vpImageIo::readPNG(vpImage &I, const std::string &filename) -{ - FILE *file; - png_byte magic[8]; - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty")); - } - - file = fopen(filename.c_str(), "rb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str())); - } - - /* read magic number */ - if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) { - fclose(file); - throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str())); - } - - /* check for valid magic number */ - if (png_sig_cmp(magic, 0, sizeof(magic))) { - fclose(file); - throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image", - filename.c_str())); - } - - /* create a png read struct */ - // printf("version %s\n", PNG_LIBPNG_VER_STRING); - png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); - if (png_ptr == NULL) { - fprintf(stderr, "error: can't create a png read structure!\n"); - fclose(file); - throw(vpImageException(vpImageException::ioError, "error reading png file")); - } - - /* create a png info struct */ - png_infop info_ptr = png_create_info_struct(png_ptr); - if (info_ptr == NULL) { - fprintf(stderr, "error: can't create a png info structure!\n"); - fclose(file); - png_destroy_read_struct(&png_ptr, NULL, NULL); - throw(vpImageException(vpImageException::ioError, "error reading png file")); - } - - /* initialize the setjmp for returning properly after a libpng error occured - */ - if (setjmp(png_jmpbuf(png_ptr))) { - fclose(file); - png_destroy_read_struct(&png_ptr, &info_ptr, NULL); - vpERROR_TRACE("Error during init io\n"); - throw(vpImageException(vpImageException::ioError, "PNG read error")); - } - - /* setup libpng for using standard C fread() function with our FILE pointer - */ - png_init_io(png_ptr, file); - - /* tell libpng that we have already read the magic number */ - png_set_sig_bytes(png_ptr, sizeof(magic)); - - /* read png info */ - png_read_info(png_ptr, info_ptr); - - unsigned int width = png_get_image_width(png_ptr, info_ptr); - unsigned int height = png_get_image_height(png_ptr, info_ptr); - - unsigned int bit_depth, channels, color_type; - /* get some useful information from header */ - bit_depth = png_get_bit_depth(png_ptr, info_ptr); - channels = png_get_channels(png_ptr, info_ptr); - color_type = png_get_color_type(png_ptr, info_ptr); - - /* convert index color images to RGB images */ - if (color_type == PNG_COLOR_TYPE_PALETTE) - png_set_palette_to_rgb(png_ptr); - - /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */ - if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) - png_set_expand(png_ptr); - - // if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS)) - // png_set_tRNS_to_alpha (png_ptr); - - if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA) - png_set_strip_alpha(png_ptr); - - if (bit_depth == 16) - png_set_strip_16(png_ptr); - else if (bit_depth < 8) - png_set_packing(png_ptr); - - /* update info structure to apply transformations */ - png_read_update_info(png_ptr, info_ptr); - - channels = png_get_channels(png_ptr, info_ptr); - - if ((width != I.getWidth()) || (height != I.getHeight())) - I.resize(height, width); - - png_bytep *rowPtrs = new png_bytep[height]; - - unsigned int stride = png_get_rowbytes(png_ptr, info_ptr); - unsigned char *data = new unsigned char[stride * height]; - - for (unsigned int i = 0; i < height; i++) - rowPtrs[i] = (png_bytep)data + (i * stride); - - png_read_image(png_ptr, rowPtrs); - - vpImage Ic(height, width); - unsigned char *output; - - switch (channels) { - case 1: - output = (unsigned char *)I.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i]; - } - break; - - case 2: - output = (unsigned char *)I.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i * 2]; - } - break; - - case 3: - output = (unsigned char *)Ic.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i * 3]; - *(output++) = data[i * 3 + 1]; - *(output++) = data[i * 3 + 2]; - *(output++) = vpRGBa::alpha_default; - } - vpImageConvert::convert(Ic, I); - break; - - case 4: - output = (unsigned char *)Ic.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i * 4]; - *(output++) = data[i * 4 + 1]; - *(output++) = data[i * 4 + 2]; - *(output++) = data[i * 4 + 3]; - } - vpImageConvert::convert(Ic, I); - break; - } - - delete[](png_bytep) rowPtrs; - delete[] data; - png_read_end(png_ptr, NULL); - png_destroy_read_struct(&png_ptr, &info_ptr, NULL); - fclose(file); -} - -/*! - Read a PNG file and initialize a scalar image. - - Read the contents of the PNG file, allocate - memory for the corresponding image, and set - the bitmap whith the content of - the file. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - If the file corresponds to a grayscaled image, a conversion is done to deal - with \e I which is a color image. - - \param I : Color image to set with the \e filename content. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::readPNG(vpImage &I, const std::string &filename) -{ - FILE *file; - png_byte magic[8]; - - // Test the filename - if (filename.empty()) { - throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty")); - } - - file = fopen(filename.c_str(), "rb"); - - if (file == NULL) { - throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str())); - } - - /* read magic number */ - if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) { - fclose(file); - throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str())); - } - - /* check for valid magic number */ - if (png_sig_cmp(magic, 0, sizeof(magic))) { - fclose(file); - throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image", - filename.c_str())); - } - - /* create a png read struct */ - png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); - if (!png_ptr) { - fclose(file); - vpERROR_TRACE("Error during png_create_read_struct()\n"); - throw(vpImageException(vpImageException::ioError, "PNG read error")); - } - - /* create a png info struct */ - png_infop info_ptr = png_create_info_struct(png_ptr); - if (!info_ptr) { - fclose(file); - png_destroy_read_struct(&png_ptr, NULL, NULL); - vpERROR_TRACE("Error during png_create_info_struct()\n"); - throw(vpImageException(vpImageException::ioError, "PNG read error")); - } - - /* initialize the setjmp for returning properly after a libpng error occured - */ - if (setjmp(png_jmpbuf(png_ptr))) { - fclose(file); - png_destroy_read_struct(&png_ptr, &info_ptr, NULL); - vpERROR_TRACE("Error during init io\n"); - throw(vpImageException(vpImageException::ioError, "PNG read error")); - } - - /* setup libpng for using standard C fread() function with our FILE pointer - */ - png_init_io(png_ptr, file); - - /* tell libpng that we have already read the magic number */ - png_set_sig_bytes(png_ptr, sizeof(magic)); - - /* read png info */ - png_read_info(png_ptr, info_ptr); - - unsigned int width = png_get_image_width(png_ptr, info_ptr); - unsigned int height = png_get_image_height(png_ptr, info_ptr); - - unsigned int bit_depth, channels, color_type; - /* get some useful information from header */ - bit_depth = png_get_bit_depth(png_ptr, info_ptr); - channels = png_get_channels(png_ptr, info_ptr); - color_type = png_get_color_type(png_ptr, info_ptr); - - /* convert index color images to RGB images */ - if (color_type == PNG_COLOR_TYPE_PALETTE) - png_set_palette_to_rgb(png_ptr); - - /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */ - if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) - png_set_expand(png_ptr); - - // if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS)) - // png_set_tRNS_to_alpha (png_ptr); - - if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA) - png_set_strip_alpha(png_ptr); - - if (bit_depth == 16) - png_set_strip_16(png_ptr); - else if (bit_depth < 8) - png_set_packing(png_ptr); - - /* update info structure to apply transformations */ - png_read_update_info(png_ptr, info_ptr); - - channels = png_get_channels(png_ptr, info_ptr); - - if ((width != I.getWidth()) || (height != I.getHeight())) - I.resize(height, width); - - png_bytep *rowPtrs = new png_bytep[height]; - - unsigned int stride = png_get_rowbytes(png_ptr, info_ptr); - unsigned char *data = new unsigned char[stride * height]; - - for (unsigned int i = 0; i < height; i++) - rowPtrs[i] = (png_bytep)data + (i * stride); - - png_read_image(png_ptr, rowPtrs); - - vpImage Ig(height, width); - unsigned char *output; - - switch (channels) { - case 1: - output = (unsigned char *)Ig.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i]; - } - vpImageConvert::convert(Ig, I); - break; - - case 2: - output = (unsigned char *)Ig.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i * 2]; - } - vpImageConvert::convert(Ig, I); - break; - - case 3: - output = (unsigned char *)I.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i * 3]; - *(output++) = data[i * 3 + 1]; - *(output++) = data[i * 3 + 2]; - *(output++) = vpRGBa::alpha_default; - } - break; - - case 4: - output = (unsigned char *)I.bitmap; - for (unsigned int i = 0; i < width * height; i++) { - *(output++) = data[i * 4]; - *(output++) = data[i * 4 + 1]; - *(output++) = data[i * 4 + 2]; - *(output++) = data[i * 4 + 3]; - } - break; - } - - delete[](png_bytep) rowPtrs; - delete[] data; - png_read_end(png_ptr, NULL); - png_destroy_read_struct(&png_ptr, &info_ptr, NULL); - fclose(file); -} - -//TODO: -void vpImageIo::readSimdlib(vpImage &I, const std::string &filename) -{ - size_t stride = 0, width = 0, height = 0; - SimdPixelFormatType format = SimdPixelFormatRgba32; - uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format); - const bool copyData = false; - I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData); -} - -void vpImageIo::readStb(vpImage &I, const std::string &filename) -{ - int width = 0, height = 0, channels = 0; - unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha); - if (image == NULL) { - throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); - } - I.init(reinterpret_cast(image), static_cast(height), static_cast(width), true); - stbi_image_free(image); -} - -inline bool ends_with(std::string const & value, std::string const & ending) -{ - if (ending.size() > value.size()) return false; - return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); -} - -void vpImageIo::writeSimdlib(vpImage &I, const std::string &filename) -{ - if (ends_with(filename, ".png")) { - SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFilePng, 90, filename.c_str()); - } else { - SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, 90, filename.c_str()); - } -} - -void vpImageIo::writeStb(vpImage &I, const std::string &filename) -{ - if (ends_with(filename, ".png")) { - const int stride_in_bytes = static_cast(4 * I.getWidth()); - int res = stbi_write_png(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, - reinterpret_cast(I.bitmap), stride_in_bytes); - if (res == 0) { - throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str())); - } - } else { - int res = stbi_write_jpg(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, - reinterpret_cast(I.bitmap), 90); - if (res == 0) { - throw(vpImageException(vpImageException::ioError, "JEPG write error")); - } - } -} - -#elif defined(VISP_HAVE_OPENCV) - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a PNG file. - - \param I : Image to save as a PNG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writePNG(const vpImage &I, const std::string &filename) -{ -#if (VISP_HAVE_OPENCV_VERSION >= 0x020408) - cv::Mat Ip; - vpImageConvert::convert(I, Ip); - cv::imwrite(filename.c_str(), Ip); -#else - IplImage *Ip = NULL; - vpImageConvert::convert(I, Ip); - - cvSaveImage(filename.c_str(), Ip); - - cvReleaseImage(&Ip); -#endif -} - -/*! - Write the content of the image bitmap in the file which name is given by \e - filename. This function writes a PNG file. - - \param I : Image to save as a PNG file. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::writePNG(const vpImage &I, const std::string &filename) -{ -#if (VISP_HAVE_OPENCV_VERSION >= 0x020408) - cv::Mat Ip; - vpImageConvert::convert(I, Ip); - cv::imwrite(filename.c_str(), Ip); -#else - IplImage *Ip = NULL; - vpImageConvert::convert(I, Ip); - - cvSaveImage(filename.c_str(), Ip); - - cvReleaseImage(&Ip); -#endif -} - -/*! - Read the contents of the PNG file, allocate memory - for the corresponding gray level image, if necessary convert the data in - gray level, and set the bitmap whith the gray level data. That means that - the image \e I is a "black and white" rendering of the original image in \e - filename, as in a black and white photograph. If necessary, the quantization - formula used is \f$0,299 r + 0,587 g + 0,114 b\f$. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - If EXIF information is embedded in the image file, the EXIF orientation is ignored. - - \param I : Image to set with the \e filename content. - \param filename : Name of the file containing the image. - -*/ -void vpImageIo::readPNG(vpImage &I, const std::string &filename) -{ -#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 -#if VISP_HAVE_OPENCV_VERSION >= 0x030200 - int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION; -#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 - int flags = cv::IMREAD_GRAYSCALE; -#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 - int flags = CV_LOAD_IMAGE_GRAYSCALE; -#endif - cv::Mat Ip = cv::imread(filename.c_str(), flags); - if (!Ip.empty()) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); -#else - IplImage *Ip = NULL; - Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE); - if (Ip != NULL) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); - cvReleaseImage(&Ip); -#endif -} - -/*! - Read a PNG file and initialize a scalar image. - - Read the contents of the PNG file, allocate - memory for the corresponding image, and set - the bitmap whith the content of - the file. - - If the image has been already initialized, memory allocation is done - only if the new image size is different, else we re-use the same - memory space. - - If the file corresponds to a grayscaled image, a conversion is done to deal - with \e I which is a color image. - - If EXIF information is embedded in the image file, the EXIF orientation is ignored. - - \param I : Color image to set with the \e filename content. - \param filename : Name of the file containing the image. -*/ -void vpImageIo::readPNG(vpImage &I, const std::string &filename) -{ -#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 -#if VISP_HAVE_OPENCV_VERSION >= 0x030200 - int flags = cv::IMREAD_COLOR | cv::IMREAD_IGNORE_ORIENTATION; -#elif VISP_HAVE_OPENCV_VERSION >= 0x030000 - int flags = cv::IMREAD_COLOR; -#elif VISP_HAVE_OPENCV_VERSION >= 0x020100 - int flags = CV_LOAD_IMAGE_COLOR; -#endif - cv::Mat Ip = cv::imread(filename.c_str(), flags); - if (!Ip.empty()) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); -#else - IplImage *Ip = NULL; - Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_COLOR); - if (Ip != NULL) - vpImageConvert::convert(Ip, I); - else - throw(vpImageException(vpImageException::ioError, "Can't read the image")); - cvReleaseImage(&Ip); -#endif -} -#else -void vpImageIo::readPNG(vpImage &I, const std::string &filename) -{ - int width = 0, height = 0, channels = 0; - unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_grey); - if (image == NULL) { - throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); - } - I.init(image, static_cast(height), static_cast(width), true); - stbi_image_free(image); -} -void vpImageIo::readPNG(vpImage &I, const std::string &filename) -{ - int width = 0, height = 0, channels = 0; - unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha); - if (image == NULL) { - throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str())); - } - I.init(reinterpret_cast(image), static_cast(height), static_cast(width), true); - stbi_image_free(image); -} -void vpImageIo::writePNG(const vpImage &I, const std::string &filename) -{ - const int stride_in_bytes = static_cast(I.getWidth()); - int res = stbi_write_png(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_grey, - reinterpret_cast(I.bitmap), stride_in_bytes); - if (res == 0) { - throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str())); - } -} -void vpImageIo::writePNG(const vpImage &I, const std::string &filename) -{ - const int stride_in_bytes = static_cast(4 * I.getWidth()); - int res = stbi_write_png(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, - reinterpret_cast(I.bitmap), stride_in_bytes); - if (res == 0) { - throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str())); - } -} -#endif diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp index 8efe2c759e..3bf19a465e 100644 --- a/modules/io/test/perfImageLoadSave.cpp +++ b/modules/io/test/perfImageLoadSave.cpp @@ -64,7 +64,7 @@ TEST_CASE("Benchmark Jpeg image loading", "[benchmark]") { vpImage I; BENCHMARK("vpImageIo::readSimdlib()") { - vpImageIo::readSimdlib(I, imagePathJpeg); + vpImageIo::readJPEG(I, imagePathJpeg, vpImageIo::IO_SIMDLIB_BACKEND); return I; }; } @@ -73,7 +73,7 @@ TEST_CASE("Benchmark Jpeg image loading", "[benchmark]") { vpImage I; BENCHMARK("vpImageIo::readStb()") { - vpImageIo::readStb(I, imagePathJpeg); + vpImageIo::readJPEG(I, imagePathJpeg, vpImageIo::IO_STB_IMAGE_BACKEND); return I; }; } @@ -93,7 +93,7 @@ TEST_CASE("Benchmark Png image loading", "[benchmark]") { vpImage I; BENCHMARK("vpImageIo::readSimdlib()") { - vpImageIo::readSimdlib(I, imagePathPng); + vpImageIo::readPNG(I, imagePathPng, vpImageIo::IO_SIMDLIB_BACKEND); return I; }; } @@ -102,7 +102,7 @@ TEST_CASE("Benchmark Png image loading", "[benchmark]") { vpImage I; BENCHMARK("vpImageIo::readStb()") { - vpImageIo::readStb(I, imagePathPng); + vpImageIo::readPNG(I, imagePathPng, vpImageIo::IO_STB_IMAGE_BACKEND); return I; }; } @@ -122,7 +122,7 @@ TEST_CASE("Benchmark big Png image loading", "[benchmark]") { vpImage I; BENCHMARK("vpImageIo::readSimdlib()") { - vpImageIo::readSimdlib(I, imagePathPngBig); + vpImageIo::readPNG(I, imagePathPngBig, vpImageIo::IO_SIMDLIB_BACKEND); return I; }; } @@ -131,7 +131,7 @@ TEST_CASE("Benchmark big Png image loading", "[benchmark]") { vpImage I; BENCHMARK("vpImageIo::readStb()") { - vpImageIo::readStb(I, imagePathPngBig); + vpImageIo::readPNG(I, imagePathPngBig, vpImageIo::IO_STB_IMAGE_BACKEND); return I; }; } @@ -153,7 +153,7 @@ TEST_CASE("Benchmark Jpeg image saving", "[benchmark]") { const std::string filename = "/tmp/Klimt_Simd.jpg"; BENCHMARK("vpImageIo::writeSimdlib()") { - vpImageIo::writeSimdlib(I, filename); + vpImageIo::writeJPEG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND); return I; }; } @@ -162,7 +162,7 @@ TEST_CASE("Benchmark Jpeg image saving", "[benchmark]") { const std::string filename = "/tmp/Klimt_stb.jpg"; BENCHMARK("vpImageIo::writeStb()") { - vpImageIo::writeStb(I, filename); + vpImageIo::writeJPEG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND); return I; }; } @@ -184,7 +184,7 @@ TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") { const std::string filename = "/tmp/Big_images_Simd.jpg"; BENCHMARK("vpImageIo::writeSimdlib()") { - vpImageIo::writeSimdlib(I, filename); + vpImageIo::writeJPEG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND); return I; }; } @@ -193,7 +193,7 @@ TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") { const std::string filename = "/tmp/Big_images_stb.jpg"; BENCHMARK("vpImageIo::writeStb()") { - vpImageIo::writeStb(I, filename); + vpImageIo::writeJPEG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND); return I; }; } @@ -215,7 +215,7 @@ TEST_CASE("Benchmark Png image saving", "[benchmark]") { const std::string filename = "/tmp/Klimt_Simd.png"; BENCHMARK("vpImageIo::writeSimdlib()") { - vpImageIo::writeSimdlib(I, filename); + vpImageIo::writePNG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND); return I; }; } @@ -224,7 +224,7 @@ TEST_CASE("Benchmark Png image saving", "[benchmark]") { const std::string filename = "/tmp/Klimt_stb.png"; BENCHMARK("vpImageIo::writeStb()") { - vpImageIo::writeStb(I, filename); + vpImageIo::writePNG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND); return I; }; } @@ -246,7 +246,7 @@ TEST_CASE("Benchmark big Png image saving", "[benchmark]") { const std::string filename = "/tmp/Big_images_Simd.png"; BENCHMARK("vpImageIo::writeSimdlib()") { - vpImageIo::writeSimdlib(I, filename); + vpImageIo::writePNG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND); return I; }; } @@ -255,155 +255,12 @@ TEST_CASE("Benchmark big Png image saving", "[benchmark]") { const std::string filename = "/tmp/Big_images_stb.png"; BENCHMARK("vpImageIo::writeStb()") { - vpImageIo::writeStb(I, filename); + vpImageIo::writePNG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND); return I; }; } } -//TEST_CASE("Benchmark bgr to grayscale (ViSP)", "[benchmark]") { -// vpImage I; -// vpImageIo::read(I, imagePathColor); - -// std::vector bgr; -// common_tools::RGBaToBGR(I, bgr); - -// vpImage I_gray(I.getHeight(), I.getWidth()); - -// BENCHMARK("Benchmark bgr to grayscale (ViSP)") { -// vpImageConvert::BGRToGrey(bgr.data(), -// I_gray.bitmap, -// I.getWidth(), I.getHeight(), -// false, nThreads); -// return I_gray; -// }; - -//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101) -// SECTION("OpenCV Mat type") -// { -// cv::Mat img; -// vpImageConvert::convert(I, img); - -// BENCHMARK("Benchmark bgr to grayscale (ViSP + OpenCV Mat type)") { -// vpImageConvert::convert(img, I_gray, false, nThreads); -// return I_gray; -// }; -// } -//#endif -//} -//#endif - -//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101) -//TEST_CASE("Benchmark bgr to grayscale (OpenCV)", "[benchmark]") { -// cv::Mat img = cv::imread(imagePathColor); -// cv::Mat img_gray(img.size(), CV_8UC1); - -// BENCHMARK("Benchmark bgr to grayscale (OpenCV)") { -// cv::cvtColor(img, img_gray, cv::COLOR_BGR2GRAY); -// return img_gray; -// }; -//} -//#endif - -//// C++11 to be able to do bgr.data() -//#if VISP_CXX_STANDARD >= VISP_CXX_STANDARD_11 -//TEST_CASE("Benchmark bgr to rgba (naive code)", "[benchmark]") { -// vpImage I; -// vpImageIo::read(I, imagePathColor); - -// std::vector bgr; -// common_tools::RGBaToBGR(I, bgr); - -// vpImage I_bench(I.getHeight(), I.getWidth()); -// BENCHMARK("Benchmark bgr to rgba (naive code)") { -// common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast(I_bench.bitmap), -// I.getWidth(), I.getHeight(), false); -// return I_bench; -// }; -//} - -//TEST_CASE("Benchmark bgr to rgba (ViSP)", "[benchmark]") { -// vpImage I; -// vpImageIo::read(I, imagePathColor); - -// std::vector bgr; -// common_tools::RGBaToBGR(I, bgr); - -// SECTION("Check BGR to RGBa conversion") -// { -// vpImage ref(I.getHeight(), I.getWidth()); -// common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast(ref.bitmap), -// I.getWidth(), I.getHeight(), false); -// vpImage rgba(I.getHeight(), I.getWidth()); -// vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast(rgba.bitmap), -// I.getWidth(), I.getHeight(), false); - -// CHECK((rgba == ref)); -// } - -// vpImage I_rgba(I.getHeight(), I.getWidth()); -// BENCHMARK("Benchmark bgr to rgba (ViSP)") { -// vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast(I_rgba.bitmap), -// I.getWidth(), I.getHeight(), false); -// return I_rgba; -// }; - -//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101) -// SECTION("OpenCV Mat type") -// { -// cv::Mat img; -// vpImageConvert::convert(I, img); - -// BENCHMARK("Benchmark bgr to rgba (ViSP + OpenCV Mat type)") { -// vpImageConvert::convert(img, I_rgba); -// return I_rgba; -// }; -// } -//#endif -//} - -//TEST_CASE("Benchmark bgra to rgba (naive code)", "[benchmark]") { -// vpImage I; -// vpImageIo::read(I, imagePathColor); - -// std::vector bgra; -// common_tools::RGBaToBGRa(I, bgra); - -// vpImage I_bench(I.getHeight(), I.getWidth()); -// BENCHMARK("Benchmark bgra to rgba (naive code)") { -// common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast(I_bench.bitmap), -// I.getWidth(), I.getHeight(), false); -// return I_bench; -// }; -//} - -//TEST_CASE("Benchmark bgra to rgba (ViSP)", "[benchmark]") { -// vpImage I; -// vpImageIo::read(I, imagePathColor); - -// std::vector bgra; -// common_tools::RGBaToBGRa(I, bgra); - -// SECTION("Check BGRa to RGBa conversion") -// { -// vpImage ref(I.getHeight(), I.getWidth()); -// common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast(ref.bitmap), -// I.getWidth(), I.getHeight(), false); -// vpImage rgba(I.getHeight(), I.getWidth()); -// vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast(rgba.bitmap), -// I.getWidth(), I.getHeight(), false); - -// CHECK((rgba == ref)); -// } -// vpImage I_rgba(I.getHeight(), I.getWidth()); -// BENCHMARK("Benchmark bgra to rgba (ViSP)") { -// vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast(I_rgba.bitmap), -// I.getWidth(), I.getHeight(), false); -// return I_rgba; -// }; -//} -//#endif - int main(int argc, char *argv[]) { Catch::Session session; // There must be exactly one instance From 7dcc2a1d02ffe3b5bd27777d0e55930e44b9e6eb Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Sun, 21 Nov 2021 21:11:01 +0100 Subject: [PATCH 13/18] Update vpImageIo backend option. Add JPEG compression quality. Update perfImageLoadSave.cpp. --- modules/io/include/visp3/io/vpImageIo.h | 12 +- .../io/src/image/private/vpImageIoBackend.h | 39 +- .../io/src/image/private/vpImageIoLibjpeg.cpp | 37 +- .../io/src/image/private/vpImageIoLibpng.cpp | 23 +- .../io/src/image/private/vpImageIoOpenCV.cpp | 25 +- .../src/image/private/vpImageIoPortable.cpp | 23 +- .../io/src/image/private/vpImageIoSimd.cpp | 19 +- modules/io/src/image/private/vpImageIoStb.cpp | 15 +- modules/io/src/image/vpImageIo.cpp | 117 +++--- modules/io/test/perfImageLoadSave.cpp | 356 ++++++++---------- 10 files changed, 305 insertions(+), 361 deletions(-) diff --git a/modules/io/include/visp3/io/vpImageIo.h b/modules/io/include/visp3/io/vpImageIo.h index fa395e3882..7edbb765e7 100644 --- a/modules/io/include/visp3/io/vpImageIo.h +++ b/modules/io/include/visp3/io/vpImageIo.h @@ -134,11 +134,11 @@ class VISP_EXPORT vpImageIo IO_STB_IMAGE_BACKEND }; - static void read(vpImage &I, const std::string &filename); - static void read(vpImage &I, const std::string &filename); + static void read(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); + static void read(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); - static void write(const vpImage &I, const std::string &filename); - static void write(const vpImage &I, const std::string &filename); + static void write(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); + static void write(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); static void readPFM(vpImage &I, const std::string &filename); @@ -163,8 +163,8 @@ class VISP_EXPORT vpImageIo static void writePPM(const vpImage &I, const std::string &filename); static void writePPM(const vpImage &I, const std::string &filename); - static void writeJPEG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); - static void writeJPEG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); + static void writeJPEG(const vpImage &I, const std::string &filename, int quality=90, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); + static void writeJPEG(const vpImage &I, const std::string &filename, int quality=90, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); static void writePNG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); static void writePNG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND); diff --git a/modules/io/src/image/private/vpImageIoBackend.h b/modules/io/src/image/private/vpImageIoBackend.h index e1b434c030..75a33d1793 100644 --- a/modules/io/src/image/private/vpImageIoBackend.h +++ b/modules/io/src/image/private/vpImageIoBackend.h @@ -29,16 +29,13 @@ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * * Description: - * Read/write images. - * - * Authors: - * Eric Marchand + * Backend functions implementation for image I/O operations. * *****************************************************************************/ /*! - \file vpImageIo.h - \brief Read/write images + \file vpImageIoBackend.h + \brief Backend functions implementation for image I/O operations. */ #ifndef vpIMAGEIOBACKEND_H @@ -47,7 +44,9 @@ #include -// +// Portable FloatMap format (PFM) +// Portable Graymap format (PGM) +// Portable Pixmap format (PPM) void vp_writePFM(const vpImage &I, const std::string &filename); void vp_writePGM(const vpImage &I, const std::string &filename); void vp_writePGM(const vpImage &I, const std::string &filename); @@ -60,43 +59,43 @@ void vp_readPPM(vpImage &I, const std::string &filename); void vp_writePPM(const vpImage &I, const std::string &filename); void vp_writePPM(const vpImage &I, const std::string &filename); -// +// libjpeg void readJPEGLibjpeg(vpImage &I, const std::string &filename); void readJPEGLibjpeg(vpImage &I, const std::string &filename); -void writeJPEGLibjpeg(const vpImage &I, const std::string &filename); -void writeJPEGLibjpeg(const vpImage &I, const std::string &filename); +void writeJPEGLibjpeg(const vpImage &I, const std::string &filename, int quality); +void writeJPEGLibjpeg(const vpImage &I, const std::string &filename, int quality); -// +// libpng void readPNGLibpng(vpImage &I, const std::string &filename); void readPNGLibpng(vpImage &I, const std::string &filename); void writePNGLibpng(const vpImage &I, const std::string &filename); void writePNGLibpng(const vpImage &I, const std::string &filename); -// +// OpenCV void readOpenCV(vpImage &I, const std::string &filename); void readOpenCV(vpImage &I, const std::string &filename); -void writeOpenCV(const vpImage &I, const std::string &filename); -void writeOpenCV(const vpImage &I, const std::string &filename); +void writeOpenCV(const vpImage &I, const std::string &filename, int quality); +void writeOpenCV(const vpImage &I, const std::string &filename, int quality); -// +// Simd lib void readSimdlib(vpImage &I, const std::string &filename); void readSimdlib(vpImage &I, const std::string &filename); -void writeJPEGSimdlib(const vpImage &I, const std::string &filename); -void writeJPEGSimdlib(const vpImage &I, const std::string &filename); +void writeJPEGSimdlib(const vpImage &I, const std::string &filename, int quality); +void writeJPEGSimdlib(const vpImage &I, const std::string &filename, int quality); void writePNGSimdlib(const vpImage &I, const std::string &filename); void writePNGSimdlib(const vpImage &I, const std::string &filename); -// +// stb lib void readStb(vpImage &I, const std::string &filename); void readStb(vpImage &I, const std::string &filename); -void writeJPEGStb(const vpImage &I, const std::string &filename); -void writeJPEGStb(const vpImage &I, const std::string &filename); +void writeJPEGStb(const vpImage &I, const std::string &filename, int quality); +void writeJPEGStb(const vpImage &I, const std::string &filename, int quality); void writePNGStb(const vpImage &I, const std::string &filename); void writePNGStb(const vpImage &I, const std::string &filename); diff --git a/modules/io/src/image/private/vpImageIoLibjpeg.cpp b/modules/io/src/image/private/vpImageIoLibjpeg.cpp index 99debb3021..8f5b021c8c 100644 --- a/modules/io/src/image/private/vpImageIoLibjpeg.cpp +++ b/modules/io/src/image/private/vpImageIoLibjpeg.cpp @@ -29,28 +29,25 @@ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * * Description: - * Read/write images. - * - * Authors: - * Eric Marchand + * Libjpeg backend for JPEG image I/O operations. * *****************************************************************************/ /*! - \file vpImageIo.cpp - \brief Read/write images + \file vpImageIoLibjpeg.cpp + \brief Libjpeg backend for JPEG image I/O operations. */ #include "vpImageIoBackend.h" #include -//TODO: -#if defined(_WIN32) -// Include WinSock2.h before windows.h to ensure that winsock.h is not -// included by windows.h since winsock.h and winsock2.h are incompatible -#include -#include -#endif +//TODO: is it needed? +//#if defined(_WIN32) +//// Include WinSock2.h before windows.h to ensure that winsock.h is not +//// included by windows.h since winsock.h and winsock2.h are incompatible +//#include +//#include +//#endif #if defined(VISP_HAVE_JPEG) #include @@ -70,8 +67,9 @@ \param I : Image to save as a JPEG file. \param filename : Name of the file containing the image. + \param quality : JPEG quality for compression. */ -void writeJPEGLibjpeg(const vpImage &I, const std::string &filename) +void writeJPEGLibjpeg(const vpImage &I, const std::string &filename, int quality) { struct jpeg_compress_struct cinfo; struct jpeg_error_mgr jerr; @@ -96,11 +94,13 @@ void writeJPEGLibjpeg(const vpImage &I, const std::string &filena jpeg_stdio_dest(&cinfo, file); + jpeg_set_defaults(&cinfo); cinfo.image_width = width; cinfo.image_height = height; cinfo.input_components = 1; cinfo.in_color_space = JCS_GRAYSCALE; - jpeg_set_defaults(&cinfo); + //TODO: + jpeg_set_quality(&cinfo, quality, TRUE); jpeg_start_compress(&cinfo, TRUE); @@ -127,8 +127,9 @@ void writeJPEGLibjpeg(const vpImage &I, const std::string &filena \param I : Image to save as a JPEG file. \param filename : Name of the file containing the image. + \param quality : JPEG quality for compression. */ -void writeJPEGLibjpeg(const vpImage &I, const std::string &filename) +void writeJPEGLibjpeg(const vpImage &I, const std::string &filename, int quality) { struct jpeg_compress_struct cinfo; struct jpeg_error_mgr jerr; @@ -153,11 +154,13 @@ void writeJPEGLibjpeg(const vpImage &I, const std::string &filename) jpeg_stdio_dest(&cinfo, file); + jpeg_set_defaults(&cinfo); cinfo.image_width = width; cinfo.image_height = height; cinfo.input_components = 3; cinfo.in_color_space = JCS_RGB; - jpeg_set_defaults(&cinfo); + //TODO: + jpeg_set_quality(&cinfo, quality, TRUE); jpeg_start_compress(&cinfo, TRUE); diff --git a/modules/io/src/image/private/vpImageIoLibpng.cpp b/modules/io/src/image/private/vpImageIoLibpng.cpp index e350e4260b..e87a956a28 100644 --- a/modules/io/src/image/private/vpImageIoLibpng.cpp +++ b/modules/io/src/image/private/vpImageIoLibpng.cpp @@ -29,28 +29,25 @@ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * * Description: - * Read/write images. - * - * Authors: - * Eric Marchand + * Libpng backend for PNG image I/O operations. * *****************************************************************************/ /*! - \file vpImageIo.cpp - \brief Read/write images + \file vpImageIoLibpng.cpp + \brief Libpng backend for PNG image I/O operations. */ #include "vpImageIoBackend.h" #include -//TODO: -#if defined(_WIN32) -// Include WinSock2.h before windows.h to ensure that winsock.h is not -// included by windows.h since winsock.h and winsock2.h are incompatible -#include -#include -#endif +//TODO: is it needed? +//#if defined(_WIN32) +//// Include WinSock2.h before windows.h to ensure that winsock.h is not +//// included by windows.h since winsock.h and winsock2.h are incompatible +//#include +//#include +//#endif #if defined(VISP_HAVE_PNG) #include diff --git a/modules/io/src/image/private/vpImageIoOpenCV.cpp b/modules/io/src/image/private/vpImageIoOpenCV.cpp index 93b6a1ca1d..d13ed07216 100644 --- a/modules/io/src/image/private/vpImageIoOpenCV.cpp +++ b/modules/io/src/image/private/vpImageIoOpenCV.cpp @@ -29,16 +29,13 @@ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * * Description: - * Read/write images. - * - * Authors: - * Eric Marchand + * OpenCV backend for image I/O operations. * *****************************************************************************/ /*! - \file vpImageIo.cpp - \brief Read/write images + \file vpImageIoOpenCV.cpp + \brief OpenCV backend for image I/O operations. */ #include "vpImageIoBackend.h" @@ -163,12 +160,16 @@ void readOpenCV(vpImage &I, const std::string &filename) \param I : Image to save as a JPEG file. \param filename : Name of the file containing the image. */ -void writeOpenCV(const vpImage &I, const std::string &filename) +void writeOpenCV(const vpImage &I, const std::string &filename, int quality) { #if (VISP_HAVE_OPENCV_VERSION >= 0x020408) cv::Mat Ip; vpImageConvert::convert(I, Ip); - cv::imwrite(filename.c_str(), Ip); + + std::vector compression_params; + compression_params.push_back(cv::IMWRITE_JPEG_QUALITY); + compression_params.push_back(quality); + cv::imwrite(filename.c_str(), Ip, compression_params); #else IplImage *Ip = NULL; vpImageConvert::convert(I, Ip); @@ -186,12 +187,16 @@ void writeOpenCV(const vpImage &I, const std::string &filename) \param I : Image to save as a JPEG file. \param filename : Name of the file containing the image. */ -void writeOpenCV(const vpImage &I, const std::string &filename) +void writeOpenCV(const vpImage &I, const std::string &filename, int quality) { #if (VISP_HAVE_OPENCV_VERSION >= 0x020408) cv::Mat Ip; vpImageConvert::convert(I, Ip); - cv::imwrite(filename.c_str(), Ip); + + std::vector compression_params; + compression_params.push_back(cv::IMWRITE_JPEG_QUALITY); + compression_params.push_back(quality); + cv::imwrite(filename.c_str(), Ip, compression_params); #else IplImage *Ip = NULL; vpImageConvert::convert(I, Ip); diff --git a/modules/io/src/image/private/vpImageIoPortable.cpp b/modules/io/src/image/private/vpImageIoPortable.cpp index 0031e4c96a..10a4a35fcd 100644 --- a/modules/io/src/image/private/vpImageIoPortable.cpp +++ b/modules/io/src/image/private/vpImageIoPortable.cpp @@ -29,29 +29,26 @@ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * * Description: - * Read/write images. - * - * Authors: - * Eric Marchand + * Backend for portable image format I/O operations. * *****************************************************************************/ /*! - \file vpImageIo.cpp - \brief Read/write images + \file vpImageIoPortable.cpp + \brief Backend for portable image format I/O operations. */ #include "vpImageIoBackend.h" #include #include -//TODO: -#if defined(_WIN32) -// Include WinSock2.h before windows.h to ensure that winsock.h is not -// included by windows.h since winsock.h and winsock2.h are incompatible -#include -#include -#endif +//TODO: is it needed? +//#if defined(_WIN32) +//// Include WinSock2.h before windows.h to ensure that winsock.h is not +//// included by windows.h since winsock.h and winsock2.h are incompatible +//#include +//#include +//#endif void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w, diff --git a/modules/io/src/image/private/vpImageIoSimd.cpp b/modules/io/src/image/private/vpImageIoSimd.cpp index 40986bf743..4612aa5f7f 100644 --- a/modules/io/src/image/private/vpImageIoSimd.cpp +++ b/modules/io/src/image/private/vpImageIoSimd.cpp @@ -29,21 +29,16 @@ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * * Description: - * Read/write images. - * - * Authors: - * Eric Marchand + * Simd backend for JPEG and PNG image I/O operations. * *****************************************************************************/ /*! \file vpImageIo.cpp - \brief Read/write images + \brief Simd backend for JPEG and PNG image I/O operations. */ #include "vpImageIoBackend.h" - -//TODO: #include @@ -66,19 +61,19 @@ void readSimdlib(vpImage &I, const std::string &filename) I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData); } -void writeJPEGSimdlib(const vpImage &I, const std::string &filename) +void writeJPEGSimdlib(const vpImage &I, const std::string &filename, int quality) { - SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFileJpeg, 90, filename.c_str()); + SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFileJpeg, quality, filename.c_str()); } -void writeJPEGSimdlib(const vpImage &I, const std::string &filename) +void writeJPEGSimdlib(const vpImage &I, const std::string &filename, int quality) { - SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, 90, filename.c_str()); + SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, quality, filename.c_str()); } void writePNGSimdlib(const vpImage &I, const std::string &filename) { - SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFileJpeg, 90, filename.c_str()); + SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFilePng, 90, filename.c_str()); } void writePNGSimdlib(const vpImage &I, const std::string &filename) diff --git a/modules/io/src/image/private/vpImageIoStb.cpp b/modules/io/src/image/private/vpImageIoStb.cpp index 97b453d841..4b6626b0cc 100644 --- a/modules/io/src/image/private/vpImageIoStb.cpp +++ b/modules/io/src/image/private/vpImageIoStb.cpp @@ -29,16 +29,13 @@ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * * Description: - * Read/write images. - * - * Authors: - * Eric Marchand + * stb backend for JPEG and PNG image I/O operations. * *****************************************************************************/ /*! \file vpImageIo.cpp - \brief Read/write images + \brief stb backend for JPEG and PNG image I/O operations. */ #include "vpImageIoBackend.h" @@ -82,19 +79,19 @@ void readStb(vpImage &I, const std::string &filename) stbi_image_free(image); } -void writeJPEGStb(const vpImage &I, const std::string &filename) +void writeJPEGStb(const vpImage &I, const std::string &filename, int quality) { int res = stbi_write_jpg(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_grey, - reinterpret_cast(I.bitmap), 90); + reinterpret_cast(I.bitmap), quality); if (res == 0) { throw(vpImageException(vpImageException::ioError, "JEPG write error")); } } -void writeJPEGStb(const vpImage &I, const std::string &filename) +void writeJPEGStb(const vpImage &I, const std::string &filename, int quality) { int res = stbi_write_jpg(filename.c_str(), static_cast(I.getWidth()), static_cast(I.getHeight()), STBI_rgb_alpha, - reinterpret_cast(I.bitmap), 90); + reinterpret_cast(I.bitmap), quality); if (res == 0) { throw(vpImageException(vpImageException::ioError, "JEPG write error")); } diff --git a/modules/io/src/image/vpImageIo.cpp b/modules/io/src/image/vpImageIo.cpp index e8b221049e..241a408e27 100644 --- a/modules/io/src/image/vpImageIo.cpp +++ b/modules/io/src/image/vpImageIo.cpp @@ -41,14 +41,19 @@ \brief Read/write images */ -#include -#include //image conversion #include #include //TODO: #include "private/vpImageIoBackend.h" +//TODO: +// priority order for backend selection is: +// - libjpeg / libpng if available +// - OpenCV if available +// - stb backend for image reading / Simd backend for image writing +// - Simd backend for image reading / stb backend for image writing + vpImageIo::vpImageFormatType vpImageIo::getFormat(const std::string &filename) { @@ -140,7 +145,7 @@ std::string vpImageIo::getExtension(const std::string &filename) \param I : Image to set with the \e filename content. \param filename : Name of the file containing the image. */ -void vpImageIo::read(vpImage &I, const std::string &filename) +void vpImageIo::read(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) { bool exist = vpIoTools::checkFilename(filename); if (!exist) { @@ -161,10 +166,10 @@ void vpImageIo::read(vpImage &I, const std::string &filename) readPPM(I, final_filename); break; case FORMAT_JPEG: - readJPEG(I, final_filename); + readJPEG(I, final_filename, backend); break; case FORMAT_PNG: - readPNG(I, final_filename); + readPNG(I, final_filename, backend); break; case FORMAT_TIFF: case FORMAT_BMP: @@ -207,7 +212,7 @@ void vpImageIo::read(vpImage &I, const std::string &filename) \param I : Image to set with the \e filename content. \param filename : Name of the file containing the image. */ -void vpImageIo::read(vpImage &I, const std::string &filename) +void vpImageIo::read(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) { bool exist = vpIoTools::checkFilename(filename); if (!exist) { @@ -227,10 +232,10 @@ void vpImageIo::read(vpImage &I, const std::string &filename) readPPM(I, final_filename); break; case FORMAT_JPEG: - readJPEG(I, final_filename); + readJPEG(I, final_filename, backend); break; case FORMAT_PNG: - readPNG(I, final_filename); + readPNG(I, final_filename, backend); break; case FORMAT_TIFF: case FORMAT_BMP: @@ -267,7 +272,7 @@ void vpImageIo::read(vpImage &I, const std::string &filename) \param I : Image to write. \param filename : Name of the file containing the image. */ -void vpImageIo::write(const vpImage &I, const std::string &filename) +void vpImageIo::write(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) { bool try_opencv_writer = false; @@ -279,10 +284,10 @@ void vpImageIo::write(const vpImage &I, const std::string &filena writePPM(I, filename); break; case FORMAT_JPEG: - writeJPEG(I, filename); + writeJPEG(I, filename, backend); break; case FORMAT_PNG: - writePNG(I, filename); + writePNG(I, filename, backend); break; case FORMAT_TIFF: case FORMAT_BMP: @@ -297,7 +302,7 @@ void vpImageIo::write(const vpImage &I, const std::string &filena if (try_opencv_writer) { #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 - writeOpenCV(I, filename); + writeOpenCV(I, filename, 90); #else std::string message = "Cannot write file \"" + filename + "\": No backend able to support this image format"; throw(vpImageException(vpImageException::ioError, message)); @@ -319,7 +324,7 @@ void vpImageIo::write(const vpImage &I, const std::string &filena \param I : Image to write. \param filename : Name of the file containing the image. */ -void vpImageIo::write(const vpImage &I, const std::string &filename) +void vpImageIo::write(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) { bool try_opencv_writer = false; @@ -331,10 +336,10 @@ void vpImageIo::write(const vpImage &I, const std::string &filename) writePPM(I, filename); break; case FORMAT_JPEG: - writeJPEG(I, filename); + writeJPEG(I, filename, backend); break; case FORMAT_PNG: - writePNG(I, filename); + writePNG(I, filename, backend); break; case FORMAT_TIFF: case FORMAT_BMP: @@ -349,7 +354,7 @@ void vpImageIo::write(const vpImage &I, const std::string &filename) if (try_opencv_writer) { #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 - writeOpenCV(I, filename); + writeOpenCV(I, filename, 90); #else std::string message = "Cannot write file \"" + filename + "\": No backend able to support this image format"; throw(vpImageException(vpImageException::ioError, message)); @@ -359,159 +364,159 @@ void vpImageIo::write(const vpImage &I, const std::string &filename) void vpImageIo::readJPEG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) { - if (backend == IO_LIB_BACKEND) { + if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_JPEG) readJPEGLibjpeg(I, filename); #else std::string message = "Cannot read file \"" + filename + "\": Libjpeg backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif - } else if (backend == IO_OPENCV_BACKEND) { + } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 readOpenCV(I, filename); #else std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif + } else if (backend == IO_STB_IMAGE_BACKEND || backend == IO_DEFAULT_BACKEND) { + readStb(I, filename); } else if (backend == IO_SIMDLIB_BACKEND) { readSimdlib(I, filename); - } else if (backend == IO_STB_IMAGE_BACKEND) { - readStb(I, filename); } } void vpImageIo::readJPEG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) { - if (backend == IO_LIB_BACKEND) { + if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_JPEG) readJPEGLibjpeg(I, filename); #else std::string message = "Cannot read file \"" + filename + "\": Libjpeg backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif - } else if (backend == IO_OPENCV_BACKEND) { + } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 readOpenCV(I, filename); #else std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif + } else if (backend == IO_STB_IMAGE_BACKEND || backend == IO_DEFAULT_BACKEND) { + readStb(I, filename); } else if (backend == IO_SIMDLIB_BACKEND) { readSimdlib(I, filename); - } else if (backend == IO_STB_IMAGE_BACKEND) { - readStb(I, filename); } } void vpImageIo::readPNG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) { - if (backend == IO_LIB_BACKEND) { + if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_PNG) readPNGLibpng(I, filename); #else std::string message = "Cannot read file \"" + filename + "\": Libpng backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif - } else if (backend == IO_OPENCV_BACKEND) { + } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 readOpenCV(I, filename); #else std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif + } else if (backend == IO_STB_IMAGE_BACKEND || backend == IO_DEFAULT_BACKEND) { + readStb(I, filename); } else if (backend == IO_SIMDLIB_BACKEND) { readSimdlib(I, filename); - } else if (backend == IO_STB_IMAGE_BACKEND) { - readStb(I, filename); } } void vpImageIo::readPNG(vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) { - if (backend == IO_LIB_BACKEND) { + if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_PNG) readPNGLibpng(I, filename); #else std::string message = "Cannot read file \"" + filename + "\": Libpng backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif - } else if (backend == IO_OPENCV_BACKEND) { + } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 readOpenCV(I, filename); #else std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif + } else if (backend == IO_STB_IMAGE_BACKEND || backend == IO_DEFAULT_BACKEND) { + readStb(I, filename); } else if (backend == IO_SIMDLIB_BACKEND) { readSimdlib(I, filename); - } else if (backend == IO_STB_IMAGE_BACKEND) { - readStb(I, filename); } } -void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) +void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename, int quality, const vpImageIoBackendType& backend) { - if (backend == IO_LIB_BACKEND) { + if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_JPEG) - writeJPEGLibjpeg(I, filename); + writeJPEGLibjpeg(I, filename, quality); #else std::string message = "Cannot write file \"" + filename + "\": Libjpeg backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif - } else if (backend == IO_OPENCV_BACKEND) { + } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 - writeOpenCV(I, filename); + writeOpenCV(I, filename, quality); #else std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif - } else if (backend == IO_SIMDLIB_BACKEND) { - writeJPEGSimdlib(I, filename); + } else if (backend == IO_SIMDLIB_BACKEND || backend == IO_DEFAULT_BACKEND) { + writeJPEGSimdlib(I, filename, quality); } else if (backend == IO_STB_IMAGE_BACKEND) { - writeJPEGStb(I, filename); + writeJPEGStb(I, filename, quality); } } -void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) +void vpImageIo::writeJPEG(const vpImage &I, const std::string &filename, int quality, const vpImageIoBackendType& backend) { - if (backend == IO_LIB_BACKEND) { + if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_JPEG) - writeJPEGLibjpeg(I, filename); + writeJPEGLibjpeg(I, filename, quality); #else std::string message = "Cannot write file \"" + filename + "\": Libjpeg backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif - } else if (backend == IO_OPENCV_BACKEND) { + } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 - writeOpenCV(I, filename); + writeOpenCV(I, filename, quality); #else std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif - } else if (backend == IO_SIMDLIB_BACKEND) { - writeJPEGSimdlib(I, filename); + } else if (backend == IO_SIMDLIB_BACKEND || backend == IO_DEFAULT_BACKEND) { + writeJPEGSimdlib(I, filename, quality); } else if (backend == IO_STB_IMAGE_BACKEND) { - writeJPEGStb(I, filename); + writeJPEGStb(I, filename, quality); } } void vpImageIo::writePNG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) { - if (backend == IO_LIB_BACKEND) { + if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_PNG) writePNGLibpng(I, filename); #else std::string message = "Cannot write file \"" + filename + "\": Libpng backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif - } else if (backend == IO_OPENCV_BACKEND) { + } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 - writeOpenCV(I, filename); + writeOpenCV(I, filename, 90); #else std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif - } else if (backend == IO_SIMDLIB_BACKEND) { + } else if (backend == IO_SIMDLIB_BACKEND || backend == IO_DEFAULT_BACKEND) { writePNGSimdlib(I, filename); } else if (backend == IO_STB_IMAGE_BACKEND) { writePNGStb(I, filename); @@ -520,21 +525,21 @@ void vpImageIo::writePNG(const vpImage &I, const std::string &fil void vpImageIo::writePNG(const vpImage &I, const std::string &filename, const vpImageIoBackendType& backend) { - if (backend == IO_LIB_BACKEND) { + if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_PNG) writePNGLibpng(I, filename); #else std::string message = "Cannot write file \"" + filename + "\": Libpng backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif - } else if (backend == IO_OPENCV_BACKEND) { + } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) { #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100 - writeOpenCV(I, filename); + writeOpenCV(I, filename, 90); #else std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available"; throw(vpImageException(vpImageException::ioError, message)); #endif - } else if (backend == IO_SIMDLIB_BACKEND) { + } else if (backend == IO_SIMDLIB_BACKEND || backend == IO_DEFAULT_BACKEND) { writePNGSimdlib(I, filename); } else if (backend == IO_STB_IMAGE_BACKEND) { writePNGStb(I, filename); diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp index 3bf19a465e..6182df06e4 100644 --- a/modules/io/test/perfImageLoadSave.cpp +++ b/modules/io/test/perfImageLoadSave.cpp @@ -45,221 +45,179 @@ #include static std::string ipath = vpIoTools::getViSPImagesDataPath(); -static std::string imagePathJpeg = vpIoTools::createFilePath(ipath, "Klimt/Klimt.jpeg"); -static std::string imagePathPng = vpIoTools::createFilePath(ipath, "Klimt/Klimt.png"); -static std::string imagePathPngBig = vpIoTools::createFilePath(ipath, "Klimt/test_image_resize.png"); +static std::vector paths { + ipath + "/Solvay/Solvay_conference_1927_Version2_640x440", + ipath + "/Solvay/Solvay_conference_1927_Version2_1024x705", + ipath + "/Solvay/Solvay_conference_1927_Version2_1280x881", + ipath + "/Solvay/Solvay_conference_1927_Version2_2126x1463", +}; +static std::vector names { + "Solvay (640x440)", "Solvay (1024x705)", "Solvay (1280x881)", "Solvay (2126x1463)" +}; +static std::vector backends { + vpImageIo::IO_LIB_BACKEND, vpImageIo::IO_OPENCV_BACKEND, vpImageIo::IO_SIMDLIB_BACKEND, vpImageIo::IO_STB_IMAGE_BACKEND +}; +static std::vector backendNamesJpeg { + "libjpeg", "OpenCV", "simd", "stb" +}; +static std::vector backendNamesPng { + "libpng", "OpenCV", "simd", "stb" +}; static int nThreads = 0; -TEST_CASE("Benchmark Jpeg image loading", "[benchmark]") { - { - vpImage I; - - BENCHMARK("vpImageIo::read()") { - vpImageIo::read(I, imagePathJpeg); - return I; - }; +TEST_CASE("Benchmark JPEG image loading", "[benchmark]") { + SECTION("Grayscale") { + for (size_t i = 0; i < paths.size(); i++) { + SECTION(names[i]) { + for (size_t j = 0; j < backends.size(); j++) { + vpImage I; + + BENCHMARK(backendNamesJpeg[j] + " backend") { + vpImageIo::read(I, paths[i] + ".jpg", backends[j]); + return I; + }; + } + } + } } - { - vpImage I; - - BENCHMARK("vpImageIo::readSimdlib()") { - vpImageIo::readJPEG(I, imagePathJpeg, vpImageIo::IO_SIMDLIB_BACKEND); - return I; - }; - } - - { - vpImage I; - - BENCHMARK("vpImageIo::readStb()") { - vpImageIo::readJPEG(I, imagePathJpeg, vpImageIo::IO_STB_IMAGE_BACKEND); - return I; - }; + SECTION("vpRGBa") { + for (size_t i = 0; i < paths.size(); i++) { + SECTION(names[i]) { + for (size_t j = 0; j < backends.size(); j++) { + vpImage I; + + BENCHMARK(backendNamesJpeg[j] + " backend") { + vpImageIo::read(I, paths[i] + ".jpg", backends[j]); + return I; + }; + } + } + } } } -TEST_CASE("Benchmark Png image loading", "[benchmark]") { - { - vpImage I; - - BENCHMARK("vpImageIo::read()") { - vpImageIo::read(I, imagePathPng); - return I; - }; +TEST_CASE("Benchmark PNG image loading", "[benchmark]") { + SECTION("Grayscale") { + for (size_t i = 0; i < paths.size(); i++) { + SECTION(names[i]) { + for (size_t j = 0; j < backends.size(); j++) { + vpImage I; + + BENCHMARK(backendNamesPng[j] + " backend") { + vpImageIo::read(I, paths[i] + ".png", backends[j]); + return I; + }; + } + } + } } - { - vpImage I; - - BENCHMARK("vpImageIo::readSimdlib()") { - vpImageIo::readPNG(I, imagePathPng, vpImageIo::IO_SIMDLIB_BACKEND); - return I; - }; - } - - { - vpImage I; - - BENCHMARK("vpImageIo::readStb()") { - vpImageIo::readPNG(I, imagePathPng, vpImageIo::IO_STB_IMAGE_BACKEND); - return I; - }; + SECTION("vpRGBa") { + for (size_t i = 0; i < paths.size(); i++) { + SECTION(names[i]) { + for (size_t j = 0; j < backends.size(); j++) { + vpImage I; + + BENCHMARK(backendNamesPng[j] + " backend") { + vpImageIo::read(I, paths[i] + ".png", backends[j]); + return I; + }; + } + } + } } } -TEST_CASE("Benchmark big Png image loading", "[benchmark]") { - { - vpImage I; - - BENCHMARK("vpImageIo::read()") { - vpImageIo::read(I, imagePathPngBig); - return I; - }; +#if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || (defined(__APPLE__) && defined(__MACH__))) // UNIX +// makeTempDirectory is only implemented for Unix platform + +std::string username, directory_filename_tmp; + +TEST_CASE("Benchmark JPEG image saving", "[benchmark]") { + vpIoTools::getUserName(username); + std::string tmp_dir = "/tmp/" + username; + vpIoTools::makeDirectory(tmp_dir); + directory_filename_tmp = tmp_dir + "/" + "vpIoTools_perfImageLoadSave_XXXXXX"; + std::string converted_dirname_tmp = vpIoTools::makeTempDirectory(directory_filename_tmp); + REQUIRE(vpIoTools::checkDirectory(converted_dirname_tmp)); + + SECTION("Grayscale") { + for (size_t i = 0; i < paths.size(); i++) { + vpImage I; + vpImageIo::read(I, paths[i] + ".png"); + + SECTION(names[i]) { + for (size_t j = 0; j < backends.size(); j++) { + BENCHMARK(backendNamesJpeg[j] + " backend") { + vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.jpg", backends[j]); + return I; + }; + } + } + } } - { - vpImage I; - - BENCHMARK("vpImageIo::readSimdlib()") { - vpImageIo::readPNG(I, imagePathPngBig, vpImageIo::IO_SIMDLIB_BACKEND); - return I; - }; + SECTION("vpRGBa") { + for (size_t i = 0; i < paths.size(); i++) { + vpImage I; + vpImageIo::read(I, paths[i] + ".png"); + + SECTION(names[i]) { + for (size_t j = 0; j < backends.size(); j++) { + BENCHMARK(backendNamesJpeg[j] + " backend") { + vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.jpg", backends[j]); + return I; + }; + } + } + } } - { - vpImage I; - - BENCHMARK("vpImageIo::readStb()") { - vpImageIo::readPNG(I, imagePathPngBig, vpImageIo::IO_STB_IMAGE_BACKEND); - return I; - }; - } + REQUIRE(vpIoTools::remove(converted_dirname_tmp)); } -TEST_CASE("Benchmark Jpeg image saving", "[benchmark]") { - vpImage I; - vpImageIo::read(I, imagePathJpeg); - { - const std::string filename = "/tmp/Klimt_ViSP.jpg"; - - BENCHMARK("vpImageIo::write()") { - vpImageIo::write(I, filename); - return I; - }; +TEST_CASE("Benchmark PNG image saving", "[benchmark]") { + vpIoTools::getUserName(username); + std::string tmp_dir = "/tmp/" + username; + vpIoTools::makeDirectory(tmp_dir); + directory_filename_tmp = tmp_dir + "/" + "vpIoTools_perfImageLoadSave_XXXXXX"; + std::string converted_dirname_tmp = vpIoTools::makeTempDirectory(directory_filename_tmp); + REQUIRE(vpIoTools::checkDirectory(converted_dirname_tmp)); + + SECTION("Grayscale") { + for (size_t i = 0; i < paths.size(); i++) { + vpImage I; + vpImageIo::read(I, paths[i] + ".png"); + + SECTION(names[i]) { + for (size_t j = 0; j < backends.size(); j++) { + BENCHMARK(backendNamesPng[j] + " backend") { + vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.png", backends[j]); + return I; + }; + } + } + } } - { - const std::string filename = "/tmp/Klimt_Simd.jpg"; - - BENCHMARK("vpImageIo::writeSimdlib()") { - vpImageIo::writeJPEG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND); - return I; - }; - } - - { - const std::string filename = "/tmp/Klimt_stb.jpg"; - - BENCHMARK("vpImageIo::writeStb()") { - vpImageIo::writeJPEG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND); - return I; - }; - } -} - -TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") { - vpImage I; - vpImageIo::read(I, imagePathPngBig); - { - const std::string filename = "/tmp/Big_images_ViSP.jpg"; - - BENCHMARK("vpImageIo::write()") { - vpImageIo::write(I, filename); - return I; - }; - } - - { - const std::string filename = "/tmp/Big_images_Simd.jpg"; - - BENCHMARK("vpImageIo::writeSimdlib()") { - vpImageIo::writeJPEG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND); - return I; - }; - } - - { - const std::string filename = "/tmp/Big_images_stb.jpg"; - - BENCHMARK("vpImageIo::writeStb()") { - vpImageIo::writeJPEG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND); - return I; - }; - } -} - -TEST_CASE("Benchmark Png image saving", "[benchmark]") { - vpImage I; - vpImageIo::read(I, imagePathPng); - { - const std::string filename = "/tmp/Klimt_ViSP.png"; - - BENCHMARK("vpImageIo::write()") { - vpImageIo::write(I, filename); - return I; - }; - } - - { - const std::string filename = "/tmp/Klimt_Simd.png"; - - BENCHMARK("vpImageIo::writeSimdlib()") { - vpImageIo::writePNG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND); - return I; - }; - } - - { - const std::string filename = "/tmp/Klimt_stb.png"; - - BENCHMARK("vpImageIo::writeStb()") { - vpImageIo::writePNG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND); - return I; - }; - } -} - -TEST_CASE("Benchmark big Png image saving", "[benchmark]") { - vpImage I; - vpImageIo::read(I, imagePathPngBig); - { - const std::string filename = "/tmp/Big_images_ViSP.png"; - - BENCHMARK("vpImageIo::write()") { - vpImageIo::write(I, filename); - return I; - }; - } - - { - const std::string filename = "/tmp/Big_images_Simd.png"; - - BENCHMARK("vpImageIo::writeSimdlib()") { - vpImageIo::writePNG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND); - return I; - }; - } - - { - const std::string filename = "/tmp/Big_images_stb.png"; - - BENCHMARK("vpImageIo::writeStb()") { - vpImageIo::writePNG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND); - return I; - }; + SECTION("vpRGBa") { + for (size_t i = 0; i < paths.size(); i++) { + vpImage I; + vpImageIo::read(I, paths[i] + ".png"); + + SECTION(names[i]) { + for (size_t j = 0; j < backends.size(); j++) { + BENCHMARK(backendNamesPng[j] + " backend") { + vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.png", backends[j]); + return I; + }; + } + } + } } } +#endif int main(int argc, char *argv[]) { @@ -272,11 +230,6 @@ int main(int argc, char *argv[]) | Opt(runBenchmark) // bind variable to a new option, with a hint string ["--benchmark"] // the option names it will respond to ("run benchmark?") // description string for the help output - | Opt(imagePathJpeg, "imagePathColor") - ["--imagePathColor"] - ("Path to color image") - | Opt(imagePathPng, "imagePathColor") - ["--imagePathGray"] ("Path to gray image") | Opt(nThreads, "nThreads") ["--nThreads"] @@ -289,13 +242,6 @@ int main(int argc, char *argv[]) session.applyCommandLine(argc, argv); if (runBenchmark) { -// vpImage I_color; -// vpImageIo::read(I_color, imagePathColor); -// std::cout << "imagePathColor:\n\t" << imagePathColor << "\n\t" << I_color.getWidth() << "x" << I_color.getHeight() << std::endl; - -// vpImage I_gray; -// vpImageIo::read(I_gray, imagePathGray); -// std::cout << "imagePathGray:\n\t" << imagePathGray << "\n\t" << I_gray.getWidth() << "x" << I_gray.getHeight() << std::endl; std::cout << "nThreads: " << nThreads << " / available threads: " << std::thread::hardware_concurrency() << std::endl; int numFailed = session.run(); From 66b12c526f1991740867f56d4eefc395f38a8aee Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Sun, 21 Nov 2021 21:40:24 +0100 Subject: [PATCH 14/18] Update stb_image.h to 2.27 version. --- 3rdparty/stb_image/CMakeLists.txt | 2 +- 3rdparty/stb_image/README.md | 2 +- 3rdparty/stb_image/stb_image.h | 576 ++++++++++++++++++++++++------ 3 files changed, 459 insertions(+), 121 deletions(-) diff --git a/3rdparty/stb_image/CMakeLists.txt b/3rdparty/stb_image/CMakeLists.txt index 84ded2f220..f344e7f27d 100644 --- a/3rdparty/stb_image/CMakeLists.txt +++ b/3rdparty/stb_image/CMakeLists.txt @@ -1,5 +1,5 @@ project(${STBIMAGE_LIBRARY}) set(STBIMAGE_MAJOR_VERSION 2 PARENT_SCOPE) -set(STBIMAGE_MINOR_VERSION 22 PARENT_SCOPE) +set(STBIMAGE_MINOR_VERSION 27 PARENT_SCOPE) set(STBIMAGE_PATCH_VERSION 0 PARENT_SCOPE) diff --git a/3rdparty/stb_image/README.md b/3rdparty/stb_image/README.md index 80019a1405..efa37458eb 100644 --- a/3rdparty/stb_image/README.md +++ b/3rdparty/stb_image/README.md @@ -12,7 +12,7 @@ by Jorge L. "VinoBS" Rodriguez, and stb_sprintf by Jeff Roberts. library | lastest version | category | LoC | description --------------------- | ---- | -------- | --- | -------------------------------- **[stb_vorbis.c](stb_vorbis.c)** | 1.16 | audio | 5486 | decode ogg vorbis files from file/memory to float/16-bit signed output -**[stb_image.h](stb_image.h)** | 2.22 | graphics | 7547 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC +**[stb_image.h](stb_image.h)** | 2.27 | graphics | 7897 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC **[stb_truetype.h](stb_truetype.h)** | 1.21 | graphics | 4882 | parse, decode, and rasterize characters from truetype fonts **[stb_image_write.h](stb_image_write.h)** | 1.13 | graphics | 1617 | image writing to disk: PNG, TGA, BMP **[stb_image_resize.h](stb_image_resize.h)** | 0.96 | graphics | 2630 | resize images larger/smaller with good quality diff --git a/3rdparty/stb_image/stb_image.h b/3rdparty/stb_image/stb_image.h index eb8d215b40..d60371b95f 100644 --- a/3rdparty/stb_image/stb_image.h +++ b/3rdparty/stb_image/stb_image.h @@ -1,4 +1,4 @@ -/* stb_image - v2.23 - public domain image loader - http://nothings.org/stb +/* stb_image - v2.27 - public domain image loader - http://nothings.org/stb no warranty implied; use at your own risk Do this: @@ -48,6 +48,10 @@ LICENSE RECENT REVISION HISTORY: + 2.27 (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes + 2.26 (2020-07-13) many minor fixes + 2.25 (2020-02-02) fix warnings + 2.24 (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically 2.23 (2019-08-11) fix clang static analysis warning 2.22 (2019-03-04) gif fixes, fix warnings 2.21 (2019-02-25) fix typo in comment @@ -86,26 +90,37 @@ RECENT REVISION HISTORY: Jeremy Sawicki (handle all ImageNet JPGs) Optimizations & bugfixes Mikhail Morozov (1-bit BMP) Fabian "ryg" Giesen Anael Seghezzi (is-16-bit query) - Arseny Kapoulkine + Arseny Kapoulkine Simon Breuss (16-bit PNM) John-Mark Allen Carmelo J Fdez-Aguera Bug & warning fixes - Marc LeBlanc David Woo Guillaume George Martins Mozeiko - Christpher Lloyd Jerry Jansson Joseph Thomson Phil Jordan - Dave Moore Roy Eltham Hayaki Saito Nathan Reed - Won Chun Luke Graham Johan Duparc Nick Verigakis - the Horde3D community Thomas Ruf Ronny Chevalier github:rlyeh - Janez Zemva John Bartholomew Michal Cichon github:romigrou - Jonathan Blow Ken Hamada Tero Hanninen github:svdijk - Laurent Gomila Cort Stratton Sergio Gonzalez github:snagar - Aruelien Pocheville Thibault Reuille Cass Everitt github:Zelex - Ryamond Barbiero Paul Du Bois Engin Manap github:grim210 - Aldo Culquicondor Philipp Wiesemann Dale Weiler github:sammyhw - Oriol Ferrer Mesia Josh Tobin Matthew Gregan github:phprus - Julian Raschke Gregory Mullen Baldur Karlsson github:poppolopoppo - Christian Floisand Kevin Schmidt JR Smith github:darealshinji - Blazej Dariusz Roszkowski github:Michaelangel007 + Marc LeBlanc David Woo Guillaume George Martins Mozeiko + Christpher Lloyd Jerry Jansson Joseph Thomson Blazej Dariusz Roszkowski + Phil Jordan Dave Moore Roy Eltham + Hayaki Saito Nathan Reed Won Chun + Luke Graham Johan Duparc Nick Verigakis the Horde3D community + Thomas Ruf Ronny Chevalier github:rlyeh + Janez Zemva John Bartholomew Michal Cichon github:romigrou + Jonathan Blow Ken Hamada Tero Hanninen github:svdijk + Eugene Golushkov Laurent Gomila Cort Stratton github:snagar + Aruelien Pocheville Sergio Gonzalez Thibault Reuille github:Zelex + Cass Everitt Ryamond Barbiero github:grim210 + Paul Du Bois Engin Manap Aldo Culquicondor github:sammyhw + Philipp Wiesemann Dale Weiler Oriol Ferrer Mesia github:phprus + Josh Tobin Matthew Gregan github:poppolopoppo + Julian Raschke Gregory Mullen Christian Floisand github:darealshinji + Baldur Karlsson Kevin Schmidt JR Smith github:Michaelangel007 + Brad Weinberger Matvey Cherevko github:mosra + Luca Sas Alexander Veselov Zack Middleton [reserved] + Ryan C. Gordon [reserved] [reserved] + DO NOT ADD YOUR NAME HERE + + Jacko Dirks + + To add your name to the credits, pick a random blank space in the middle and fill it. + 80% of merge conflicts on stb PRs are due to people adding their name at the end + of the credits. */ #ifndef STBI_INCLUDE_STB_IMAGE_H @@ -164,6 +179,32 @@ RECENT REVISION HISTORY: // // Paletted PNG, BMP, GIF, and PIC images are automatically depalettized. // +// To query the width, height and component count of an image without having to +// decode the full file, you can use the stbi_info family of functions: +// +// int x,y,n,ok; +// ok = stbi_info(filename, &x, &y, &n); +// // returns ok=1 and sets x, y, n if image is a supported format, +// // 0 otherwise. +// +// Note that stb_image pervasively uses ints in its public API for sizes, +// including sizes of memory buffers. This is now part of the API and thus +// hard to change without causing breakage. As a result, the various image +// loaders all have certain limits on image size; these differ somewhat +// by format but generally boil down to either just under 2GB or just under +// 1GB. When the decoded image would be larger than this, stb_image decoding +// will fail. +// +// Additionally, stb_image will reject image files that have any of their +// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS, +// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit, +// the only way to have an image with such dimensions load correctly +// is for it to have a rather extreme aspect ratio. Either way, the +// assumption here is that such larger images are likely to be malformed +// or malicious. If you do need to load an image with individual dimensions +// larger than that, and it still fits in the overall size limit, you can +// #define STBI_MAX_DIMENSIONS on your own to be something larger. +// // =========================================================================== // // UNICODE: @@ -269,11 +310,10 @@ RECENT REVISION HISTORY: // // iPhone PNG support: // -// By default we convert iphone-formatted PNGs back to RGB, even though -// they are internally encoded differently. You can disable this conversion -// by calling stbi_convert_iphone_png_to_rgb(0), in which case -// you will always just get the native iphone "format" through (which -// is BGR stored in RGB). +// We optionally support converting iPhone-formatted PNGs (which store +// premultiplied BGRA) back to RGB, even though they're internally encoded +// differently. To enable this conversion, call +// stbi_convert_iphone_png_to_rgb(1). // // Call stbi_set_unpremultiply_on_load(1) as well to force a divide per // pixel to remove any premultiplied alpha *only* if the image file explicitly @@ -315,7 +355,14 @@ RECENT REVISION HISTORY: // - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still // want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB // - +// - If you define STBI_MAX_DIMENSIONS, stb_image will reject images greater +// than that size (in either width or height) without further processing. +// This is to let programs in the wild set an upper bound to prevent +// denial-of-service attacks on untrusted data, as one could generate a +// valid image of gigantic dimensions and force stb_image to allocate a +// huge block of memory and spend disproportionate time decoding it. By +// default this is set to (1 << 24), which is 16777216, but that's still +// very big. #ifndef STBI_NO_STDIO #include @@ -434,7 +481,7 @@ STBIDEF int stbi_is_hdr_from_file(FILE *f); // get a VERY brief reason for failure -// NOT THREADSAFE +// on most compilers (and ALL modern mainstream compilers) this is threadsafe STBIDEF const char *stbi_failure_reason (void); // free the loaded image -- this is just free() @@ -467,6 +514,13 @@ STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert); // flip the image vertically, so the first pixel in the output array is the bottom left STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip); +// as above, but only applies to images loaded on the thread that calls the function +// this function is only available if your compiler supports thread-local variables; +// calling it will fail to link if your compiler doesn't +STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply); +STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert); +STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip); + // ZLIB client - used by PNG, available for other purposes STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen); @@ -563,6 +617,23 @@ STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch #define stbi_inline __forceinline #endif +#ifndef STBI_NO_THREAD_LOCALS + #if defined(__cplusplus) && __cplusplus >= 201103L + #define STBI_THREAD_LOCAL thread_local + #elif defined(__GNUC__) && __GNUC__ < 5 + #define STBI_THREAD_LOCAL __thread + #elif defined(_MSC_VER) + #define STBI_THREAD_LOCAL __declspec(thread) + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__) + #define STBI_THREAD_LOCAL _Thread_local + #endif + + #ifndef STBI_THREAD_LOCAL + #if defined(__GNUC__) + #define STBI_THREAD_LOCAL __thread + #endif + #endif +#endif #ifdef _MSC_VER typedef unsigned short stbi__uint16; @@ -593,7 +664,7 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1]; #ifdef STBI_HAS_LROTL #define stbi_lrot(x,y) _lrotl(x,y) #else - #define stbi_lrot(x,y) (((x) << (y)) | ((x) >> (32 - (y)))) + #define stbi_lrot(x,y) (((x) << (y)) | ((x) >> (-(y) & 31))) #endif #if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED)) @@ -707,14 +778,21 @@ static int stbi__sse2_available(void) #ifdef STBI_NEON #include -// assume GCC or Clang on ARM targets +#ifdef _MSC_VER +#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name +#else #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16))) #endif +#endif #ifndef STBI_SIMD_ALIGN #define STBI_SIMD_ALIGN(type, name) type name #endif +#ifndef STBI_MAX_DIMENSIONS +#define STBI_MAX_DIMENSIONS (1 << 24) +#endif + /////////////////////////////////////////////// // // stbi__context struct and start_xxx functions @@ -732,6 +810,7 @@ typedef struct int read_from_callbacks; int buflen; stbi_uc buffer_start[128]; + int callback_already_read; stbi_uc *img_buffer, *img_buffer_end; stbi_uc *img_buffer_original, *img_buffer_original_end; @@ -745,6 +824,7 @@ static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len) { s->io.read = NULL; s->read_from_callbacks = 0; + s->callback_already_read = 0; s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer; s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len; } @@ -756,7 +836,8 @@ static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void * s->io_user_data = user; s->buflen = sizeof(s->buffer_start); s->read_from_callbacks = 1; - s->img_buffer_original = s->buffer_start; + s->callback_already_read = 0; + s->img_buffer = s->img_buffer_original = s->buffer_start; stbi__refill_buffer(s); s->img_buffer_original_end = s->img_buffer_end; } @@ -770,12 +851,17 @@ static int stbi__stdio_read(void *user, char *data, int size) static void stbi__stdio_skip(void *user, int n) { + int ch; fseek((FILE*) user, n, SEEK_CUR); + ch = fgetc((FILE*) user); /* have to read a byte to reset feof()'s flag */ + if (ch != EOF) { + ungetc(ch, (FILE *) user); /* push byte back onto stream if valid. */ + } } static int stbi__stdio_eof(void *user) { - return feof((FILE*) user); + return feof((FILE*) user) || ferror((FILE *) user); } static stbi_io_callbacks stbi__stdio_callbacks = @@ -871,21 +957,27 @@ static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp); static int stbi__pnm_test(stbi__context *s); static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri); static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp); +static int stbi__pnm_is16(stbi__context *s); #endif -// this is not threadsafe -static const char *stbi__g_failure_reason; +static +#ifdef STBI_THREAD_LOCAL +STBI_THREAD_LOCAL +#endif +const char *stbi__g_failure_reason; STBIDEF const char *stbi_failure_reason(void) { return stbi__g_failure_reason; } +#ifndef STBI_NO_FAILURE_STRINGS static int stbi__err(const char *str) { stbi__g_failure_reason = str; return 0; } +#endif static void *stbi__malloc(size_t size) { @@ -924,11 +1016,13 @@ static int stbi__mul2sizes_valid(int a, int b) return a <= INT_MAX/b; } +#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR) // returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow static int stbi__mad2sizes_valid(int a, int b, int add) { return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add); } +#endif // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow static int stbi__mad3sizes_valid(int a, int b, int c, int add) @@ -938,7 +1032,7 @@ static int stbi__mad3sizes_valid(int a, int b, int c, int add) } // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow -#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) +#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM) static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add) { return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) && @@ -946,12 +1040,14 @@ static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add) } #endif +#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR) // mallocs with size overflow checking static void *stbi__malloc_mad2(int a, int b, int add) { if (!stbi__mad2sizes_valid(a, b, add)) return NULL; return stbi__malloc(a*b + add); } +#endif static void *stbi__malloc_mad3(int a, int b, int c, int add) { @@ -959,7 +1055,7 @@ static void *stbi__malloc_mad3(int a, int b, int c, int add) return stbi__malloc(a*b*c + add); } -#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) +#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM) static void *stbi__malloc_mad4(int a, int b, int c, int d, int add) { if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL; @@ -995,13 +1091,29 @@ static float *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp); static stbi_uc *stbi__hdr_to_ldr(float *data, int x, int y, int comp); #endif -static int stbi__vertically_flip_on_load = 0; +static int stbi__vertically_flip_on_load_global = 0; STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip) { - stbi__vertically_flip_on_load = flag_true_if_should_flip; + stbi__vertically_flip_on_load_global = flag_true_if_should_flip; +} + +#ifndef STBI_THREAD_LOCAL +#define stbi__vertically_flip_on_load stbi__vertically_flip_on_load_global +#else +static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set; + +STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip) +{ + stbi__vertically_flip_on_load_local = flag_true_if_should_flip; + stbi__vertically_flip_on_load_set = 1; } +#define stbi__vertically_flip_on_load (stbi__vertically_flip_on_load_set \ + ? stbi__vertically_flip_on_load_local \ + : stbi__vertically_flip_on_load_global) +#endif // STBI_THREAD_LOCAL + static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc) { memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields @@ -1009,9 +1121,8 @@ static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int re ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order ri->num_channels = 0; - #ifndef STBI_NO_JPEG - if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri); - #endif + // test the formats with a very explicit header first (at least a FOURCC + // or distinctive magic number first) #ifndef STBI_NO_PNG if (stbi__png_test(s)) return stbi__png_load(s,x,y,comp,req_comp, ri); #endif @@ -1023,10 +1134,19 @@ static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int re #endif #ifndef STBI_NO_PSD if (stbi__psd_test(s)) return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc); + #else + STBI_NOTUSED(bpc); #endif #ifndef STBI_NO_PIC if (stbi__pic_test(s)) return stbi__pic_load(s,x,y,comp,req_comp, ri); #endif + + // then the formats that can end up attempting to load with just 1 or 2 + // bytes matching expectations; these are prone to false positives, so + // try them later + #ifndef STBI_NO_JPEG + if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri); + #endif #ifndef STBI_NO_PNM if (stbi__pnm_test(s)) return stbi__pnm_load(s,x,y,comp,req_comp, ri); #endif @@ -1125,8 +1245,10 @@ static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, if (result == NULL) return NULL; + // it is the responsibility of the loaders to make sure we get either 8 or 16 bit. + STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16); + if (ri.bits_per_channel != 8) { - STBI_ASSERT(ri.bits_per_channel == 16); result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp); ri.bits_per_channel = 8; } @@ -1149,8 +1271,10 @@ static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, if (result == NULL) return NULL; + // it is the responsibility of the loaders to make sure we get either 8 or 16 bit. + STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16); + if (ri.bits_per_channel != 16) { - STBI_ASSERT(ri.bits_per_channel == 8); result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp); ri.bits_per_channel = 16; } @@ -1178,33 +1302,33 @@ static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, in #ifndef STBI_NO_STDIO -#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8) +#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8) STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide); STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default); #endif -#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8) +#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8) STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input) { - return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL); + return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL); } #endif static FILE *stbi__fopen(char const *filename, char const *mode) { FILE *f; -#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8) +#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8) wchar_t wMode[64]; wchar_t wFilename[1024]; - if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename))) + if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename))) return 0; - if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode))) + if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode))) return 0; -#if _MSC_VER >= 1400 - if (0 != _wfopen_s(&f, wFilename, wMode)) - f = 0; +#if defined(_MSC_VER) && _MSC_VER >= 1400 + if (0 != _wfopen_s(&f, wFilename, wMode)) + f = 0; #else f = _wfopen(wFilename, wMode); #endif @@ -1453,6 +1577,7 @@ enum static void stbi__refill_buffer(stbi__context *s) { int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen); + s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original); if (n == 0) { // at end of file, treat same as if from memory, but need to handle case // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file @@ -1477,6 +1602,9 @@ stbi_inline static stbi_uc stbi__get8(stbi__context *s) return 0; } +#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM) +// nothing +#else stbi_inline static int stbi__at_eof(stbi__context *s) { if (s->io.read) { @@ -1488,9 +1616,14 @@ stbi_inline static int stbi__at_eof(stbi__context *s) return s->img_buffer >= s->img_buffer_end; } +#endif +#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) +// nothing +#else static void stbi__skip(stbi__context *s, int n) { + if (n == 0) return; // already there! if (n < 0) { s->img_buffer = s->img_buffer_end; return; @@ -1505,7 +1638,11 @@ static void stbi__skip(stbi__context *s, int n) } s->img_buffer += n; } +#endif +#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM) +// nothing +#else static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n) { if (s->io.read) { @@ -1529,18 +1666,27 @@ static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n) } else return 0; } +#endif +#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC) +// nothing +#else static int stbi__get16be(stbi__context *s) { int z = stbi__get8(s); return (z << 8) + stbi__get8(s); } +#endif +#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC) +// nothing +#else static stbi__uint32 stbi__get32be(stbi__context *s) { stbi__uint32 z = stbi__get16be(s); return (z << 16) + stbi__get16be(s); } +#endif #if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) // nothing @@ -1556,13 +1702,16 @@ static int stbi__get16le(stbi__context *s) static stbi__uint32 stbi__get32le(stbi__context *s) { stbi__uint32 z = stbi__get16le(s); - return z + (stbi__get16le(s) << 16); + z += (stbi__uint32)stbi__get16le(s) << 16; + return z; } #endif #define STBI__BYTECAST(x) ((stbi_uc) ((x) & 255)) // truncate int to byte without warnings - +#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM) +// nothing +#else ////////////////////////////////////////////////////////////////////////////// // // generic converter from built-in img_n to req_comp @@ -1578,7 +1727,11 @@ static stbi_uc stbi__compute_y(int r, int g, int b) { return (stbi_uc) (((r*77) + (g*150) + (29*b)) >> 8); } +#endif +#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM) +// nothing +#else static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y) { int i,j; @@ -1614,7 +1767,7 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); } break; STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break; STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2]; } break; - default: STBI_ASSERT(0); + default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion"); } #undef STBI__CASE } @@ -1622,12 +1775,20 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r STBI_FREE(data); return good; } +#endif +#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) +// nothing +#else static stbi__uint16 stbi__compute_y_16(int r, int g, int b) { return (stbi__uint16) (((r*77) + (g*150) + (29*b)) >> 8); } +#endif +#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) +// nothing +#else static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y) { int i,j; @@ -1663,7 +1824,7 @@ static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int r STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); } break; STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break; STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2]; } break; - default: STBI_ASSERT(0); + default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion"); } #undef STBI__CASE } @@ -1671,6 +1832,7 @@ static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int r STBI_FREE(data); return good; } +#endif #ifndef STBI_NO_LINEAR static float *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp) @@ -1969,13 +2131,12 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n) int sgn; if (j->code_bits < n) stbi__grow_buffer_unsafe(j); - sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB + sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative) k = stbi_lrot(j->code_buffer, n); - STBI_ASSERT(n >= 0 && n < (int) (sizeof(stbi__bmask)/sizeof(*stbi__bmask))); j->code_buffer = k & ~stbi__bmask[n]; k &= stbi__bmask[n]; j->code_bits -= n; - return k + (stbi__jbias[n] & ~sgn); + return k + (stbi__jbias[n] & (sgn - 1)); } // get some unsigned bits @@ -2025,7 +2186,7 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); t = stbi__jpeg_huff_decode(j, hdc); - if (t < 0) return stbi__err("bad huffman code","Corrupt JPEG"); + if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG"); // 0 all the ac values now so we can do it 32-bits at a time memset(data,0,64*sizeof(data[0])); @@ -2082,11 +2243,12 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__ // first scan for DC coefficient, must be first memset(data,0,64*sizeof(data[0])); // 0 all the ac values now t = stbi__jpeg_huff_decode(j, hdc); + if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); diff = t ? stbi__extend_receive(j, t) : 0; dc = j->img_comp[b].dc_pred + diff; j->img_comp[b].dc_pred = dc; - data[0] = (short) (dc << j->succ_low); + data[0] = (short) (dc * (1 << j->succ_low)); } else { // refinement scan for DC coefficient if (stbi__jpeg_get_bit(j)) @@ -2123,7 +2285,7 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__ j->code_buffer <<= s; j->code_bits -= s; zig = stbi__jpeg_dezigzag[k++]; - data[zig] = (short) ((r >> 8) << shift); + data[zig] = (short) ((r >> 8) * (1 << shift)); } else { int rs = stbi__jpeg_huff_decode(j, hac); if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG"); @@ -2141,7 +2303,7 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__ } else { k += r; zig = stbi__jpeg_dezigzag[k++]; - data[zig] = (short) (stbi__extend_receive(j,s) << shift); + data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift)); } } } while (k <= j->spec_end); @@ -3072,6 +3234,8 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan) p = stbi__get8(s); if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline s->img_y = stbi__get16be(s); if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG s->img_x = stbi__get16be(s); if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires + if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)"); + if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)"); c = stbi__get8(s); if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG"); s->img_n = c; @@ -3103,6 +3267,13 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan) if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v; } + // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios + // and I've never seen a non-corrupted JPEG file actually use them + for (i=0; i < s->img_n; ++i) { + if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG"); + if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG"); + } + // compute interleaved mcu info z->img_h_max = h_max; z->img_v_max = v_max; @@ -3658,6 +3829,10 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp else decode_n = z->s->img_n; + // nothing to do if no components requested; check this now to avoid + // accessing uninitialized coutput[0] later + if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; } + // resample and color-convert { int k; @@ -3800,6 +3975,7 @@ static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int re { unsigned char* result; stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg)); + if (!j) return stbi__errpuc("outofmem", "Out of memory"); STBI_NOTUSED(ri); j->s = s; stbi__setup_jpeg(j); @@ -3812,6 +3988,7 @@ static int stbi__jpeg_test(stbi__context *s) { int r; stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg)); + if (!j) return stbi__err("outofmem", "Out of memory"); j->s = s; stbi__setup_jpeg(j); r = stbi__decode_jpeg_header(j, STBI__SCAN_type); @@ -3836,6 +4013,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp) { int result; stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg))); + if (!j) return stbi__err("outofmem", "Out of memory"); j->s = s; result = stbi__jpeg_info_raw(j, x, y, comp); STBI_FREE(j); @@ -3855,6 +4033,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp) // fast-way is faster to check than jpeg huffman, but slow way is slower #define STBI__ZFAST_BITS 9 // accelerate all cases in default tables #define STBI__ZFAST_MASK ((1 << STBI__ZFAST_BITS) - 1) +#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet // zlib-style huffman encoding // (jpegs packs from left, zlib from right, so can't share code) @@ -3864,8 +4043,8 @@ typedef struct stbi__uint16 firstcode[16]; int maxcode[17]; stbi__uint16 firstsymbol[16]; - stbi_uc size[288]; - stbi__uint16 value[288]; + stbi_uc size[STBI__ZNSYMS]; + stbi__uint16 value[STBI__ZNSYMS]; } stbi__zhuffman; stbi_inline static int stbi__bitreverse16(int n) @@ -3952,16 +4131,23 @@ typedef struct stbi__zhuffman z_length, z_distance; } stbi__zbuf; +stbi_inline static int stbi__zeof(stbi__zbuf *z) +{ + return (z->zbuffer >= z->zbuffer_end); +} + stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z) { - if (z->zbuffer >= z->zbuffer_end) return 0; - return *z->zbuffer++; + return stbi__zeof(z) ? 0 : *z->zbuffer++; } static void stbi__fill_bits(stbi__zbuf *z) { do { - STBI_ASSERT(z->code_buffer < (1U << z->num_bits)); + if (z->code_buffer >= (1U << z->num_bits)) { + z->zbuffer = z->zbuffer_end; /* treat this as EOF so we fail. */ + return; + } z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits; z->num_bits += 8; } while (z->num_bits <= 24); @@ -3986,10 +4172,11 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z) for (s=STBI__ZFAST_BITS+1; ; ++s) if (k < z->maxcode[s]) break; - if (s == 16) return -1; // invalid code! + if (s >= 16) return -1; // invalid code! // code size is s, so: b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s]; - STBI_ASSERT(z->size[b] == s); + if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere! + if (z->size[b] != s) return -1; // was originally an assert, but report failure instead. a->code_buffer >>= s; a->num_bits -= s; return z->value[b]; @@ -3998,7 +4185,12 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z) stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z) { int b,s; - if (a->num_bits < 16) stbi__fill_bits(a); + if (a->num_bits < 16) { + if (stbi__zeof(a)) { + return -1; /* report error for unexpected end of data. */ + } + stbi__fill_bits(a); + } b = z->fast[a->code_buffer & STBI__ZFAST_MASK]; if (b) { s = b >> 9; @@ -4012,13 +4204,16 @@ stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z) static int stbi__zexpand(stbi__zbuf *z, char *zout, int n) // need to make room for n bytes { char *q; - int cur, limit, old_limit; + unsigned int cur, limit, old_limit; z->zout = zout; if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG"); - cur = (int) (z->zout - z->zout_start); - limit = old_limit = (int) (z->zout_end - z->zout_start); - while (cur + n > limit) + cur = (unsigned int) (z->zout - z->zout_start); + limit = old_limit = (unsigned) (z->zout_end - z->zout_start); + if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory"); + while (cur + n > limit) { + if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory"); limit *= 2; + } q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit); STBI_NOTUSED(old_limit); if (q == NULL) return stbi__err("outofmem", "Out of memory"); @@ -4116,11 +4311,12 @@ static int stbi__compute_huffman_codes(stbi__zbuf *a) c = stbi__zreceive(a,2)+3; if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG"); fill = lencodes[n-1]; - } else if (c == 17) + } else if (c == 17) { c = stbi__zreceive(a,3)+3; - else { - STBI_ASSERT(c == 18); + } else if (c == 18) { c = stbi__zreceive(a,7)+11; + } else { + return stbi__err("bad codelengths", "Corrupt PNG"); } if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG"); memset(lencodes+n, fill, c); @@ -4146,7 +4342,7 @@ static int stbi__parse_uncompressed_block(stbi__zbuf *a) a->code_buffer >>= 8; a->num_bits -= 8; } - STBI_ASSERT(a->num_bits == 0); + if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG"); // now fill header the normal way while (k < 4) header[k++] = stbi__zget8(a); @@ -4168,6 +4364,7 @@ static int stbi__parse_zlib_header(stbi__zbuf *a) int cm = cmf & 15; /* int cinfo = cmf >> 4; */ int flg = stbi__zget8(a); + if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png @@ -4175,7 +4372,7 @@ static int stbi__parse_zlib_header(stbi__zbuf *a) return 1; } -static const stbi_uc stbi__zdefault_length[288] = +static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] = { 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, @@ -4221,7 +4418,7 @@ static int stbi__parse_zlib(stbi__zbuf *a, int parse_header) } else { if (type == 1) { // use fixed code lengths - if (!stbi__zbuild_huffman(&a->z_length , stbi__zdefault_length , 288)) return 0; + if (!stbi__zbuild_huffman(&a->z_length , stbi__zdefault_length , STBI__ZNSYMS)) return 0; if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32)) return 0; } else { if (!stbi__compute_huffman_codes(a)) return 0; @@ -4429,7 +4626,7 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r return stbi__err("invalid filter","Corrupt PNG"); if (depth < 8) { - STBI_ASSERT(img_width_bytes <= x); + if (img_width_bytes > x) return stbi__err("invalid width","Corrupt PNG"); cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place filter_bytes = 1; width = img_width_bytes; @@ -4617,6 +4814,7 @@ static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint3 // de-interlacing final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0); + if (!final) return stbi__err("outofmem", "Out of memory"); for (p=0; p < 7; ++p) { int xorig[] = { 0,4,0,2,0,1,0 }; int yorig[] = { 0,0,4,0,2,0,1 }; @@ -4737,19 +4935,46 @@ static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int return 1; } -static int stbi__unpremultiply_on_load = 0; -static int stbi__de_iphone_flag = 0; +static int stbi__unpremultiply_on_load_global = 0; +static int stbi__de_iphone_flag_global = 0; STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply) { - stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply; + stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply; } STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert) { - stbi__de_iphone_flag = flag_true_if_should_convert; + stbi__de_iphone_flag_global = flag_true_if_should_convert; } +#ifndef STBI_THREAD_LOCAL +#define stbi__unpremultiply_on_load stbi__unpremultiply_on_load_global +#define stbi__de_iphone_flag stbi__de_iphone_flag_global +#else +static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set; +static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set; + +STBIDEF void stbi__unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply) +{ + stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply; + stbi__unpremultiply_on_load_set = 1; +} + +STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert) +{ + stbi__de_iphone_flag_local = flag_true_if_should_convert; + stbi__de_iphone_flag_set = 1; +} + +#define stbi__unpremultiply_on_load (stbi__unpremultiply_on_load_set \ + ? stbi__unpremultiply_on_load_local \ + : stbi__unpremultiply_on_load_global) +#define stbi__de_iphone_flag (stbi__de_iphone_flag_set \ + ? stbi__de_iphone_flag_local \ + : stbi__de_iphone_flag_global) +#endif // STBI_THREAD_LOCAL + static void stbi__de_iphone(stbi__png *z) { stbi__context *s = z->s; @@ -4824,8 +5049,10 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) if (!first) return stbi__err("multiple IHDR","Corrupt PNG"); first = 0; if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG"); - s->img_x = stbi__get32be(s); if (s->img_x > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)"); - s->img_y = stbi__get32be(s); if (s->img_y > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)"); + s->img_x = stbi__get32be(s); + s->img_y = stbi__get32be(s); + if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)"); + if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)"); z->depth = stbi__get8(s); if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16) return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only"); color = stbi__get8(s); if (color > 6) return stbi__err("bad ctype","Corrupt PNG"); if (color == 3 && z->depth == 16) return stbi__err("bad ctype","Corrupt PNG"); @@ -4942,6 +5169,8 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) ++s->img_n; } STBI_FREE(z->expanded); z->expanded = NULL; + // end of PNG chunk, read and skip CRC + stbi__get32be(s); return 1; } @@ -4972,10 +5201,12 @@ static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, st void *result=NULL; if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error"); if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) { - if (p->depth < 8) + if (p->depth <= 8) ri->bits_per_channel = 8; + else if (p->depth == 16) + ri->bits_per_channel = 16; else - ri->bits_per_channel = p->depth; + return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth"); result = p->out; p->out = NULL; if (req_comp && req_comp != p->s->img_out_n) { @@ -5036,7 +5267,7 @@ static int stbi__png_is16(stbi__context *s) stbi__png p; p.s = s; if (!stbi__png_info_raw(&p, NULL, NULL, NULL)) - return 0; + return 0; if (p.depth != 16) { stbi__rewind(p.s); return 0; @@ -5111,7 +5342,7 @@ static int stbi__shiftsigned(unsigned int v, int shift, int bits) v <<= -shift; else v >>= shift; - STBI_ASSERT(/* v >= 0 && */ v < 256); + STBI_ASSERT(v < 256); v >>= (8-bits); STBI_ASSERT(bits >= 0 && bits <= 8); return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits]; @@ -5121,8 +5352,35 @@ typedef struct { int bpp, offset, hsz; unsigned int mr,mg,mb,ma, all_a; + int extra_read; } stbi__bmp_data; +static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress) +{ + // BI_BITFIELDS specifies masks explicitly, don't override + if (compress == 3) + return 1; + + if (compress == 0) { + if (info->bpp == 16) { + info->mr = 31u << 10; + info->mg = 31u << 5; + info->mb = 31u << 0; + } else if (info->bpp == 32) { + info->mr = 0xffu << 16; + info->mg = 0xffu << 8; + info->mb = 0xffu << 0; + info->ma = 0xffu << 24; + info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0 + } else { + // otherwise, use defaults, which is all-0 + info->mr = info->mg = info->mb = info->ma = 0; + } + return 1; + } + return 0; // error +} + static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info) { int hsz; @@ -5133,6 +5391,9 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info) info->offset = stbi__get32le(s); info->hsz = hsz = stbi__get32le(s); info->mr = info->mg = info->mb = info->ma = 0; + info->extra_read = 14; + + if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP"); if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown"); if (hsz == 12) { @@ -5147,6 +5408,8 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info) if (hsz != 12) { int compress = stbi__get32le(s); if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE"); + if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes + if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel stbi__get32le(s); // discard sizeof stbi__get32le(s); // discard hres stbi__get32le(s); // discard vres @@ -5161,21 +5424,12 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info) } if (info->bpp == 16 || info->bpp == 32) { if (compress == 0) { - if (info->bpp == 32) { - info->mr = 0xffu << 16; - info->mg = 0xffu << 8; - info->mb = 0xffu << 0; - info->ma = 0xffu << 24; - info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0 - } else { - info->mr = 31u << 10; - info->mg = 31u << 5; - info->mb = 31u << 0; - } + stbi__bmp_set_mask_defaults(info, compress); } else if (compress == 3) { info->mr = stbi__get32le(s); info->mg = stbi__get32le(s); info->mb = stbi__get32le(s); + info->extra_read += 12; // not documented, but generated by photoshop and handled by mspaint if (info->mr == info->mg && info->mg == info->mb) { // ?!?!? @@ -5185,6 +5439,7 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info) return stbi__errpuc("bad BMP", "bad BMP"); } } else { + // V4/V5 header int i; if (hsz != 108 && hsz != 124) return stbi__errpuc("bad BMP", "bad BMP"); @@ -5192,6 +5447,8 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info) info->mg = stbi__get32le(s); info->mb = stbi__get32le(s); info->ma = stbi__get32le(s); + if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs + stbi__bmp_set_mask_defaults(info, compress); stbi__get32le(s); // discard color space for (i=0; i < 12; ++i) stbi__get32le(s); // discard color space parameters @@ -5224,6 +5481,9 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req flip_vertically = ((int) s->img_y) > 0; s->img_y = abs((int) s->img_y); + if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); + if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); + mr = info.mr; mg = info.mg; mb = info.mb; @@ -5232,10 +5492,15 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req if (info.hsz == 12) { if (info.bpp < 24) - psize = (info.offset - 14 - 24) / 3; + psize = (info.offset - info.extra_read - 24) / 3; } else { if (info.bpp < 16) - psize = (info.offset - 14 - info.hsz) >> 2; + psize = (info.offset - info.extra_read - info.hsz) >> 2; + } + if (psize == 0) { + if (info.offset != s->callback_already_read + (s->img_buffer - s->img_buffer_original)) { + return stbi__errpuc("bad offset", "Corrupt BMP"); + } } if (info.bpp == 24 && ma == 0xff000000) @@ -5263,7 +5528,7 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req if (info.hsz != 12) stbi__get8(s); pal[i][3] = 255; } - stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4)); + stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4)); if (info.bpp == 1) width = (s->img_x + 7) >> 3; else if (info.bpp == 4) width = (s->img_x + 1) >> 1; else if (info.bpp == 8) width = s->img_x; @@ -5312,7 +5577,7 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0; int z = 0; int easy=0; - stbi__skip(s, info.offset - 14 - info.hsz); + stbi__skip(s, info.offset - info.extra_read - info.hsz); if (info.bpp == 24) width = 3 * s->img_x; else if (info.bpp == 16) width = 2*s->img_x; else /* bpp = 32 and pad = 0 */ width=0; @@ -5330,6 +5595,7 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg); bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb); ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma); + if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); } } for (j=0; j < (int) s->img_y; ++j) { if (easy) { @@ -5554,6 +5820,9 @@ static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req STBI_NOTUSED(tga_x_origin); // @TODO STBI_NOTUSED(tga_y_origin); // @TODO + if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); + if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); + // do a tiny bit of precessing if ( tga_image_type >= 8 ) { @@ -5593,6 +5862,11 @@ static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req // do I need to load a palette? if ( tga_indexed) { + if (tga_palette_len == 0) { /* you have to have at least one entry! */ + STBI_FREE(tga_data); + return stbi__errpuc("bad palette", "Corrupt TGA"); + } + // any data to skip? (offset usually = 0) stbi__skip(s, tga_palette_start ); // load the palette @@ -5801,6 +6075,9 @@ static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req h = stbi__get32be(s); w = stbi__get32be(s); + if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); + if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); + // Make sure the depth is 8 bits. bitdepth = stbi__get16be(s); if (bitdepth != 8 && bitdepth != 16) @@ -6155,6 +6432,10 @@ static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_c x = stbi__get16be(s); y = stbi__get16be(s); + + if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); + if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); + if (stbi__at_eof(s)) return stbi__errpuc("bad file","file too short (pic header)"); if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode"); @@ -6164,6 +6445,7 @@ static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_c // intermediate buffer is RGBA result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0); + if (!result) return stbi__errpuc("outofmem", "Out of memory"); memset(result, 0xff, x*y*4); if (!stbi__pic_load_core(s,x,y,comp, result)) { @@ -6263,6 +6545,9 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_in g->ratio = stbi__get8(s); g->transparent = -1; + if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)"); + if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)"); + if (comp != 0) *comp = 4; // can't actually tell whether it's 3 or 4 until we parse the comments if (is_info) return 1; @@ -6276,6 +6561,7 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_in static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp) { stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif)); + if (!g) return stbi__err("outofmem", "Out of memory"); if (!stbi__gif_header(s, g, comp, 1)) { STBI_FREE(g); stbi__rewind( s ); @@ -6440,7 +6726,7 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i memset(g->history, 0x00, pcount); // pixels that were affected previous frame first_frame = 1; } else { - // second frame - how do we dispoase of the previous one? + // second frame - how do we dispose of the previous one? dispose = (g->eflags & 0x1C) >> 2; pcount = g->w * g->h; @@ -6585,6 +6871,17 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i } } +static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays) +{ + STBI_FREE(g->out); + STBI_FREE(g->history); + STBI_FREE(g->background); + + if (out) STBI_FREE(out); + if (delays && *delays) STBI_FREE(*delays); + return stbi__errpuc("outofmem", "Out of memory"); +} + static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp) { if (stbi__gif_test(s)) { @@ -6594,6 +6891,12 @@ static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, stbi_uc *two_back = 0; stbi__gif g; int stride; + int out_size = 0; + int delays_size = 0; + + STBI_NOTUSED(out_size); + STBI_NOTUSED(delays_size); + memset(&g, 0, sizeof(g)); if (delays) { *delays = 0; @@ -6610,14 +6913,31 @@ static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, stride = g.w * g.h * 4; if (out) { - out = (stbi_uc*) STBI_REALLOC( out, layers * stride ); + void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride ); + if (!tmp) + return stbi__load_gif_main_outofmem(&g, out, delays); + else { + out = (stbi_uc*) tmp; + out_size = layers * stride; + } + if (delays) { - *delays = (int*) STBI_REALLOC( *delays, sizeof(int) * layers ); + int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers ); + if (!new_delays) + return stbi__load_gif_main_outofmem(&g, out, delays); + *delays = new_delays; + delays_size = layers * sizeof(int); } } else { out = (stbi_uc*)stbi__malloc( layers * stride ); + if (!out) + return stbi__load_gif_main_outofmem(&g, out, delays); + out_size = layers * stride; if (delays) { *delays = (int*) stbi__malloc( layers * sizeof(int) ); + if (!*delays) + return stbi__load_gif_main_outofmem(&g, out, delays); + delays_size = layers * sizeof(int); } } memcpy( out + ((layers - 1) * stride), u, stride ); @@ -6796,6 +7116,9 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re token += 3; width = (int) strtol(token, NULL, 10); + if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)"); + if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)"); + *x = width; *y = height; @@ -6938,9 +7261,10 @@ static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp) info.all_a = 255; p = stbi__bmp_parse_header(s, &info); - stbi__rewind( s ); - if (p == NULL) + if (p == NULL) { + stbi__rewind( s ); return 0; + } if (x) *x = s->img_x; if (y) *y = s->img_y; if (comp) { @@ -7006,8 +7330,8 @@ static int stbi__psd_is16(stbi__context *s) stbi__rewind( s ); return 0; } - (void) stbi__get32be(s); - (void) stbi__get32be(s); + STBI_NOTUSED(stbi__get32be(s)); + STBI_NOTUSED(stbi__get32be(s)); depth = stbi__get16be(s); if (depth != 16) { stbi__rewind( s ); @@ -7086,7 +7410,6 @@ static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp) // Known limitations: // Does not support comments in the header section // Does not support ASCII image data (formats P2 and P3) -// Does not support 16-bit-per-channel #ifndef STBI_NO_PNM @@ -7107,19 +7430,23 @@ static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req stbi_uc *out; STBI_NOTUSED(ri); - if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n)) + ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n); + if (ri->bits_per_channel == 0) return 0; + if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); + if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); + *x = s->img_x; *y = s->img_y; if (comp) *comp = s->img_n; - if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0)) + if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0)) return stbi__errpuc("too large", "PNM too large"); - out = (stbi_uc *) stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0); + out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0); if (!out) return stbi__errpuc("outofmem", "Out of memory"); - stbi__getn(s, out, s->img_n * s->img_x * s->img_y); + stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8)); if (req_comp && req_comp != s->img_n) { out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y); @@ -7195,11 +7522,19 @@ static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp) stbi__pnm_skip_whitespace(s, &c); maxv = stbi__pnm_getinteger(s, &c); // read max value - - if (maxv > 255) - return stbi__err("max value > 255", "PPM image not 8-bit"); + if (maxv > 65535) + return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images"); + else if (maxv > 255) + return 16; else - return 1; + return 8; +} + +static int stbi__pnm_is16(stbi__context *s) +{ + if (stbi__pnm_info(s, NULL, NULL, NULL) == 16) + return 1; + return 0; } #endif @@ -7255,6 +7590,9 @@ static int stbi__is_16_main(stbi__context *s) if (stbi__psd_is16(s)) return 1; #endif + #ifndef STBI_NO_PNM + if (stbi__pnm_is16(s)) return 1; + #endif return 0; } From fd312faaa2fd37de36d104ab779cc9463abe5c67 Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Sun, 21 Nov 2021 21:44:34 +0100 Subject: [PATCH 15/18] Update stb_image_write.h to 1.16 version. Do we need to introduce another STBIMAGE_LIBRARY and STBIMAGE_*_VERSION CMake variables? --- 3rdparty/stb_image/README.md | 2 +- 3rdparty/stb_image/stb_image_write.h | 251 +++++++++++++++++++-------- 2 files changed, 179 insertions(+), 74 deletions(-) diff --git a/3rdparty/stb_image/README.md b/3rdparty/stb_image/README.md index efa37458eb..736bef4a74 100644 --- a/3rdparty/stb_image/README.md +++ b/3rdparty/stb_image/README.md @@ -14,7 +14,7 @@ library | lastest version | category | LoC | description **[stb_vorbis.c](stb_vorbis.c)** | 1.16 | audio | 5486 | decode ogg vorbis files from file/memory to float/16-bit signed output **[stb_image.h](stb_image.h)** | 2.27 | graphics | 7897 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC **[stb_truetype.h](stb_truetype.h)** | 1.21 | graphics | 4882 | parse, decode, and rasterize characters from truetype fonts -**[stb_image_write.h](stb_image_write.h)** | 1.13 | graphics | 1617 | image writing to disk: PNG, TGA, BMP +**[stb_image_write.h](stb_image_write.h)** | 1.16 | graphics | 1724 | image writing to disk: PNG, TGA, BMP **[stb_image_resize.h](stb_image_resize.h)** | 0.96 | graphics | 2630 | resize images larger/smaller with good quality **[stb_rect_pack.h](stb_rect_pack.h)** | 1.00 | graphics | 628 | simple 2D rectangle packer with decent quality **[stb_ds.h](stb_ds.h)** | 0.5 | utility | 1691 | typesafe dynamic array and hash tables for C, will compile in C++ diff --git a/3rdparty/stb_image/stb_image_write.h b/3rdparty/stb_image/stb_image_write.h index c989bc1418..e4b32ed1bc 100644 --- a/3rdparty/stb_image/stb_image_write.h +++ b/3rdparty/stb_image/stb_image_write.h @@ -1,4 +1,4 @@ -/* stb_image_write - v1.13 - public domain - http://nothings.org/stb +/* stb_image_write - v1.16 - public domain - http://nothings.org/stb writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015 no warranty implied; use at your own risk @@ -140,6 +140,7 @@ Ivan Tikhonov github:ignotion Adam Schackart + Andrew Kensler LICENSE @@ -166,9 +167,9 @@ LICENSE #endif #ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations -extern int stbi_write_tga_with_rle; -extern int stbi_write_png_compression_level; -extern int stbi_write_force_png_filter; +STBIWDEF int stbi_write_tga_with_rle; +STBIWDEF int stbi_write_png_compression_level; +STBIWDEF int stbi_write_force_png_filter; #endif #ifndef STBI_WRITE_NO_STDIO @@ -178,7 +179,7 @@ STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data); STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality); -#ifdef STBI_WINDOWS_UTF8 +#ifdef STBIW_WINDOWS_UTF8 STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input); #endif #endif @@ -247,17 +248,17 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean); #define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff) #ifdef STB_IMAGE_WRITE_STATIC -static int stbi__flip_vertically_on_write=0; static int stbi_write_png_compression_level = 8; static int stbi_write_tga_with_rle = 1; static int stbi_write_force_png_filter = -1; #else int stbi_write_png_compression_level = 8; -int stbi__flip_vertically_on_write=0; int stbi_write_tga_with_rle = 1; int stbi_write_force_png_filter = -1; #endif +static int stbi__flip_vertically_on_write = 0; + STBIWDEF void stbi_flip_vertically_on_write(int flag) { stbi__flip_vertically_on_write = flag; @@ -267,6 +268,8 @@ typedef struct { stbi_write_func *func; void *context; + unsigned char buffer[64]; + int buf_used; } stbi__write_context; // initialize a callback-based context @@ -283,7 +286,7 @@ static void stbi__stdio_write(void *context, void *data, int size) fwrite(data,1,size,(FILE*) context); } -#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8) +#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8) #ifdef __cplusplus #define STBIW_EXTERN extern "C" #else @@ -294,25 +297,25 @@ STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned in STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input) { - return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL); + return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL); } #endif static FILE *stbiw__fopen(char const *filename, char const *mode) { FILE *f; -#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8) +#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8) wchar_t wMode[64]; wchar_t wFilename[1024]; - if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename))) + if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename))) return 0; - if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode))) + if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode))) return 0; -#if _MSC_VER >= 1400 - if (0 != _wfopen_s(&f, wFilename, wMode)) - f = 0; +#if defined(_MSC_VER) && _MSC_VER >= 1400 + if (0 != _wfopen_s(&f, wFilename, wMode)) + f = 0; #else f = _wfopen(wFilename, wMode); #endif @@ -380,16 +383,36 @@ static void stbiw__writef(stbi__write_context *s, const char *fmt, ...) va_end(v); } +static void stbiw__write_flush(stbi__write_context *s) +{ + if (s->buf_used) { + s->func(s->context, &s->buffer, s->buf_used); + s->buf_used = 0; + } +} + static void stbiw__putc(stbi__write_context *s, unsigned char c) { s->func(s->context, &c, 1); } +static void stbiw__write1(stbi__write_context *s, unsigned char a) +{ + if ((size_t)s->buf_used + 1 > sizeof(s->buffer)) + stbiw__write_flush(s); + s->buffer[s->buf_used++] = a; +} + static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c) { - unsigned char arr[3]; - arr[0] = a; arr[1] = b; arr[2] = c; - s->func(s->context, arr, 3); + int n; + if ((size_t)s->buf_used + 3 > sizeof(s->buffer)) + stbiw__write_flush(s); + n = s->buf_used; + s->buf_used = n+3; + s->buffer[n+0] = a; + s->buffer[n+1] = b; + s->buffer[n+2] = c; } static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d) @@ -398,7 +421,7 @@ static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, in int k; if (write_alpha < 0) - s->func(s->context, &d[comp - 1], 1); + stbiw__write1(s, d[comp - 1]); switch (comp) { case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case @@ -406,7 +429,7 @@ static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, in if (expand_mono) stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp else - s->func(s->context, d, 1); // monochrome TGA + stbiw__write1(s, d[0]); // monochrome TGA break; case 4: if (!write_alpha) { @@ -422,7 +445,7 @@ static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, in break; } if (write_alpha > 0) - s->func(s->context, &d[comp - 1], 1); + stbiw__write1(s, d[comp - 1]); } static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono) @@ -447,6 +470,7 @@ static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, i unsigned char *d = (unsigned char *) data + (j*x+i)*comp; stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d); } + stbiw__write_flush(s); s->func(s->context, &zero, scanline_pad); } } @@ -467,16 +491,27 @@ static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data) { - int pad = (-x*3) & 3; - return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad, - "11 4 22 4" "4 44 22 444444", - 'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40, // file header - 40, x,y, 1,24, 0,0,0,0,0,0); // bitmap header + if (comp != 4) { + // write RGB bitmap + int pad = (-x*3) & 3; + return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad, + "11 4 22 4" "4 44 22 444444", + 'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40, // file header + 40, x,y, 1,24, 0,0,0,0,0,0); // bitmap header + } else { + // RGBA bitmaps need a v4 header + // use BI_BITFIELDS mode with 32bpp and alpha mask + // (straight BI_RGB with alpha mask doesn't work in most readers) + return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0, + "11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444", + 'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header + 108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header + } } STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data) { - stbi__write_context s; + stbi__write_context s = { 0 }; stbi__start_write_callbacks(&s, func, context); return stbi_write_bmp_core(&s, x, y, comp, data); } @@ -484,7 +519,7 @@ STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, #ifndef STBI_WRITE_NO_STDIO STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data) { - stbi__write_context s; + stbi__write_context s = { 0 }; if (stbi__start_write_file(&s,filename)) { int r = stbi_write_bmp_core(&s, x, y, comp, data); stbi__end_write_file(&s); @@ -557,24 +592,25 @@ static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, v if (diff) { unsigned char header = STBIW_UCHAR(len - 1); - s->func(s->context, &header, 1); + stbiw__write1(s, header); for (k = 0; k < len; ++k) { stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp); } } else { unsigned char header = STBIW_UCHAR(len - 129); - s->func(s->context, &header, 1); + stbiw__write1(s, header); stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin); } } } + stbiw__write_flush(s); } return 1; } STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data) { - stbi__write_context s; + stbi__write_context s = { 0 }; stbi__start_write_callbacks(&s, func, context); return stbi_write_tga_core(&s, x, y, comp, (void *) data); } @@ -582,7 +618,7 @@ STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, #ifndef STBI_WRITE_NO_STDIO STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data) { - stbi__write_context s; + stbi__write_context s = { 0 }; if (stbi__start_write_file(&s,filename)) { int r = stbi_write_tga_core(&s, x, y, comp, (void *) data); stbi__end_write_file(&s); @@ -598,6 +634,8 @@ STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const #define stbiw__max(a, b) ((a) > (b) ? (a) : (b)) +#ifndef STBI_WRITE_NO_STDIO + static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear) { int exponent; @@ -732,7 +770,7 @@ static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, f char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n"; s->func(s->context, header, sizeof(header)-1); -#ifdef __STDC_WANT_SECURE_LIB__ +#ifdef __STDC_LIB_EXT1__ len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE= 1.0000000000000\n\n-Y %d +X %d\n", y, x); #else len = sprintf(buffer, "EXPOSURE= 1.0000000000000\n\n-Y %d +X %d\n", y, x); @@ -748,15 +786,14 @@ static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, f STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data) { - stbi__write_context s; + stbi__write_context s = { 0 }; stbi__start_write_callbacks(&s, func, context); return stbi_write_hdr_core(&s, x, y, comp, (float *) data); } -#ifndef STBI_WRITE_NO_STDIO STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data) { - stbi__write_context s; + stbi__write_context s = { 0 }; if (stbi__start_write_file(&s,filename)) { int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data); stbi__end_write_file(&s); @@ -774,7 +811,7 @@ STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const #ifndef STBIW_ZLIB_COMPRESS // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size() -#define stbiw__sbraw(a) ((int *) (a) - 2) +#define stbiw__sbraw(a) ((int *) (void *) (a) - 2) #define stbiw__sbm(a) stbiw__sbraw(a)[0] #define stbiw__sbn(a) stbiw__sbraw(a)[1] @@ -944,6 +981,23 @@ STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, i (void) stbiw__sbfree(hash_table[i]); STBIW_FREE(hash_table); + // store uncompressed instead if compression was worse + if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) { + stbiw__sbn(out) = 2; // truncate to DEFLATE 32K window and FLEVEL = 1 + for (j = 0; j < data_len;) { + int blocklen = data_len - j; + if (blocklen > 32767) blocklen = 32767; + stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression + stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN + stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8)); + stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN + stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8)); + memcpy(out+stbiw__sbn(out), data+j, blocklen); + stbiw__sbn(out) += blocklen; + j += blocklen; + } + } + { // compute adler32 on input unsigned int s1=1, s2=0; @@ -1271,26 +1325,31 @@ static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) { bits[0] = val & ((1< 100 ? 100 : quality; quality = quality < 50 ? 5000 / quality : 200 - quality * 2; @@ -1439,7 +1499,7 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, in static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 }; static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 }; const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width), - 3,1,0x11,0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 }; + 3,1,(unsigned char)(subsample?0x22:0x11),0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 }; s->func(s->context, (void*)head0, sizeof(head0)); s->func(s->context, (void*)YTable, sizeof(YTable)); stbiw__putc(s, 1); @@ -1462,36 +1522,74 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, in // Encode 8x8 macroblocks { static const unsigned short fillBits[] = {0x7F, 7}; - const unsigned char *imageData = (const unsigned char *)data; int DCY=0, DCU=0, DCV=0; int bitBuf=0, bitCnt=0; // comp == 2 is grey+alpha (alpha is ignored) int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0; + const unsigned char *dataR = (const unsigned char *)data; + const unsigned char *dataG = dataR + ofsG; + const unsigned char *dataB = dataR + ofsB; int x, y, pos; - for(y = 0; y < height; y += 8) { - for(x = 0; x < width; x += 8) { - float YDU[64], UDU[64], VDU[64]; - for(row = y, pos = 0; row < y+8; ++row) { - // row >= height => use last input row - int clamped_row = (row < height) ? row : height - 1; - int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp; - for(col = x; col < x+8; ++col, ++pos) { - float r, g, b; - // if col >= width => use pixel from last input column - int p = base_p + ((col < width) ? col : (width-1))*comp; - - r = imageData[p+0]; - g = imageData[p+ofsG]; - b = imageData[p+ofsB]; - YDU[pos]=+0.29900f*r+0.58700f*g+0.11400f*b-128; - UDU[pos]=-0.16874f*r-0.33126f*g+0.50000f*b; - VDU[pos]=+0.50000f*r-0.41869f*g-0.08131f*b; + if(subsample) { + for(y = 0; y < height; y += 16) { + for(x = 0; x < width; x += 16) { + float Y[256], U[256], V[256]; + for(row = y, pos = 0; row < y+16; ++row) { + // row >= height => use last input row + int clamped_row = (row < height) ? row : height - 1; + int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp; + for(col = x; col < x+16; ++col, ++pos) { + // if col >= width => use pixel from last input column + int p = base_p + ((col < width) ? col : (width-1))*comp; + float r = dataR[p], g = dataG[p], b = dataB[p]; + Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128; + U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b; + V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b; + } + } + DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+0, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT); + DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+8, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT); + DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+128, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT); + DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+136, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT); + + // subsample U,V + { + float subU[64], subV[64]; + int yy, xx; + for(yy = 0, pos = 0; yy < 8; ++yy) { + for(xx = 0; xx < 8; ++xx, ++pos) { + int j = yy*32+xx*2; + subU[pos] = (U[j+0] + U[j+1] + U[j+16] + U[j+17]) * 0.25f; + subV[pos] = (V[j+0] + V[j+1] + V[j+16] + V[j+17]) * 0.25f; + } + } + DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT); + DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT); } } + } + } else { + for(y = 0; y < height; y += 8) { + for(x = 0; x < width; x += 8) { + float Y[64], U[64], V[64]; + for(row = y, pos = 0; row < y+8; ++row) { + // row >= height => use last input row + int clamped_row = (row < height) ? row : height - 1; + int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp; + for(col = x; col < x+8; ++col, ++pos) { + // if col >= width => use pixel from last input column + int p = base_p + ((col < width) ? col : (width-1))*comp; + float r = dataR[p], g = dataG[p], b = dataB[p]; + Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128; + U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b; + V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b; + } + } - DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, YDU, fdtbl_Y, DCY, YDC_HT, YAC_HT); - DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, UDU, fdtbl_UV, DCU, UVDC_HT, UVAC_HT); - DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, VDU, fdtbl_UV, DCV, UVDC_HT, UVAC_HT); + DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y, DCY, YDC_HT, YAC_HT); + DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT); + DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT); + } } } @@ -1508,7 +1606,7 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, in STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality) { - stbi__write_context s; + stbi__write_context s = { 0 }; stbi__start_write_callbacks(&s, func, context); return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality); } @@ -1517,7 +1615,7 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, #ifndef STBI_WRITE_NO_STDIO STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality) { - stbi__write_context s; + stbi__write_context s = { 0 }; if (stbi__start_write_file(&s,filename)) { int r = stbi_write_jpg_core(&s, x, y, comp, data, quality); stbi__end_write_file(&s); @@ -1530,6 +1628,13 @@ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const #endif // STB_IMAGE_WRITE_IMPLEMENTATION /* Revision history + 1.16 (2021-07-11) + make Deflate code emit uncompressed blocks when it would otherwise expand + support writing BMPs with alpha channel + 1.15 (2020-07-13) unknown + 1.14 (2020-02-02) updated JPEG writer to downsample chroma channels + 1.13 + 1.12 1.11 (2019-08-11) 1.10 (2019-02-07) @@ -1564,7 +1669,7 @@ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const add HDR output fix monochrome BMP 0.95 (2014-08-17) - add monochrome TGA output + add monochrome TGA output 0.94 (2014-05-31) rename private functions to avoid conflicts with stb_image.h 0.93 (2014-05-27) From 02895d0ec39d3d2fbeb039fecc0789445aec8e05 Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Sun, 21 Nov 2021 23:05:58 +0100 Subject: [PATCH 16/18] Add missing file. Update Simd CMakeLists.txt. Add SimdFree(). --- 3rdparty/simdlib/CMakeLists.txt | 10 +- .../simdlib/Simd/SimdNeonImageSavePng.cpp | 409 ++++++++++++++++++ .../io/src/image/private/vpImageIoSimd.cpp | 8 +- 3 files changed, 423 insertions(+), 4 deletions(-) create mode 100644 3rdparty/simdlib/Simd/SimdNeonImageSavePng.cpp diff --git a/3rdparty/simdlib/CMakeLists.txt b/3rdparty/simdlib/CMakeLists.txt index 95b3358ad2..f737f8ea89 100644 --- a/3rdparty/simdlib/CMakeLists.txt +++ b/3rdparty/simdlib/CMakeLists.txt @@ -89,11 +89,19 @@ if(X86 OR X86_64) set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}") file(GLOB_RECURSE SIMD_AVX1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx1*.cpp) - set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}") + if(MSVC) + set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG} -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store") + elseif((CMAKE_CXX_COMPILER MATCHES "clang") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) + set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}") + else() + set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG} -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store") + endif() file(GLOB_RECURSE SIMD_AVX2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx2*.cpp) if(MSVC) set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store") + elseif((CMAKE_CXX_COMPILER MATCHES "clang") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) + set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mfma -mbmi -mbmi2 -mlzcnt") else() set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mfma -mbmi -mbmi2 -mlzcnt -fabi-version=4 -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store") endif() diff --git a/3rdparty/simdlib/Simd/SimdNeonImageSavePng.cpp b/3rdparty/simdlib/Simd/SimdNeonImageSavePng.cpp new file mode 100644 index 0000000000..330a64374d --- /dev/null +++ b/3rdparty/simdlib/Simd/SimdNeonImageSavePng.cpp @@ -0,0 +1,409 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2021 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdImageSave.h" +#include "Simd/SimdImageSavePng.h" +#include "Simd/SimdBase.h" +#include "Simd/SimdNeon.h" +#include "Simd/SimdSet.h" +#include "Simd/SimdExtract.h" +#include "Simd/SimdStore.h" + +namespace Simd +{ +#ifdef SIMD_NEON_ENABLE + namespace Neon + { + uint32_t ZlibAdler32(uint8_t* data, int size) + { + int32x4_t _i0 = SetI32(0, -1, -2, -3), _4 = vdupq_n_s32(4); + uint32_t lo = 1, hi = 0; + for (int b = 0, n = (int)(size % 5552); b < size;) + { + int n8 = n & (~7), i = 0; + int32x4_t _i = vaddq_s32(_i0, vdupq_n_s32(n)); + int32x4_t _l = vdupq_n_s32(0), _h = vdupq_n_s32(0); + for (; i < n8; i += 8) + { + uint8x8_t d8 = LoadHalf(data + b + i); + int16x8_t d16 = (int16x8_t)vmovl_u8(d8); + int32x4_t d0 = vmovl_s16(Half<0>(d16)); + _l = vaddq_s32(_l, d0); + _h = vmlaq_s32(_h, d0, _i); + _i = vsubq_s32(_i, _4); + int32x4_t d1 = vmovl_s16((int16x4_t)Half<1>(d16)); + _l = vaddq_s32(_l, d1); + _h = vmlaq_s32(_h, d1, _i); + _i = vsubq_s32(_i, _4); + } + int l = ExtractSum32s(_l), h = ExtractSum32s(_h); + for (; i < n; ++i) + { + l += data[b + i]; + h += data[b + i]*(n - i); + } + hi = (hi + h + lo*n) % 65521; + lo = (lo + l) % 65521; + b += n; + n = 5552; + } + return (hi << 16) | lo; + } + + void ZlibCompress(uint8_t* data, int size, int quality, OutputMemoryStream& stream) + { + const int ZHASH = 16384; + if (quality < 5) + quality = 5; + const int basket = quality * 2; + Array32i hashTable(ZHASH * basket); + memset(hashTable.data, -1, hashTable.RawSize()); + + stream.Write(uint8_t(0x78)); + stream.Write(uint8_t(0x5e)); + stream.WriteBits(1, 1); + stream.WriteBits(1, 2); + + int i = 0, j; + while (i < size - 3) + { + int h = Base::ZlibHash(data + i) & (ZHASH - 1), best = 3; + uint8_t* bestLoc = 0; + int* hList = hashTable.data + h * basket; + for (j = 0; hList[j] != -1 && j < basket; ++j) + { + if (hList[j] > i - 32768) + { + int d = Base::ZlibCount(data + hList[j], data + i, size - i); + if (d >= best) + { + best = d; + bestLoc = data + hList[j]; + } + } + } + if (j == basket) + { + memcpy(hList, hList + quality, quality * sizeof(int)); + memset(hList + quality, -1, quality * sizeof(int)); + j = quality; + } + hList[j] = i; + + if (bestLoc) + { + h = Base::ZlibHash(data + i + 1) & (ZHASH - 1); + int* hList = hashTable.data + h * basket; + for (j = 0; hList[j] != -1 && j < basket; ++j) + { + if (hList[j] > i - 32767) + { + int e = Base::ZlibCount(data + hList[j], data + i + 1, size - i - 1); + if (e > best) + { + bestLoc = NULL; + break; + } + } + } + } + + if (bestLoc) + { + int d = (int)(data + i - bestLoc); + assert(d <= 32767 && best <= 258); + for (j = 0; best > Base::ZlibLenC[j + 1] - 1; ++j); + Base::ZlibHuff(j + 257, stream); + if (Base::ZlibLenEb[j]) + stream.WriteBits(best - Base::ZlibLenC[j], Base::ZlibLenEb[j]); + for (j = 0; d > Base::ZlibDistC[j + 1] - 1; ++j); + stream.WriteBits(Base::ZlibBitRev(j, 5), 5); + if (Base::ZlibDistEb[j]) + stream.WriteBits(d - Base::ZlibDistC[j], Base::ZlibDistEb[j]); + i += best; + } + else + { + Base::ZlibHuffB(data[i], stream); + ++i; + } + } + for (; i < size; ++i) + Base::ZlibHuffB(data[i], stream); + Base::ZlibHuff(256, stream); + stream.FlushBits(); + stream.WriteBe32u(ZlibAdler32(data, size)); + } + + uint32_t EncodeLine0(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size, A), bS = A << 7, bC = (sizeA >> 7) + 1; + uint32x4_t _sum = vdupq_n_u32(0); + for (size_t b = 0; b < bC; ++b) + { + uint16x8_t bSum = vdupq_n_u16(0); + for (size_t end = Min(i + bS, sizeA); i < end; i += A) + { + int8x16_t _src = (int8x16_t)Load(src + i); + Store(dst + i, _src); + bSum = vaddq_u16(bSum, vpaddlq_u8((uint8x16_t)vabsq_s8(_src))); + } + _sum = vaddq_u32(_sum, vpaddlq_u16(bSum)); + } + uint32_t sum = ExtractSum32u(_sum); + for (; i < size; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine1(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n, bS = A << 7, bC = (sizeA >> 7) + 1; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + uint32x4_t _sum = vdupq_n_u32(0); + for (size_t b = 0; b < bC; ++b) + { + uint16x8_t bSum = vdupq_n_u16(0); + for (size_t end = Min(i + bS, sizeA); i < end; i += A) + { + int8x16_t _src0 = (int8x16_t)Load(src + i); + int8x16_t _src1 = (int8x16_t)Load(src + i - n); + int8x16_t _dst = vsubq_s8(_src0, _src1); + Store(dst + i, _dst); + bSum = vaddq_u16(bSum, vpaddlq_u8((uint8x16_t)vabsq_s8(_dst))); + } + _sum = vaddq_u32(_sum, vpaddlq_u16(bSum)); + } + sum += ExtractSum32u(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - src[i - n]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine2(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n, bS = A << 7, bC = (sizeA >> 7) + 1; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i] - src[i - stride]; + sum += ::abs(dst[i]); + } + uint32x4_t _sum = vdupq_n_u32(0); + for (size_t b = 0; b < bC; ++b) + { + uint16x8_t bSum = vdupq_n_u16(0); + for (size_t end = Min(i + bS, sizeA); i < end; i += A) + { + int8x16_t _src0 = (int8x16_t)Load(src + i); + int8x16_t _src1 = (int8x16_t)Load(src + i - stride); + int8x16_t _dst = vsubq_s8(_src0, _src1); + Store(dst + i, _dst); + bSum = vaddq_u16(bSum, vpaddlq_u8((uint8x16_t)vabsq_s8(_dst))); + } + _sum = vaddq_u32(_sum, vpaddlq_u16(bSum)); + } + sum += ExtractSum32u(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - src[i - stride]; + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine3(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n, bS = A << 7, bC = (sizeA >> 7) + 1; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i] - (src[i - stride] >> 1); + sum += ::abs(dst[i]); + } + uint32x4_t _sum = vdupq_n_u32(0); + for (size_t b = 0; b < bC; ++b) + { + uint16x8_t bSum = vdupq_n_u16(0); + for (size_t end = Min(i + bS, sizeA); i < end; i += A) + { + uint8x16_t _src0 = Load(src + i); + uint8x16_t _src1 = Load(src + i - n); + uint8x16_t _src2 = Load(src + i - stride); + int8x16_t _dst = (int8x16_t)vsubq_u8(_src0, vhaddq_u8(_src1, _src2)); + Store(dst + i, _dst); + bSum = vaddq_u16(bSum, vpaddlq_u8((uint8x16_t)vabsq_s8(_dst))); + } + _sum = vaddq_u32(_sum, vpaddlq_u16(bSum)); + } + sum += ExtractSum32u(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - ((src[i - n] + src[i - stride]) >> 1); + sum += ::abs(dst[i]); + } + return sum; + } + + SIMD_INLINE uint16x8_t Paeth(uint16x8_t a, uint16x8_t b, uint16x8_t c) + { + int16x8_t p = (int16x8_t)vsubq_u16(vaddq_u16(a, b), c); + int16x8_t pa = vabsq_s16(vsubq_s16(p, (int16x8_t)a)); + int16x8_t pb = vabsq_s16(vsubq_s16(p, (int16x8_t)b)); + int16x8_t pc = vabsq_s16(vsubq_s16(p, (int16x8_t)c)); + uint16x8_t mbc = vorrq_u16(vcgtq_s16(pa, pb), vcgtq_s16(pa, pc)); + uint16x8_t mc = vcgtq_s16(pb, pc); + return (uint16x8_t)vbslq_u16(mbc, vbslq_u16(mc, c, b), a); + } + + uint32_t EncodeLine4(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n, bS = A << 7, bC = (sizeA >> 7) + 1; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = (int8_t)(src[i] - src[i - stride]); + sum += ::abs(dst[i]); + } + uint32x4_t _sum = vdupq_n_u32(0); + for (size_t b = 0; b < bC; ++b) + { + uint16x8_t bSum = vdupq_n_u16(0); + for (size_t end = Min(i + bS, sizeA); i < end; i += A) + { + uint8x16_t _src0 = Load(src + i); + uint8x16_t _src1 = Load(src + i - n); + uint8x16_t _src2 = Load(src + i - stride); + uint8x16_t _src3 = Load(src + i - stride - n); + uint16x8_t lo = Paeth(UnpackU8<0>(_src1), UnpackU8<0>(_src2), UnpackU8<0>(_src3)); + uint16x8_t hi = Paeth(UnpackU8<1>(_src1), UnpackU8<1>(_src2), UnpackU8<1>(_src3)); + int8x16_t _dst = (int8x16_t)vsubq_u8(_src0, PackU16(lo, hi)); + Store(dst + i, _dst); + bSum = vaddq_u16(bSum, vpaddlq_u8((uint8x16_t)vabsq_s8(_dst))); + } + _sum = vaddq_u32(_sum, vpaddlq_u16(bSum)); + } + sum += ExtractSum32u(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - Base::Paeth(src[i - n], src[i - stride], src[i - stride - n]); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine5(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n, bS = A << 7, bC = (sizeA >> 7) + 1; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + uint32x4_t _sum = vdupq_n_u32(0); + for (size_t b = 0; b < bC; ++b) + { + uint16x8_t bSum = vdupq_n_u16(0); + for (size_t end = Min(i + bS, sizeA); i < end; i += A) + { + uint8x16_t _src0 = Load(src + i); + uint8x16_t _src1 = Load(src + i - n); + int8x16_t _dst = (int8x16_t)vsubq_u8(_src0, vshrq_n_u8(_src1, 1)); + Store(dst + i, _dst); + bSum = vaddq_u16(bSum, vpaddlq_u8((uint8x16_t)vabsq_s8(_dst))); + } + _sum = vaddq_u32(_sum, vpaddlq_u16(bSum)); + } + sum += ExtractSum32u(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - (src[i - n] >> 1); + sum += ::abs(dst[i]); + } + return sum; + } + + uint32_t EncodeLine6(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst) + { + size_t i = 0, sizeA = AlignLo(size - n, A) + n, bS = A << 7, bC = (sizeA >> 7) + 1; + uint32_t sum = 0; + for (; i < n; ++i) + { + dst[i] = src[i]; + sum += ::abs(dst[i]); + } + uint32x4_t _sum = vdupq_n_u32(0); + for (size_t b = 0; b < bC; ++b) + { + uint16x8_t bSum = vdupq_n_u16(0); + for (size_t end = Min(i + bS, sizeA); i < end; i += A) + { + int8x16_t _src0 = (int8x16_t)Load(src + i); + int8x16_t _src1 = (int8x16_t)Load(src + i - n); + int8x16_t _dst = vsubq_s8(_src0, _src1); + Store(dst + i, _dst); + bSum = vaddq_u16(bSum, vpaddlq_u8((uint8x16_t)vabsq_s8(_dst))); + } + _sum = vaddq_u32(_sum, vpaddlq_u16(bSum)); + } + sum += ExtractSum32u(_sum); + for (; i < size; ++i) + { + dst[i] = src[i] - src[i - n]; + sum += ::abs(dst[i]); + } + return sum; + } + + ImagePngSaver::ImagePngSaver(const ImageSaverParam& param) + : Base::ImagePngSaver(param) + { + if (_param.format == SimdPixelFormatBgr24) + _convert = Neon::BgrToRgb; + else if (_param.format == SimdPixelFormatBgra32) + _convert = Neon::BgraToRgba; + _encode[0] = Neon::EncodeLine0; + _encode[1] = Neon::EncodeLine1; + _encode[2] = Neon::EncodeLine2; + _encode[3] = Neon::EncodeLine3; + _encode[4] = Neon::EncodeLine4; + _encode[5] = Neon::EncodeLine5; + _encode[6] = Neon::EncodeLine6; + _compress = Neon::ZlibCompress; + } + } +#endif// SIMD_NEON_ENABLE +} diff --git a/modules/io/src/image/private/vpImageIoSimd.cpp b/modules/io/src/image/private/vpImageIoSimd.cpp index 4612aa5f7f..424286dc70 100644 --- a/modules/io/src/image/private/vpImageIoSimd.cpp +++ b/modules/io/src/image/private/vpImageIoSimd.cpp @@ -39,7 +39,7 @@ */ #include "vpImageIoBackend.h" -#include +#include //TODO: @@ -48,8 +48,9 @@ void readSimdlib(vpImage &I, const std::string &filename) size_t stride = 0, width = 0, height = 0; SimdPixelFormatType format = SimdPixelFormatGray8; uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format); - const bool copyData = false; + const bool copyData = true; I.init(data, (unsigned int)height, (unsigned int)width, copyData); + SimdFree(data); } void readSimdlib(vpImage &I, const std::string &filename) @@ -57,8 +58,9 @@ void readSimdlib(vpImage &I, const std::string &filename) size_t stride = 0, width = 0, height = 0; SimdPixelFormatType format = SimdPixelFormatRgba32; uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format); - const bool copyData = false; + const bool copyData = true; I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData); + SimdFree(data); } void writeJPEGSimdlib(const vpImage &I, const std::string &filename, int quality) From 557f1beda01f36ca886ec039d0a1a80a7446ca59 Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Sun, 21 Nov 2021 23:58:41 +0100 Subject: [PATCH 17/18] Fix write with libjpeg. Try to fix ARM build. --- 3rdparty/simdlib/Simd/SimdStore.h | 35 +++++++++++++++++++ .../io/src/image/private/vpImageIoLibjpeg.cpp | 4 +-- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/3rdparty/simdlib/Simd/SimdStore.h b/3rdparty/simdlib/Simd/SimdStore.h index 2b22a9616d..465c757dc4 100755 --- a/3rdparty/simdlib/Simd/SimdStore.h +++ b/3rdparty/simdlib/Simd/SimdStore.h @@ -62,6 +62,14 @@ namespace Simd { __m128 old = Load(p); Store(p, Combine(mask, value, old)); + } + + SIMD_INLINE void Store(float* ptr, __m128 val, size_t size) + { + SIMD_ALIGNED(16) float buf[F]; + _mm_store_ps(buf, val); + for (size_t i = 0; i < size; ++i) + ptr[i] = buf[i]; } template SIMD_INLINE void Store(__m128i * p, __m128i a); @@ -113,6 +121,14 @@ namespace Simd _mm256_store_ps(p, a); } + SIMD_INLINE void Store(float* ptr, __m256 val, size_t size) + { + SIMD_ALIGNED(32) float buf[F]; + _mm256_store_ps(buf, val); + for (size_t i = 0; i < size; ++i) + ptr[i] = buf[i]; + } + template SIMD_INLINE void Store(float * p0, float * p1, __m256 a) { Sse2::Store(p0, _mm256_extractf128_ps(a, 0)); @@ -144,6 +160,12 @@ namespace Simd _mm256_store_si256(p, a); } + template SIMD_INLINE void Store(__m128i* p0, __m128i* p1, __m256i a) + { + Sse2::Store(p0, _mm256_extractf128_si256(a, 0)); + Sse2::Store(p1, _mm256_extractf128_si256(a, 1)); + } + template SIMD_INLINE void StoreMasked(__m256i * p, __m256i value, __m256i mask) { __m256i old = Load(p); @@ -207,6 +229,11 @@ namespace Simd #endif } + template SIMD_INLINE void Store(int8_t* p, int8x16_t a) + { + Store((uint8_t*)p, vreinterpretq_u8_s8(a)); + } + template SIMD_INLINE void Store(uint8_t * p, uint8x8_t a); template <> SIMD_INLINE void Store(uint8_t * p, uint8x8_t a) @@ -403,6 +430,14 @@ namespace Simd #endif } + SIMD_INLINE void Store(float* ptr, float32x4_t val, size_t size) + { + SIMD_ALIGNED(16) float buf[F]; + Store(buf, val); + for (size_t i = 0; i < size; ++i) + ptr[i] = buf[i]; + } + template SIMD_INLINE void Store(float * p, float32x2_t a); template <> SIMD_INLINE void Store(float * p, float32x2_t a) diff --git a/modules/io/src/image/private/vpImageIoLibjpeg.cpp b/modules/io/src/image/private/vpImageIoLibjpeg.cpp index 8f5b021c8c..77777b0814 100644 --- a/modules/io/src/image/private/vpImageIoLibjpeg.cpp +++ b/modules/io/src/image/private/vpImageIoLibjpeg.cpp @@ -94,11 +94,11 @@ void writeJPEGLibjpeg(const vpImage &I, const std::string &filena jpeg_stdio_dest(&cinfo, file); - jpeg_set_defaults(&cinfo); cinfo.image_width = width; cinfo.image_height = height; cinfo.input_components = 1; cinfo.in_color_space = JCS_GRAYSCALE; + jpeg_set_defaults(&cinfo); //TODO: jpeg_set_quality(&cinfo, quality, TRUE); @@ -154,11 +154,11 @@ void writeJPEGLibjpeg(const vpImage &I, const std::string &filename, int jpeg_stdio_dest(&cinfo, file); - jpeg_set_defaults(&cinfo); cinfo.image_width = width; cinfo.image_height = height; cinfo.input_components = 3; cinfo.in_color_space = JCS_RGB; + jpeg_set_defaults(&cinfo); //TODO: jpeg_set_quality(&cinfo, quality, TRUE); From ca1cca3e4ea431410964cbd62bb878d6ce37f80b Mon Sep 17 00:00:00 2001 From: Souriya Trinh Date: Wed, 24 Nov 2021 10:06:16 +0100 Subject: [PATCH 18/18] This should allow running the benchmarks on Windows and Unix. --- modules/io/test/perfImageLoadSave.cpp | 49 ++++++++++++++++----------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp index 6182df06e4..b4a1bd97dd 100644 --- a/modules/io/test/perfImageLoadSave.cpp +++ b/modules/io/test/perfImageLoadSave.cpp @@ -55,7 +55,14 @@ static std::vector names { "Solvay (640x440)", "Solvay (1024x705)", "Solvay (1280x881)", "Solvay (2126x1463)" }; static std::vector backends { - vpImageIo::IO_LIB_BACKEND, vpImageIo::IO_OPENCV_BACKEND, vpImageIo::IO_SIMDLIB_BACKEND, vpImageIo::IO_STB_IMAGE_BACKEND +#if defined(VISP_HAVE_JPEG) && defined(VISP_HAVE_PNG) + vpImageIo::IO_LIB_BACKEND, +#endif +#if defined(VISP_HAVE_OPENCV) + vpImageIo::IO_OPENCV_BACKEND, +#endif + vpImageIo::IO_SIMDLIB_BACKEND, + vpImageIo::IO_STB_IMAGE_BACKEND }; static std::vector backendNamesJpeg { "libjpeg", "OpenCV", "simd", "stb" @@ -129,18 +136,20 @@ TEST_CASE("Benchmark PNG image loading", "[benchmark]") { } } -#if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || (defined(__APPLE__) && defined(__MACH__))) // UNIX -// makeTempDirectory is only implemented for Unix platform - std::string username, directory_filename_tmp; +#if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || (defined(__APPLE__) && defined(__MACH__))) // UNIX +std::string tmp_dir = "/tmp/"; +#else +std::string tmp_dir = "C:/Temp/"; +#endif + TEST_CASE("Benchmark JPEG image saving", "[benchmark]") { vpIoTools::getUserName(username); - std::string tmp_dir = "/tmp/" + username; - vpIoTools::makeDirectory(tmp_dir); - directory_filename_tmp = tmp_dir + "/" + "vpIoTools_perfImageLoadSave_XXXXXX"; - std::string converted_dirname_tmp = vpIoTools::makeTempDirectory(directory_filename_tmp); - REQUIRE(vpIoTools::checkDirectory(converted_dirname_tmp)); + vpIoTools::makeDirectory(tmp_dir + username); + directory_filename_tmp = tmp_dir + username + "/vpIoTools_perfImageLoadSave_" + vpTime::getDateTime("%Y-%m-%d_%H.%M.%S"); + vpIoTools::makeDirectory(directory_filename_tmp); + REQUIRE(vpIoTools::checkDirectory(directory_filename_tmp)); SECTION("Grayscale") { for (size_t i = 0; i < paths.size(); i++) { @@ -150,7 +159,7 @@ TEST_CASE("Benchmark JPEG image saving", "[benchmark]") { SECTION(names[i]) { for (size_t j = 0; j < backends.size(); j++) { BENCHMARK(backendNamesJpeg[j] + " backend") { - vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.jpg", backends[j]); + vpImageIo::write(I, directory_filename_tmp + "/ViSP_tmp_perf_write.jpg", backends[j]); return I; }; } @@ -166,7 +175,7 @@ TEST_CASE("Benchmark JPEG image saving", "[benchmark]") { SECTION(names[i]) { for (size_t j = 0; j < backends.size(); j++) { BENCHMARK(backendNamesJpeg[j] + " backend") { - vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.jpg", backends[j]); + vpImageIo::write(I, directory_filename_tmp + "/ViSP_tmp_perf_write.jpg", backends[j]); return I; }; } @@ -174,16 +183,15 @@ TEST_CASE("Benchmark JPEG image saving", "[benchmark]") { } } - REQUIRE(vpIoTools::remove(converted_dirname_tmp)); + REQUIRE(vpIoTools::remove(directory_filename_tmp)); } TEST_CASE("Benchmark PNG image saving", "[benchmark]") { vpIoTools::getUserName(username); - std::string tmp_dir = "/tmp/" + username; - vpIoTools::makeDirectory(tmp_dir); - directory_filename_tmp = tmp_dir + "/" + "vpIoTools_perfImageLoadSave_XXXXXX"; - std::string converted_dirname_tmp = vpIoTools::makeTempDirectory(directory_filename_tmp); - REQUIRE(vpIoTools::checkDirectory(converted_dirname_tmp)); + vpIoTools::makeDirectory(tmp_dir + username); + directory_filename_tmp = tmp_dir + username + "/vpIoTools_perfImageLoadSave_" + vpTime::getDateTime("%Y-%m-%d_%H.%M.%S"); + vpIoTools::makeDirectory(directory_filename_tmp); + REQUIRE(vpIoTools::checkDirectory(directory_filename_tmp)); SECTION("Grayscale") { for (size_t i = 0; i < paths.size(); i++) { @@ -193,7 +201,7 @@ TEST_CASE("Benchmark PNG image saving", "[benchmark]") { SECTION(names[i]) { for (size_t j = 0; j < backends.size(); j++) { BENCHMARK(backendNamesPng[j] + " backend") { - vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.png", backends[j]); + vpImageIo::write(I, directory_filename_tmp + "/ViSP_tmp_perf_write.png", backends[j]); return I; }; } @@ -209,15 +217,16 @@ TEST_CASE("Benchmark PNG image saving", "[benchmark]") { SECTION(names[i]) { for (size_t j = 0; j < backends.size(); j++) { BENCHMARK(backendNamesPng[j] + " backend") { - vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.png", backends[j]); + vpImageIo::write(I, directory_filename_tmp + "/ViSP_tmp_perf_write.png", backends[j]); return I; }; } } } } + + REQUIRE(vpIoTools::remove(directory_filename_tmp)); } -#endif int main(int argc, char *argv[]) {