From de1b75d62667927842a7acd463bd5b2549f37c69 Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Wed, 3 Nov 2021 01:06:57 +0100
Subject: [PATCH 01/18] Update Simd lib to 4.9.107 version.

---
 3rdparty/simdlib/CMakeLists.txt               |  16 +-
 .../Simd/{SimdSse1.h => SimdAlignment.h}      | 113 +++--
 3rdparty/simdlib/Simd/SimdAllocator.hpp       |   6 +-
 3rdparty/simdlib/Simd/SimdArray.h             |  30 +-
 3rdparty/simdlib/Simd/SimdAvx1.h              |   9 +-
 ...SimdBaseRgbaToGray.cpp => SimdAvx1Cpu.cpp} |  45 +-
 3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp     |  14 +-
 3rdparty/simdlib/Simd/SimdAvx2.h              |  22 +-
 3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp   |  43 +-
 3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp   |  61 ++-
 3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp    |  10 +-
 3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp   |  74 ---
 3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp   | 149 ++++++
 3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp  |  56 ++-
 3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp  |  72 ---
 3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp         |  68 +++
 .../simdlib/Simd/SimdAvx2Deinterleave.cpp     |  59 ++-
 .../simdlib/Simd/SimdAvx2GaussianBlur.cpp     |   3 +-
 3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp      |   4 +-
 .../simdlib/Simd/SimdAvx2ReduceGray2x2.cpp    |   6 +-
 .../simdlib/Simd/SimdAvx2ReduceGray3x3.cpp    |   4 +-
 .../simdlib/Simd/SimdAvx2ReduceGray4x4.cpp    |   4 +-
 .../simdlib/Simd/SimdAvx2ReduceGray5x5.cpp    |   6 +-
 .../simdlib/Simd/SimdAvx2ResizeBilinear.cpp   |   4 +-
 3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp     |  23 +-
 3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp   |  92 ----
 3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp  |  97 ----
 3rdparty/simdlib/Simd/SimdBase.h              |  18 +-
 3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp   |  20 +-
 3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp   |  15 +-
 3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp    |   4 +-
 3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp   |  80 ---
 3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp   |  37 +-
 3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp  |  15 +-
 3rdparty/simdlib/Simd/SimdBaseCpu.cpp         | 234 +++++++++
 .../simdlib/Simd/SimdBaseDeinterleave.cpp     |  43 +-
 .../simdlib/Simd/SimdBaseGaussianBlur.cpp     |   2 +-
 3rdparty/simdlib/Simd/SimdBaseResizer.cpp     | 243 ++++++++-
 3rdparty/simdlib/Simd/SimdConfig.h            |  10 +-
 3rdparty/simdlib/Simd/SimdConst.h             |  70 +--
 3rdparty/simdlib/Simd/SimdConversion.h        |  55 +--
 3rdparty/simdlib/Simd/SimdCopyPixel.h         |  17 +
 3rdparty/simdlib/Simd/SimdCpu.h               | 101 +++-
 3rdparty/simdlib/Simd/SimdDefs.h              |  80 +--
 3rdparty/simdlib/Simd/SimdEnable.h            | 415 +---------------
 3rdparty/simdlib/Simd/SimdExp.h               | 176 ++++++-
 3rdparty/simdlib/Simd/SimdExtract.h           |  22 +-
 3rdparty/simdlib/Simd/SimdFrame.hpp           |  98 +++-
 3rdparty/simdlib/Simd/SimdInit.h              |  35 +-
 3rdparty/simdlib/Simd/SimdLib.cpp             | 279 ++++++-----
 3rdparty/simdlib/Simd/SimdLib.h               | 239 +++++----
 3rdparty/simdlib/Simd/SimdLib.hpp             | 465 +++++++++++++++++-
 3rdparty/simdlib/Simd/SimdLoad.h              | 277 +----------
 3rdparty/simdlib/Simd/SimdLoadBlock.h         | 251 ++++++++++
 3rdparty/simdlib/Simd/SimdLog.h               |  28 +-
 3rdparty/simdlib/Simd/SimdMath.h              |  47 +-
 3rdparty/simdlib/Simd/SimdMemory.h            | 104 ++--
 3rdparty/simdlib/Simd/SimdNeon.h              |  20 +-
 3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp   |  45 +-
 3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp   |  63 ++-
 3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp    |  10 +-
 3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp   |  81 ---
 3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp   |  83 +++-
 3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp  |  41 +-
 3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp  |  78 ---
 .../simdlib/Simd/SimdNeonDeinterleave.cpp     |  79 ++-
 .../simdlib/Simd/SimdNeonGaussianBlur.cpp     |   1 +
 3rdparty/simdlib/Simd/SimdNeonResizer.cpp     |   8 +-
 3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp   |  71 ---
 3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp  |  71 ---
 3rdparty/simdlib/Simd/SimdPixel.hpp           | 200 +++++++-
 3rdparty/simdlib/Simd/SimdPow.h               |   2 +-
 3rdparty/simdlib/Simd/SimdResizer.h           | 148 ++++--
 3rdparty/simdlib/Simd/SimdResizerCommon.h     |  97 ++++
 3rdparty/simdlib/Simd/SimdRuntime.h           |  34 +-
 3rdparty/simdlib/Simd/SimdSet.h               |   8 +-
 3rdparty/simdlib/Simd/SimdSse1Resizer.cpp     | 129 -----
 3rdparty/simdlib/Simd/SimdSse2.h              |   8 +-
 3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp  |  54 +-
 ...SimdBaseBgraToRgba.cpp => SimdSse2Cpu.cpp} |  44 +-
 .../simdlib/Simd/SimdSse2GaussianBlur3x3.cpp  |   3 +-
 3rdparty/simdlib/Simd/SimdSse2Resizer.cpp     |   8 +-
 3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp   |  75 ---
 3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp  |  96 ----
 3rdparty/simdlib/Simd/SimdSse41.h             |  76 +++
 ...e3BgrToBgra.cpp => SimdSse41BgrToBgra.cpp} | 185 ++++---
 ...e3BgrToGray.cpp => SimdSse41BgrToGray.cpp} | 241 +++++----
 ...sse3BgrToRgb.cpp => SimdSse41BgrToRgb.cpp} | 163 +++---
 ...e3BgraToBgr.cpp => SimdSse41BgraToBgr.cpp} | 257 ++++++----
 ...SimdBaseRgbToGray.cpp => SimdSse41Cpu.cpp} |  46 +-
 ...terleave.cpp => SimdSse41Deinterleave.cpp} |  60 ++-
 .../simdlib/Simd/SimdSse41GaussianBlur.cpp    |   3 +-
 ...ur3x3.cpp => SimdSse41GaussianBlur3x3.cpp} |  12 +-
 ...e3GrayToBgr.cpp => SimdSse41GrayToBgr.cpp} | 147 +++---
 ...Interleave.cpp => SimdSse41Interleave.cpp} |  11 +-
 ...imdSsse3Reduce.cpp => SimdSse41Reduce.cpp} | 401 ++++++++-------
 ...Gray2x2.cpp => SimdSse41ReduceGray2x2.cpp} | 189 ++++---
 ...Gray4x4.cpp => SimdSse41ReduceGray4x4.cpp} |  11 +-
 ...linear.cpp => SimdSse41ResizeBilinear.cpp} |   9 +-
 3rdparty/simdlib/Simd/SimdSse41Resizer.cpp    | 311 +++++++++++-
 3rdparty/simdlib/Simd/SimdSsse3.h             |  77 ---
 3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp  |  74 ---
 3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp |  73 ---
 .../simdlib/Simd/SimdSsse3CustomFunctions.cpp |  69 ---
 3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp    | 350 -------------
 3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp  |  93 ----
 3rdparty/simdlib/Simd/SimdStore.h             |  45 +-
 3rdparty/simdlib/Simd/SimdStream.h            |  21 +-
 3rdparty/simdlib/Simd/SimdUpdate.h            |  17 +-
 3rdparty/simdlib/Simd/SimdVersion.h           |   2 +-
 3rdparty/simdlib/Simd/SimdView.hpp            |   6 +-
 modules/core/src/image/vpImageConvert.cpp     |   4 +-
 112 files changed, 5013 insertions(+), 4067 deletions(-)
 rename 3rdparty/simdlib/Simd/{SimdSse1.h => SimdAlignment.h} (53%)
 mode change 100644 => 100755
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAllocator.hpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdArray.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx1.h
 rename 3rdparty/simdlib/Simd/{SimdBaseRgbaToGray.cpp => SimdAvx1Cpu.cpp} (57%)
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp
 create mode 100755 3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBase.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseCpu.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseResizer.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdConfig.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdConst.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdConversion.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdCopyPixel.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdCpu.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdDefs.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdEnable.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdExp.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdExtract.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdFrame.hpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdInit.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLib.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLib.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLib.hpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLoad.h
 create mode 100755 3rdparty/simdlib/Simd/SimdLoadBlock.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLog.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdMath.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdMemory.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeon.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonResizer.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdPixel.hpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdPow.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdResizer.h
 create mode 100755 3rdparty/simdlib/Simd/SimdResizerCommon.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdRuntime.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSet.h
 delete mode 100644 3rdparty/simdlib/Simd/SimdSse1Resizer.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp
 rename 3rdparty/simdlib/Simd/{SimdBaseBgraToRgba.cpp => SimdSse2Cpu.cpp} (62%)
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2Resizer.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp
 create mode 100755 3rdparty/simdlib/Simd/SimdSse41.h
 rename 3rdparty/simdlib/Simd/{SimdSsse3BgrToBgra.cpp => SimdSse41BgrToBgra.cpp} (57%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdSsse3BgrToGray.cpp => SimdSse41BgrToGray.cpp} (56%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdSsse3BgrToRgb.cpp => SimdSse41BgrToRgb.cpp} (84%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdSsse3BgraToBgr.cpp => SimdSse41BgraToBgr.cpp} (53%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdBaseRgbToGray.cpp => SimdSse41Cpu.cpp} (54%)
 rename 3rdparty/simdlib/Simd/{SimdSsse3Deinterleave.cpp => SimdSse41Deinterleave.cpp} (74%)
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp
 rename 3rdparty/simdlib/Simd/{SimdSsse3GaussianBlur3x3.cpp => SimdSse41GaussianBlur3x3.cpp} (95%)
 rename 3rdparty/simdlib/Simd/{SimdSsse3GrayToBgr.cpp => SimdSse41GrayToBgr.cpp} (92%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdSsse3Interleave.cpp => SimdSse41Interleave.cpp} (96%)
 rename 3rdparty/simdlib/Simd/{SimdSsse3Reduce.cpp => SimdSse41Reduce.cpp} (96%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdSsse3ReduceGray2x2.cpp => SimdSse41ReduceGray2x2.cpp} (94%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdSsse3ReduceGray4x4.cpp => SimdSse41ReduceGray4x4.cpp} (96%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdSsse3ResizeBilinear.cpp => SimdSse41ResizeBilinear.cpp} (98%)
 mode change 100644 => 100755
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse41Resizer.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3.h
 delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdStore.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdStream.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdUpdate.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdView.hpp

diff --git a/3rdparty/simdlib/CMakeLists.txt b/3rdparty/simdlib/CMakeLists.txt
index e6880b3800..dc6d111aae 100644
--- a/3rdparty/simdlib/CMakeLists.txt
+++ b/3rdparty/simdlib/CMakeLists.txt
@@ -109,23 +109,11 @@ if(X86 OR X86_64)
     file(GLOB_RECURSE SIMD_BASE_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdBase*.cpp)
     set_source_files_properties(${SIMD_BASE_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS}")
 
-    file(GLOB_RECURSE SIMD_SSE1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse1*.cpp)
-    set_source_files_properties(${SIMD_SSE1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG}")
-
     file(GLOB_RECURSE SIMD_SSE2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse2*.cpp)
-    set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE2_FLAG}")
-
-    file(GLOB_RECURSE SIMD_SSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse3*.cpp)
-    set_source_files_properties(${SIMD_SSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG}")
-
-    file(GLOB_RECURSE SIMD_SSSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSsse3*.cpp)
-    set_source_files_properties(${SIMD_SSSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSSE3_FLAG}")
+    set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG} ${SSE2_FLAG}")
 
     file(GLOB_RECURSE SIMD_SSE41_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse41*.cpp)
-    set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_1_FLAG}")
-
-    file(GLOB_RECURSE SIMD_SSE42_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse42*.cpp)
-    set_source_files_properties(${SIMD_SSE42_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}")
+    set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG} ${SSSE3_FLAG} ${SSE4_1_FLAG} ${SSE4_2_FLAG}")
 
     file(GLOB_RECURSE SIMD_AVX1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx1*.cpp)
     set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}")
diff --git a/3rdparty/simdlib/Simd/SimdSse1.h b/3rdparty/simdlib/Simd/SimdAlignment.h
old mode 100644
new mode 100755
similarity index 53%
rename from 3rdparty/simdlib/Simd/SimdSse1.h
rename to 3rdparty/simdlib/Simd/SimdAlignment.h
index e258d50ab3..9789cbb9e7
--- a/3rdparty/simdlib/Simd/SimdSse1.h
+++ b/3rdparty/simdlib/Simd/SimdAlignment.h
@@ -1,40 +1,73 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#ifndef __SimdSse_h__
-#define __SimdSse_h__
-
-#include "Simd/SimdDefs.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
-    {
-        void SquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum);
-
-        void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum);
-    }
-#endif// SIMD_SSE_ENABLE
-}
-#endif//__SimdSse_h__
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdAlignment_h__
+#define __SimdAlignment_h__
+
+#include "Simd/SimdEnable.h"
+
+namespace Simd
+{
+    SIMD_INLINE size_t GetAlignment()
+    {
+#ifdef SIMD_AVX2_ENABLE
+        if (Avx2::Enable)
+            return sizeof(__m256i);
+        else
+#endif
+#ifdef SIMD_AVX_ENABLE
+        if (Avx::Enable)
+            return sizeof(__m256);
+        else
+#endif
+#ifdef SIMD_SSE41_ENABLE
+        if (Sse41::Enable)
+            return sizeof(__m128i);
+        else
+#endif
+#ifdef SIMD_SSE2_ENABLE
+        if (Sse2::Enable)
+            return sizeof(__m128i);
+        else
+#endif
+#ifdef SIMD_NEON_ENABLE
+        if (Neon::Enable)
+            return sizeof(uint8x16_t);
+        else
+#endif
+            return sizeof(void *);
+    }
+
+    extern const size_t ALIGNMENT;
+
+    SIMD_INLINE size_t Alignment()
+    {
+#if defined(WIN32)
+        return GetAlignment();
+#else
+        return ALIGNMENT;
+#endif
+    }
+}
+
+#endif//__SimdAlignment_h__
diff --git a/3rdparty/simdlib/Simd/SimdAllocator.hpp b/3rdparty/simdlib/Simd/SimdAllocator.hpp
old mode 100644
new mode 100755
index cd65f196f4..8ee548e5ae
--- a/3rdparty/simdlib/Simd/SimdAllocator.hpp
+++ b/3rdparty/simdlib/Simd/SimdAllocator.hpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -125,8 +125,8 @@ namespace Simd
         */
         static SIMD_INLINE size_t Alignment()
         {
-#if defined(__SimdEnable_h__) && defined(WIN32)
-            return Simd::ALIGNMENT;
+#if defined(__SimdAlignment_h__) && defined(WIN32)
+            return Simd::Alignment();
 #else
             return SimdAlignment();
 #endif
diff --git a/3rdparty/simdlib/Simd/SimdArray.h b/3rdparty/simdlib/Simd/SimdArray.h
old mode 100644
new mode 100755
index 30e793080f..2f7f1bbbe0
--- a/3rdparty/simdlib/Simd/SimdArray.h
+++ b/3rdparty/simdlib/Simd/SimdArray.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -57,15 +57,28 @@ namespace Simd
                 }
                 *(size_t*)&size = size_;
                 if (size_)
-                    *(T**)&data = (T*)Simd::Allocate(size * sizeof(T), align);
+                    *(T**)&data = (T*)Simd::Allocate(RawSize(), align);
             }
             if (clear)
                 Clear();
         }
 
+        SIMD_INLINE void Assign(const T * src, size_t size_)
+        {
+            Resize(size_, src == NULL);
+            if(src)
+                memcpy(data, src, RawSize());
+        }
+
         SIMD_INLINE void Clear()
         {
-            ::memset(data, 0, size * sizeof(T));
+            memset(data, 0, RawSize());
+        }
+
+        SIMD_INLINE void Swap(const Array & array)
+        {
+            Simd::Swap((T*&)data, (T*&)(array.data));
+            Simd::Swap((size_t&)size, (size_t&)(array.size));
         }
 
         SIMD_INLINE T & operator[] (size_t i)
@@ -77,12 +90,19 @@ namespace Simd
         {
             return data[i];
         }
+
+        SIMD_INLINE size_t RawSize() const
+        {
+            return size * sizeof(T);
+        }
     };
 
+    typedef Array<int8_t> Array8i;
     typedef Array<uint8_t> Array8u;
     typedef Array<int16_t> Array16i;
     typedef Array<uint16_t> Array16u;
     typedef Array<int32_t> Array32i;
+    typedef Array<uint32_t> Array32u;
     typedef Array<float> Array32f;
 
 #if defined(__GNUC__) && __GNUC__ >= 6
@@ -90,8 +110,8 @@ namespace Simd
 #pragma GCC diagnostic ignored "-Wignored-attributes"
 #endif
 
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         typedef Array<__m128> Array128f;
     }
diff --git a/3rdparty/simdlib/Simd/SimdAvx1.h b/3rdparty/simdlib/Simd/SimdAvx1.h
old mode 100644
new mode 100755
index 25c070c459..48df913c02
--- a/3rdparty/simdlib/Simd/SimdAvx1.h
+++ b/3rdparty/simdlib/Simd/SimdAvx1.h
@@ -1,8 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
-*               2019-2019 Facundo Galan.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -22,8 +21,8 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
-#ifndef __SimdAvx1_h__
-#define __SimdAvx1_h__
+#ifndef __SimdAvx_h__
+#define __SimdAvx_h__
 
 #include "Simd/SimdDefs.h"
 
@@ -36,4 +35,4 @@ namespace Simd
     }
 #endif// SIMD_AVX_ENABLE
 }
-#endif//__SimdAvx1_h__
+#endif//__SimdAvx_h__
diff --git a/3rdparty/simdlib/Simd/SimdBaseRgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx1Cpu.cpp
similarity index 57%
rename from 3rdparty/simdlib/Simd/SimdBaseRgbaToGray.cpp
rename to 3rdparty/simdlib/Simd/SimdAvx1Cpu.cpp
index 22d37b17ee..9d9cbb29d3 100644
--- a/3rdparty/simdlib/Simd/SimdBaseRgbaToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx1Cpu.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,23 +21,46 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
-#include "Simd/SimdConversion.h"
+#include "Simd/SimdEnable.h"
+#include "Simd/SimdCpu.h"
+
+#if defined(_MSC_VER)
+#include <windows.h>
+#endif
 
 namespace Simd
 {
-    namespace Base
+#ifdef SIMD_AVX_ENABLE
+    namespace Avx
     {
-        void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride)
+        SIMD_INLINE bool SupportedByCPU()
         {
-            for (size_t row = 0; row < height; ++row)
+            return
+                Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) &&
+                Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::AVX);
+        }
+
+        SIMD_INLINE bool SupportedByOS()
+        {
+#if defined(_MSC_VER)
+            __try
             {
-                const uint8_t * pRgba = rgba + row*rgbaStride;
-                uint8_t * pGray = gray + row*grayStride;
-                for (const uint8_t *pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgba += 4)
-                {
-                    *pGray = RgbToGray(pRgba[0], pRgba[1], pRgba[2]);
-                }
+                __m256d value = _mm256_set1_pd(1.0);// try to execute of AVX instructions;
+                return true;
             }
+            __except (EXCEPTION_EXECUTE_HANDLER)
+            {
+                return false;
+            }
+#else
+            return true;
+#endif
+        }
+
+        bool GetEnable()
+        {
+            return SupportedByCPU() && SupportedByOS();
         }
     }
+#endif
 }
diff --git a/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp b/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp
old mode 100644
new mode 100755
index e409c17ff1..319c609408
--- a/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -42,7 +42,7 @@ namespace Simd
             float * pbx[2] = { _bx[0].data, _bx[1].data };
             int32_t prev = -2;
             size_t rsa = AlignLo(rs, Avx::F);
-            size_t rsh = AlignLo(rs, Sse::F);
+            size_t rsh = AlignLo(rs, Sse2::F);
             for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride)
             {
                 float fy1 = _ay[dy];
@@ -78,10 +78,10 @@ namespace Simd
                             __m256 m1 = _mm256_mul_ps(fx1, _mm256_shuffle_ps(s0145, s2367, 0xDD));
                             _mm256_store_ps(pb + dx, _mm256_add_ps(m0, m1));
                         }
-                        for (; dx < rsh; dx += Sse::F)
+                        for (; dx < rsh; dx += Sse2::F)
                         {
-                            __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]);
-                            __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]);
+                            __m128 s01 = Sse2::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]);
+                            __m128 s23 = Sse2::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]);
                             __m128 fx1 = _mm_load_ps(_ax.data + dx);
                             __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1);
                             __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88));
@@ -128,7 +128,7 @@ namespace Simd
                     __m256 m1 = _mm256_mul_ps(_mm256_load_ps(pbx[1] + dx), _fy1);
                     _mm256_storeu_ps(dst + dx, _mm256_add_ps(m0, m1));
                 }
-                for (; dx < rsh; dx += Sse::F)
+                for (; dx < rsh; dx += Sse2::F)
                 {
                     __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0));
                     __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1));
@@ -144,7 +144,7 @@ namespace Simd
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
         {
             ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m256));
-            if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp))
+            if (param.IsFloatBilinear())
                 return new ResizerFloatBilinear(param);
             else
                 return Sse41::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
diff --git a/3rdparty/simdlib/Simd/SimdAvx2.h b/3rdparty/simdlib/Simd/SimdAvx2.h
old mode 100644
new mode 100755
index 46d3b2d547..f5957b26c1
--- a/3rdparty/simdlib/Simd/SimdAvx2.h
+++ b/3rdparty/simdlib/Simd/SimdAvx2.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
 *               2019-2019 Facundo Galan.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -32,24 +32,22 @@ namespace Simd
 #ifdef SIMD_AVX2_ENABLE
     namespace Avx2
     {
+        void BgraToBgr(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* bgr, size_t bgrStride);
+
         void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride);
 
-        void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride);
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride);
+
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride);
 
         void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height,
             const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
         void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
-        void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha);
-
-        void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride);
-
         void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride);
 
-        void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride);
-
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride);
+        void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride);
 
         void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride);
 
@@ -87,6 +85,12 @@ namespace Simd
         void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
 
+        void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride);
+
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride);
+
         void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
     }
diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp
old mode 100644
new mode 100755
index b1f9ef8417..ffb4828e98
--- a/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -67,6 +67,8 @@ namespace Simd
                 BgrToBgra<false>(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
         }
 
+        //---------------------------------------------------------------------
+
         template <bool align> SIMD_INLINE void Bgr48pToBgra32(uint8_t * bgra,
             const uint8_t * blue, const uint8_t * green, const uint8_t * red, size_t offset, __m256i alpha)
         {
@@ -117,6 +119,45 @@ namespace Simd
             else
                 Bgr48pToBgra32<false>(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha);
         }
+
+        //---------------------------------------------------------------------
+
+        template <bool align> SIMD_INLINE void RgbToBgra(const uint8_t* rgb, uint8_t* bgra, __m256i alpha)
+        {
+            Store<align>((__m256i*)bgra + 0, RgbToBgra<false>(Load<align>((__m256i*)(rgb + 0)), alpha));
+            Store<align>((__m256i*)bgra + 1, RgbToBgra<false>(Load<false>((__m256i*)(rgb + 24)), alpha));
+            Store<align>((__m256i*)bgra + 2, RgbToBgra<false>(Load<false>((__m256i*)(rgb + 48)), alpha));
+            Store<align>((__m256i*)bgra + 3, RgbToBgra<true >(Load<align>((__m256i*)(rgb + 64)), alpha));
+        }
+
+        template <bool align> void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+
+            __m256i _alpha = _mm256_slli_si256(_mm256_set1_epi32(alpha), 3);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    RgbToBgra<align>(rgb + 3 * col, bgra + 4 * col, _alpha);
+                if (width != alignedWidth)
+                    RgbToBgra<false>(rgb + 3 * (width - A), bgra + 4 * (width - A), _alpha);
+                rgb += rgbStride;
+                bgra += bgraStride;
+            }
+        }
+
+        void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride))
+                RgbToBgra<true>(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+            else
+                RgbToBgra<false>(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+        }
     }
 #else
     // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToBgra.cpp.o) has no symbols
diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp
old mode 100644
new mode 100755
index d40b0f0cc6..7b922e7025
--- a/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -45,7 +45,7 @@ namespace Simd
         {
             const __m256i lo = PackI32ToI16(BgraToGray32(bgra[0]), BgraToGray32(bgra[1]));
             const __m256i hi = PackI32ToI16(BgraToGray32(bgra[2]), BgraToGray32(bgra[3]));
-            return PackU16ToU8(lo, hi);
+            return PackI16ToU8(lo, hi);
         }
 
         template <bool align> SIMD_INLINE __m256i BgrToGray(const uint8_t * bgr)
@@ -84,6 +84,63 @@ namespace Simd
             else
                 BgrToGray<false>(bgr, width, height, bgrStride, gray, grayStride);
         }
+
+
+        //---------------------------------------------------------------------
+
+        const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
+
+        SIMD_INLINE __m256i RgbaToGray32(__m256i rgba)
+        {
+            const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF);
+            const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF);
+            const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_ROUND), _mm256_madd_epi16(r0b0, K16_RED_BLUE));
+            return _mm256_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT);
+        }
+
+        SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4])
+        {
+            const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
+            const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
+            return PackI16ToU8(lo, hi);
+        }
+
+        template <bool align> SIMD_INLINE __m256i RgbToGray(const uint8_t* rgb)
+        {
+            __m256i rgba[4];
+            rgba[0] = BgrToBgra<false>(Load<align>((__m256i*)(rgb + 0)), K32_01000000);
+            rgba[1] = BgrToBgra<false>(Load<false>((__m256i*)(rgb + 24)), K32_01000000);
+            rgba[2] = BgrToBgra<false>(Load<false>((__m256i*)(rgb + 48)), K32_01000000);
+            rgba[3] = BgrToBgra<true>(Load<align>((__m256i*)(rgb + 64)), K32_01000000);
+            return RgbaToGray(rgba);
+        }
+
+        template <bool align> void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    Store<align>((__m256i*)(gray + col), RgbToGray<align>(rgb + 3 * col));
+                if (width != alignedWidth)
+                    Store<false>((__m256i*)(gray + width - A), RgbToGray<false>(rgb + 3 * (width - A)));
+                rgb += rgbStride;
+                gray += grayStride;
+            }
+        }
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+        {
+            if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride))
+                RgbToGray<true>(rgb, width, height, rgbStride, gray, grayStride);
+            else
+                RgbToGray<false>(rgb, width, height, rgbStride, gray, grayStride);
+        }
     }
 #else
     // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToGray.cpp.o) has no symbols
diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp
old mode 100644
new mode 100755
index 2daae1e7df..a64ed8035e
--- a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -64,7 +64,7 @@ namespace Simd
                 _mm256_shuffle_epi8(p1, K8_SHFL_2P1)), _mm256_shuffle_epi8(p2, K8_SHFL_2P2)));
         }
 
-        template <bool align> void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
+        template <bool align> void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride)
         {
             assert(width >= A);
             if (align)
@@ -85,12 +85,12 @@ namespace Simd
             }
         }
 
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
+        void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride)
         {
             if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride))
-                BgrToRgb<true>(bgr, bgrStride, width, height, rgb, rgbStride);
+                BgrToRgb<true>(bgr, width, height, bgrStride, rgb, rgbStride);
             else
-                BgrToRgb<false>(bgr, bgrStride, width, height, rgb, rgbStride);
+                BgrToRgb<false>(bgr, width, height, bgrStride, rgb, rgbStride);
         }
     }
 #else
diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp
deleted file mode 100644
index a4f9efdb2f..0000000000
--- a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdConversion.h"
-
-namespace Simd
-{
-#ifdef SIMD_AVX2_ENABLE
-    namespace Avx2
-    {
-        template <bool align> SIMD_INLINE void BgrToRgba(const uint8_t * bgr, uint8_t * rgba, __m256i alpha)
-        {
-            Store<align>((__m256i*)rgba + 0, BgrToRgba<false>(Load<align>((__m256i*)(bgr + 0)), alpha));
-            Store<align>((__m256i*)rgba + 1, BgrToRgba<false>(Load<false>((__m256i*)(bgr + 24)), alpha));
-            Store<align>((__m256i*)rgba + 2, BgrToRgba<false>(Load<false>((__m256i*)(bgr + 48)), alpha));
-            Store<align>((__m256i*)rgba + 3, BgrToRgba<true >(Load<align>((__m256i*)(bgr + 64)), alpha));
-        }
-
-        template <bool align> void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgba) && Aligned(rgbaStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            __m256i _alpha = _mm256_slli_si256(_mm256_set1_epi32(alpha), 3);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    BgrToRgba<align>(bgr + 3 * col, rgba + 4 * col, _alpha);
-                if (width != alignedWidth)
-                    BgrToRgba<false>(bgr + 3 * (width - A), rgba + 4 * (width - A), _alpha);
-                bgr += bgrStride;
-                rgba += rgbaStride;
-            }
-        }
-
-        void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha)
-        {
-            if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride))
-                BgrToRgba<true>(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-            else
-                BgrToRgba<false>(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToRgba.cpp.o) has no symbols
-    void dummy_SimdAvx2BgrToRgba(){};
-#endif//SIMD_AVX2_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp
new file mode 100755
index 0000000000..aac574d71c
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp
@@ -0,0 +1,149 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdStore.h"
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdConst.h"
+
+namespace Simd
+{
+#ifdef SIMD_AVX2_ENABLE  
+    namespace Avx2
+    {
+        template <bool align> SIMD_INLINE __m256i BgraToBgr(const uint8_t* bgra)
+        {
+            __m256i _bgra = Load<align>((__m256i*)bgra);
+            return _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(_bgra, K8_SHUFFLE_BGRA_TO_BGR), K32_PERMUTE_BGRA_TO_BGR);
+        }
+
+        template <bool align> void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride)
+        {
+            assert(width >= F);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride));
+
+            size_t widthF = AlignLo(width, F);
+            if (width == widthF)
+                widthF -= F;
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < widthF; col += F)
+                    Store<false>((__m256i*)(bgr + 3 * col), BgraToBgr<align>(bgra + 4 * col));
+                if (width != widthF)
+                    Store24<false>(bgr + 3 * (width - F), BgraToBgr<false>(bgra + 4 * (width - F)));
+                bgra += bgraStride;
+                bgr += bgrStride;
+            }
+        }
+
+        void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride))
+                BgraToBgr<true>(bgra, width, height, bgraStride, bgr, bgrStride);
+            else
+                BgraToBgr<false>(bgra, width, height, bgraStride, bgr, bgrStride);
+        }
+
+        //---------------------------------------------------------------------
+
+        const __m256i K8_SHUFFLE_BGRA_TO_RGB = SIMD_MM256_SETR_EPI8(
+            0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1,
+            0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1);
+
+        template <bool align> SIMD_INLINE __m256i BgraToRgb(const uint8_t* bgra)
+        {
+            __m256i _bgra = Load<align>((__m256i*)bgra);
+            return _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(_bgra, K8_SHUFFLE_BGRA_TO_RGB), K32_PERMUTE_BGRA_TO_BGR);
+        }
+
+        template <bool align> void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
+        {
+            assert(width >= F);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t widthF = AlignLo(width, F);
+            if (width == widthF)
+                widthF -= F;
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < widthF; col += F)
+                    Store<false>((__m256i*)(rgb + 3 * col), BgraToRgb<align>(bgra + 4 * col));
+                if (width != widthF)
+                    Store24<false>(rgb + 3 * (width - F), BgraToRgb<false>(bgra + 4 * (width - F)));
+                bgra += bgraStride;
+                rgb += rgbStride;
+            }
+        }
+
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride))
+                BgraToRgb<true>(bgra, width, height, bgraStride, rgb, rgbStride);
+            else
+                BgraToRgb<false>(bgra, width, height, bgraStride, rgb, rgbStride);
+        }
+
+        //---------------------------------------------------------------------
+
+        const __m256i K8_BGRA_TO_RGBA = SIMD_MM256_SETR_EPI8(
+            0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF,
+            0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF);
+
+        template <bool align> SIMD_INLINE void BgraToRgba(const uint8_t* bgra, uint8_t* rgba)
+        {
+            Store<align>((__m256i*)rgba, _mm256_shuffle_epi8(Load<align>((__m256i*)bgra), K8_BGRA_TO_RGBA));
+        }
+
+        template <bool align> void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride));
+
+            size_t size = width * 4;
+            size_t sizeA = AlignLo(size, A);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t i = 0; i < size; i += A)
+                    BgraToRgba<align>(bgra + i, rgba + i);
+                if (size != sizeA)
+                    BgraToRgba<false>(bgra + size - sizeA, rgba + size - sizeA);
+                bgra += bgraStride;
+                rgba += rgbaStride;
+            }
+        }
+
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride))
+                BgraToRgba<true>(bgra, width, height, bgraStride, rgba, rgbaStride);
+            else
+                BgraToRgba<false>(bgra, width, height, bgraStride, rgba, rgbaStride);
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp
old mode 100644
new mode 100755
index f203fcae78..7082801956
--- a/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -46,7 +46,7 @@ namespace Simd
         {
             const __m256i lo = PackI32ToI16(BgraToGray32(bgra[0]), BgraToGray32(bgra[1]));
             const __m256i hi = PackI32ToI16(BgraToGray32(bgra[2]), BgraToGray32(bgra[3]));
-            return PackU16ToU8(lo, hi);
+            return PackI16ToU8(lo, hi);
         }
 
         template <bool align> SIMD_INLINE void Load(const uint8_t* p, __m256i a[4])
@@ -89,6 +89,58 @@ namespace Simd
             else
                 BgraToGray<false>(bgra, width, height, bgraStride, gray, grayStride);
         }
+
+        //---------------------------------------------------------------------
+
+        const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
+
+        SIMD_INLINE __m256i RgbaToGray32(__m256i rgba)
+        {
+            const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF);
+            const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF);
+            const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_0000), _mm256_madd_epi16(r0b0, K16_RED_BLUE));
+            return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT);
+        }
+
+        SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4])
+        {
+            const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
+            const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
+            return PackI16ToU8(lo, hi);
+        }
+
+        template <bool align> void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            __m256i a[4];
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                {
+                    Load<align>(rgba + 4 * col, a);
+                    Store<align>((__m256i*)(gray + col), RgbaToGray(a));
+                }
+                if (alignedWidth != width)
+                {
+                    Load<false>(rgba + 4 * (width - A), a);
+                    Store<false>((__m256i*)(gray + width - A), RgbaToGray(a));
+                }
+                rgba += rgbaStride;
+                gray += grayStride;
+            }
+        }
+
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+        {
+            if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride))
+                RgbaToGray<true>(rgba, width, height, rgbaStride, gray, grayStride);
+            else
+                RgbaToGray<false>(rgba, width, height, rgbaStride, gray, grayStride);
+        }
     }
 #else
     // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgraToGray.cpp.o) has no symbols
diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp
deleted file mode 100644
index d64f184cbf..0000000000
--- a/3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdConversion.h"
-
-namespace Simd
-{
-#ifdef SIMD_AVX2_ENABLE
-    namespace Avx2
-    {
-        template <bool align> SIMD_INLINE void BgraToRgba(const uint8_t * bgra, uint8_t * rgba)
-        {
-            Store<align>((__m256i*)rgba + 0, BgraToRgba(Load<align>((__m256i*)(bgra + 0))));
-            Store<align>((__m256i*)rgba + 1, BgraToRgba(Load<align>((__m256i*)(bgra + 32))));
-            Store<align>((__m256i*)rgba + 2, BgraToRgba(Load<align>((__m256i*)(bgra + 64))));
-            Store<align>((__m256i*)rgba + 3, BgraToRgba(Load<align>((__m256i*)(bgra + 96))));
-        }
-
-        template <bool align> void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    BgraToRgba<align>(bgra + 4 * col, rgba + 4 * col);
-                if (width != alignedWidth)
-                    BgraToRgba<false>(bgra + 4 * (width - A), rgba + 4 * (width - A));
-                bgra += bgraStride;
-                rgba += rgbaStride;
-            }
-        }
-
-        void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride)
-        {
-            if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride))
-                BgraToRgba<true>(bgra, width, height, bgraStride, rgba, rgbaStride);
-            else
-                BgraToRgba<false>(bgra, width, height, bgraStride, rgba, rgbaStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToRgba.cpp.o) has no symbols
-    void dummy_SimdAvx2BgraToRgba(){};
-#endif//SIMD_AVX2_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp b/3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp
new file mode 100644
index 0000000000..778b11803a
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp
@@ -0,0 +1,68 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdEnable.h"
+#include "Simd/SimdCpu.h"
+
+#if defined(_MSC_VER)
+#include <windows.h>
+#endif
+
+namespace Simd
+{
+#ifdef SIMD_AVX2_ENABLE
+    namespace Avx2
+    {
+        SIMD_INLINE bool SupportedByCPU()
+        {
+            return
+                Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) &&
+                Base::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX2) &&
+                Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::FMA) &&
+                Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::F16C);
+        }
+
+        SIMD_INLINE bool SupportedByOS()
+        {
+#if defined(_MSC_VER)
+            __try
+            {
+                __m256i value = _mm256_abs_epi8(_mm256_set1_epi8(1));// try to execute of AVX2 instructions;
+                return true;
+            }
+            __except (EXCEPTION_EXECUTE_HANDLER)
+            {
+                return false;
+            }
+#else
+            return true;
+#endif
+        }
+
+        bool GetEnable()
+        {
+            return SupportedByCPU() && SupportedByOS();
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp b/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp
old mode 100644
new mode 100755
index 762d0f37ba..2bf5741a35
--- a/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -69,13 +69,15 @@ namespace Simd
                 DeinterleaveBgr<false>(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride);
         }
 
+        //---------------------------------------------------------------------
+
         const __m256i K8_SHUFFLE_BGRA = SIMD_MM256_SETR_EPI8(
             0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF,
             0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF);
 
         const __m256i K32_PERMUTE_BGRA = SIMD_MM256_SETR_EPI32(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7);
 
-        template <bool align> SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset)
+        template <bool align, bool alpha> SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset)
         {
             __m256i _bgra[4];
             _bgra[0] = _mm256_shuffle_epi8(Load<align>((__m256i*)bgra + 0), K8_SHUFFLE_BGRA);
@@ -93,39 +95,58 @@ namespace Simd
             __m256i rraa1 = _mm256_unpackhi_epi32(_bgra[2], _bgra[3]);
 
             Store<align>((__m256i*)(r + offset), _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(rraa0, rraa1), K32_PERMUTE_BGRA));
-            Store<align>((__m256i*)(a + offset), _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(rraa0, rraa1), K32_PERMUTE_BGRA));
+            if(alpha)
+                Store<align>((__m256i*)(a + offset), _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(rraa0, rraa1), K32_PERMUTE_BGRA));
         }
 
-        template <bool align> void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height,
-            uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride)
+        template <bool align> void DeinterleaveBgra(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height,
+            uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride)
         {
             assert(width >= A);
             if (align)
             {
                 assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride));
-                assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride));
+                assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL));
             }
 
             size_t alignedWidth = AlignLo(width, A);
 
-            for (size_t row = 0; row < height; ++row)
+            if (a)
             {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    DeinterleaveBgra<align>(bgra + col * 4, b, g, r, a, col);
-                if (width != alignedWidth)
-                    DeinterleaveBgra<false>(bgra + 4 * (width - A), b, g, r, a, width - A);
-                bgra += bgraStride;
-                b += bStride;
-                g += gStride;
-                r += rStride;
-                a += aStride;
+                for (size_t row = 0; row < height; ++row)
+                {
+                    for (size_t col = 0; col < alignedWidth; col += A)
+                        DeinterleaveBgra<align, true>(bgra + col * 4, b, g, r, a, col);
+                    if (width != alignedWidth)
+                        DeinterleaveBgra<false, true>(bgra + 4 * (width - A), b, g, r, a, width - A);
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
+                    a += aStride;
+                }
+            }
+            else
+            {
+                for (size_t row = 0; row < height; ++row)
+                {
+                    for (size_t col = 0; col < alignedWidth; col += A)
+                        DeinterleaveBgra<align, false>(bgra + col * 4, b, g, r, NULL, col);
+                    if (width != alignedWidth)
+                        DeinterleaveBgra<false, false>(bgra + 4 * (width - A), b, g, r, NULL, width - A);
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
+                }
             }
         }
 
-        void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height,
-            uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride)
+        void DeinterleaveBgra(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height,
+            uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride)
         {
-            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride))
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) &&
+                Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL))
                 DeinterleaveBgra<true>(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
             else
                 DeinterleaveBgra<false>(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
diff --git a/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp
old mode 100644
new mode 100755
index 243663a169..beefb55410
--- a/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2020 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -22,6 +22,7 @@
 * SOFTWARE.
 */
 #include "Simd/SimdMemory.h"
+#include "Simd/SimdLoadBlock.h"
 #include "Simd/SimdStore.h"
 #include "Simd/SimdGaussianBlur.h"
 #include "Simd/SimdExtract.h"
diff --git a/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp b/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp
old mode 100644
new mode 100755
index ca40f5a347..5a85a27334
--- a/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp
@@ -42,7 +42,7 @@ namespace Simd
                 _mm256_and_si256(_mm256_srli_si256(s01, 1), K16_00FF),
                 _mm256_and_si256(s11, K16_00FF),
                 _mm256_and_si256(_mm256_srli_si256(s11, 1), K16_00FF));
-            return PackU16ToU8(lo, hi);
+            return PackI16ToU8(lo, hi);
         }
 #else
         SIMD_INLINE __m256i Average16(const __m256i & s0, const __m256i & s1)
@@ -52,7 +52,7 @@ namespace Simd
 
         SIMD_INLINE __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11)
         {
-            return PackU16ToU8(Average16(s00, s10), Average16(s01, s11));
+            return PackI16ToU8(Average16(s00, s10), Average16(s01, s11));
         }
 #endif
 
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp
old mode 100644
new mode 100755
index c4ee30e989..d7caad1571
--- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2018 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -42,7 +42,7 @@ namespace Simd
                 _mm256_and_si256(_mm256_srli_si256(s01, 1), K16_00FF),
                 _mm256_and_si256(s11, K16_00FF),
                 _mm256_and_si256(_mm256_srli_si256(s11, 1), K16_00FF));
-            return PackU16ToU8(lo, hi);
+            return PackI16ToU8(lo, hi);
         }
 #else
         SIMD_INLINE __m256i Average16(const __m256i & s0, const __m256i & s1)
@@ -52,7 +52,7 @@ namespace Simd
 
         SIMD_INLINE __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11)
         {
-            return PackU16ToU8(Average16(s00, s10), Average16(s01, s11));
+            return PackI16ToU8(Average16(s00, s10), Average16(s01, s11));
         }
 #endif
 
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp
old mode 100644
new mode 100755
index 34b4a91ecb..71f36b978f
--- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -78,7 +78,7 @@ namespace Simd
 
         template <bool compensation> SIMD_INLINE __m256i ReduceRow(const __m256i lo[3], const __m256i hi[3])
         {
-            return PackU16ToU8(
+            return PackI16ToU8(
                 DivideBy16<compensation>(BinomialSum16(lo[0], lo[1], lo[2])),
                 DivideBy16<compensation>(BinomialSum16(hi[0], hi[1], hi[2])));
         }
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp
old mode 100644
new mode 100755
index bf732178ed..cea41815d3
--- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -119,7 +119,7 @@ namespace Simd
         {
             __m256i lo = ReduceRow16<align>(buffer, offset);
             __m256i hi = ReduceRow16<align>(buffer, offset + HA);
-            return PackU16ToU8(lo, hi);
+            return PackI16ToU8(lo, hi);
         }
 
         template <bool even> void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp
old mode 100644
new mode 100755
index 96771d8aee..fe2ebbd3cf
--- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -120,14 +120,14 @@ namespace Simd
         {
             const __m256i lo = MainRowX5x5<align, compensation>(buffer.dst + offset);
             const __m256i hi = MainRowX5x5<align, compensation>(buffer.dst + offset + HA);
-            return _mm256_and_si256(PackU16ToU8(lo, hi), K16_00FF);
+            return _mm256_and_si256(PackI16ToU8(lo, hi), K16_00FF);
         }
 
         template <bool align, bool compensation> SIMD_INLINE void MainRowX5x5(Buffer & buffer, size_t offset, uint8_t * dst)
         {
             __m256i lo = MainRowX5x5<align, compensation>(buffer, offset);
             __m256i hi = MainRowX5x5<align, compensation>(buffer, offset + A);
-            Store<false>((__m256i*)dst, PackU16ToU8(lo, hi));
+            Store<false>((__m256i*)dst, PackI16ToU8(lo, hi));
         }
 
         template <bool align, bool compensation> void ReduceGray5x5(
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp b/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp
old mode 100644
new mode 100755
index f00b174cb2..53c9cdc9f8
--- a/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -273,7 +273,7 @@ namespace Simd
         {
             __m256i lo = InterpolateY<align>((__m256i*)bx0 + 0, (__m256i*)bx1 + 0, alpha);
             __m256i hi = InterpolateY<align>((__m256i*)bx0 + 1, (__m256i*)bx1 + 1, alpha);
-            Store<false>((__m256i*)dst, PackU16ToU8(lo, hi));
+            Store<false>((__m256i*)dst, PackI16ToU8(lo, hi));
         }
 
         template <size_t channelCount> void ResizeBilinear(
diff --git a/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp b/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp
old mode 100644
new mode 100755
index ab739b7aa9..d75c24989d
--- a/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -23,6 +23,7 @@
 */
 #include "Simd/SimdMemory.h"
 #include "Simd/SimdResizer.h"
+#include "Simd/SimdResizerCommon.h"
 #include "Simd/SimdStore.h"
 #include "Simd/SimdSet.h"
 #include "Simd/SimdUpdate.h"
@@ -33,7 +34,7 @@ namespace Simd
     namespace Avx2
     {
         ResizerByteBilinear::ResizerByteBilinear(const ResParam & param)
-            : Ssse3::ResizerByteBilinear(param)
+            : Sse41::ResizerByteBilinear(param)
         {
         }
 
@@ -223,7 +224,7 @@ namespace Simd
         {
             __m256i lo = ResizerByteBilinearInterpolateY<align>((__m256i*)bx0 + 0, (__m256i*)bx1 + 0, alpha);
             __m256i hi = ResizerByteBilinearInterpolateY<align>((__m256i*)bx0 + 1, (__m256i*)bx1 + 1, alpha);
-            Store<false>((__m256i*)dst, PackU16ToU8(lo, hi));
+            Store<false>((__m256i*)dst, PackI16ToU8(lo, hi));
         }
 
         template<size_t N> void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride)
@@ -523,7 +524,7 @@ namespace Simd
             float * pbx[2] = { _bx[0].data, _bx[1].data };
             int32_t prev = -2;
             size_t rsa = AlignLo(rs, Avx::F);
-            size_t rsh = AlignLo(rs, Sse::F);
+            size_t rsh = AlignLo(rs, Sse2::F);
             for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride)
             {
                 float fy1 = _ay[dy];
@@ -560,10 +561,10 @@ namespace Simd
                             __m256 s1 = _mm256_shuffle_ps(s0145, s2367, 0xDD);
                             _mm256_store_ps(pb + dx, _mm256_fmadd_ps(s0, fx0, _mm256_mul_ps(s1, fx1)));
                         }
-                        for (; dx < rsh; dx += Sse::F)
+                        for (; dx < rsh; dx += Sse2::F)
                         {
-                            __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]);
-                            __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]);
+                            __m128 s01 = Sse2::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]);
+                            __m128 s23 = Sse2::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]);
                             __m128 fx1 = _mm_load_ps(_ax.data + dx);
                             __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1);
                             __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88));
@@ -625,7 +626,7 @@ namespace Simd
                     __m256 b1 = _mm256_load_ps(pbx[1] + dx);
                     _mm256_storeu_ps(dst + dx, _mm256_fmadd_ps(b0, _fy0, _mm256_mul_ps(b1, _fy1)));
                 }
-                for (; dx < rsh; dx += Sse::F)
+                for (; dx < rsh; dx += Sse2::F)
                 {
                     __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0));
                     __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1));
@@ -641,11 +642,11 @@ namespace Simd
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
         {
             ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m256i));
-            if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && dstX >= A)
+            if (param.IsByteBilinear() && dstX >= A)
                 return new ResizerByteBilinear(param);
-            else if (type == SimdResizeChannelByte && method == SimdResizeMethodArea)
+            else if (param.IsByteArea())
                 return new ResizerByteArea(param);
-            else if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp))
+            else if (param.IsFloatBilinear())
                 return new ResizerFloatBilinear(param);
             else
                 return Avx::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
diff --git a/3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp
deleted file mode 100644
index 1533d99dfb..0000000000
--- a/3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdStore.h"
-#include "Simd/SimdConversion.h"
-
-namespace Simd
-{
-#ifdef SIMD_AVX2_ENABLE
-    namespace Avx2
-    {
-        const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
-        const __m256i K16_GREEN_ROUND = SIMD_MM256_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM);
-
-        SIMD_INLINE __m256i RgbaToGray32(__m256i rgba)
-        {
-            const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF);
-            const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF);
-            const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_ROUND), _mm256_madd_epi16(r0b0, K16_RED_BLUE));
-            return _mm256_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT);
-        }
-
-        SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4])
-        {
-            const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
-            const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
-            return PackU16ToU8(lo, hi);
-        }
-
-        template <bool align> SIMD_INLINE __m256i RgbToGray(const uint8_t * rgb)
-        {
-            __m256i rgba[4];
-            rgba[0] = BgrToBgra<false>(Load<align>((__m256i*)(rgb + 0)), K32_01000000);
-            rgba[1] = BgrToBgra<false>(Load<false>((__m256i*)(rgb + 24)), K32_01000000);
-            rgba[2] = BgrToBgra<false>(Load<false>((__m256i*)(rgb + 48)), K32_01000000);
-            rgba[3] = BgrToBgra<true>(Load<align>((__m256i*)(rgb + 64)), K32_01000000);
-            return RgbaToGray(rgba);
-        }
-
-        template <bool align> void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    Store<align>((__m256i*)(gray + col), RgbToGray<align>(rgb + 3 * col));
-                if (width != alignedWidth)
-                    Store<false>((__m256i*)(gray + width - A), RgbToGray<false>(rgb + 3 * (width - A)));
-                rgb += rgbStride;
-                gray += grayStride;
-            }
-        }
-
-        void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride)
-        {
-            if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride))
-                RgbToGray<true>(rgb, width, height, rgbStride, gray, grayStride);
-            else
-                RgbToGray<false>(rgb, width, height, rgbStride, gray, grayStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2RgbToGray.cpp.o) has no symbols
-    void dummy_SimdAvx2RgbToGray(){};
-#endif//SIMD_AVX2_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp
deleted file mode 100644
index d28cb39832..0000000000
--- a/3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdConversion.h"
-
-namespace Simd
-{
-#ifdef SIMD_AVX2_ENABLE
-    namespace Avx2
-    {
-        const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
-        const __m256i K16_GREEN_0000 = SIMD_MM256_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, 0x0000);
-        const __m256i K32_ROUND_TERM = SIMD_MM256_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM);
-
-        SIMD_INLINE __m256i RgbaToGray32(__m256i rgba)
-        {
-            const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF);
-            const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF);
-            const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_0000), _mm256_madd_epi16(r0b0, K16_RED_BLUE));
-            return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT);
-        }
-
-        SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4])
-        {
-            const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
-            const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
-            return PackU16ToU8(lo, hi);
-        }
-
-        template <bool align> SIMD_INLINE void Load(const uint8_t* p, __m256i a[4])
-        {
-            a[0] = Load<align>((__m256i*)p + 0);
-            a[1] = Load<align>((__m256i*)p + 1);
-            a[2] = Load<align>((__m256i*)p + 2);
-            a[3] = Load<align>((__m256i*)p + 3);
-        }
-
-        template <bool align> void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-            __m256i a[4];
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                {
-                    Load<align>(rgba + 4 * col, a);
-                    Store<align>((__m256i*)(gray + col), RgbaToGray(a));
-                }
-                if (alignedWidth != width)
-                {
-                    Load<false>(rgba + 4 * (width - A), a);
-                    Store<false>((__m256i*)(gray + width - A), RgbaToGray(a));
-                }
-                rgba += rgbaStride;
-                gray += grayStride;
-            }
-        }
-
-        void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride)
-        {
-            if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride))
-                RgbaToGray<true>(rgba, width, height, rgbaStride, gray, grayStride);
-            else
-                RgbaToGray<false>(rgba, width, height, rgbaStride, gray, grayStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2RgbaToGray.cpp.o) has no symbols
-    void dummy_SimdAvx2RgbaToGray(){};
-#endif// SIMD_AVX2_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdBase.h b/3rdparty/simdlib/Simd/SimdBase.h
old mode 100644
new mode 100755
index 57d654751e..998a7b7cbe
--- a/3rdparty/simdlib/Simd/SimdBase.h
+++ b/3rdparty/simdlib/Simd/SimdBase.h
@@ -38,7 +38,9 @@ namespace Simd
 
         void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride);
 
-        void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride);
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride);
+
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride);
 
         void BgrToBgra(const uint8_t * bgr, size_t size, uint8_t * bgra, bool fillAlpha, bool lastRow, uint8_t alpha);
 
@@ -47,15 +49,9 @@ namespace Simd
         void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height,
             const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
-        void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha);
-
-        void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride);
-
         void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride);
 
-        void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride);
-
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride);
+        void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride);
 
         void Copy(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, uint8_t * dst, size_t dstStride);
 
@@ -104,6 +100,12 @@ namespace Simd
         void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
 
+        void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride);
+
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride);
+
         void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
 
diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp
old mode 100644
new mode 100755
index b909ee9d20..b5b8140dbe
--- a/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -106,5 +106,23 @@ namespace Simd
                 bgra += bgraStride;
             }
         }
+
+        void RgbToBgra(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+        {
+            size_t rgbGap = rgbStride - width * 3;
+            size_t bgraGap = bgraStride - width * 4;
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < width; ++col, rgb += 3, bgra += 4)
+                {
+                    bgra[0] = rgb[2];
+                    bgra[1] = rgb[1];
+                    bgra[2] = rgb[0];
+                    bgra[3] = alpha;
+                }
+                rgb += rgbGap;
+                bgra += bgraGap;
+            }
+        }
     }
 }
diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp
old mode 100644
new mode 100755
index e6fa81ddb1..26f7bf171b
--- a/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -39,5 +39,18 @@ namespace Simd
                 }
             }
         }
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+        {
+            for (size_t row = 0; row < height; ++row)
+            {
+                const uint8_t* pRgb = rgb + row * rgbStride;
+                uint8_t* pGray = gray + row * grayStride;
+                for (const uint8_t* pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgb += 3)
+                {
+                    *pGray = BgrToGray(pRgb[2], pRgb[1], pRgb[0]);
+                }
+            }
+        }
     }
 }
diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp
old mode 100644
new mode 100755
index d508115a64..ece4ffc97f
--- a/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@ namespace Simd
 {
     namespace Base
     {
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
+        void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride)
         {
             size_t size = width * 3;
             for (size_t row = 0; row < height; ++row)
diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp
deleted file mode 100644
index b7003c067b..0000000000
--- a/3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdDefs.h"
-#include <algorithm>
-
-namespace Simd
-{
-    namespace Base
-    {
-        void BgrToRgba(const uint8_t *bgr, size_t size, uint8_t *rgba, bool fillAlpha, bool lastRow, uint8_t alpha)
-        {
-            if (fillAlpha)
-            {
-#ifdef SIMD_BIG_ENDIAN
-                const int32_t alphaMask = alpha;
-#else
-                const int32_t alphaMask = alpha << 24;
-#endif
-                for (size_t i = (lastRow ? 1 : 0); i < size; ++i, bgr += 3, rgba += 4)
-                {
-                    *(int32_t*)rgba = (*(int32_t*)bgr) | alphaMask;
-                    std::swap(rgba[0], rgba[2]);
-                }
-                if (lastRow)
-                {
-                    rgba[0] = bgr[2];
-                    rgba[1] = bgr[1];
-                    rgba[2] = bgr[0];
-                    rgba[3] = alpha;
-                }
-            }
-            else
-            {
-                for (size_t i = (lastRow ? 1 : 0); i < size; ++i, bgr += 3, rgba += 4)
-                {
-                    *(int32_t*)rgba = (*(int32_t*)bgr);
-                    std::swap(rgba[0], rgba[2]);
-                }
-                if (lastRow)
-                {
-                    rgba[0] = bgr[2];
-                    rgba[1] = bgr[1];
-                    rgba[2] = bgr[0];
-                }
-            }
-        }
-
-        void BgrToRgba(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *rgba, size_t bgraStride, uint8_t alpha)
-        {
-            for (size_t row = 1; row < height; ++row)
-            {
-                BgrToRgba(bgr, width, rgba, true, false, alpha);
-                bgr += bgrStride;
-                rgba += bgraStride;
-            }
-            BgrToRgba(bgr, width, rgba, true, true, alpha);
-        }
-    }
-}
diff --git a/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp
old mode 100644
new mode 100755
index 8d3b1bbc6c..6ee5d55355
--- a/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -51,5 +51,40 @@ namespace Simd
             }
             BgraToBgr(bgra, width, bgr, true);
         }
+
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
+        {
+            size_t bgraGap = bgraStride - width * 4;
+            size_t rgbGap = rgbStride - width * 3;
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < width; ++col, bgra += 4, rgb += 3)
+                {
+                    rgb[2] = bgra[0];
+                    rgb[1] = bgra[1];
+                    rgb[0] = bgra[2];
+                }
+                bgra += bgraGap;
+                rgb += rgbGap;
+            }
+        }
+
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
+        {
+            size_t bgraGap = bgraStride - width * 4;
+            size_t rgbaGap = rgbaStride - width * 4;
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < width; ++col, bgra += 4, rgba += 4)
+                {
+                    rgba[2] = bgra[0];
+                    rgba[1] = bgra[1];
+                    rgba[0] = bgra[2];
+                    rgba[3] = bgra[3];
+                }
+                bgra += bgraGap;
+                rgba += rgbaGap;
+            }
+        }
     }
 }
diff --git a/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp b/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp
old mode 100644
new mode 100755
index 3d855e749e..16fba3e7ce
--- a/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -39,5 +39,18 @@ namespace Simd
                 }
             }
         }
+
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+        {
+            for (size_t row = 0; row < height; ++row)
+            {
+                const uint8_t* pRgba = rgba + row * rgbaStride;
+                uint8_t* pGray = gray + row * grayStride;
+                for (const uint8_t* pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgba += 4)
+                {
+                    *pGray = BgrToGray(pRgba[2], pRgba[1], pRgba[0]);
+                }
+            }
+        }
     }
 }
diff --git a/3rdparty/simdlib/Simd/SimdBaseCpu.cpp b/3rdparty/simdlib/Simd/SimdBaseCpu.cpp
new file mode 100644
index 0000000000..77fc5718df
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseCpu.cpp
@@ -0,0 +1,234 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdCpu.h"
+
+#include <vector>
+#include <thread>
+#include <sstream>
+#include <iostream>
+
+#if defined(_MSC_VER)
+
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <intrin.h>
+
+#elif defined(__GNUC__)
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdlib.h>
+
+#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
+#include <cpuid.h>
+#endif
+
+#if defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)
+#include <fcntl.h>
+#include <sys/auxv.h>
+#if defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)
+#include <asm/hwcap.h>
+#endif
+#endif
+
+#else
+# error Do not know how to detect CPU info
+#endif
+
+namespace Simd
+{
+    namespace Base
+    {
+#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
+        bool CheckBit(Cpuid::Level level, Cpuid::Register index, Cpuid::Bit bit)
+        {
+            unsigned int registers[4] = { 0, 0, 0, 0 };
+#if defined(_MSC_VER)
+            __cpuid((int*)registers, level);
+#elif (defined __GNUC__)
+            if (__get_cpuid_max(0, NULL) < level)
+                return false;
+            __cpuid_count(level, 0, 
+                registers[Cpuid::Eax], 
+                registers[Cpuid::Ebx], 
+                registers[Cpuid::Ecx], 
+                registers[Cpuid::Edx]);
+#else
+#error Do not know how to detect CPU info!
+#endif
+            return (registers[index] & bit) == bit;
+        }
+#endif//defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
+
+#if defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE))
+        bool CheckBit(int at, int bit)
+        {
+            bool result = false;
+            int file = ::open("/proc/self/auxv", O_RDONLY);
+            if (file < 0)
+                return false;
+            const ssize_t size = 64;
+            unsigned long buffer[size];
+            for (ssize_t count = size; count == size;)
+            {
+                count = ::read(file, buffer, sizeof(buffer)) / sizeof(unsigned long);
+                for (int i = 0; i < count; i += 2)
+                {
+                    if (buffer[i] == (unsigned)at)
+                    {
+                        result = !!(buffer[i + 1] & bit);
+                        count = 0;
+                    }
+                    if (buffer[i] == AT_NULL)
+                        count = 0;
+                }
+            }
+            ::close(file);
+            return result;
+        }
+#endif//defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE))
+
+        size_t CpuThreadNumber()
+        {
+            return std::thread::hardware_concurrency();
+        }
+
+#if defined(_MSC_VER)
+        typedef SYSTEM_LOGICAL_PROCESSOR_INFORMATION Info;
+
+        void GetLogicalProcessorInformation(std::vector<Info> & info)
+        {
+            DWORD size = 0;
+            ::GetLogicalProcessorInformation(0, &size); 
+            info.resize(size / sizeof(Info));
+            ::GetLogicalProcessorInformation(info.data(), &size);
+        }
+
+        size_t CpuSocketNumber()
+        {
+            std::vector<Info> info;
+            GetLogicalProcessorInformation(info);
+            size_t number = 0;
+            for (size_t i = 0; i < info.size(); ++i)
+                if (info[i].Relationship == ::RelationNumaNode)
+                    number++;
+            return number;
+        }            
+
+        size_t CpuCoreNumber()
+        {
+            std::vector<Info> info;
+            GetLogicalProcessorInformation(info);
+            size_t number = 0;
+            for (size_t i = 0; i < info.size(); ++i)
+                if (info[i].Relationship == ::RelationProcessorCore)
+                    number++;
+            return number;
+        }
+
+        size_t CpuCacheSize(size_t level)
+        {
+            std::vector<Info> info;
+            GetLogicalProcessorInformation(info);
+            for (size_t i = 0; i < info.size(); ++i)
+                if (info[i].Relationship == ::RelationCache && info[i].Cache.Level == level && (info[i].Cache.Type == ::CacheData || info[i].Cache.Type == CacheUnified))
+                    return info[i].Cache.Size;
+            return 0;
+        }
+#elif defined(__GNUC__)
+        size_t CpuSocketNumber()
+        {
+            uint32_t number = 0;
+            ::FILE * p = ::popen("lscpu -b -p=Socket | grep -v '^#' | sort -u | wc -l", "r");
+            if (p)
+            {
+                char buffer[PATH_MAX];
+                while (::fgets(buffer, PATH_MAX, p));
+                number = ::atoi(buffer);
+                ::pclose(p);
+            }
+            return number;
+        }
+
+        size_t CpuCoreNumber()
+        {
+            uint32_t number = 0;
+            ::FILE * p = ::popen("lscpu -b -p=Core | grep -v '^#' | sort -u | wc -l", "r");
+            if (p)
+            {
+                char buffer[PATH_MAX];
+                while (::fgets(buffer, PATH_MAX, p));
+                number = ::atoi(buffer);
+                ::pclose(p);
+            }
+            return number;
+        }
+
+        SIMD_INLINE size_t CorrectIfZero(size_t value, size_t otherwise)
+        {
+            return value ? value : otherwise;
+        }
+
+#if defined(_SC_LEVEL1_DCACHE_SIZE) && defined(_SC_LEVEL2_CACHE_SIZE) && defined(_SC_LEVEL3_CACHE_SIZE)
+        size_t CpuCacheSize(size_t level)
+        {
+            switch (level)
+            {
+            case 1: return CorrectIfZero(::sysconf(_SC_LEVEL1_DCACHE_SIZE), 32 * 1024);
+            case 2: return CorrectIfZero(::sysconf(_SC_LEVEL2_CACHE_SIZE), 256 * 1024);
+            case 3: return CorrectIfZero(::sysconf(_SC_LEVEL3_CACHE_SIZE), 2048 * 1024);
+            default:
+                return 0;
+            }
+        }
+#else
+        size_t CpuCacheSize(size_t level)
+        {
+            switch (level)
+            {
+            case 1: return 32 * 1024;
+            case 2: return 256 * 1024;
+            case 3: return 2048 * 1024;
+            default:
+                return 0;
+            }
+        }
+#endif
+
+#else
+#error This platform is unsupported!
+#endif
+    }
+
+    namespace Cpu
+    {
+        const size_t SOCKET_NUMBER = Base::CpuSocketNumber();
+        const size_t CORE_NUMBER = Base::CpuCoreNumber();
+        const size_t THREAD_NUMBER = Base::CpuThreadNumber();
+        const size_t L1_CACHE_SIZE = Base::CpuCacheSize(1);
+        const size_t L2_CACHE_SIZE = Base::CpuCacheSize(2);
+        const size_t L3_CACHE_SIZE = Base::CpuCacheSize(3);
+    }
+}
diff --git a/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp b/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp
old mode 100644
new mode 100755
index ecb22ed4b0..366ce1bc0e
--- a/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -48,20 +48,39 @@ namespace Simd
         void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height,
             uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride)
         {
-            for (size_t row = 0; row < height; ++row)
+            if (a)
             {
-                for (size_t col = 0, offset = 0; col < width; ++col, offset += 4)
+                for (size_t row = 0; row < height; ++row)
                 {
-                    b[col] = bgra[offset + 0];
-                    g[col] = bgra[offset + 1];
-                    r[col] = bgra[offset + 2];
-                    a[col] = bgra[offset + 3];
+                    for (size_t col = 0, offset = 0; col < width; ++col, offset += 4)
+                    {
+                        b[col] = bgra[offset + 0];
+                        g[col] = bgra[offset + 1];
+                        r[col] = bgra[offset + 2];
+                        a[col] = bgra[offset + 3];
+                    }
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
+                    a += aStride;
+                }
+            }
+            else
+            {
+                for (size_t row = 0; row < height; ++row)
+                {
+                    for (size_t col = 0, offset = 0; col < width; ++col, offset += 4)
+                    {
+                        b[col] = bgra[offset + 0];
+                        g[col] = bgra[offset + 1];
+                        r[col] = bgra[offset + 2];
+                    }
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
                 }
-                bgra += bgraStride;
-                b += bStride;
-                g += gStride;
-                r += rStride;
-                a += aStride;
             }
         }
     }
diff --git a/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp
old mode 100644
new mode 100755
index 560b9d3cb9..1394d919e1
--- a/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2020 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
diff --git a/3rdparty/simdlib/Simd/SimdBaseResizer.cpp b/3rdparty/simdlib/Simd/SimdBaseResizer.cpp
old mode 100644
new mode 100755
index 9585a4f1ac..b8c08d2b92
--- a/3rdparty/simdlib/Simd/SimdBaseResizer.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseResizer.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -23,6 +23,7 @@
 */
 #include "Simd/SimdMemory.h"
 #include "Simd/SimdResizer.h"
+#include "Simd/SimdCopyPixel.h"
 
 namespace Simd
 {
@@ -132,8 +133,6 @@ namespace Simd
         ResizerByteArea::ResizerByteArea(const ResParam & param)
             : Resizer(param)
         {
-            double scale = Simd::Max(float(_param.srcW) / _param.dstW, float(_param.srcH) / _param.dstH);
-
             _ay.Resize(_param.dstH + 1);
             _iy.Resize(_param.dstH + 1);
             EstimateParams(_param.srcH, _param.dstH, Base::AREA_RANGE, _ay.data, _iy.data);
@@ -234,28 +233,173 @@ namespace Simd
 
         //---------------------------------------------------------------------
 
+        ResizerShortBilinear::ResizerShortBilinear(const ResParam& param)
+            : Resizer(param)
+        {
+            _ay.Resize(_param.dstH, false, _param.align);
+            _iy.Resize(_param.dstH, false, _param.align);
+            EstimateIndexAlpha(_param.srcH, _param.dstH, 1, _iy.data, _ay.data);
+            size_t rs = _param.dstW * _param.channels;
+            _ax.Resize(rs, false, _param.align);
+            _ix.Resize(rs, false, _param.align);
+            EstimateIndexAlpha(_param.srcW, _param.dstW, _param.channels, _ix.data, _ax.data);
+            _bx[0].Resize(rs, false, _param.align);
+            _bx[1].Resize(rs, false, _param.align);
+        }
+
+        void ResizerShortBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t* indices, float* alphas)
+        {
+            float scale = (float)srcSize / dstSize;
+            for (size_t i = 0; i < dstSize; ++i)
+            {
+                float alpha = (float)((i + 0.5f) * scale - 0.5f);
+                ptrdiff_t index = (ptrdiff_t)::floor(alpha);
+                alpha -= index;
+                if (index < 0)
+                {
+                    index = 0;
+                    alpha = 0;
+                }
+                if (index > (ptrdiff_t)srcSize - 2)
+                {
+                    index = srcSize - 2;
+                    alpha = 1;
+                }
+                for (size_t c = 0; c < channels; c++)
+                {
+                    size_t offset = i * channels + c;
+                    indices[offset] = (int32_t)(channels * index + c);
+                    alphas[offset] = alpha;
+                }
+            }
+        }
+
+        void ResizerShortBilinear::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
+        {
+            Run((const uint16_t*)src, srcStride / sizeof(uint16_t), (uint16_t*)dst, dstStride / sizeof(uint16_t));
+        }
+
+        template<size_t N> void ResizerShortBilinear::RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride)
+        {
+            size_t rs = _param.dstW * N;
+            float* pbx[2] = { _bx[0].data, _bx[1].data };
+            int32_t prev = -2;
+            for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride)
+            {
+                float fy1 = _ay[dy];
+                float fy0 = 1.0f - fy1;
+                int32_t sy = _iy[dy];
+                int32_t k = 0;
+                if (sy == prev)
+                    k = 2;
+                else if (sy == prev + 1)
+                {
+                    Swap(pbx[0], pbx[1]);
+                    k = 1;
+                }
+                prev = sy;
+                for (; k < 2; k++)
+                {
+                    float* pb = pbx[k];
+                    const uint16_t* ps = src + (sy + k) * srcStride;
+                    for (size_t dx = 0; dx < rs; dx++)
+                    {
+                        int32_t sx = _ix[dx];
+                        float fx = _ax[dx];
+                        pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + N] * fx;
+                    }
+                }
+                for (size_t dx = 0; dx < rs; dx++)
+                    dst[dx] = Round(pbx[0][dx] * fy0 + pbx[1][dx] * fy1);
+            }
+        }
+
+        template<size_t N> void ResizerShortBilinear::RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride)
+        {
+            size_t rs = _param.dstW * N;
+            for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride)
+            {
+                float fy1 = _ay[dy];
+                float fy0 = 1.0f - fy1;
+                int32_t sy = _iy[dy];
+                const uint16_t* ps0 = src + (sy + 0) * srcStride;
+                const uint16_t* ps1 = src + (sy + 1) * srcStride;
+                for (size_t dx = 0; dx < rs; dx++)
+                {
+                    int32_t sx = _ix[dx];
+                    float fx1 = _ax[dx];
+                    float fx0 = 1.0f - fx1;
+                    float r0 = ps0[sx] * fx0 + ps0[sx + N] * fx1;
+                    float r1 = ps1[sx] * fx0 + ps1[sx + N] * fx1;
+                    dst[dx] = Round(r0 * fy0 + r1 * fy1);
+                }
+            }
+        }
+
+        void ResizerShortBilinear::Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride)
+        {
+            bool sparse = _param.dstH * 2.0 <= _param.srcH;
+            switch (_param.channels)
+            {
+            case 1: sparse ? RunS<1>(src, srcStride, dst, dstStride) : RunB<1>(src, srcStride, dst, dstStride); return;
+            case 2: sparse ? RunS<2>(src, srcStride, dst, dstStride) : RunB<2>(src, srcStride, dst, dstStride); return;
+            case 3: sparse ? RunS<3>(src, srcStride, dst, dstStride) : RunB<3>(src, srcStride, dst, dstStride); return;
+            case 4: sparse ? RunS<4>(src, srcStride, dst, dstStride) : RunB<4>(src, srcStride, dst, dstStride); return;
+            default:
+                assert(0);
+            }
+        }
+
+        //---------------------------------------------------------------------
+
         ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param)
             : Resizer(param)
         {
             _ay.Resize(_param.dstH, false, _param.align);
             _iy.Resize(_param.dstH, false, _param.align);
-            EstimateIndexAlpha(_param.srcH, _param.dstH, 1, _param.method == SimdResizeMethodCaffeInterp, _iy.data, _ay.data);
+            EstimateIndexAlpha(_param.srcH, _param.dstH, 1, _iy.data, _ay.data);
             size_t rs = _param.dstW * _param.channels;
             _ax.Resize(rs, false, _param.align);
             _ix.Resize(rs, false, _param.align);
-            EstimateIndexAlpha(_param.srcW, _param.dstW, _param.channels, _param.method == SimdResizeMethodCaffeInterp, _ix.data, _ax.data);
+            EstimateIndexAlpha(_param.srcW, _param.dstW, _param.channels, _ix.data, _ax.data);
             _bx[0].Resize(rs, false, _param.align);
             _bx[1].Resize(rs, false, _param.align);
         }
 
-        void ResizerFloatBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, bool caffeInterp, int32_t * indices, float * alphas)
+        void ResizerFloatBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t * indices, float * alphas)
         {
-            if (caffeInterp)
+            if (_param.method == SimdResizeMethodBilinear)
+            {
+                float scale = (float)srcSize / dstSize;
+                for (size_t i = 0; i < dstSize; ++i)
+                {
+                    float alpha = (float)((i + 0.5f) * scale - 0.5f);
+                    ptrdiff_t index = (ptrdiff_t)::floor(alpha);
+                    alpha -= index;
+                    if (index < 0)
+                    {
+                        index = 0;
+                        alpha = 0;
+                    }
+                    if (index > (ptrdiff_t)srcSize - 2)
+                    {
+                        index = srcSize - 2;
+                        alpha = 1;
+                    }
+                    for (size_t c = 0; c < channels; c++)
+                    {
+                        size_t offset = i * channels + c;
+                        indices[offset] = (int32_t)(channels * index + c);
+                        alphas[offset] = alpha;
+                    }
+                }
+            }            
+            else if (_param.method == SimdResizeMethodCaffeInterp)
             {
                 float scale = dstSize > 1 ? float(srcSize - 1) / float(dstSize - 1) : 0.0f;
                 for (size_t i = 0; i < dstSize; ++i)
                 {
-                    float alpha = float(i)*scale;
+                    float alpha = float(i) * scale;
                     ptrdiff_t index = (ptrdiff_t)::floor(alpha);
                     alpha -= index;
                     if (index > (ptrdiff_t)srcSize - 2)
@@ -266,17 +410,17 @@ namespace Simd
                     for (size_t c = 0; c < channels; c++)
                     {
                         size_t offset = i * channels + c;
-                        indices[offset] = (int32_t)(channels*index + c);
+                        indices[offset] = (int32_t)(channels * index + c);
                         alphas[offset] = alpha;
                     }
                 }
             }
-            else
+            else if (_param.method == SimdResizeMethodInferenceEngineInterp)
             {
                 float scale = (float)srcSize / dstSize;
                 for (size_t i = 0; i < dstSize; ++i)
                 {
-                    float alpha = (float)((i + 0.5f)*scale - 0.5f);
+                    float alpha = float(i) * scale;
                     ptrdiff_t index = (ptrdiff_t)::floor(alpha);
                     alpha -= index;
                     if (index < 0)
@@ -284,7 +428,7 @@ namespace Simd
                         index = 0;
                         alpha = 0;
                     }
-                    if (index >(ptrdiff_t)srcSize - 2)
+                    if (index > (ptrdiff_t)srcSize - 2)
                     {
                         index = srcSize - 2;
                         alpha = 1;
@@ -292,11 +436,13 @@ namespace Simd
                     for (size_t c = 0; c < channels; c++)
                     {
                         size_t offset = i * channels + c;
-                        indices[offset] = (int32_t)(channels*index + c);
+                        indices[offset] = (int32_t)(channels * index + c);
                         alphas[offset] = alpha;
                     }
                 }
             }
+            else
+                assert(0);
         }
 
         void ResizerFloatBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride)
@@ -346,15 +492,80 @@ namespace Simd
 
         //---------------------------------------------------------------------
 
+        ResizerNearest::ResizerNearest(const ResParam& param)
+            : Resizer(param)
+        {
+            _pixelSize = _param.PixelSize();
+            _iy.Resize(_param.dstH, false, _param.align);
+            EstimateIndex(_param.srcH, _param.dstH, 1, _iy.data);
+            _ix.Resize(_param.dstW, false, _param.align);
+            EstimateIndex(_param.srcW, _param.dstW, _pixelSize, _ix.data);
+        }
+
+        void ResizerNearest::EstimateIndex(size_t srcSize, size_t dstSize, size_t pixelSize, int32_t* indices)
+        {
+            float scale = (float)srcSize / dstSize;
+            for (size_t i = 0; i < dstSize; ++i)
+            {
+                float alpha = (i + 0.5f) * scale;
+                int index = RestrictRange((int)::floor(alpha), 0, (int)srcSize - 1);
+                indices[i] = (int)(index * pixelSize);
+            }
+        }
+
+        void ResizerNearest::Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
+        {
+            for (size_t dy = 0; dy < _param.dstH; dy++)
+            {
+                const uint8_t* srcRow = src + _iy[dy] * srcStride;
+                for (size_t dx = 0, offset = 0; dx < _param.dstW; dx++, offset += _pixelSize)
+                    memcpy(dst + offset, srcRow + _ix[dx], _pixelSize);
+                dst += dstStride;
+            }
+        }
+
+        template<size_t N> void ResizerNearest::Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
+        {
+            for (size_t dy = 0; dy < _param.dstH; dy++)
+            {
+                const uint8_t * srcRow = src + _iy[dy] * srcStride;
+                for (size_t dx = 0, offset = 0; dx < _param.dstW; dx++, offset += N)
+                    CopyPixel<N>(srcRow + _ix[dx], dst + offset);
+                dst += dstStride;
+            }
+        }
+
+        void ResizerNearest::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
+        {
+            switch (_pixelSize)
+            {
+            case 1: Resize<1>(src, srcStride, dst, dstStride); break;
+            case 2: Resize<2>(src, srcStride, dst, dstStride); break;
+            case 3: Resize<3>(src, srcStride, dst, dstStride); break;
+            case 4: Resize<4>(src, srcStride, dst, dstStride); break;
+            case 6: Resize<6>(src, srcStride, dst, dstStride); break;
+            case 8: Resize<8>(src, srcStride, dst, dstStride); break;
+            case 12: Resize<12>(src, srcStride, dst, dstStride); break;
+            default:
+                Resize(src, srcStride, dst, dstStride);
+            }
+        }
+
+        //---------------------------------------------------------------------
+
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
         {
             ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(void*));
-            if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear)
+            if (param.IsByteBilinear())
                 return new ResizerByteBilinear(param);
-            else  if (type == SimdResizeChannelByte && method == SimdResizeMethodArea)
+            else if (param.IsByteArea())
                 return new ResizerByteArea(param);
-            else if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp))
+            else if (param.IsShortBilinear())
+                return new ResizerShortBilinear(param);
+            else if (param.IsFloatBilinear())
                 return new ResizerFloatBilinear(param);
+            else if (param.IsNearest())
+                return new ResizerNearest(param);
             else
                 return NULL;
         }
diff --git a/3rdparty/simdlib/Simd/SimdConfig.h b/3rdparty/simdlib/Simd/SimdConfig.h
old mode 100644
new mode 100755
index 8e328e2495..22c7fdd8e6
--- a/3rdparty/simdlib/Simd/SimdConfig.h
+++ b/3rdparty/simdlib/Simd/SimdConfig.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -24,18 +24,10 @@
 #ifndef __SimdConfig_h__
 #define __SimdConfig_h__
 
-//#define SIMD_SSE_DISABLE
-
 //#define SIMD_SSE2_DISABLE
 
-//#define SIMD_SSE3_DISABLE
-
-//#define SIMD_SSSE3_DISABLE
-
 //#define SIMD_SSE41_DISABLE
 
-//#define SIMD_SSE42_DISABLE
-
 //#define SIMD_AVX_DISABLE
 
 //#define SIMD_AVX2_DISABLE
diff --git a/3rdparty/simdlib/Simd/SimdConst.h b/3rdparty/simdlib/Simd/SimdConst.h
old mode 100644
new mode 100755
index 38e217d6ca..e18c1b90d0
--- a/3rdparty/simdlib/Simd/SimdConst.h
+++ b/3rdparty/simdlib/Simd/SimdConst.h
@@ -76,25 +76,13 @@ namespace Simd
         const int DIVISION_BY_9_FACTOR = (1 << DIVISION_BY_9_SHIFT) / 9;
     }
 
-#ifdef SIMD_SSE_ENABLE    
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE    
+    namespace Sse2
     {
         const size_t F = sizeof(__m128) / sizeof(float);
         const size_t DF = 2 * F;
         const size_t QF = 4 * F;
         const size_t HF = F / 2;
-    }
-#endif// SIMD_SSE_ENABLE
-
-#ifdef SIMD_SSE2_ENABLE    
-    namespace Sse2
-    {
-        using namespace Sse;
-#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug
-        using Sse::F;
-        using Sse::DF;
-        using Sse::QF;
-#endif
 
         const size_t A = sizeof(__m128i);
         const size_t DA = 2 * A;
@@ -128,6 +116,7 @@ namespace Simd
         const __m128i K16_0020 = SIMD_MM_SET1_EPI16(0x0020);
         const __m128i K16_0080 = SIMD_MM_SET1_EPI16(0x0080);
         const __m128i K16_00FF = SIMD_MM_SET1_EPI16(0x00FF);
+        const __m128i K16_0101 = SIMD_MM_SET1_EPI16(0x0101);
         const __m128i K16_FF00 = SIMD_MM_SET1_EPI16(0xFF00);
 
         const __m128i K32_00000001 = SIMD_MM_SET1_EPI32(0x00000001);
@@ -138,6 +127,7 @@ namespace Simd
         const __m128i K32_0000FFFF = SIMD_MM_SET1_EPI32(0x0000FFFF);
         const __m128i K32_00010000 = SIMD_MM_SET1_EPI32(0x00010000);
         const __m128i K32_01000000 = SIMD_MM_SET1_EPI32(0x01000000);
+        const __m128i K32_00FF0000 = SIMD_MM_SET1_EPI32(0x00FF0000);
         const __m128i K32_00FFFFFF = SIMD_MM_SET1_EPI32(0x00FFFFFF);
         const __m128i K32_FFFFFF00 = SIMD_MM_SET1_EPI32(0xFFFFFF00);
 
@@ -162,22 +152,15 @@ namespace Simd
     }
 #endif// SIMD_SSE2_ENABLE
 
-#ifdef SIMD_SSE3_ENABLE    
-    namespace Sse3
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
     {
         using namespace Sse2;
 #if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug      
-        using Sse::F;
-        using Sse::DF;
-        using Sse::QF;
+        using Sse2::F;
+        using Sse2::DF;
+        using Sse2::QF;
 #endif
-    }
-#endif// SIMD_SSE3_ENABLE
-
-#ifdef SIMD_SSSE3_ENABLE    
-    namespace Ssse3
-    {
-        using namespace Sse3;
 
         const __m128i K8_SHUFFLE_GRAY_TO_BGR0 = SIMD_MM_SETR_EPI8(0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5);
         const __m128i K8_SHUFFLE_GRAY_TO_BGR1 = SIMD_MM_SETR_EPI8(0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xA, 0xA);
@@ -207,27 +190,8 @@ namespace Simd
         const __m128i K8_SHUFFLE_BGR1_TO_RED = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1);
         const __m128i K8_SHUFFLE_BGR2_TO_RED = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF);
     }
-#endif// SIMD_SSSE3_ENABLE
-
-#ifdef SIMD_SSE41_ENABLE    
-    namespace Sse41
-    {
-        using namespace Ssse3;
-#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug      
-        using Sse::F;
-        using Sse::DF;
-        using Sse::QF;
-#endif
-    }
 #endif// SIMD_SSE41_ENABLE
 
-#ifdef SIMD_SSE42_ENABLE    
-    namespace Sse42
-    {
-        using namespace Sse41;
-    }
-#endif// SIMD_SSE42_ENABLE
-
 #ifdef SIMD_AVX_ENABLE    
     namespace Avx
     {
@@ -282,6 +246,7 @@ namespace Simd
         const __m256i K16_0020 = SIMD_MM256_SET1_EPI16(0x0020);
         const __m256i K16_0080 = SIMD_MM256_SET1_EPI16(0x0080);
         const __m256i K16_00FF = SIMD_MM256_SET1_EPI16(0x00FF);
+        const __m256i K16_0101 = SIMD_MM256_SET1_EPI16(0x0101);
         const __m256i K16_FF00 = SIMD_MM256_SET1_EPI16(0xFF00);
 
         const __m256i K32_00000001 = SIMD_MM256_SET1_EPI32(0x00000001);
@@ -292,6 +257,7 @@ namespace Simd
         const __m256i K32_0000FFFF = SIMD_MM256_SET1_EPI32(0x0000FFFF);
         const __m256i K32_00010000 = SIMD_MM256_SET1_EPI32(0x00010000);
         const __m256i K32_01000000 = SIMD_MM256_SET1_EPI32(0x01000000);
+        const __m256i K32_00FF0000 = SIMD_MM256_SET1_EPI32(0x00FF0000);
         const __m256i K32_FFFFFF00 = SIMD_MM256_SET1_EPI32(0xFFFFFF00);
 
         const __m256i K16_Y_ADJUST = SIMD_MM256_SET1_EPI16(Base::Y_ADJUST);
@@ -311,6 +277,8 @@ namespace Simd
 
         const __m256i K16_DIVISION_BY_9_FACTOR = SIMD_MM256_SET1_EPI16(Base::DIVISION_BY_9_FACTOR);
 
+        const __m256i K64_00000000FFFFFFFF = SIMD_MM256_SET2_EPI32(0xFFFFFFFF, 0);
+
         const __m256i K8_SHUFFLE_0 = SIMD_MM256_SETR_EPI8(
             0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
             0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0);
@@ -389,11 +357,11 @@ namespace Simd
             -1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1,
             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF);
 
-        const __m256i K8_BGRA_TO_BGR_SHUFFLE = SIMD_MM256_SETR_EPI8(
+        const __m256i K8_BGR_TO_BGRA_SHUFFLE = SIMD_MM256_SETR_EPI8(
             0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1,
             0x4, 0x5, 0x6, -1, 0x7, 0x8, 0x9, -1, 0xA, 0xB, 0xC, -1, 0xD, 0xE, 0xF, -1);
 
-        const __m256i K8_BGRA_TO_RGB_SHUFFLE = SIMD_MM256_SETR_EPI8(
+        const __m256i K8_RGB_TO_BGRA_SHUFFLE = SIMD_MM256_SETR_EPI8(
             0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1,
             0x6, 0x5, 0x4, -1, 0x9, 0x8, 0x7, -1, 0xC, 0xB, 0xA, -1, 0xF, 0xE, 0xD, -1);
 
@@ -402,6 +370,12 @@ namespace Simd
             0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF);
 
         const __m256i K32_TWO_UNPACK_PERMUTE = SIMD_MM256_SETR_EPI32(0, 2, 4, 6, 1, 3, 5, 7);
+
+        const __m256i K8_SHUFFLE_BGRA_TO_BGR = SIMD_MM256_SETR_EPI8(
+            0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1,
+            0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1);
+
+        const __m256i K32_PERMUTE_BGRA_TO_BGR = SIMD_MM256_SETR_EPI32(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, -1, -1);
     }
 #endif// SIMD_AVX2_ENABLE
 
@@ -459,8 +433,10 @@ namespace Simd
         const uint32x4_t K32_000000FF = SIMD_VEC_SET1_EPI32(0x000000FF);
         const uint32x4_t K32_0000FFFF = SIMD_VEC_SET1_EPI32(0x0000FFFF);
         const uint32x4_t K32_00010000 = SIMD_VEC_SET1_EPI32(0x00010000);
+        const uint32x4_t K32_00FF0000 = SIMD_VEC_SET1_EPI32(0x00FF0000);
         const uint32x4_t K32_01000000 = SIMD_VEC_SET1_EPI32(0x01000000);
         const uint32x4_t K32_08080800 = SIMD_VEC_SET1_EPI32(0x08080800);
+        const uint32x4_t K32_FF000000 = SIMD_VEC_SET1_EPI32(0xFF000000);
         const uint32x4_t K32_FFFFFF00 = SIMD_VEC_SET1_EPI32(0xFFFFFF00);
         const uint32x4_t K32_FFFFFFFF = SIMD_VEC_SET1_EPI32(0xFFFFFFFF);
         const uint32x4_t K32_0123 = SIMD_VEC_SETR_EPI32(0, 1, 2, 3);
diff --git a/3rdparty/simdlib/Simd/SimdConversion.h b/3rdparty/simdlib/Simd/SimdConversion.h
old mode 100644
new mode 100755
index e0601a9f61..5f8f0a0b9b
--- a/3rdparty/simdlib/Simd/SimdConversion.h
+++ b/3rdparty/simdlib/Simd/SimdConversion.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
 *               2014-2015 Antonenka Mikhail.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -38,16 +38,10 @@ namespace Simd
             return (BLUE_TO_GRAY_WEIGHT*blue + GREEN_TO_GRAY_WEIGHT * green +
                 RED_TO_GRAY_WEIGHT * red + BGR_TO_GRAY_ROUND_TERM) >> BGR_TO_GRAY_AVERAGING_SHIFT;
         }
-
-        SIMD_INLINE int RgbToGray(int red, int green, int blue)
-        {
-            return (BLUE_TO_GRAY_WEIGHT*blue + GREEN_TO_GRAY_WEIGHT * green +
-                RED_TO_GRAY_WEIGHT * red + BGR_TO_GRAY_ROUND_TERM) >> BGR_TO_GRAY_AVERAGING_SHIFT;
-        }
     }
 
-#ifdef SIMD_SSSE3_ENABLE    
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE
+    namespace Sse41
     {
         template <int index> __m128i InterleaveBgr(__m128i blue, __m128i green, __m128i red);
 
@@ -99,7 +93,7 @@ namespace Simd
                         _mm_shuffle_epi8(bgr[2], K8_SHUFFLE_BGR2_TO_RED)));
         }
     }
-#endif//SIMD_SSSE3_ENABLE
+#endif
 
 #ifdef SIMD_AVX2_ENABLE    
     namespace Avx2
@@ -181,41 +175,24 @@ namespace Simd
 
         template<> SIMD_INLINE __m256i BgrToBgra<false>(const __m256i & bgr, const __m256i & alpha)
         {
-            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0x94), K8_BGRA_TO_BGR_SHUFFLE), alpha);
+            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0x94), K8_BGR_TO_BGRA_SHUFFLE), alpha);
         }
 
         template<> SIMD_INLINE __m256i BgrToBgra<true>(const __m256i & bgr, const __m256i & alpha)
         {
-            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0xE9), K8_BGRA_TO_BGR_SHUFFLE), alpha);
-        }
-
-        template<bool tail> __m256i BgrToRgba(const __m256i & bgr, const __m256i & alpha);
-
-        template<> SIMD_INLINE __m256i BgrToRgba<false>(const __m256i & bgr, const __m256i & alpha)
-        {
-            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0x94), K8_BGRA_TO_RGB_SHUFFLE), alpha);
-        }
-
-        template<> SIMD_INLINE __m256i BgrToRgba<true>(const __m256i & bgr, const __m256i & alpha)
-        {
-            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0xE9), K8_BGRA_TO_RGB_SHUFFLE), alpha);
-        }
-
-        SIMD_INLINE __m256i BgraToRgba(const __m256i & bgra)
-        {
-            return _mm256_shuffle_epi8(bgra, K8_BGRA_TO_RGBA_SHUFFLE);
+            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0xE9), K8_BGR_TO_BGRA_SHUFFLE), alpha);
         }
 
         template<bool tail> __m256i RgbToBgra(const __m256i & rgb, const __m256i & alpha);
 
         template<> SIMD_INLINE __m256i RgbToBgra<false>(const __m256i & rgb, const __m256i & alpha)
         {
-            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0x94), K8_BGRA_TO_RGB_SHUFFLE), alpha);
+            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0x94), K8_RGB_TO_BGRA_SHUFFLE), alpha);
         }
 
         template<> SIMD_INLINE __m256i RgbToBgra<true>(const __m256i & rgb, const __m256i & alpha)
         {
-            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0xE9), K8_BGRA_TO_RGB_SHUFFLE), alpha);
+            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0xE9), K8_RGB_TO_BGRA_SHUFFLE), alpha);
         }
     }
 #endif// SIMD_AVX2_ENABLE
@@ -236,8 +213,20 @@ namespace Simd
 
         template <int part> SIMD_INLINE int32x4_t BgrToU(uint16x8_t blue, uint16x8_t green, uint16x8_t red)
         {
-            return vshrq_n_s32(vmlal_s16(vmlal_s16(vmlal_s16(K32_BGR_TO_YUV_ROUND_TERM, (int16x4_t)Half<part>(blue), K16_BLUE_TO_U_WEIGHT),
-                (int16x4_t)Half<part>(green), K16_GREEN_TO_U_WEIGHT), (int16x4_t)Half<part>(red), K16_RED_TO_U_WEIGHT), Base::BGR_TO_YUV_AVERAGING_SHIFT);
+            return vshrq_n_s32(vmlal_s16(vmlal_s16(vmlal_s16(K32_BGR_TO_YUV_ROUND_TERM, vreinterpret_s16_u16(Half<part>(blue)), K16_BLUE_TO_U_WEIGHT),
+                vreinterpret_s16_u16(Half<part>(green)), K16_GREEN_TO_U_WEIGHT), vreinterpret_s16_u16(Half<part>(red)), K16_RED_TO_U_WEIGHT), Base::BGR_TO_YUV_AVERAGING_SHIFT);
+        }
+
+        SIMD_INLINE int16x8_t BgrToU(uint16x8_t blue, uint16x8_t green, uint16x8_t red)
+        {
+            return vaddq_s16(K16_UV_ADJUST, PackI32(BgrToU<0>(blue, green, red), BgrToU<1>(blue, green, red)));
+        }
+
+        SIMD_INLINE uint8x16_t BgrToU(uint8x16_t blue, uint8x16_t green, uint8x16_t red)
+        {
+            return PackSaturatedI16(
+                BgrToU(UnpackU8<0>(blue), UnpackU8<0>(green), UnpackU8<0>(red)),
+                BgrToU(UnpackU8<1>(blue), UnpackU8<1>(green), UnpackU8<1>(red)));
         }
     }
 #endif// SIMD_NEON_ENABLE
diff --git a/3rdparty/simdlib/Simd/SimdCopyPixel.h b/3rdparty/simdlib/Simd/SimdCopyPixel.h
old mode 100644
new mode 100755
index 6f113e4c39..a5539eba35
--- a/3rdparty/simdlib/Simd/SimdCopyPixel.h
+++ b/3rdparty/simdlib/Simd/SimdCopyPixel.h
@@ -56,6 +56,23 @@ namespace Simd
         {
             ((uint32_t*)dst)[0] = ((uint32_t*)src)[0];
         }
+
+        template<> SIMD_INLINE void CopyPixel<6>(const uint8_t* src, uint8_t* dst)
+        {
+            ((uint32_t*)dst)[0] = ((uint32_t*)src)[0];
+            ((uint16_t*)dst)[2] = ((uint16_t*)src)[2];
+        }
+
+        template<> SIMD_INLINE void CopyPixel<8>(const uint8_t* src, uint8_t* dst)
+        {
+            ((uint64_t*)dst)[0] = ((uint64_t*)src)[0];
+        }
+
+        template<> SIMD_INLINE void CopyPixel<12>(const uint8_t* src, uint8_t* dst)
+        {
+            ((uint64_t*)dst)[0] = ((uint64_t*)src)[0];
+            ((uint32_t*)dst)[2] = ((uint32_t*)src)[2];
+        }
     }
 }
 
diff --git a/3rdparty/simdlib/Simd/SimdCpu.h b/3rdparty/simdlib/Simd/SimdCpu.h
old mode 100644
new mode 100755
index adaf916462..b10d9fa98f
--- a/3rdparty/simdlib/Simd/SimdCpu.h
+++ b/3rdparty/simdlib/Simd/SimdCpu.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -28,8 +28,103 @@
 
 namespace Simd
 {
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
+    namespace Cpuid
+    {
+        // See http://www.sandpile.org/x86/cpuid.htm for additional information.
+        enum Level
+        {
+            Ordinary = 1,
+            Extended = 7,
+        };
+
+        enum Register
+        {
+            Eax = 0,
+            Ebx = 1,
+            Ecx = 2,
+            Edx = 3,
+        };
+
+        enum Bit
+        {
+            // Ordinary:
+            // Edx:
+            SSE = 1 << 25,
+            SSE2 = 1 << 26,
+
+            // Ecx:
+            SSE3 = 1 << 0,
+            SSSE3 = 1 << 9,
+            FMA = 1 << 12,
+            SSE41 = 1 << 19,
+            SSE42 = 1 << 20,
+            OSXSAVE = 1 << 27,
+            AVX = 1 << 28,
+            F16C = 1 << 29,
+
+            // Extended:
+            // Ebx:
+            AVX2 = 1 << 5,
+            AVX512F = 1 << 16,
+            AVX512DQ = 1 << 17,
+            AVX512CD = 1 << 28,
+            AVX512BW = 1 << 30,
+            AVX512VL = 1 << 31,
+
+            // Ecx:
+            AVX512VBMI = 1 << 1,
+            AVX512VNNI = 1 << 11,
+        };
+    }
+#endif//defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
+
+    namespace Cpu
+    {
+        extern const size_t SOCKET_NUMBER;
+        extern const size_t CORE_NUMBER;
+        extern const size_t THREAD_NUMBER;
+        extern const size_t L1_CACHE_SIZE;
+        extern const size_t L2_CACHE_SIZE;
+        extern const size_t L3_CACHE_SIZE;
+    }
+
+    namespace Base
+    {
+#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
+        bool CheckBit(Cpuid::Level level, Cpuid::Register index, Cpuid::Bit bit);
+#endif
+
+#if defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE))
+        bool CheckBit(int at, int bit);
+#endif
+
+        size_t CpuSocketNumber();
+
+        size_t CpuCoreNumber();
+
+        size_t CpuThreadNumber();
+
+        size_t CpuCacheSize(size_t level);
+
+        SIMD_INLINE size_t AlgCacheL1()
+        {
+            return Cpu::L1_CACHE_SIZE;
+        }
+
+        SIMD_INLINE size_t AlgCacheL2()
+        {
+            return Cpu::L3_CACHE_SIZE ? Cpu::L2_CACHE_SIZE : Cpu::L2_CACHE_SIZE * Cpu::SOCKET_NUMBER / Cpu::CORE_NUMBER;
+        }
+
+        SIMD_INLINE size_t AlgCacheL3()
+        {
+            return Cpu::L3_CACHE_SIZE ? Cpu::L3_CACHE_SIZE * Cpu::SOCKET_NUMBER / Cpu::CORE_NUMBER : Cpu::L2_CACHE_SIZE;
+        }
+    }
+
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         const unsigned int SCR_FTZ = 1 << 15;
         const unsigned int SCR_DAZ = 1 << 6;
diff --git a/3rdparty/simdlib/Simd/SimdDefs.h b/3rdparty/simdlib/Simd/SimdDefs.h
old mode 100644
new mode 100755
index c2b9274ed4..97d8f06ad6
--- a/3rdparty/simdlib/Simd/SimdDefs.h
+++ b/3rdparty/simdlib/Simd/SimdDefs.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -37,10 +37,24 @@
 #include <math.h>
 #include <cmath>
 
+#if defined(SIMD_SSE2_DISABLE) && !defined(SIMD_SSE41_DISABLE)
+#define SIMD_SSE41_DISABLE
+#endif
+
+#if defined(SIMD_SSE41_DISABLE) && !defined(SIMD_AVX_DISABLE)
+#define SIMD_AVX_DISABLE
+#endif
+
+#if defined(SIMD_AVX_DISABLE) && !defined(SIMD_AVX2_DISABLE)
+#define SIMD_AVX2_DISABLE
+#endif
+
 #if defined(_MSC_VER) && defined(_MSC_FULL_VER)
 
 #define SIMD_ALIGNED(x) __declspec(align(x))
 
+#define SIMD_NOINLINE __declspec(noinline)
+
 #ifdef _M_IX86
 #define SIMD_X86_ENABLE
 #endif
@@ -55,30 +69,14 @@
 
 #if defined(SIMD_X64_ENABLE) || defined(SIMD_X86_ENABLE)
 
-#if !defined(SIMD_SSE_DISABLE) && _MSC_VER >= 1200
-#define SIMD_SSE_ENABLE
-#endif
-
 #if !defined(SIMD_SSE2_DISABLE) && _MSC_VER >= 1300
 #define SIMD_SSE2_ENABLE
 #endif
 
-#if !defined(SIMD_SSE3_DISABLE) && _MSC_VER >= 1500
-#define SIMD_SSE3_ENABLE
-#endif
-
-#if !defined(SIMD_SSSE3_DISABLE) && _MSC_VER >= 1500
-#define SIMD_SSSE3_ENABLE
-#endif
-
 #if !defined(SIMD_SSE41_DISABLE) && _MSC_VER >= 1500
 #define SIMD_SSE41_ENABLE
 #endif
 
-#if !defined(SIMD_SSE42_DISABLE) && _MSC_VER >= 1500
-#define SIMD_SSE42_ENABLE
-#endif
-
 #if !defined(SIMD_AVX_DISABLE) && _MSC_FULL_VER >= 160040219
 #define SIMD_AVX_ENABLE
 #endif
@@ -88,7 +86,7 @@
 #endif
 
 #if defined(NDEBUG) && _MSC_VER >= 1700 && _MSC_VER < 1900
-#define SIMD_MADDUBS_ERROR // Visual Studio 2012/2013 release mode compiler bug in function _mm256_maddubs_epi16:
+#define SIMD_MADDUBS_ERROR // Visual Studio 2012/2013 release mode compiler bug in function _mm256_maddubs_epi16.
 #endif
 
 #if defined(NDEBUG) && _MSC_VER == 1914
@@ -123,6 +121,8 @@
 
 #define SIMD_ALIGNED(x) __attribute__ ((aligned(x)))
 
+#define SIMD_NOINLINE __attribute__ ((noinline))
+
 #ifdef __i386__
 #define SIMD_X86_ENABLE
 #endif
@@ -159,36 +159,16 @@
 #define SIMD_ARM64_ENABLE
 #endif
 
-#if defined __mips__
-#define SIMD_MIPS_ENABLE
-#endif
-
 #if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
 
-#if !defined(SIMD_SSE_DISABLE) && defined(__SSE__)
-#define SIMD_SSE_ENABLE
-#endif
-
-#if !defined(SIMD_SSE2_DISABLE) && defined(__SSE2__)
+#if !defined(SIMD_SSE2_DISABLE) && defined(__SSE__) && defined(__SSE2__)
 #define SIMD_SSE2_ENABLE
 #endif
 
-#if !defined(SIMD_SSE3_DISABLE) && defined(__SSE3__)
-#define SIMD_SSE3_ENABLE
-#endif
-
-#if !defined(SIMD_SSSE3_DISABLE) && defined(__SSSE3__)
-#define SIMD_SSSE3_ENABLE
-#endif
-
-#if !defined(SIMD_SSE41_DISABLE) && defined(__SSE4_1__)
+#if !defined(SIMD_SSE41_DISABLE) && defined(__SSE3__) && defined(__SSSE3__) && defined(__SSE4_1__) && defined(__SSE4_2__)
 #define SIMD_SSE41_ENABLE
 #endif
 
-#if !defined(SIMD_SSE42_DISABLE) && defined(__SSE4_2__)
-#define SIMD_SSE42_ENABLE
-#endif
-
 #if !defined(SIMD_AVX_DISABLE) && defined(__AVX__)
 #define SIMD_AVX_ENABLE
 #endif
@@ -239,27 +219,11 @@
 
 #endif
 
-#ifdef SIMD_SSE_ENABLE
-#include <xmmintrin.h>
-#endif
-
 #ifdef SIMD_SSE2_ENABLE
 #include <emmintrin.h>
 #endif
 
-#ifdef SIMD_SSE3_ENABLE
-# include <pmmintrin.h>
-#endif
-
-#ifdef SIMD_SSSE3_ENABLE
-#include <tmmintrin.h>
-#endif
-
 #ifdef SIMD_SSE41_ENABLE
-#include <smmintrin.h>
-#endif
-
-#ifdef SIMD_SSE42_ENABLE
 #include <nmmintrin.h>
 #endif
 
@@ -273,10 +237,10 @@
 
 #if defined(SIMD_AVX_ENABLE) || defined(SIMD_AVX2_ENABLE)
 #define SIMD_ALIGN 32
-#elif defined(SIMD_SSE_ENABLE) || defined(SIMD_SSE2_ENABLE) || defined(SIMD_SSE3_ENABLE)  || defined(SIMD_SSSE3_ENABLE) || defined(SIMD_SSE41_ENABLE) || defined(SIMD_SSE42_ENABLE) \
+#elif defined(SIMD_SSE2_ENABLE) || defined(SIMD_SSE41_ENABLE) \
     || defined(SIMD_NEON_ENABLE)
 #define SIMD_ALIGN 16
-#elif defined (SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+#elif defined (SIMD_X64_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM64_ENABLE)
 #define SIMD_ALIGN 8
 #else
 #define SIMD_ALIGN 4
diff --git a/3rdparty/simdlib/Simd/SimdEnable.h b/3rdparty/simdlib/Simd/SimdEnable.h
old mode 100644
new mode 100755
index 6c79eb0d94..a501daf8ad
--- a/3rdparty/simdlib/Simd/SimdEnable.h
+++ b/3rdparty/simdlib/Simd/SimdEnable.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -62,455 +62,74 @@
 
 namespace Simd
 {
-#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
-    namespace Cpuid
-    {
-        // See http://www.sandpile.org/x86/cpuid.htm for additional information.
-        enum Level
-        {
-            Ordinary = 1,
-            Extended = 7,
-        };
-
-        enum Register
-        {
-            Eax = 0,
-            Ebx = 1,
-            Ecx = 2,
-            Edx = 3,
-        };
-
-        enum Bit
-        {
-            // Ordinary:
-            // Edx:
-            SSE = 1 << 25,
-            SSE2 = 1 << 26,
-
-            // Ecx:
-            SSE3 = 1 << 0,
-            SSSE3 = 1 << 9,
-            FMA = 1 << 12,
-            SSE41 = 1 << 19,
-            SSE42 = 1 << 20,
-            OSXSAVE = 1 << 27,
-            AVX = 1 << 28,
-            F16C = 1 << 29,
-
-            // Extended:
-            // Ebx:
-            AVX2 = 1 << 5,
-            AVX512F = 1 << 16,
-            AVX512BW = 1 << 30,
-
-            // Ecx:
-            AVX512VBMI = 1 << 1,
-        };
-
-        SIMD_INLINE bool CheckBit(Level level, Register index, Bit bit)
-        {
-            unsigned int registers[4] = { 0, 0, 0, 0 };
-#if defined(_MSC_VER)
-            __cpuid((int*)registers, level);
-#elif (defined __GNUC__)
-            if (__get_cpuid_max(0, NULL) < level)
-                return false;
-            __cpuid_count(level, 0, registers[Eax], registers[Ebx], registers[Ecx], registers[Edx]);
-#else
-#error Do not know how to detect CPU info!
-#endif
-            return (registers[index] & bit) == bit;
-        }
-    }
-#endif//defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
-
-#if !defined(__APPLE__) // not macOS, iOS
-#if defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE))
-    namespace CpuInfo
-    {
-        SIMD_INLINE bool CheckBit(int at, int bit)
-        {
-            bool result = false;
-            int file = ::open("/proc/self/auxv", O_RDONLY);
-            if (file < 0)
-                return false;
-            const ssize_t size = 64;
-            unsigned long buffer[size];
-            for (ssize_t count = size; count == size;)
-            {
-                count = ::read(file, buffer, sizeof(buffer)) / sizeof(unsigned long);
-                for (int i = 0; i < count; i += 2)
-                {
-                    if (buffer[i] == (unsigned)at)
-                    {
-                        result = !!(buffer[i + 1] & bit);
-                        count = 0;
-                    }
-                    if (buffer[i] == AT_NULL)
-                        count = 0;
-                }
-            }
-            ::close(file);
-            return result;
-        }
-    }
-#endif//defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE))
-#endif//(TARGET_OS_IOS == 0) not iOS
-
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
-    {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Edx, Cpuid::SSE);
-        }
-
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                __m128 value = _mm_set1_ps(1.0f);// try to execute of SSE instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
-    }
-#endif
-
 #ifdef SIMD_SSE2_ENABLE
     namespace Sse2
     {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Edx, Cpuid::SSE2);
-        }
+        bool GetEnable();
 
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                __m128d value = _mm_set1_pd(1.0);// try to execute of SSE2 instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
-    }
-#endif
-
-#ifdef SIMD_SSE3_ENABLE
-    namespace Sse3
-    {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE3);
-        }
-
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                __m128 value = _mm_hadd_ps(_mm_set1_ps(1.0f), _mm_set1_ps(2.0f)); //try to execute of SSE3 instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
-    }
-#endif
-
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSSE3);
-        }
-
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                __m128i value = _mm_abs_epi8(_mm_set1_epi8(-1)); //try to execute of SSSE3 instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
+        const bool Enable = GetEnable();
     }
 #endif
 
 #ifdef SIMD_SSE41_ENABLE
     namespace Sse41
     {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE41);
-        }
-
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                int value = _mm_testz_si128(_mm_set1_epi8(0), _mm_set1_epi8(-1)); // try to execute of SSE41 instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
-    }
-#endif
-
-#ifdef SIMD_SSE42_ENABLE
-    namespace Sse42
-    {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE42);
-        }
+        bool GetEnable();
 
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                uint32_t value = _mm_crc32_u8(0, 1); // try to execute of SSE42 instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
+        const bool Enable = GetEnable();
     }
 #endif
 
 #ifdef SIMD_AVX_ENABLE
     namespace Avx
     {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return
-                Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) &&
-                Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::AVX);
-        }
-
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                __m256d value = _mm256_set1_pd(1.0);// try to execute of AVX instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
+        bool GetEnable();
 
-        const bool Enable = SupportedByCPU() && SupportedByOS();
+        const bool Enable = GetEnable();
     }
 #endif
 
 #ifdef SIMD_AVX2_ENABLE
     namespace Avx2
     {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return
-                Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) &&
-                Cpuid::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX2) &&
-                Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::FMA) &&
-                Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::F16C);
-        }
+        bool GetEnable();
 
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                __m256i value = _mm256_abs_epi8(_mm256_set1_epi8(1));// try to execute of AVX2 instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
+        const bool Enable = GetEnable();
     }
 #endif
 
 #ifdef SIMD_NEON_ENABLE
     namespace Neon
     {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-#if defined(_MSC_VER)
-            return true;
-#elif defined(__GNUC__)
-#if defined(SIMD_ARM64_ENABLE) || (TARGET_OS_IOS != 0) // iOS
-            return true;
-#else
-            return CpuInfo::CheckBit(AT_HWCAP, HWCAP_NEON);
-#endif
-#else
-#error Do not know how to detect NEON support!
-#endif
-        }
+        bool GetEnable();
 
-        SIMD_INLINE bool SupportedByOS()
-        {
-            return true;
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
+        const bool Enable = GetEnable();
     }
 #endif
-
-    SIMD_INLINE size_t Alignment()
-    {
-#ifdef SIMD_AVX2_ENABLE
-        if (Avx2::Enable)
-            return sizeof(__m256i);
-        else
-#endif
-#ifdef SIMD_AVX_ENABLE
-        if (Avx::Enable)
-            return sizeof(__m256);
-        else
-#endif
-#ifdef SIMD_SSE41_ENABLE
-        if (Sse41::Enable)
-            return sizeof(__m128i);
-        else
-#endif
-#ifdef SIMD_SSSE3_ENABLE
-        if (Ssse3::Enable)
-            return sizeof(__m128i);
-        else
-#endif
-#ifdef SIMD_SSE2_ENABLE
-        if (Sse2::Enable)
-            return sizeof(__m128i);
-        else
-#endif
-#ifdef SIMD_SSE_ENABLE
-        if (Sse::Enable)
-            return sizeof(__m128);
-        else
-#endif
-#ifdef SIMD_NEON_ENABLE
-        if (Neon::Enable)
-            return sizeof(uint8x16_t);
-        else
-#endif
-            return sizeof(void *);
-    }
-
-    const size_t ALIGNMENT = Alignment();
 }
 
 #define SIMD_BASE_FUNC(func) Simd::Base::func
 
-#ifdef SIMD_SSE_ENABLE
-#define SIMD_SSE_FUNC(func) Simd::Sse::Enable ? Simd::Sse::func :
-#else
-#define SIMD_SSE_FUNC(func)
-#endif
-
 #ifdef SIMD_SSE2_ENABLE
-#define SIMD_SSE2_FUNC(func) Simd::Sse2::Enable ? Simd::Sse2::func :
-#else
-#define SIMD_SSE2_FUNC(func)
-#endif
-
-#ifdef SIMD_SSE3_ENABLE
-#define SIMD_SSE3_FUNC(func) Simd::Sse3::Enable ? Simd::Sse3::func :
+#define SIMD_SSE2_FUNC(func) Simd::Sse2::Enable ? Simd::Sse2::func : 
 #else
-#define SIMD_SSE3_FUNC(func)
-#endif
-
-#ifdef SIMD_SSSE3_ENABLE
-#define SIMD_SSSE3_FUNC(func) Simd::Ssse3::Enable ? Simd::Ssse3::func :
-#else
-#define SIMD_SSSE3_FUNC(func)
+#define SIMD_SSE2_FUNC(func) 
 #endif
 
 #ifdef SIMD_SSE41_ENABLE
-#define SIMD_SSE41_FUNC(func) Simd::Sse41::Enable ? Simd::Sse41::func :
-#else
-#define SIMD_SSE41_FUNC(func)
-#endif
-
-#ifdef SIMD_SSE42_ENABLE
-#define SIMD_SSE42_FUNC(func) Simd::Sse42::Enable ? Simd::Sse42::func :
+#define SIMD_SSE41_FUNC(func) Simd::Sse41::Enable ? Simd::Sse41::func : 
 #else
-#define SIMD_SSE42_FUNC(func)
+#define SIMD_SSE41_FUNC(func) 
 #endif
 
 #ifdef SIMD_AVX_ENABLE
-#define SIMD_AVX_FUNC(func) Simd::Avx::Enable ? Simd::Avx::func :
+#define SIMD_AVX_FUNC(func) Simd::Avx::Enable ? Simd::Avx::func : 
 #else
 #define SIMD_AVX_FUNC(func)
 #endif
 
 #ifdef SIMD_AVX2_ENABLE
-#define SIMD_AVX2_FUNC(func) Simd::Avx2::Enable ? Simd::Avx2::func :
+#define SIMD_AVX2_FUNC(func) Simd::Avx2::Enable ? Simd::Avx2::func : 
 #else
 #define SIMD_AVX2_FUNC(func)
 #endif
diff --git a/3rdparty/simdlib/Simd/SimdExp.h b/3rdparty/simdlib/Simd/SimdExp.h
old mode 100644
new mode 100755
index 3bfbc3f8f5..1600275b23
--- a/3rdparty/simdlib/Simd/SimdExp.h
+++ b/3rdparty/simdlib/Simd/SimdExp.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -34,6 +34,11 @@ namespace Simd
         {
             return ::expf(value);
         }
+
+        SIMD_INLINE float Log(float value)
+        {
+            return ::logf(value);
+        }
     }
 
 #ifdef SIMD_SSE2_ENABLE    
@@ -107,20 +112,20 @@ namespace Simd
                 __m128 exp = Exp2(_mm_mul_ps(_k, value));
                 __m128 neg = _mm_mul_ps(alpha, _mm_sub_ps(exp, _1_0));
                 __m128 mask = _mm_cmpgt_ps(_mm_setzero_ps(), value);
-                return Sse::Combine(mask, neg, value);
+                return Combine(mask, neg, value);
             }
         };
 
         namespace Detail
         {
-            SIMD_INLINE __m128 Poly5(__m128 x)
+            SIMD_INLINE __m128 Poly5(__m128 x, float a, float b, float c, float d, float e, float f)
             {
-                __m128 p = _mm_set1_ps(1.8775767e-3f);
-                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(8.9893397e-3f));
-                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(5.5826318e-2f));
-                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(2.4015361e-1f));
-                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(6.9315308e-1f));
-                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(9.9999994e-1f));
+                __m128 p = _mm_set1_ps(f);
+                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(e));
+                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(d));
+                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(c));
+                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(b));
+                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(a));
                 return p;
             }
 
@@ -130,9 +135,19 @@ namespace Simd
                 __m128i ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
                 __m128 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
                 __m128 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
-                __m128 expfpart = Poly5(fpart);
+                __m128 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
                 return _mm_mul_ps(expipart, expfpart);
             }
+
+            SIMD_INLINE __m128 Log2(__m128 x)
+            {
+                __m128 _1 = _mm_set1_ps(1.0f);
+                __m128i i = _mm_castps_si128(x);
+                __m128 e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, _mm_set1_epi32(0x7F800000)), 23), _mm_set1_epi32(127)));
+                __m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, _mm_set1_epi32(0x007FFFFF))), _1);
+                __m128 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+                return _mm_add_ps(_mm_mul_ps(p, _mm_sub_ps(m, _1)), e);
+            }
         }
 
         SIMD_INLINE __m128 Exponent(__m128 value)
@@ -145,7 +160,36 @@ namespace Simd
             __m128 exp = Exponent(value);
             __m128 neg = _mm_mul_ps(alpha, _mm_sub_ps(exp, _mm_set1_ps(1.0f)));
             __m128 mask = _mm_cmpgt_ps(_mm_setzero_ps(), value);
-            return Sse::Combine(mask, neg, value);
+            return Combine(mask, neg, value);
+        }
+
+        SIMD_INLINE __m128 Logarithm(__m128 value)
+        {
+            return _mm_mul_ps(_mm_set1_ps(0.693147181f), Detail::Log2(value));
+        }
+
+        SIMD_INLINE __m128 Mish(__m128 value, __m128 threshold)
+        {
+            __m128 _1 = _mm_set1_ps(1.0f);
+            __m128 mish = _mm_add_ps(Exponent(value), _1);
+            mish = _mm_add_ps(_mm_mul_ps(mish, mish), _1);
+            mish = _mm_mul_ps(value, _mm_sub_ps(_1, _mm_div_ps(_mm_set1_ps(2.0f), mish)));
+            return Combine(_mm_cmpgt_ps(threshold, value), mish, value);
+        }
+
+        SIMD_INLINE __m128 Softplus(__m128 value, __m128 beta, __m128 threshold)
+        {
+            __m128 exp = Exponent(_mm_mul_ps(value, beta));
+            __m128 log = Logarithm(_mm_add_ps(_mm_set1_ps(1.0f), exp));
+            __m128 mask = _mm_cmpgt_ps(threshold, value);
+            return Combine(mask, _mm_div_ps(log, beta), value);
+        }
+
+        SIMD_INLINE __m128 Tanh(__m128 value)
+        {
+            __m128 _1 = _mm_set1_ps(1.0f);
+            __m128 exp = Detail::Exp2(_mm_mul_ps(_mm_set1_ps(2.88539008f), value));
+            return _mm_div_ps(_mm_sub_ps(exp, _1), _mm_add_ps(_1, exp));
         }
     }
 #endif //SIMD_SSE2_ENABLE   
@@ -227,14 +271,14 @@ namespace Simd
 
         namespace Detail
         {
-            SIMD_INLINE __m256 Poly5(__m256 x)
+            SIMD_INLINE __m256 Poly5(__m256 x, float a, float b, float c, float d, float e, float f)
             {
-                __m256 p = _mm256_set1_ps(1.8775767e-3f);
-                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(8.9893397e-3f));
-                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(5.5826318e-2f));
-                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(2.4015361e-1f));
-                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(6.9315308e-1f));
-                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(9.9999994e-1f));
+                __m256 p = _mm256_set1_ps(f);
+                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(e));
+                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(d));
+                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(c));
+                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(b));
+                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(a));
                 return p;
             }
 
@@ -244,9 +288,19 @@ namespace Simd
                 __m256i ipart = _mm256_cvtps_epi32(_mm256_sub_ps(x, _mm256_set1_ps(0.5f)));
                 __m256 fpart = _mm256_sub_ps(x, _mm256_cvtepi32_ps(ipart));
                 __m256 expipart = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(ipart, _mm256_set1_epi32(127)), 23));
-                __m256 expfpart = Poly5(fpart);
+                __m256 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
                 return _mm256_mul_ps(expipart, expfpart);
             }
+
+            SIMD_INLINE __m256 Log2(__m256 x)
+            {
+                __m256 _1 = _mm256_set1_ps(1.0f);
+                __m256i i = _mm256_castps_si256(x);
+                __m256 e = _mm256_cvtepi32_ps(_mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(i, _mm256_set1_epi32(0x7F800000)), 23), _mm256_set1_epi32(127)));
+                __m256 m = _mm256_or_ps(_mm256_castsi256_ps(_mm256_and_si256(i, _mm256_set1_epi32(0x007FFFFF))), _1);
+                __m256 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+                return _mm256_add_ps(_mm256_mul_ps(p, _mm256_sub_ps(m, _1)), e);
+            }
         }
 
         SIMD_INLINE __m256 Exponent(__m256 value)
@@ -261,6 +315,35 @@ namespace Simd
             __m256 mask = _mm256_cmp_ps(_mm256_setzero_ps(), value, _CMP_GT_OS);
             return _mm256_blendv_ps(value, neg, mask);
         }
+
+        SIMD_INLINE __m256 Logarithm(__m256 value)
+        {
+            return _mm256_mul_ps(_mm256_set1_ps(0.693147181f), Detail::Log2(value));
+        }
+
+        SIMD_INLINE __m256 Mish(__m256 value, __m256 threshold)
+        {
+            __m256 _1 = _mm256_set1_ps(1.0f);
+            __m256 mish = _mm256_add_ps(Exponent(value), _1);
+            mish = Fmadd<true>(mish, mish, _1);
+            mish = _mm256_mul_ps(value, _mm256_sub_ps(_1, _mm256_div_ps(_mm256_set1_ps(2.0f), mish)));
+            return _mm256_blendv_ps(value, mish, _mm256_cmp_ps(threshold, value, _CMP_GT_OS));
+        }
+
+        SIMD_INLINE __m256 Softplus(__m256 value, __m256 beta, __m256 threshold)
+        {
+            __m256 exp = Exponent(_mm256_mul_ps(value, beta));
+            __m256 log = Logarithm(_mm256_add_ps(_mm256_set1_ps(1.0f), exp));
+            __m256 mask = _mm256_cmp_ps(threshold, value, _CMP_GT_OS);
+            return _mm256_blendv_ps(value, _mm256_div_ps(log, beta), mask);
+        }
+
+        SIMD_INLINE __m256 Tanh(__m256 value)
+        {
+            __m256 _1 = _mm256_set1_ps(1.0f);
+            __m256 exp = Detail::Exp2(_mm256_mul_ps(_mm256_set1_ps(2.88539008f), value));
+            return _mm256_div_ps(_mm256_sub_ps(exp, _1), _mm256_add_ps(_1, exp));
+        }
     }
 #endif //SIMD_AVX2_ENABLE
 
@@ -341,14 +424,14 @@ namespace Simd
 
         namespace Detail
         {
-            SIMD_INLINE float32x4_t Poly5(float32x4_t x)
+            SIMD_INLINE float32x4_t Poly5(float32x4_t x, float a, float b, float c, float d, float e, float f)
             {
-                float32x4_t p = vdupq_n_f32(1.8775767e-3f);
-                p = vmlaq_f32(vdupq_n_f32(8.9893397e-3f), x, p);
-                p = vmlaq_f32(vdupq_n_f32(5.5826318e-2f), x, p);
-                p = vmlaq_f32(vdupq_n_f32(2.4015361e-1f), x, p);
-                p = vmlaq_f32(vdupq_n_f32(6.9315308e-1f), x, p);
-                p = vmlaq_f32(vdupq_n_f32(9.9999994e-1f), x, p);
+                float32x4_t p = vdupq_n_f32(f);
+                p = vmlaq_f32(vdupq_n_f32(e), x, p);
+                p = vmlaq_f32(vdupq_n_f32(d), x, p);
+                p = vmlaq_f32(vdupq_n_f32(c), x, p);
+                p = vmlaq_f32(vdupq_n_f32(b), x, p);
+                p = vmlaq_f32(vdupq_n_f32(a), x, p);
                 return p;
             }
 
@@ -358,9 +441,19 @@ namespace Simd
                 int32x4_t ipart = vcvtq_s32_f32(vsubq_f32(x, vdupq_n_f32(0.5f)));
                 float32x4_t fpart = vsubq_f32(x, vcvtq_f32_s32(ipart));
                 float32x4_t expipart = vreinterpretq_f32_s32(vshlq_n_s32(vaddq_s32(ipart, vdupq_n_s32(127)), 23));
-                float32x4_t expfpart = Poly5(fpart);
+                float32x4_t expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
                 return vmulq_f32(expipart, expfpart);
             }
+
+            SIMD_INLINE float32x4_t Log2(float32x4_t x)
+            {
+                float32x4_t _1 = vdupq_n_f32(1.0f);
+                int32x4_t i = vreinterpretq_s32_f32(x);
+                float32x4_t e = vcvtq_f32_s32(vsubq_s32(vshrq_n_s32(vandq_s32(i, vdupq_n_s32(0x7F800000)), 23), vdupq_n_s32(127)));
+                float32x4_t m = Or(vreinterpretq_f32_s32(vandq_s32(i, vdupq_n_s32(0x007FFFFF))), _1);
+                float32x4_t p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+                return vaddq_f32(vmulq_f32(p, vsubq_f32(m, _1)), e);
+            }
         }
 
         SIMD_INLINE float32x4_t Exponent(float32x4_t value)
@@ -375,6 +468,35 @@ namespace Simd
             uint32x4_t mask = vcgtq_f32(vdupq_n_f32(0.0f), value);
             return vbslq_f32(mask, neg, value);
         }
+
+        SIMD_INLINE float32x4_t Logarithm(float32x4_t value)
+        {
+            return vmulq_f32(vdupq_n_f32(0.693147181f), Detail::Log2(value));
+        }
+
+        template<int iter> SIMD_INLINE float32x4_t Mish(float32x4_t value, float32x4_t threshold)
+        {
+            float32x4_t _1 = vdupq_n_f32(1.0f);
+            float32x4_t mish = vaddq_f32(Exponent(value), _1);
+            mish = Fmadd<true>(mish, mish, _1);
+            mish = vmulq_f32(value, vsubq_f32(_1, Div<iter>(vdupq_n_f32(2.0f), mish)));
+            return vbslq_f32(vcgtq_f32(threshold, value), mish, value);
+        }
+
+        template<int iter> SIMD_INLINE float32x4_t Softplus(float32x4_t value, float32x4_t beta, float32x4_t threshold)
+        {
+            float32x4_t exp = Exponent(vmulq_f32(value, beta));
+            float32x4_t log = Logarithm(vaddq_f32(vdupq_n_f32(1.0f), exp));
+            uint32x4_t mask = vcgtq_f32(threshold, value);
+            return vbslq_f32(mask, Div<iter>(log, beta), value);
+        }
+
+        template<int iter> SIMD_INLINE float32x4_t Tanh(float32x4_t value)
+        {
+            float32x4_t _1 = vdupq_n_f32(1.0f);
+            float32x4_t exp = Detail::Exp2(vmulq_f32(vdupq_n_f32(2.88539008f), value));
+            return Div<iter>(vsubq_f32(exp, _1), vaddq_f32(_1, exp));
+        }
     }
 #endif //SIMD_NEON_ENABLE
 }
diff --git a/3rdparty/simdlib/Simd/SimdExtract.h b/3rdparty/simdlib/Simd/SimdExtract.h
old mode 100644
new mode 100755
index d0d8184d7c..e30a0c85e5
--- a/3rdparty/simdlib/Simd/SimdExtract.h
+++ b/3rdparty/simdlib/Simd/SimdExtract.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -28,8 +28,8 @@
 
 namespace Simd
 {
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         SIMD_INLINE float ExtractValue(__m128 a, int i)
         {
@@ -44,12 +44,7 @@ namespace Simd
             _mm_store_ps(_a, a);
             return _a[0] + _a[1] + _a[2] + _a[3];
         }
-    }
-#endif//SIMD_SSE_ENABLE
 
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
         template <int index> SIMD_INLINE int ExtractInt8(__m128i a)
         {
             return _mm_extract_epi16(_mm_srli_si128(a, index & 0x1), index >> 1) & 0xFF;
@@ -90,8 +85,8 @@ namespace Simd
     }
 #endif// SIMD_SSE2_ENABLE
 
-#ifdef SIMD_SSE3_ENABLE
-    namespace Sse3
+#ifdef SIMD_SSE41_ENABLE
+    namespace Sse41
     {
         SIMD_INLINE float ExtractSum(__m128 a)
         {
@@ -103,7 +98,7 @@ namespace Simd
             return _mm_hadd_ps(_mm_hadd_ps(a[0], a[1]), _mm_hadd_ps(a[2], a[3]));
         }
     }
-#endif//SIMD_SSE3_ENABLE
+#endif//SIMD_SSE41_ENABLE
 
 #ifdef SIMD_AVX_ENABLE
     namespace Avx
@@ -199,6 +194,11 @@ namespace Simd
             return vgetq_lane_u32(a, 0) + vgetq_lane_u32(a, 1) + vgetq_lane_u32(a, 2) + vgetq_lane_u32(a, 3);
         }
 
+        SIMD_INLINE int32_t ExtractSum32s(const int32x4_t& a)
+        {
+            return vgetq_lane_s32(a, 0) + vgetq_lane_s32(a, 1) + vgetq_lane_s32(a, 2) + vgetq_lane_s32(a, 3);
+        }
+
         SIMD_INLINE uint64_t ExtractSum64u(const uint64x2_t & a)
         {
             return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
diff --git a/3rdparty/simdlib/Simd/SimdFrame.hpp b/3rdparty/simdlib/Simd/SimdFrame.hpp
old mode 100644
new mode 100755
index 53cc33879d..45b0b6022a
--- a/3rdparty/simdlib/Simd/SimdFrame.hpp
+++ b/3rdparty/simdlib/Simd/SimdFrame.hpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
 *               2014-2019 Antonenka Mikhail,
 *               2019-2019 Artur Voronkov.
 *
@@ -58,6 +58,10 @@ namespace Simd
             Bgr24,
             /*! One plane 8-bit gray pixel format. */
             Gray8,
+            /*! One plane 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */
+            Rgb24,
+            /*! One plane 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */
+            Rgba32,
         };
 
         const size_t width; /*!< \brief A width of the frame. */
@@ -373,6 +377,8 @@ namespace Simd
         case View<A>::Gray8: (Format&)format = Gray8; break;
         case View<A>::Bgr24: (Format&)format = Bgr24; break;
         case View<A>::Bgra32: (Format&)format = Bgra32; break;
+        case View<A>::Rgb24: (Format&)format = Rgb24; break;
+        case View<A>::Rgba32: (Format&)format = Rgba32; break;
         default:
             assert(0);
         }
@@ -420,6 +426,14 @@ namespace Simd
         case Gray8:
             planes[0] = View<A>(width, height, stride0, View<A>::Gray8, data0);
             break;
+        case Rgb24:
+            planes[0] = View<A>(width, height, stride0, View<A>::Rgb24, data0);
+            break;
+        case Rgba32:
+            planes[0] = View<A>(width, height, stride0, View<A>::Rgba32, data0);
+            break;
+        default:
+            assert(0);
         }
     }
 
@@ -494,6 +508,14 @@ namespace Simd
         case Gray8:
             planes[0].Recreate(width, height, View<A>::Gray8);
             break;
+        case Rgb24:
+            planes[0].Recreate(width, height, View<A>::Rgb24);
+            break;
+        case Rgba32:
+            planes[0].Recreate(width, height, View<A>::Rgba32);
+            break;
+        default:
+            assert(0);
         }
     }
 
@@ -591,6 +613,8 @@ namespace Simd
         case Bgra32:  return 1;
         case Bgr24:   return 1;
         case Gray8:   return 1;
+        case Rgb24:   return 1;
+        case Rgba32:  return 1;
         default: assert(0); return 0;
         }
     }
@@ -648,6 +672,12 @@ namespace Simd
             case Frame<A>::Gray8:
                 BgraToGray(src.planes[0], dst.planes[0]);
                 break;
+            case Frame<A>::Rgb24:
+                BgraToRgb(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Rgba32:
+                BgraToRgba(src.planes[0], dst.planes[0]);
+                break;
             default:
                 assert(0);
             }
@@ -662,6 +692,12 @@ namespace Simd
             case Frame<A>::Gray8:
                 BgrToGray(src.planes[0], dst.planes[0]);
                 break;
+            case Frame<A>::Rgb24:
+                BgrToRgb(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Rgba32:
+                BgrToRgba(src.planes[0], dst.planes[0]);
+                break;
             default:
                 assert(0);
             }
@@ -676,11 +712,71 @@ namespace Simd
             case Frame<A>::Bgr24:
                 GrayToBgr(src.planes[0], dst.planes[0]);
                 break;
+            case Frame<A>::Rgb24:
+                GrayToRgb(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Rgba32:
+                GrayToRgba(src.planes[0], dst.planes[0]);
+                break;
             default:
                 assert(0);
             }
             break;
 
+        case Frame<A>::Rgb24:
+            switch (dst.format)
+            {
+            case Frame<A>::Bgra32:
+                RgbToBgra(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Gray8:
+                RgbToGray(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Bgr24:
+                RgbToBgr(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Rgba32:
+                RgbToRgba(src.planes[0], dst.planes[0]);
+                break;
+            default:
+                assert(0);
+            }
+
+        case Frame<A>::Rgba32:
+            switch (dst.format)
+            {
+            case Frame<A>::Nv12:
+            {
+                View<A> bgr(src.Size(), View<A>::Bgr24);
+                RgbaToBgr(src.planes[0], bgr);
+                View<A> u(src.Size(), View<A>::Gray8), v(src.Size(), View<A>::Gray8);
+                BgrToYuv420p(bgr, dst.planes[0], u, v);
+                InterleaveUv(u, v, dst.planes[1]);
+                break;
+            }
+            case Frame<A>::Yuv420p:
+            {
+                View<A> bgr(src.Size(), View<A>::Bgr24);
+                RgbaToBgr(src.planes[0], bgr);
+                BgrToYuv420p(bgr, dst.planes[0], dst.planes[1], dst.planes[2]);
+                break;
+            }
+            case Frame<A>::Bgra32:
+                RgbaToBgra(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Gray8:
+                RgbaToGray(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Bgr24:
+                RgbaToBgr(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Rgb24:
+                RgbaToRgb(src.planes[0], dst.planes[0]);
+                break;
+            default:
+                assert(0);
+            }
+
         default:
             assert(0);
         }
diff --git a/3rdparty/simdlib/Simd/SimdInit.h b/3rdparty/simdlib/Simd/SimdInit.h
old mode 100644
new mode 100755
index 179e61bdb4..707ea4c8bc
--- a/3rdparty/simdlib/Simd/SimdInit.h
+++ b/3rdparty/simdlib/Simd/SimdInit.h
@@ -28,7 +28,22 @@
 
 namespace Simd
 {
-#if defined(_MSC_VER) && (defined(SIMD_X64_ENABLE) || defined(SIMD_X86_ENABLE))
+
+#if defined(_MSC_VER) && !defined(__clang__) && (defined(SIMD_X64_ENABLE) || defined(SIMD_X86_ENABLE))
+
+#define SIMD_INIT_AS_CHAR
+
+#elif defined(__GNUC__) || defined(__clang__) || (defined(_MSC_VER) && defined(SIMD_NEON_ENABLE))
+
+#define SIMD_INIT_AS_LONGLONG
+
+#else
+
+#error This platform is unsupported!
+
+#endif
+
+#if defined(SIMD_INIT_AS_CHAR)
 
     template <class T> SIMD_INLINE char GetChar(T value, size_t index)
     {
@@ -50,7 +65,7 @@ namespace Simd
 	Simd::GetChar(int64_t(a), 4), Simd::GetChar(int64_t(a), 5), \
 	Simd::GetChar(int64_t(a), 6), Simd::GetChar(int64_t(a), 7)
 
-#elif defined(__GNUC__) || (defined(_MSC_VER) && defined(SIMD_NEON_ENABLE))
+#elif defined(SIMD_INIT_AS_LONGLONG)
 
 #define SIMD_CHAR_AS_LONGLONG(a) (((long long)a) & 0xFF)
 
@@ -94,11 +109,15 @@ namespace Simd
 #define SIMD_LL_SET2_EPI32(a, b) \
     SIMD_INT_AS_LONGLONG(a) | (SIMD_INT_AS_LONGLONG(b) << 32)
 
-#endif//defined(__GNUC__) || (defined(_MSC_VER) && defined(SIMD_NEON_ENABLE))
+#else
+
+#error This platform is unsupported!
+
+#endif
 
 #if defined(SIMD_SSE2_ENABLE)
 
-#if defined(_MSC_VER)
+#if defined(SIMD_INIT_AS_CHAR)
 
 #define SIMD_MM_SET1_EPI8(a) \
     {SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \
@@ -148,7 +167,7 @@ namespace Simd
 #define SIMD_MM_SETR_EPI64(a0, a1) \
     {SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1)}
 
-#elif defined(__GNUC__)
+#elif defined(SIMD_INIT_AS_LONGLONG)
 
 #define SIMD_MM_SET1_EPI8(a) \
     {SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a)}
@@ -192,7 +211,7 @@ namespace Simd
 
 #if defined(SIMD_AVX2_ENABLE)
 
-#if defined(_MSC_VER)
+#if defined(SIMD_INIT_AS_CHAR)
 
 #define SIMD_MM256_SET1_EPI8(a) \
 	{SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \
@@ -263,7 +282,7 @@ namespace Simd
 #define SIMD_MM256_SETR_EPI64(a0, a1, a2, a3) \
     {SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1), SIMD_AS_8CHARS(a2), SIMD_AS_8CHARS(a3)}
 
-#elif defined(__GNUC__)
+#elif defined(SIMD_INIT_AS_LONGLONG)
 
 #define SIMD_MM256_SET1_EPI8(a) \
     {SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a), \
@@ -310,7 +329,7 @@ namespace Simd
 #define SIMD_MM256_SETR_EPI64(a0, a1, a2, a3) \
     {a0, a1, a2, a3}
 
-#endif// defined(_MSC_VER) || defined(__GNUC__)
+#endif
 
 #endif// SIMD_AVX2_ENABLE
 
diff --git a/3rdparty/simdlib/Simd/SimdLib.cpp b/3rdparty/simdlib/Simd/SimdLib.cpp
old mode 100644
new mode 100755
index eb181ec376..b1cac8b1ba
--- a/3rdparty/simdlib/Simd/SimdLib.cpp
+++ b/3rdparty/simdlib/Simd/SimdLib.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
 *               2014-2018 Antonenka Mikhail,
 *               2018-2018 Radchenko Andrey,
 *               2019-2019 Facundo Galan.
@@ -55,18 +55,18 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved)
 #include "Simd/SimdLib.h"
 
 #include "Simd/SimdMemory.h"
+#include "Simd/SimdCpu.h"
 #include "Simd/SimdEnable.h"
+#include "Simd/SimdAlignment.h"
 #include "Simd/SimdConst.h"
-#include "Simd/SimdCpu.h"
 #include "Simd/SimdLog.h"
 
 #include "Simd/SimdResizer.h"
 #include "Simd/SimdGaussianBlur.h"
 
 #include "Simd/SimdBase.h"
-#include "Simd/SimdSse1.h"
 #include "Simd/SimdSse2.h"
-#include "Simd/SimdSsse3.h"
+#include "Simd/SimdSse41.h"
 #include "Simd/SimdAvx1.h"
 #include "Simd/SimdAvx2.h"
 #include "Simd/SimdNeon.h"
@@ -75,6 +75,11 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved)
 #include "Simd/SimdVersion.h"
 #endif
 
+namespace Simd
+{
+    const size_t ALIGNMENT = GetAlignment();
+}
+
 SIMD_API const char * SimdVersion()
 {
     return SIMD_VERSION;
@@ -118,9 +123,9 @@ SIMD_API void SimdRelease(void * context)
 
 SIMD_API SimdBool SimdGetFastMode()
 {
-#ifdef SIMD_SSE_ENABLE
-    if (Sse::Enable)
-        return Sse::GetFastMode();
+#ifdef SIMD_SSE2_ENABLE
+    if (Sse2::Enable)
+        return Sse2::GetFastMode();
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
@@ -133,9 +138,9 @@ SIMD_API SimdBool SimdGetFastMode()
 
 SIMD_API void SimdSetFastMode(SimdBool value)
 {
-#ifdef SIMD_SSE_ENABLE
-    if (Sse::Enable)
-        Sse::SetFastMode(value);
+#ifdef SIMD_SSE2_ENABLE
+    if (Sse2::Enable)
+        Sse2::SetFastMode(value);
 #endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable)
@@ -145,9 +150,9 @@ SIMD_API void SimdSetFastMode(SimdBool value)
 
 SIMD_API void SimdBgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride)
 {
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && width >= Sse41::A)
+        Sse41::BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
@@ -178,84 +183,69 @@ SIMD_API void SimdBgraToGray(const uint8_t *bgra, size_t width, size_t height, s
         Base::BgraToGray(bgra, width, height, bgraStride, gray, grayStride);
 }
 
-SIMD_API void SimdRgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride)
+SIMD_API void SimdBgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
 {
 #ifdef SIMD_AVX2_ENABLE
-    if(Avx2::Enable && width >= Avx2::A)
-        Avx2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+    if (Avx2::Enable && width >= Avx2::A)
+        Avx2::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride);
     else
 #endif
-#ifdef SIMD_SSE2_ENABLE
-    if(Sse2::Enable && width >= Sse2::A)
-        Sse2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
-    if (Neon::Enable && width >= Neon::HA)
-        Neon::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+    if (Neon::Enable && width >= Neon::A)
+        Neon::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride);
     else
 #endif
-        Base::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+        Base::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride);
 }
 
-SIMD_API void SimdBgrToBgra(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha)
+SIMD_API void SimdBgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
 {
-#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR)
-    if(Avx2::Enable && width >= Avx2::A)
-        Avx2::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
+#ifdef SIMD_AVX2_ENABLE
+    if (Avx2::Enable && width >= Avx2::A)
+        Avx2::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable && width >= Neon::A)
-        Neon::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
+        Neon::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
     else
 #endif
-        Base::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
+        Base::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
 }
 
-SIMD_API void SimdBgrToRgba(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *rgba, size_t rgbaStride, uint8_t alpha)
+SIMD_API void SimdBgrToBgra(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha)
 {
 #if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR)
     if(Avx2::Enable && width >= Avx2::A)
-        Avx2::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-    else
-#endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-    else
-#endif
-#ifdef SIMD_NEON_ENABLE
-    if (Neon::Enable && width >= Neon::A)
-        Neon::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
+        Avx2::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
     else
 #endif
-        Base::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-}
-
-SIMD_API void SimdBgraToRgba(const uint8_t *bgra, size_t width, size_t height, size_t bgraStride, uint8_t *rgba, size_t rgbaStride)
-{
-#if defined(SIMD_AVX2_ENABLE)
-    if(Avx2::Enable && width >= Avx2::A)
-        Avx2::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && width >= Sse41::A)
+        Sse41::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
+#ifdef SIMD_VMX_ENABLE
+    if(Vmx::Enable && width >= Vmx::A)
+        Vmx::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable && width >= Neon::A)
-        Neon::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
+        Neon::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
     else
 #endif
-        Base::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
+        Base::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
 }
 
 SIMD_API void SimdBgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height,
@@ -286,9 +276,9 @@ SIMD_API void SimdBgrToGray(const uint8_t *bgr, size_t width, size_t height, siz
         Avx2::BgrToGray(bgr, width, height, bgrStride, gray, grayStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::BgrToGray(bgr, width, height, bgrStride, gray, grayStride);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && width >= Sse41::A)
+        Sse41::BgrToGray(bgr, width, height, bgrStride, gray, grayStride);
     else
 #endif
 #ifdef SIMD_SSE2_ENABLE
@@ -304,49 +294,29 @@ SIMD_API void SimdBgrToGray(const uint8_t *bgr, size_t width, size_t height, siz
         Base::BgrToGray(bgr, width, height, bgrStride, gray, grayStride);
 }
 
-SIMD_API void SimdRgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride)
+SIMD_API void SimdBgrToRgb(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride)
 {
-#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR)
-    if (Avx2::Enable && width >= Avx2::A)
-        Avx2::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
-    else
-#endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
-    else
-#endif
-#ifdef SIMD_SSE2_ENABLE
-    if (Sse2::Enable && width >= Sse2::A)
-        Sse2::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
+#ifdef SIMD_AVX512BW_ENABLE
+    if (Avx512bw::Enable)
+        Avx512bw::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride);
     else
 #endif
-#ifdef SIMD_NEON_ENABLE
-    if (Neon::Enable && width >= Neon::A)
-      Neon::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
-    else
-#endif
-    Base::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
-}
-
-SIMD_API void SimdBgrToRgb(const uint8_t *bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
-{
 #ifdef SIMD_AVX2_ENABLE
     if (Avx2::Enable && width >= Avx2::A)
-        Avx2::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride);
+        Avx2::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable && width >= Neon::A)
-        Neon::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride);
+        Neon::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride);
     else
 #endif
-        Base::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride);
+        Base::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride);
 }
 
 SIMD_API void SimdCopy(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, uint8_t * dst, size_t dstStride)
@@ -368,9 +338,9 @@ SIMD_API void SimdDeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t
         Avx2::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
@@ -389,9 +359,9 @@ SIMD_API void SimdDeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size
         Avx2::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
@@ -410,9 +380,9 @@ SIMD_API void SimdGaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t
         Avx2::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && (width - 1)*channelCount >= Ssse3::A)
-        Ssse3::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && (width - 1)*channelCount >= Sse41::A)
+        Sse41::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride);
     else
 #endif
 #ifdef SIMD_SSE2_ENABLE
@@ -448,9 +418,9 @@ SIMD_API void SimdGrayToBgr(const uint8_t * gray, size_t width, size_t height, s
         Avx2::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && width >= Sse41::A)
+        Sse41::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
@@ -489,9 +459,9 @@ SIMD_API void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t
         Avx2::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
@@ -510,9 +480,9 @@ SIMD_API void SimdInterleaveBgra(const uint8_t * b, size_t bStride, const uint8_
         Avx2::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
@@ -552,9 +522,9 @@ SIMD_API void SimdReduceColor2x2(const uint8_t *src, size_t srcWidth, size_t src
         Avx2::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && srcWidth >= Ssse3::DA)
-        Ssse3::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && srcWidth >= Sse41::DA)
+        Sse41::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
     else
 #endif
 #ifdef SIMD_SSE2_ENABLE
@@ -578,9 +548,9 @@ SIMD_API void SimdReduceGray2x2(const uint8_t *src, size_t srcWidth, size_t srcH
         Avx2::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && srcWidth >= Ssse3::DA)
-        Ssse3::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && srcWidth >= Sse41::DA)
+        Sse41::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
     else
 #endif
 #ifdef SIMD_SSE2_ENABLE
@@ -625,9 +595,9 @@ SIMD_API void SimdReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcH
         Avx2::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && srcWidth > Ssse3::A)
-        Ssse3::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && srcWidth > Sse41::A)
+        Sse41::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
     else
 #endif
 #ifdef SIMD_SSE2_ENABLE
@@ -672,9 +642,9 @@ SIMD_API void SimdResizeBilinear(const uint8_t *src, size_t srcWidth, size_t src
         Avx2::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && dstWidth >= Ssse3::A)
-        Ssse3::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && dstWidth >= Sse41::A)
+        Sse41::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
     else
 #endif
 #ifdef SIMD_SSE2_ENABLE
@@ -707,21 +677,11 @@ SIMD_API void * SimdResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t ds
         return Sse41::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable)
-        return Ssse3::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
-    else
-#endif
 #ifdef SIMD_SSE2_ENABLE
     if (Sse2::Enable)
         return Sse2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
     else
 #endif
-#ifdef SIMD_SSE_ENABLE
-    if (Sse::Enable)
-        return Sse::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
-    else
-#endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable)
         return Neon::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
@@ -735,6 +695,66 @@ SIMD_API void SimdResizerRun(const void * resizer, const uint8_t * src, size_t s
     ((Resizer*)resizer)->Run(src, srcStride, dst, dstStride);
 }
 
+SIMD_API void SimdRgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+{
+#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR)
+    if (Avx2::Enable && width >= Avx2::A)
+        Avx2::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+    else
+#endif
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+    else
+#endif
+#ifdef SIMD_NEON_ENABLE
+    if (Neon::Enable && width >= Neon::A)
+        Neon::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+    else
+#endif
+        Base::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+}
+
+SIMD_API void SimdRgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+{
+#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR)
+    if (Avx2::Enable && width >= Avx2::A)
+        Avx2::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
+    else
+#endif
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
+    else
+#endif
+#ifdef SIMD_NEON_ENABLE
+    if (Neon::Enable && width >= Neon::A)
+        Neon::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
+    else
+#endif
+        Base::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
+}
+
+SIMD_API void SimdRgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+{
+#if defined(SIMD_AVX2_ENABLE)
+    if (Avx2::Enable && width >= Avx2::A)
+        Avx2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+    else
+#endif
+#ifdef SIMD_SSE2_ENABLE
+    if (Sse2::Enable && width >= Sse2::A)
+        Sse2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+    else
+#endif
+#ifdef SIMD_NEON_ENABLE
+    if (Neon::Enable && width >= Neon::A)
+        Neon::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+    else
+#endif
+        Base::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+}
+
 SIMD_API void SimdStretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
                     uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride)
 {
@@ -842,6 +862,7 @@ SIMD_API void SimdMatTranspose(const double * mat, size_t rows, size_t cols, dou
 
 SIMD_API void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff)
 {
+  //TODO:
 #ifdef SIMD_SSSE3_ENABLE
     if (Ssse3::Enable && size >= Ssse3::A)
         Ssse3::SimdImageDifference(img1,img2, size, imgDiff);
diff --git a/3rdparty/simdlib/Simd/SimdLib.h b/3rdparty/simdlib/Simd/SimdLib.h
old mode 100644
new mode 100755
index c3862f19f1..4838b82261
--- a/3rdparty/simdlib/Simd/SimdLib.h
+++ b/3rdparty/simdlib/Simd/SimdLib.h
@@ -1,8 +1,8 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
-*               2014-2016 Antonenka Mikhail,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
+*               2014-2019 Antonenka Mikhail,
 *               2019-2019 Facundo Galan.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -27,8 +27,6 @@
 #ifndef __SimdLib_h__
 #define __SimdLib_h__
 
-#include "Simd/SimdConfig.h"
-
 #include <stddef.h>
 
 #if defined(_MSC_VER) || defined(__CODEGEARC__)
@@ -107,12 +105,8 @@ typedef enum
     SimdCpuInfoCacheL1, /*!< A size of level 1 data cache. */
     SimdCpuInfoCacheL2, /*!< A size of level 2 cache. */
     SimdCpuInfoCacheL3, /*!< A size of level 3 cache. */
-    SimdCpuInfoSse, /*!< Availability of SSE (x86). */
     SimdCpuInfoSse2, /*!< Availability of SSE2 (x86). */
-    SimdCpuInfoSse3, /*!< Availability of SSE3 (x86). */
-    SimdCpuInfoSsse3, /*!< Availability of SSSE3 (x86). */
     SimdCpuInfoSse41, /*!< Availability of SSE4.1 (x86). */
-    SimdCpuInfoSse42, /*!< Availability of SSE4.2 (x86). */
     SimdCpuInfoAvx, /*!< Availability of AVX (x86). */
     SimdCpuInfoAvx2, /*!< Availability of AVX2 (x86). */
     SimdCpuInfoAvx512f, /*!< Availability of AVX-512F (x86). */
@@ -120,7 +114,6 @@ typedef enum
     SimdCpuInfoVmx, /*!< Availability of VMX or Altivec (PowerPC). */
     SimdCpuInfoVsx, /*!< Availability of VSX (PowerPC). */
     SimdCpuInfoNeon, /*!< Availability of NEON (ARM). */
-    SimdCpuInfoMsa, /*!< Availability of MSA (MIPS). */
 } SimdCpuInfoType;
 
 /*! @ingroup c_types
@@ -188,6 +181,8 @@ typedef enum
     SimdPixelFormatHsl24,
     /*! A 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */
     SimdPixelFormatRgb24,
+    /*! A 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */
+    SimdPixelFormatRgba32,
 } SimdPixelFormatType;
 
 /*! @ingroup c_types
@@ -208,12 +203,14 @@ typedef enum
 {
     /*! 8-bit integer channel type.  */
     SimdResizeChannelByte,
+    /*! 16-bit integer channel type.  */
+    SimdResizeChannelShort,
     /*! 32-bit float channel type.  */
     SimdResizeChannelFloat,
 } SimdResizeChannelType;
 
 /*! @ingroup resizing
-    Describes methods used in oreder to resize image.
+    Describes methods used in order to resize image.
 */
 typedef enum
 {
@@ -223,6 +220,10 @@ typedef enum
     SimdResizeMethodCaffeInterp,
     /*! Area method. */
     SimdResizeMethodArea,
+    /*! InferenceEngine::Extension::Cpu::Interp compatible method. */
+    SimdResizeMethodInferenceEngineInterp,
+    /*! Nearest pixel method. */
+    SimdResizeMethodNearest,
 } SimdResizeMethodType;
 
 // ViSP custom SIMD code
@@ -317,7 +318,7 @@ extern "C"
 
         \fn size_t SimdAlignment();
 
-        \short Gets alignment required for the most productive work of the Simd Library.
+        \short Gets alignment required for the most productive work of Simd Library.
 
         \return a required alignment.
     */
@@ -359,17 +360,18 @@ extern "C"
 
         \fn void SimdBgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride);
 
-        \short Converts 32-bit BGRA image to 24-bit BGR image.
+        \short Converts 32-bit BGRA image to 24-bit BGR image. Also it can be used for 32-bit RGBA to 24-bit RGB conversion.
 
         All images must have the same width and height.
 
-        \note This function has a C++ wrapper Simd::BgraToBgr(const View<A>& bgra, View<A>& bgr).
+        \note This function has C++ wrappers: Simd::BgraToBgr(const View<A>& bgra, View<A>& bgr)
+            and Simd::RgbaToRgb(const View<A>& rgba, View<A>& rgb).
 
-        \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image.
+        \param [in] bgra - a pointer to pixels data of input 32-bit BGRA (or 32-bit RGBA) image.
         \param [in] width - an image width.
         \param [in] height - an image height.
         \param [in] bgraStride - a row size of the bgra image.
-        \param [out] bgr - a pointer to pixels data of output 24-bit BGR image.
+        \param [out] bgr - a pointer to pixels data of output 24-bit BGR (or 24-bit RGB) image.
         \param [in] bgrStride - a row size of the bgr image.
     */
     SIMD_API void SimdBgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride);
@@ -395,76 +397,63 @@ extern "C"
 
     /*! @ingroup bgra_conversion
 
-        \fn void SimdRgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride);
+        \fn void SimdBgraToRgb(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgb, size_t rgbStride);
 
-        \short Converts 32-bit RGBA image to 8-bit gray image.
+        \short Converts 32-bit BGRA image to 24-bit RGB image. Also it can be used for 32-bit RGBA to 24-bit BGR conversion.
 
         All images must have the same width and height.
 
-        \param [in] rgba - a pointer to pixels data of input 32-bit RGBA image.
+        \note This function has C++ wrappers: Simd::BgraToRgb(const View<A>& bgra, View<A>& rgb)
+            and Simd::RgbaToBgr(const View<A>& rgba, View<A>& bgr).
+
+        \param [in] bgra - a pointer to pixels data of input 32-bit BGRA (or 32-bit RGBA) image.
         \param [in] width - an image width.
         \param [in] height - an image height.
-        \param [in] rgbaStride - a row size of the rgba image.
-        \param [out] gray - a pointer to pixels data of output 8-bit gray image.
-        \param [in] grayStride - a row size of the gray image.
+        \param [in] bgraStride - a row size of the bgra image.
+        \param [out] rgb - a pointer to pixels data of output 24-bit RGB (or 24-bit BGR) image.
+        \param [in] rgbStride - a row size of the rgb image.
     */
-    SIMD_API void SimdRgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride);
+    SIMD_API void SimdBgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride);
 
-    /*! @ingroup bgr_conversion
+    /*! @ingroup bgra_conversion
 
-        \fn void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
+        \fn void SimdBgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride);
 
-        \short Converts 24-bit BGR image to 32-bit BGRA image.
+        \short Converts 32-bit BGRA image to 32-bit RGBA image. Also it can be used for 32-bit RGBA to 32-bit BGRA conversion.
 
         All images must have the same width and height.
 
-        \note This function has a C++ wrapper Simd::BgrToBgra(const View<A>& bgr, View<A>& bgra, uint8_t alpha).
+        \note This function has C++ wrappers: Simd::BgraToRgba(const View<A>& bgra, View<A>& rgba)
+            and Simd::RgbaToBgra(const View<A>& rgba, View<A>& bgra).
 
-        \param [in] bgr - a pointer to pixels data of input 24-bit BGR image.
+        \param [in] bgra - a pointer to pixels data of input 32-bit BGRA (or 32-bit RGBA) image.
         \param [in] width - an image width.
         \param [in] height - an image height.
-        \param [in] bgrStride - a row size of the bgr image.
-        \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image.
         \param [in] bgraStride - a row size of the bgra image.
-        \param [in] alpha - a value of alpha channel.
+        \param [out] rgba - a pointer to pixels data of output 32-bit RGBA (or 32-bit BGRA) image.
+        \param [in] rgbaStride - a row size of the rgb image.
     */
-    SIMD_API void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
+    SIMD_API void SimdBgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride);
 
     /*! @ingroup bgr_conversion
 
-        \fn void SimdBgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha);
+        \fn void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
-        \short Converts 24-bit BGR image to 32-bit RGBA image.
+        \short Converts 24-bit BGR image to 32-bit BGRA image.
 
         All images must have the same width and height.
 
+        \note This function has a C++ wrapper Simd::BgrToBgra(const View<A>& bgr, View<A>& bgra, uint8_t alpha).
+
         \param [in] bgr - a pointer to pixels data of input 24-bit BGR image.
         \param [in] width - an image width.
         \param [in] height - an image height.
         \param [in] bgrStride - a row size of the bgr image.
-        \param [out] rgba - a pointer to pixels data of output 32-bit BGRA image.
-        \param [in] rgbaStride - a row size of the bgra image.
-        \param [in] alpha - a value of alpha channel.
-    */
-    SIMD_API void SimdBgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha);
-
-    /*! @ingroup bgr_conversion
-
-        \fn void SimdBgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride);
-
-        \short Converts 32-bit BGRA image to 32-bit RGBA image.
-
-        All images must have the same width and height.
-
-        \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image.
-        \param [in] width - an image width.
-        \param [in] height - an image height.
+        \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image.
         \param [in] bgraStride - a row size of the bgra image.
-        \param [out] rgba - a pointer to pixels data of output 32-bit RGBA image.
-        \param [in] rgbaStride - a row size of the rgba image.
         \param [in] alpha - a value of alpha channel.
     */
-    SIMD_API void SimdBgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride);
+    SIMD_API void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
     /*! @ingroup other_conversion
 
@@ -512,39 +501,23 @@ extern "C"
 
     /*! @ingroup bgr_conversion
 
-        \fn void SimdRgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride);
+        \fn void SimdBgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride);
 
-        \short Converts 24-bit RGB image to 8-bit gray image.
+        \short Converts 24-bit BGR image to 24-bit RGB image. Also it can be used for 24-bit RGB to 24-bit BGR conversion.
 
         All images must have the same width and height.
 
-        \param [in] rgb - a pointer to pixels data of input 24-bit BGR image.
-        \param [in] width - an image width.
-        \param [in] height - an image height.
-        \param [in] rgbStride - a row size of the bgr image.
-        \param [out] gray - a pointer to pixels data of output 8-bit gray image.
-        \param [in] grayStride - a row size of the gray image.
-    */
-    SIMD_API void SimdRgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride);
-
-    /*! @ingroup bgr_conversion
-
-        \fn void SimdBgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride);
-
-        \short Converts 24-bit BGR image to 24-bit RGB image (also it performs backward conversion).
-
-        All images must have the same width and height.
+        \note This function has C++ wrappers: Simd::BgrToRgb(const View<A> & bgr, View<A> & rgb) 
+            and Simd::RgbToBgr(const View<A>& rgb, View<A>& bgr).
 
-        \note This function has a C++ wrapper Simd::BgrToRgb(const View<A> & bgr, View<A> & rgb).
-
-        \param [in] bgr - a pointer to pixels data of input 24-bit BGR image.
-        \param [in] bgrStride - a row size of the bgr image.
+        \param [in] bgr - a pointer to pixels data of input 24-bit BGR image (or 24-bit RGB image).
         \param [in] width - an image width.
         \param [in] height - an image height.
-        \param [out] rgb - a pointer to pixels data of output 24-bit RGB image.
+        \param [in] bgrStride - a row size of the bgr image.
+        \param [out] rgb - a pointer to pixels data of output 24-bit RGB image (or 24-bit BGR image).
         \param [in] rgbStride - a row size of the rgb image.
     */
-    SIMD_API void SimdBgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride);
+    SIMD_API void SimdBgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride);
 
     /*! @ingroup copying
 
@@ -591,7 +564,7 @@ extern "C"
     SIMD_API void SimdCopyFrame(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize,
         size_t frameLeft, size_t frameTop, size_t frameRight, size_t frameBottom, uint8_t * dst, size_t dstStride);
 
-    /*! @ingroup other_conversion
+    /*! @ingroup deinterleave_conversion
 
         \fn void SimdDeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride);
 
@@ -599,7 +572,9 @@ extern "C"
 
         All images must have the same width and height.
 
-        \note This function has a C++ wrapper Simd::DeinterleaveBgr(const View<A>& bgr, View<A>& b, View<A>& g, View<A>& r).
+        \note This function has C++ wrappers:
+            Simd::DeinterleaveBgr(const View<A>& bgr, View<A>& b, View<A>& g, View<A>& r),
+            Simd::DeinterleaveRgb(const View<A>& rgb, View<A>& r, View<A>& g, View<A>& b).
 
         \param [in] bgr - a pointer to pixels data of input 24-bit BGR interleaved image.
         \param [in] bgrStride - a row size of the bgr image.
@@ -615,7 +590,7 @@ extern "C"
     SIMD_API void SimdDeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height,
         uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride);
 
-    /*! @ingroup other_conversion
+    /*! @ingroup deinterleave_conversion
 
         \fn void SimdDeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride);
 
@@ -623,7 +598,11 @@ extern "C"
 
         All images must have the same width and height.
 
-        \note This function has a C++ wrapper Simd::DeinterleaveBgra(const View<A>& bgra, View<A>& b, View<A>& g, View<A>& r, View<A>& a).
+        \note This function has C++ wrappers:
+            Simd::DeinterleaveBgra(const View<A>& bgra, View<A>& b, View<A>& g, View<A>& r, View<A>& a),
+            Simd::DeinterleaveBgra(const View<A>& bgra, View<A>& b, View<A>& g, View<A>& r),
+            Simd::DeinterleaveRgba(const View<A>& rgba, View<A>& r, View<A>& g, View<A>& b, View<A>& a),
+            Simd::DeinterleaveRgba(const View<A>& rgba, View<A>& r, View<A>& g, View<A>& b).
 
         \param [in] bgra - a pointer to pixels data of input 32-bit BGRA interleaved image.
         \param [in] bgraStride - a row size of the bgra image.
@@ -635,7 +614,7 @@ extern "C"
         \param [in] gStride - a row size of the g image.
         \param [out] r - a pointer to pixels data of 8-bit Red planar image.
         \param [in] rStride - a row size of the r image.
-        \param [out] a - a pointer to pixels data of 8-bit Alpha planar image.
+        \param [out] a - a pointer to pixels data of 8-bit Alpha planar image. It can be NULL.
         \param [in] aStride - a row size of the a image.
     */
     SIMD_API void SimdDeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height,
@@ -670,20 +649,27 @@ extern "C"
         size_t channelCount, uint8_t * dst, size_t dstStride);
 
     /*! @ingroup gaussian_filter
+
         \fn void * SimdGaussianBlurInit(size_t width, size_t height, size_t channels, const float * sigma, const float* epsilon);
+
         \short Creates Gaussian blur filter context.
+
         In particular calculates Gaussian blur coefficients:
         \verbatim
         half = floor(sqrt(log(1/epsilon)) * sigma);
         weight[2*half + 1];
+
         for(x = -half; x <= half; ++x)
             weight[x + half] = exp(-sqr(x / sigma) / 2);
+
         sum = 0;
         for (x = -half; x <= half; ++x)
             sum += weight[x + half];
+
         for (x = -half; x <= half; ++x)
             weight[x + half] /= sum;
         \endverbatim
+
         \param [in] width - a width of input and output image.
         \param [in] height - a height of input and output image.
         \param [in] channels - a channel number of input and output image. Its value must be in range [1..4].
@@ -697,8 +683,11 @@ extern "C"
     SIMD_API void* SimdGaussianBlurInit(size_t width, size_t height, size_t channels, const float * sigma, const float* epsilon);
 
     /*! @ingroup gaussian_filter
+
         \fn void SimdGaussianBlurRun(const void* filter, const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride);
+
         \short Performs image Gaussian bluring.
+
         Bluring algorithm for every point:
         \verbatim
         sum = 0;
@@ -713,6 +702,7 @@ extern "C"
         }
         dst[dx, dy] = sum;
         \endverbatim
+
         \param [in] filter - a filter context. It must be created by function ::SimdGaussianBlurInit and released by function ::SimdRelease.
         \param [in] src - a pointer to pixels data of the original input image.
         \param [in] srcStride - a row size (in bytes) of the input image.
@@ -725,17 +715,18 @@ extern "C"
 
         \fn void SimdGrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride);
 
-        \short Converts 8-bit gray image to 24-bit BGR image.
+        \short Converts 8-bit gray image to 24-bit BGR image. Also it can be used for 8-bit gray to 24-bit RGB conversion.
 
         All images must have the same width and height.
 
-        \note This function has a C++ wrapper Simd::GrayToBgr(const View<A>& gray, View<A>& bgr).
+        \note This function has C++ wrappers: Simd::GrayToBgr(const View<A>& gray, View<A>& bgr) 
+            and Simd::GrayToRgb(const View<A>& gray, View<A>& rgb).
 
         \param [in] gray - a pointer to pixels data of input 8-bit gray image.
         \param [in] width - an image width.
         \param [in] height - an image height.
         \param [in] grayStride - a row size of the gray image.
-        \param [out] bgr - a pointer to pixels data of output 24-bit BGR image.
+        \param [out] bgr - a pointer to pixels data of output 24-bit BGR (or 24-bit RGB) image.
         \param [in] bgrStride - a row size of the bgr image.
     */
     SIMD_API void SimdGrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride);
@@ -744,17 +735,18 @@ extern "C"
 
         \fn void SimdGrayToBgra(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
-        \short Converts 8-bit gray image to 32-bit BGRA image.
+        \short Converts 8-bit gray image to 32-bit BGRA image. Also it can be used for 8-bit gray to 32-bit RGBA conversion.
 
         All images must have the same width and height.
 
-        \note This function has a C++ wrapper Simd::GrayToBgra(const View<A>& gray, View<A>& bgra, uint8_t alpha).
+        \note This function has C++ wrappers: Simd::GrayToBgra(const View<A>& gray, View<A>& bgra, uint8_t alpha) 
+            and Simd::GrayToRgba(const View<A>& gray, View<A>& rgba, uint8_t alpha).
 
         \param [in] gray - a pointer to pixels data of input 8-bit gray image.
         \param [in] width - an image width.
         \param [in] height - an image height.
         \param [in] grayStride - a row size of the gray image.
-        \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image.
+        \param [out] bgra - a pointer to pixels data of output 32-bit BGRA (or 32-bit RGBA) image.
         \param [in] bgraStride - a row size of the bgra image.
         \param [in] alpha - a value of alpha channel.
     */
@@ -785,7 +777,7 @@ extern "C"
     SIMD_API void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride,
         size_t width, size_t height, uint8_t * bgr, size_t bgrStride);
 
-    /*! @ingroup other_conversion
+    /*! @ingroup interleave_conversion
 
         \fn void SimdInterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride);
 
@@ -1125,6 +1117,16 @@ extern "C"
 
         \short Creates resize context.
 
+        An using example (resize of RGBA64 image):
+        \verbatim
+        void * resizer = SimdResizerInit(srcX, srcY, dstX, dstY, 4, SimdResizeChannelShort, SimdResizeMethodBilinear);
+        if (resizer)
+        {
+             SimdResizerRun(resizer, (uint8_t*)src, srcStride, (uint8_t*)dst, dstStride);
+             SimdRelease(resizer);
+        }
+        \endverbatim
+
         \param [in] srcX - a width of the input image.
         \param [in] srcY - a height of the input image.
         \param [in] dstX - a width of the output image.
@@ -1152,6 +1154,65 @@ extern "C"
     */
     SIMD_API void SimdResizerRun(const void * resizer, const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
 
+    /*! @ingroup rgb_conversion
+
+        \fn void SimdRgbToBgra(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
+
+        \short Converts 24-bit RGB image to 32-bit BGRA image. Also it can be used for 24-bit BGR to 32-bit RGBA conversion.
+
+        All images must have the same width and height.
+
+        \note This function has C++ wrappers: Simd::RgbToBgra(const View<A>& rgb, View<A>& bgra, uint8_t alpha)
+            and Simd::BgrToRgba(const View<A>& bgr, View<A>& rgba, uint8_t alpha).
+
+        \param [in] rgb - a pointer to pixels data of input 24-bit RGB (or 24-bit BGR) image.
+        \param [in] width - an image width.
+        \param [in] height - an image height.
+        \param [in] rgbStride - a row size of the rgb image.
+        \param [out] bgra - a pointer to pixels data of output 32-bit BGRA (or 32-bit RGBA) image.
+        \param [in] bgraStride - a row size of the bgra image.
+        \param [in] alpha - a value of alpha channel.
+    */
+    SIMD_API void SimdRgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+
+    /*! @ingroup rgb_conversion
+
+        \fn void SimdRgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride);
+
+        \short Converts 24-bit RGB image to 8-bit gray image.
+
+        All images must have the same width and height.
+
+        \note This function has a C++ wrapper Simd::RgbToGray(const View<A>& rgb, View<A>& gray).
+
+        \param [in] rgb - a pointer to pixels data of input 24-bit RGB image.
+        \param [in] width - an image width.
+        \param [in] height - an image height.
+        \param [in] rgbStride - a row size of the rgb image.
+        \param [out] gray - a pointer to pixels data of output 8-bit gray image.
+        \param [in] grayStride - a row size of the gray image.
+    */
+    SIMD_API void SimdRgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride);
+
+    /*! @ingroup rgba_conversion
+
+        \fn void SimdRgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride);
+
+        \short Converts 32-bit RGBA image to 8-bit gray image.
+
+        All images must have the same width and height.
+
+        \note This function has a C++ wrapper Simd::RgbaToGray(const View<A>& rgba, View<A>& gray).
+
+        \param [in] rgba - a pointer to pixels data of input 32-bit RGBA image.
+        \param [in] width - an image width.
+        \param [in] height - an image height.
+        \param [in] rgbaStride - a row size of the rgba image.
+        \param [out] gray - a pointer to pixels data of output 8-bit gray image.
+        \param [in] grayStride - a row size of the gray image.
+    */
+    SIMD_API void SimdRgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride);
+
     /*! @ingroup resizing
 
         \fn void SimdStretchGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
diff --git a/3rdparty/simdlib/Simd/SimdLib.hpp b/3rdparty/simdlib/Simd/SimdLib.hpp
old mode 100644
new mode 100755
index 7f7e6745d5..aaedc571e2
--- a/3rdparty/simdlib/Simd/SimdLib.hpp
+++ b/3rdparty/simdlib/Simd/SimdLib.hpp
@@ -1,8 +1,8 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
-*               2014-2016 Antonenka Mikhail,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
+*               2014-2019 Antonenka Mikhail,
 *               2019-2019 Facundo Galan.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -31,7 +31,9 @@
 #ifndef __SimdLib_hpp__
 #define __SimdLib_hpp__
 
-/*! \namespace Simd */
+/*! @ingroup functions
+    Simd API C++ wrappers.
+*/
 namespace Simd
 {
     /*! @ingroup bgra_conversion
@@ -74,6 +76,46 @@ namespace Simd
         SimdBgraToGray(bgra.data, bgra.width, bgra.height, bgra.stride, gray.data, gray.stride);
     }
 
+    /*! @ingroup bgra_conversion
+
+        \fn void BgraToRgb(const View<A>& bgra, View<A>& rgb)
+
+        \short Converts 32-bit BGRA image to 24-bit RGB image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdBgraToRgb.
+
+        \param [in] bgra - an input 32-bit BGRA image.
+        \param [out] rgb - an output 24-bit RGB image.
+    */
+    template<template<class> class A> SIMD_INLINE void BgraToRgb(const View<A>& bgra, View<A>& rgb)
+    {
+        assert(EqualSize(bgra, rgb) && bgra.format == View<A>::Bgra32 && rgb.format == View<A>::Rgb24);
+
+        SimdBgraToRgb(bgra.data, bgra.width, bgra.height, bgra.stride, rgb.data, rgb.stride);
+    }
+
+    /*! @ingroup bgra_conversion
+
+        \fn void BgraToRgba(const View<A>& bgra, View<A>& rgba)
+
+        \short Converts 32-bit BGRA image to 32-bit RGBA image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdBgraToRgba.
+
+        \param [in] bgra - an input 32-bit BGRA image.
+        \param [out] rgba - an output 32-bit RGBA image.
+    */
+    template<template<class> class A> SIMD_INLINE void BgraToRgba(const View<A>& bgra, View<A>& rgba)
+    {
+        assert(EqualSize(bgra, rgba) && bgra.format == View<A>::Bgra32 && rgba.format == View<A>::Rgba32);
+
+        SimdBgraToRgba(bgra.data, bgra.width, bgra.height, bgra.stride, rgba.data, rgba.stride);
+    }
+
     /*! @ingroup bgr_conversion
 
         \fn void BgrToBgra(const View<A>& bgr, View<A>& bgra, uint8_t alpha = 0xFF)
@@ -142,7 +184,7 @@ namespace Simd
 
         \fn void BgrToRgb(const View<A> & bgr, View<A> & rgb)
 
-        \short Converts 24-bit BGR image to 24-bit RGB image (also it performs backward conversion).
+        \short Converts 24-bit BGR image to 24-bit RGB image.
 
         All images must have the same width and height.
 
@@ -153,9 +195,30 @@ namespace Simd
     */
     template<template<class> class A> SIMD_INLINE void BgrToRgb(const View<A> & bgr, View<A> & rgb)
     {
-        assert(EqualSize(bgr, rgb) && bgr.PixelSize() == 3 && rgb.PixelSize() == 3);
+        assert(EqualSize(bgr, rgb) && bgr.format == View<A>::Bgr24 && rgb.format == View<A>::Rgb24);
 
-        SimdBgrToRgb(bgr.data, bgr.stride, bgr.width, bgr.height, rgb.data, rgb.stride);
+        SimdBgrToRgb(bgr.data, bgr.width, bgr.height, bgr.stride, rgb.data, rgb.stride);
+    }
+
+    /*! @ingroup bgr_conversion
+
+        \fn void BgrToRgba(const View<A>& bgr, View<A>& rgba, uint8_t alpha = 0xFF)
+
+        \short Converts 24-bit BGR image to 32-bit RGBA image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdRgbToBgra.
+
+        \param [in] bgr - an input 24-bit BGR image.
+        \param [out] rgba - an output 32-bit RGBA image.
+        \param [in] alpha - a value of alpha channel. It is equal to 256 by default.
+    */
+    template<template<class> class A> SIMD_INLINE void BgrToRgba(const View<A>& bgr, View<A>& rgba, uint8_t alpha = 0xFF)
+    {
+        assert(EqualSize(bgr, rgba) && rgba.format == View<A>::Rgba32 && bgr.format == View<A>::Bgr24);
+
+        SimdRgbToBgra(bgr.data, bgr.width, bgr.height, bgr.stride, rgba.data, rgba.stride, alpha);
     }
 
     /*! @ingroup copying
@@ -204,7 +267,7 @@ namespace Simd
             frame.left, frame.top, frame.right, frame.bottom, dst.data, dst.stride);
     }
 
-    /*! @ingroup other_conversion
+    /*! @ingroup deinterleave_conversion
 
         \fn void DeinterleaveBgr(const View<A>& bgr, View<A>& b, View<A>& g, View<A>& r)
 
@@ -226,7 +289,7 @@ namespace Simd
         SimdDeinterleaveBgr(bgr.data, bgr.stride, bgr.width, bgr.height, b.data, b.stride, g.data, g.stride, r.data, r.stride);
     }
 
-    /*! @ingroup other_conversion
+    /*! @ingroup deinterleave_conversion
 
         \fn void DeinterleaveBgra(const View<A>& bgra, View<A>& b, View<A>& g, View<A>& r, View<A>& a)
 
@@ -249,6 +312,95 @@ namespace Simd
         SimdDeinterleaveBgra(bgra.data, bgra.stride, bgra.width, bgra.height, b.data, b.stride, g.data, g.stride, r.data, r.stride, a.data, a.stride);
     }
 
+    /*! @ingroup deinterleave_conversion
+
+        \fn void DeinterleaveBgra(const View<A>& bgra, View<A>& b, View<A>& g, View<A>& r)
+
+        \short Deinterleaves 32-bit BGRA interleaved image into separated 8-bit Blue, Green and Red planar images (Alpha channel is ignored).
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdDeinterleaveBgra.
+
+        \param [in] bgra - an input 32-bit BGRA interleaved image.
+        \param [out] b - an output 8-bit Blue planar image.
+        \param [out] g - an output 8-bit Green planar image.
+        \param [out] r - an output 8-bit Red planar image.
+    */
+    template<template<class> class A> SIMD_INLINE void DeinterleaveBgra(const View<A>& bgra, View<A>& b, View<A>& g, View<A>& r)
+    {
+        assert(EqualSize(bgra, b) && Compatible(b, g, r) && bgra.format == View<A>::Bgra32 && b.format == View<A>::Gray8);
+
+        SimdDeinterleaveBgra(bgra.data, bgra.stride, bgra.width, bgra.height, b.data, b.stride, g.data, g.stride, r.data, r.stride, NULL, 0);
+    }
+
+    /*! @ingroup deinterleave_conversion
+
+        \fn void DeinterleaveRgb(const View<A>& rgb, View<A>& r, View<A>& g, View<A>& b)
+
+        \short Deinterleaves 24-bit RGB interleaved image into separated 8-bit Red, Green and Blue planar images.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdDeinterleaveBgr.
+
+        \param [in] rgb - an input 24-bit RGB interleaved image.
+        \param [out] r - an output 8-bit Red planar image.
+        \param [out] g - an output 8-bit Green planar image.
+        \param [out] b - an output 8-bit Blue planar image.
+        */
+    template<template<class> class A> SIMD_INLINE void DeinterleaveRgb(const View<A>& rgb, View<A>& r, View<A>& g, View<A>& b)
+    {
+        assert(EqualSize(rgb, b) && Compatible(b, g, r) && rgb.format == View<A>::Rgb24 && b.format == View<A>::Gray8);
+
+        SimdDeinterleaveBgr(rgb.data, rgb.stride, rgb.width, rgb.height, r.data, r.stride, g.data, g.stride, b.data, b.stride);
+    }
+
+    /*! @ingroup deinterleave_conversion
+
+        \fn void DeinterleaveRgba(const View<A>& rgba, View<A>& r, View<A>& g, View<A>& b, View<A>& a)
+
+        \short Deinterleaves 32-bit RGBA interleaved image into separated 8-bit Red, Green, Blue and Alpha planar images.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdDeinterleaveBgra.
+
+        \param [in] rgba - an input 32-bit RGBA interleaved image.
+        \param [out] r - an output 8-bit Red planar image.
+        \param [out] g - an output 8-bit Green planar image.
+        \param [out] b - an output 8-bit Blue planar image.
+        \param [out] a - an output 8-bit Alpha planar image.
+    */
+    template<template<class> class A> SIMD_INLINE void DeinterleaveRgba(const View<A>& rgba, View<A>& r, View<A>& g, View<A>& b, View<A>& a)
+    {
+        assert(EqualSize(rgba, b) && Compatible(b, g, r, a) && rgba.format == View<A>::Rgba32 && b.format == View<A>::Gray8);
+
+        SimdDeinterleaveBgra(rgba.data, rgba.stride, rgba.width, rgba.height, r.data, r.stride, g.data, g.stride, b.data, b.stride, a.data, a.stride);
+    }
+
+    /*! @ingroup deinterleave_conversion
+
+        \fn void DeinterleaveRgba(const View<A>& rgba, View<A>& r, View<A>& g, View<A>& b)
+
+        \short Deinterleaves 32-bit RGBA interleaved image into separated 8-bit Red, Green and Blue planar images (Alpha channel is ignored).
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdDeinterleaveBgra.
+
+        \param [in] rgba - an input 32-bit RGBA interleaved image.
+        \param [out] r - an output 8-bit Red planar image.
+        \param [out] g - an output 8-bit Green planar image.
+        \param [out] b - an output 8-bit Blue planar image.
+    */
+    template<template<class> class A> SIMD_INLINE void DeinterleaveRgba(const View<A>& rgba, View<A>& r, View<A>& g, View<A>& b)
+    {
+        assert(EqualSize(rgba, b) && Compatible(b, g, r) && rgba.format == View<A>::Rgba32 && b.format == View<A>::Gray8);
+
+        SimdDeinterleaveBgra(rgba.data, rgba.stride, rgba.width, rgba.height, r.data, r.stride, g.data, g.stride, b.data, b.stride, NULL, 0);
+    }
+
     /*! @ingroup other_filter
 
         \fn void GaussianBlur3x3(const View<A>& src, View<A>& dst)
@@ -295,6 +447,26 @@ namespace Simd
         SimdGrayToBgr(gray.data, gray.width, gray.height, gray.stride, bgr.data, bgr.stride);
     }
 
+    /*! @ingroup gray_conversion
+
+        \fn void GrayToRgb(const View<A>& gray, View<A>& rgb)
+
+        \short Converts 8-bit gray image to 24-bit RGB image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdGrayToBgr.
+
+        \param [in] gray - an input 8-bit gray image.
+        \param [out] rgb - an output 24-bit RGB image.
+    */
+    template<template<class> class A> SIMD_INLINE void GrayToRgb(const View<A>& gray, View<A>& rgb)
+    {
+        assert(EqualSize(gray, rgb) && rgb.format == View<A>::Rgb24 && gray.format == View<A>::Gray8);
+
+        SimdGrayToBgr(gray.data, gray.width, gray.height, gray.stride, rgb.data, rgb.stride);
+    }
+
     /*! @ingroup gray_conversion
 
         \fn void GrayToBgra(const View<A>& gray, View<A>& bgra, uint8_t alpha = 0xFF)
@@ -316,6 +488,27 @@ namespace Simd
         SimdGrayToBgra(gray.data, gray.width, gray.height, gray.stride, bgra.data, bgra.stride, alpha);
     }
 
+    /*! @ingroup gray_conversion
+
+        \fn void GrayToRgba(const View<A>& gray, View<A>& rgba, uint8_t alpha = 0xFF)
+
+        \short Converts 8-bit gray image to 32-bit RGBA image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdGrayToBgra.
+
+        \param [in] gray - an input 8-bit gray image.
+        \param [out] rgba - an output 32-bit RGBA image.
+        \param [in] alpha - a value of alpha channel. It is equal to 255 by default.
+    */
+    template<template<class> class A> SIMD_INLINE void GrayToRgba(const View<A>& gray, View<A>& rgba, uint8_t alpha = 0xFF)
+    {
+        assert(EqualSize(gray, rgba) && rgba.format == View<A>::Rgba32 && gray.format == View<A>::Gray8);
+
+        SimdGrayToBgra(gray.data, gray.width, gray.height, gray.stride, rgba.data, rgba.stride, alpha);
+    }
+
     /*! @ingroup other_conversion
 
         \fn void InterleaveBgr(const View<A> & b, const View<A> & g, const View<A> & r, View<A> & bgr)
@@ -338,7 +531,7 @@ namespace Simd
         SimdInterleaveBgr(b.data, b.stride, g.data, g.stride, r.data, r.stride, bgr.width, bgr.height, bgr.data, bgr.stride);
     }
 
-    /*! @ingroup other_conversion
+    /*! @ingroup interleave_conversion
 
         \fn void InterleaveBgra(const View<A>& b, const View<A>& g, const View<A>& r, const View<A>& a, View<A>& bgra)
 
@@ -798,6 +991,200 @@ namespace Simd
         }
     }
 
+    /*! @ingroup resizing
+
+        \fn void Resize(const View<A> & src, View<A> & dst, const Point<ptrdiff_t> & size, ::SimdResizeMethodType method = ::SimdResizeMethodBilinear)
+
+        \short Performs resizing of image.
+
+        \param [in] src - an original input image.
+        \param [out] dst - a resized output image. The input image can be the output.
+        \param [in] size - a size of output image.
+        \param [in] method - a resizing method. By default it is equal to ::SimdResizeMethodBilinear.
+    */
+    template<template<class> class A> SIMD_INLINE void Resize(const View<A>& src, View<A>& dst, const Point<ptrdiff_t> & size, ::SimdResizeMethodType method = ::SimdResizeMethodBilinear)
+    {
+        assert(src.format == View<A>::Float || src.ChannelSize() == 1);
+
+        if (&src == &dst)
+        {
+            if (src.Size() != size)
+            {
+                View<A> tmp(size, src.format);
+                Resize(src, tmp, method);
+                dst.Swap(tmp);
+            }
+        }
+        else
+        {
+            if (dst.Size() != size)
+                dst.Recreate(size, src.format);
+            Resize(src, dst, method);
+        }
+    }
+
+    /*! @ingroup rgb_conversion
+
+        \fn void RgbToBgr(const View<A> & rgb, View<A> & bgr)
+
+        \short Converts 24-bit RGB image to 24-bit BGR image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdBgrToRgb.
+
+        \param [in] rgb - an input 24-bit RGB image.
+        \param [out] bgr - an output 24-bit BGR image.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbToBgr(const View<A>& rgb, View<A>& bgr)
+    {
+        assert(EqualSize(bgr, rgb) && rgb.format == View<A>::Rgb24 || bgr.format == View<A>::Bgr24);
+
+        SimdBgrToRgb(rgb.data, rgb.width, rgb.height, rgb.stride, bgr.data, bgr.stride);
+    }
+
+    /*! @ingroup rgb_conversion
+
+        \fn void RgbToBgra(const View<A>& rgb, View<A>& bgra, uint8_t alpha = 0xFF)
+
+        \short Converts 24-bit RGB image to 32-bit BGRA image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdRgbToBgra.
+
+        \param [in] rgb - an input 24-bit RGB image.
+        \param [out] bgra - an output 32-bit BGRA image.
+        \param [in] alpha - a value of alpha channel. It is equal to 256 by default.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbToBgra(const View<A>& rgb, View<A>& bgra, uint8_t alpha = 0xFF)
+    {
+        assert(EqualSize(rgb, bgra) && bgra.format == View<A>::Bgra32 && rgb.format == View<A>::Rgb24);
+
+        SimdRgbToBgra(rgb.data, rgb.width, rgb.height, rgb.stride, bgra.data, bgra.stride, alpha);
+    }
+
+    /*! @ingroup rgb_conversion
+
+        \fn void RgbToGray(const View<A>& rgb, View<A>& gray)
+
+        \short Converts 24-bit RGB image to 8-bit gray image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdRgbToGray.
+
+        \param [in] rgb - an input 24-bit RGB image.
+        \param [out] gray - an output 8-bit gray image.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbToGray(const View<A>& rgb, View<A>& gray)
+    {
+        assert(EqualSize(rgb, gray) && rgb.format == View<A>::Rgb24 && gray.format == View<A>::Gray8);
+
+        SimdRgbToGray(rgb.data, rgb.width, rgb.height, rgb.stride, gray.data, gray.stride);
+    }
+
+    /*! @ingroup rgb_conversion
+
+        \fn void RgbToRgba(const View<A>& rgb, View<A>& rgba, uint8_t alpha = 0xFF)
+
+        \short Converts 24-bit RGB image to 32-bit RGBA image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdBgrToBgra.
+
+        \param [in] rgb - an input 24-bit RGB image.
+        \param [out] rgba - an output 32-bit RGBA image.
+        \param [in] alpha - a value of alpha channel. It is equal to 256 by default.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbToRgba(const View<A>& rgb, View<A>& rgba, uint8_t alpha = 0xFF)
+    {
+        assert(EqualSize(rgb, rgba) && rgba.format == View<A>::Rgba32 && rgb.format == View<A>::Rgb24);
+
+        SimdBgrToBgra(rgb.data, rgb.width, rgb.height, rgb.stride, rgba.data, rgba.stride, alpha);
+    }
+
+    /*! @ingroup rgba_conversion
+
+        \fn void RgbaToBgr(const View<A>& rgba, View<A>& bgr)
+
+        \short Converts 32-bit RGBA image to 24-bit BGR image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdBgraToRgb.
+
+        \param [in] rgba - an input 32-bit RGBA image.
+        \param [out] bgr - an output 24-bit RGB image.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbaToBgr(const View<A>& rgba, View<A>& bgr)
+    {
+        assert(EqualSize(rgba, bgr) && rgba.format == View<A>::Rgba32 && bgr.format == View<A>::Bgr24);
+
+        SimdBgraToRgb(rgba.data, rgba.width, rgba.height, rgba.stride, bgr.data, bgr.stride);
+    }
+
+    /*! @ingroup rgba_conversion
+
+        \fn void RgbaToBgra(const View<A>& rgba, View<A>& bgra)
+
+        \short Converts 32-bit RGBA image to 32-bit BGRA image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdBgraToRgba.
+
+        \param [in] rgba - an input 32-bit RGBA image.
+        \param [out] bgra - an output 32-bit BGRA image.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbaToBgra(const View<A>& rgba, View<A>& bgra)
+    {
+        assert(EqualSize(bgra, rgba) && bgra.format == View<A>::Bgra32 && rgba.format == View<A>::Rgba32);
+
+        SimdBgraToRgba(rgba.data, rgba.width, rgba.height, rgba.stride, bgra.data, bgra.stride);
+    }
+
+    /*! @ingroup rgba_conversion
+
+        \fn void RgbaToGray(const View<A>& rgba, View<A>& gray)
+
+        \short Converts 32-bit RGBA image to 8-bit gray image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdRgbaToGray.
+
+        \param [in] rgba - an input 32-bit RGBA image.
+        \param [out] gray - an output 8-bit gray image.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbaToGray(const View<A>& rgba, View<A>& gray)
+    {
+        assert(EqualSize(rgba, gray) && rgba.format == View<A>::Rgba32 && gray.format == View<A>::Gray8);
+
+        SimdRgbaToGray(rgba.data, rgba.width, rgba.height, rgba.stride, gray.data, gray.stride);
+    }
+
+    /*! @ingroup rgba_conversion
+
+        \fn void RgbaToRgb(const View<A>& rgba, View<A>& rgb)
+
+        \short Converts 32-bit RGBA image to 24-bit RGB image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdBgraToBgr.
+
+        \param [in] rgba - an input 32-bit RGBA image.
+        \param [out] rgb - an output 24-bit RGB image.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbaToRgb(const View<A>& rgba, View<A>& rgb)
+    {
+        assert(EqualSize(rgba, rgb) && rgba.format == View<A>::Rgba32 && rgb.format == View<A>::Rgb24);
+
+        SimdBgraToBgr(rgba.data, rgba.width, rgba.height, rgba.stride, rgb.data, rgb.stride);
+    }
+
     /*! @ingroup resizing
 
         \fn void StretchGray2x2(const View<A>& src, View<A>& dst)
@@ -825,7 +1212,7 @@ namespace Simd
 
         The input and output images must have the same width and height.
 
-        \note This function supports conversion between Gray8, Bgr24 and Bgra32 image formats.
+        \note This function supports conversion between View::Gray8, View::Bgr24, View::Bgra32, View::Rgb24 and View::Rgba32 image formats.
 
         \param [in] src - an input image.
         \param [out] dst - an output image.
@@ -848,9 +1235,15 @@ namespace Simd
             case View<A>::Bgra32:
                 GrayToBgra(src, dst);
                 break;
+            case View<A>::Rgba32:
+                GrayToRgba(src, dst);
+                break;
             case View<A>::Bgr24:
                 GrayToBgr(src, dst);
                 break;
+            case View<A>::Rgb24:
+                GrayToRgb(src, dst);
+                break;
             default:
                 assert(0);
             }
@@ -865,6 +1258,32 @@ namespace Simd
             case View<A>::Gray8:
                 BgrToGray(src, dst);
                 break;
+            case View<A>::Rgb24:
+                BgrToRgb(src, dst);
+                break;
+            case View<A>::Rgba32:
+                BgrToRgba(src, dst);
+                break;
+            default:
+                assert(0);
+            }
+            break;
+
+        case View<A>::Rgb24:
+            switch (dst.format)
+            {
+            case View<A>::Bgra32:
+                RgbToBgra(src, dst);
+                break;
+            case View<A>::Bgr24:
+                RgbToBgr(src, dst);
+                break;
+            case View<A>::Gray8:
+                RgbToGray(src, dst);
+                break;
+            case View<A>::Rgba32:
+                RgbToRgba(src, dst);
+                break;
             default:
                 assert(0);
             }
@@ -879,6 +1298,32 @@ namespace Simd
             case View<A>::Gray8:
                 BgraToGray(src, dst);
                 break;
+            case View<A>::Rgb24:
+                BgraToRgb(src, dst);
+                break;
+            case View<A>::Rgba32:
+                BgraToRgba(src, dst);
+                break;
+            default:
+                assert(0);
+            }
+            break;
+
+        case View<A>::Rgba32:
+            switch (dst.format)
+            {
+            case View<A>::Bgra32:
+                RgbaToBgra(src, dst);
+                break;
+            case View<A>::Bgr24:
+                RgbaToBgr(src, dst);
+                break;
+            case View<A>::Gray8:
+                RgbaToGray(src, dst);
+                break;
+            case View<A>::Rgb24:
+                RgbaToRgb(src, dst);
+                break;
             default:
                 assert(0);
             }
diff --git a/3rdparty/simdlib/Simd/SimdLoad.h b/3rdparty/simdlib/Simd/SimdLoad.h
old mode 100644
new mode 100755
index 97d7af7098..243858ca1b
--- a/3rdparty/simdlib/Simd/SimdLoad.h
+++ b/3rdparty/simdlib/Simd/SimdLoad.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -28,16 +28,8 @@
 
 namespace Simd
 {
-    enum PadType
-    {
-        PadNose1,
-        PadNone,
-        PadTail1,
-        PadTail2,
-    };
-
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         template <bool align> SIMD_INLINE __m128 Load(const float * p);
 
@@ -56,7 +48,7 @@ namespace Simd
             return _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)p0), (__m64*)p1);
         }
 
-        SIMD_INLINE __m128 LoadPadZeroNose1(const float * p)
+        SIMD_INLINE __m128 LoadPadZeroNose1(const float* p)
         {
             SIMD_ALIGNED(16) const int32_t m[F] = { 0, -1, -1, -1 };
             __m128 a = _mm_loadu_ps(p + 1);
@@ -64,7 +56,7 @@ namespace Simd
             return _mm_and_ps(b, _mm_load_ps((float*)m));
         }
 
-        SIMD_INLINE __m128 LoadPadZeroTail1(const float * p)
+        SIMD_INLINE __m128 LoadPadZeroTail1(const float* p)
         {
             SIMD_ALIGNED(16) const int32_t m[F] = { -1, -1, -1, 0 };
             __m128 a = _mm_loadu_ps(p - 1);
@@ -72,20 +64,15 @@ namespace Simd
             return _mm_and_ps(b, _mm_load_ps((float*)m));
         }
 
-        SIMD_INLINE __m128 LoadPadZeroTail2(const float * p)
+        SIMD_INLINE __m128 LoadPadZeroTail2(const float* p)
         {
             SIMD_ALIGNED(16) const int32_t m[F] = { -1, -1, 0, 0 };
             __m128 a = _mm_loadu_ps(p - 2);
             __m128 b = _mm_shuffle_ps(a, a, 0xFE);
             return _mm_and_ps(b, _mm_load_ps((float*)m));
         }
-    }
-#endif//SIMD_SSE_ENABLE
 
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
-        using namespace Sse;
+        //---------------------------------------------------------------------
 
         template <bool align> SIMD_INLINE __m128i Load(const __m128i * p);
 
@@ -99,6 +86,11 @@ namespace Simd
             return _mm_load_si128(p);
         }
 
+        SIMD_INLINE __m128i Load(const __m128i* p0, const __m128i* p1)
+        {
+            return _mm_castps_si128(_mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)p0), (__m64*)p1));
+        }
+
         template <bool align> SIMD_INLINE __m128i LoadMaskI8(const __m128i * p, __m128i index)
         {
             return _mm_cmpeq_epi8(Load<align>(p), index);
@@ -113,90 +105,13 @@ namespace Simd
         {
             return _mm_or_si128(_mm_srli_si128(last, count), _mm_and_si128(last, _mm_slli_si128(K_INV_ZERO, A - count)));
         }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadNose3(const uint8_t * p, __m128i a[3])
-        {
-            a[1] = Load<align>((__m128i*)p);
-            a[0] = LoadBeforeFirst<step>(a[1]);
-            a[2] = _mm_loadu_si128((__m128i*)(p + step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadBody3(const uint8_t * p, __m128i a[3])
-        {
-            a[0] = _mm_loadu_si128((__m128i*)(p - step));
-            a[1] = Load<align>((__m128i*)p);
-            a[2] = _mm_loadu_si128((__m128i*)(p + step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadTail3(const uint8_t * p, __m128i a[3])
-        {
-            a[0] = _mm_loadu_si128((__m128i*)(p - step));
-            a[1] = Load<align>((__m128i*)p);
-            a[2] = LoadAfterLast<step>(a[1]);
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadNose5(const uint8_t * p, __m128i a[5])
-        {
-            a[2] = Load<align>((__m128i*)p);
-            a[1] = LoadBeforeFirst<step>(a[2]);
-            a[0] = LoadBeforeFirst<step>(a[1]);
-            a[3] = _mm_loadu_si128((__m128i*)(p + step));
-            a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadBody5(const uint8_t * p, __m128i a[5])
-        {
-            a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step));
-            a[1] = _mm_loadu_si128((__m128i*)(p - step));
-            a[2] = Load<align>((__m128i*)p);
-            a[3] = _mm_loadu_si128((__m128i*)(p + step));
-            a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadTail5(const uint8_t * p, __m128i a[5])
-        {
-            a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step));
-            a[1] = _mm_loadu_si128((__m128i*)(p - step));
-            a[2] = Load<align>((__m128i*)p);
-            a[3] = LoadAfterLast<step>(a[2]);
-            a[4] = LoadAfterLast<step>(a[3]);
-        }
-
-        SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m128i a[3])
-        {
-            a[0] = LoadBeforeFirst<1>(_mm_loadu_si128((__m128i*)p));
-            a[2] = _mm_loadu_si128((__m128i*)(p + 1));
-        }
-
-        SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m128i a[3])
-        {
-            a[0] = _mm_loadu_si128((__m128i*)(p - 1));
-            a[2] = _mm_loadu_si128((__m128i*)(p + 1));
-        }
-
-        SIMD_INLINE void LoadTailDx(const uint8_t * p, __m128i a[3])
-        {
-            a[0] = _mm_loadu_si128((__m128i*)(p - 1));
-            a[2] = LoadAfterLast<1>(_mm_loadu_si128((__m128i*)p));
-        }
     }
 #endif//SIMD_SSE2_ENABLE
 
-#ifdef SIMD_SSE3_ENABLE
-    namespace Sse3
-    {
-#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug      
-        using Sse::Load;
-        using Sse2::Load;
-#endif
-    }
-#endif
-
 #ifdef SIMD_SSE41_ENABLE
     namespace Sse41
     {
 #if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug      
-        using Sse::Load;
         using Sse2::Load;
 #endif
     }
@@ -219,12 +134,17 @@ namespace Simd
 
         template<bool align> SIMD_INLINE __m256 Load(const float * p0, const float * p1)
         {
-            return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse::Load<align>(p0)), Sse::Load<align>(p1), 1);
+            return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse2::Load<align>(p0)), Sse2::Load<align>(p1), 1);
         }
 
         SIMD_INLINE __m256 Load(const float * p0, const float * p1, const float * p2, const float * p3)
         {
-            return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse::Load(p0, p1)), Sse::Load(p2, p3), 1);
+            return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse2::Load(p0, p1)), Sse2::Load(p2, p3), 1);
+        }
+
+        SIMD_INLINE __m256 Load(const float * ptr, __m256i mask)
+        {
+            return _mm256_maskload_ps(ptr, mask);
         }
     }
 #endif//SIMD_AVX_ENABLE
@@ -333,86 +253,6 @@ namespace Simd
             __m128i secondHi = LoadHalfAfterLast<step>(firstHi);
             second = _mm256_inserti128_si256(_mm256_castsi128_si256(secondLo), secondHi, 0x1);
         }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadNose3(const uint8_t * p, __m256i a[3])
-        {
-            a[0] = LoadBeforeFirst<align, step>(p);
-            a[1] = Load<align>((__m256i*)p);
-            a[2] = _mm256_loadu_si256((__m256i*)(p + step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadBody3(const uint8_t * p, __m256i a[3])
-        {
-            a[0] = _mm256_loadu_si256((__m256i*)(p - step));
-            a[1] = Load<align>((__m256i*)p);
-            a[2] = _mm256_loadu_si256((__m256i*)(p + step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadTail3(const uint8_t * p, __m256i a[3])
-        {
-            a[0] = _mm256_loadu_si256((__m256i*)(p - step));
-            a[1] = Load<align>((__m256i*)p);
-            a[2] = LoadAfterLast<align, step>(p);
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadNose5(const uint8_t * p, __m256i a[5])
-        {
-            LoadBeforeFirst<align, step>(p, a[1], a[0]);
-            a[2] = Load<align>((__m256i*)p);
-            a[3] = _mm256_loadu_si256((__m256i*)(p + step));
-            a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadBody5(const uint8_t * p, __m256i a[5])
-        {
-            a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step));
-            a[1] = _mm256_loadu_si256((__m256i*)(p - step));
-            a[2] = Load<align>((__m256i*)p);
-            a[3] = _mm256_loadu_si256((__m256i*)(p + step));
-            a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadTail5(const uint8_t * p, __m256i a[5])
-        {
-            a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step));
-            a[1] = _mm256_loadu_si256((__m256i*)(p - step));
-            a[2] = Load<align>((__m256i*)p);
-            LoadAfterLast<align, step>(p, a[3], a[4]);
-        }
-
-        SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m256i a[3])
-        {
-            a[0] = LoadBeforeFirst<false, 1>(p);
-            a[2] = _mm256_loadu_si256((__m256i*)(p + 1));
-        }
-
-        SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m256i a[3])
-        {
-            a[0] = _mm256_loadu_si256((__m256i*)(p - 1));
-            a[2] = _mm256_loadu_si256((__m256i*)(p + 1));
-        }
-
-        SIMD_INLINE void LoadTailDx(const uint8_t * p, __m256i a[3])
-        {
-            a[0] = _mm256_loadu_si256((__m256i*)(p - 1));
-            a[2] = LoadAfterLast<false, 1>(p);
-        }
-
-        template <bool align> SIMD_INLINE __m256 Load(const float * p);
-
-        template <> SIMD_INLINE __m256 Load<false>(const float * p)
-        {
-            return _mm256_loadu_ps(p);
-        }
-
-        template <> SIMD_INLINE __m256 Load<true>(const float * p)
-        {
-#ifdef _MSC_VER
-            return _mm256_castsi256_ps(_mm256_load_si256((__m256i*)p));
-#else
-            return _mm256_load_ps(p);
-#endif
-        }
     }
 #endif//SIMD_AVX2_ENABLE
 
@@ -456,12 +296,12 @@ namespace Simd
 
         template <bool align> SIMD_INLINE int32x4_t Load(const int32_t * p)
         {
-            return (int32x4_t)Load<align>((const uint8_t*)p);
+            return vreinterpretq_s32_u8(Load<align>((const uint8_t*)p));
         }
 
         template <bool align> SIMD_INLINE uint32x4_t Load(const uint32_t * p)
         {
-            return (uint32x4_t)Load<align>((const uint8_t*)p);
+            return vreinterpretq_u32_u8(Load<align>((const uint8_t*)p));
         }
 
         template <bool align> SIMD_INLINE float32x4_t Load(const float * p);
@@ -829,81 +669,6 @@ namespace Simd
             return vextq_u8(last, vextq_u8(last, last, 16 - count), count);
         }
 
-        template <bool align, size_t step> SIMD_INLINE void LoadNose3(const uint8_t * p, uint8x16_t a[3])
-        {
-            a[1] = Load<align>(p);
-            a[0] = LoadBeforeFirst<step>(a[1]);
-            a[2] = vld1q_u8(p + step);
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadBody3(const uint8_t * p, uint8x16_t a[3])
-        {
-#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE
-            __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE);
-#endif
-            a[0] = vld1q_u8(p - step);
-            a[1] = Load<align>(p);
-            a[2] = vld1q_u8(p + step);
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadTail3(const uint8_t * p, uint8x16_t a[3])
-        {
-            a[0] = vld1q_u8(p - step);
-            a[1] = Load<align>(p);
-            a[2] = LoadAfterLast<step>(a[1]);
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadNose5(const uint8_t * p, uint8x16_t a[5])
-        {
-            a[2] = Load<align>(p);
-            a[1] = LoadBeforeFirst<step>(a[2]);
-            a[0] = LoadBeforeFirst<step>(a[1]);
-            a[3] = vld1q_u8(p + step);
-            a[4] = vld1q_u8(p + 2 * step);
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadBody5(const uint8_t * p, uint8x16_t a[5])
-        {
-#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE
-            __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE);
-#endif
-            a[0] = vld1q_u8(p - 2 * step);
-            a[1] = vld1q_u8(p - step);
-            a[2] = Load<align>(p);
-            a[3] = vld1q_u8(p + step);
-            a[4] = vld1q_u8(p + 2 * step);
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadTail5(const uint8_t * p, uint8x16_t a[5])
-        {
-            a[0] = vld1q_u8(p - 2 * step);
-            a[1] = vld1q_u8(p - step);
-            a[2] = Load<align>(p);
-            a[3] = LoadAfterLast<step>(a[2]);
-            a[4] = LoadAfterLast<step>(a[3]);
-        }
-
-        SIMD_INLINE void LoadNoseDx(const uint8_t * p, uint8x16_t a[3])
-        {
-            a[0] = LoadBeforeFirst<1>(vld1q_u8(p));
-            a[2] = vld1q_u8(p + 1);
-        }
-
-        SIMD_INLINE void LoadBodyDx(const uint8_t * p, uint8x16_t a[3])
-        {
-#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE
-            __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE);
-#endif
-            a[0] = vld1q_u8(p - 1);
-            a[2] = vld1q_u8(p + 1);
-        }
-
-        SIMD_INLINE void LoadTailDx(const uint8_t * p, uint8x16_t a[3])
-        {
-            a[0] = vld1q_u8(p - 1);
-            a[2] = LoadAfterLast<1>(vld1q_u8(p));
-        }
-
         template <size_t count> SIMD_INLINE uint8x8_t LoadBeforeFirst(uint8x8_t first)
         {
             return vext_u8(vext_u8(first, first, count), first, 8 - count);
diff --git a/3rdparty/simdlib/Simd/SimdLoadBlock.h b/3rdparty/simdlib/Simd/SimdLoadBlock.h
new file mode 100755
index 0000000000..8a46e07687
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdLoadBlock.h
@@ -0,0 +1,251 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdLoadBlock_h__
+#define __SimdLoadBlock_h__
+
+#include "Simd/SimdLoad.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
+    {
+        template <bool align, size_t step> SIMD_INLINE void LoadNose3(const uint8_t * p, __m128i a[3])
+        {
+            a[1] = Load<align>((__m128i*)p);
+            a[0] = LoadBeforeFirst<step>(a[1]);
+            a[2] = _mm_loadu_si128((__m128i*)(p + step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadBody3(const uint8_t * p, __m128i a[3])
+        {
+            a[0] = _mm_loadu_si128((__m128i*)(p - step));
+            a[1] = Load<align>((__m128i*)p);
+            a[2] = _mm_loadu_si128((__m128i*)(p + step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadTail3(const uint8_t * p, __m128i a[3])
+        {
+            a[0] = _mm_loadu_si128((__m128i*)(p - step));
+            a[1] = Load<align>((__m128i*)p);
+            a[2] = LoadAfterLast<step>(a[1]);
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadNose5(const uint8_t * p, __m128i a[5])
+        {
+            a[2] = Load<align>((__m128i*)p);
+            a[1] = LoadBeforeFirst<step>(a[2]);
+            a[0] = LoadBeforeFirst<step>(a[1]);
+            a[3] = _mm_loadu_si128((__m128i*)(p + step));
+            a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadBody5(const uint8_t * p, __m128i a[5])
+        {
+            a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step));
+            a[1] = _mm_loadu_si128((__m128i*)(p - step));
+            a[2] = Load<align>((__m128i*)p);
+            a[3] = _mm_loadu_si128((__m128i*)(p + step));
+            a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadTail5(const uint8_t * p, __m128i a[5])
+        {
+            a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step));
+            a[1] = _mm_loadu_si128((__m128i*)(p - step));
+            a[2] = Load<align>((__m128i*)p);
+            a[3] = LoadAfterLast<step>(a[2]);
+            a[4] = LoadAfterLast<step>(a[3]);
+        }
+
+        SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m128i a[3])
+        {
+            a[0] = LoadBeforeFirst<1>(_mm_loadu_si128((__m128i*)p));
+            a[2] = _mm_loadu_si128((__m128i*)(p + 1));
+        }
+
+        SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m128i a[3])
+        {
+            a[0] = _mm_loadu_si128((__m128i*)(p - 1));
+            a[2] = _mm_loadu_si128((__m128i*)(p + 1));
+        }
+
+        SIMD_INLINE void LoadTailDx(const uint8_t * p, __m128i a[3])
+        {
+            a[0] = _mm_loadu_si128((__m128i*)(p - 1));
+            a[2] = LoadAfterLast<1>(_mm_loadu_si128((__m128i*)p));
+        }
+    }
+#endif//SIMD_SSE2_ENABLE
+
+#ifdef SIMD_AVX2_ENABLE
+    namespace Avx2
+    {
+        template <bool align, size_t step> SIMD_INLINE void LoadNose3(const uint8_t * p, __m256i a[3])
+        {
+            a[0] = LoadBeforeFirst<align, step>(p);
+            a[1] = Load<align>((__m256i*)p);
+            a[2] = _mm256_loadu_si256((__m256i*)(p + step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadBody3(const uint8_t * p, __m256i a[3])
+        {
+            a[0] = _mm256_loadu_si256((__m256i*)(p - step));
+            a[1] = Load<align>((__m256i*)p);
+            a[2] = _mm256_loadu_si256((__m256i*)(p + step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadTail3(const uint8_t * p, __m256i a[3])
+        {
+            a[0] = _mm256_loadu_si256((__m256i*)(p - step));
+            a[1] = Load<align>((__m256i*)p);
+            a[2] = LoadAfterLast<align, step>(p);
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadNose5(const uint8_t * p, __m256i a[5])
+        {
+            LoadBeforeFirst<align, step>(p, a[1], a[0]);
+            a[2] = Load<align>((__m256i*)p);
+            a[3] = _mm256_loadu_si256((__m256i*)(p + step));
+            a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadBody5(const uint8_t * p, __m256i a[5])
+        {
+            a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step));
+            a[1] = _mm256_loadu_si256((__m256i*)(p - step));
+            a[2] = Load<align>((__m256i*)p);
+            a[3] = _mm256_loadu_si256((__m256i*)(p + step));
+            a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadTail5(const uint8_t * p, __m256i a[5])
+        {
+            a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step));
+            a[1] = _mm256_loadu_si256((__m256i*)(p - step));
+            a[2] = Load<align>((__m256i*)p);
+            LoadAfterLast<align, step>(p, a[3], a[4]);
+        }
+
+        SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m256i a[3])
+        {
+            a[0] = LoadBeforeFirst<false, 1>(p);
+            a[2] = _mm256_loadu_si256((__m256i*)(p + 1));
+        }
+
+        SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m256i a[3])
+        {
+            a[0] = _mm256_loadu_si256((__m256i*)(p - 1));
+            a[2] = _mm256_loadu_si256((__m256i*)(p + 1));
+        }
+
+        SIMD_INLINE void LoadTailDx(const uint8_t * p, __m256i a[3])
+        {
+            a[0] = _mm256_loadu_si256((__m256i*)(p - 1));
+            a[2] = LoadAfterLast<false, 1>(p);
+        }
+    }
+#endif//SIMD_AVX2_ENABLE
+
+#ifdef SIMD_NEON_ENABLE
+    namespace Neon
+    {
+        template <bool align, size_t step> SIMD_INLINE void LoadNose3(const uint8_t * p, uint8x16_t a[3])
+        {
+            a[1] = Load<align>(p);
+            a[0] = LoadBeforeFirst<step>(a[1]);
+            a[2] = vld1q_u8(p + step);
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadBody3(const uint8_t * p, uint8x16_t a[3])
+        {
+#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE
+            __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE);
+#endif
+            a[0] = vld1q_u8(p - step);
+            a[1] = Load<align>(p);
+            a[2] = vld1q_u8(p + step);
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadTail3(const uint8_t * p, uint8x16_t a[3])
+        {
+            a[0] = vld1q_u8(p - step);
+            a[1] = Load<align>(p);
+            a[2] = LoadAfterLast<step>(a[1]);
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadNose5(const uint8_t * p, uint8x16_t a[5])
+        {
+            a[2] = Load<align>(p);
+            a[1] = LoadBeforeFirst<step>(a[2]);
+            a[0] = LoadBeforeFirst<step>(a[1]);
+            a[3] = vld1q_u8(p + step);
+            a[4] = vld1q_u8(p + 2 * step);
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadBody5(const uint8_t * p, uint8x16_t a[5])
+        {
+#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE
+            __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE);
+#endif
+            a[0] = vld1q_u8(p - 2 * step);
+            a[1] = vld1q_u8(p - step);
+            a[2] = Load<align>(p);
+            a[3] = vld1q_u8(p + step);
+            a[4] = vld1q_u8(p + 2 * step);
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadTail5(const uint8_t * p, uint8x16_t a[5])
+        {
+            a[0] = vld1q_u8(p - 2 * step);
+            a[1] = vld1q_u8(p - step);
+            a[2] = Load<align>(p);
+            a[3] = LoadAfterLast<step>(a[2]);
+            a[4] = LoadAfterLast<step>(a[3]);
+        }
+
+        SIMD_INLINE void LoadNoseDx(const uint8_t * p, uint8x16_t a[3])
+        {
+            a[0] = LoadBeforeFirst<1>(vld1q_u8(p));
+            a[2] = vld1q_u8(p + 1);
+        }
+
+        SIMD_INLINE void LoadBodyDx(const uint8_t * p, uint8x16_t a[3])
+        {
+#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE
+            __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE);
+#endif
+            a[0] = vld1q_u8(p - 1);
+            a[2] = vld1q_u8(p + 1);
+        }
+
+        SIMD_INLINE void LoadTailDx(const uint8_t * p, uint8x16_t a[3])
+        {
+            a[0] = vld1q_u8(p - 1);
+            a[2] = LoadAfterLast<1>(vld1q_u8(p));
+        }
+    }
+#endif//SIMD_NEON_ENABLE
+}
+#endif//__SimdLoadBlock_h__
diff --git a/3rdparty/simdlib/Simd/SimdLog.h b/3rdparty/simdlib/Simd/SimdLog.h
old mode 100644
new mode 100755
index 45ba3f3be5..923a16dc70
--- a/3rdparty/simdlib/Simd/SimdLog.h
+++ b/3rdparty/simdlib/Simd/SimdLog.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -58,8 +58,8 @@ namespace Simd
         Log<T>(array.data, array.size, name);
     }
 
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         SIMD_INLINE void Log(const __m128 & value, const std::string & name)
         {
@@ -67,12 +67,7 @@ namespace Simd
             _mm_storeu_ps(buffer, value);
             Simd::Log<float>(buffer, F, name);
         }
-    }
-#endif //SIMD_SSE_ENABLE
 
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
         template<class T> SIMD_INLINE void Log(const __m128i & value, const std::string & name)
         {
             const size_t n = sizeof(__m128i) / sizeof(T);
@@ -86,7 +81,7 @@ namespace Simd
 #ifdef SIMD_SSE41_ENABLE
     namespace Sse41
     {
-        using namespace Sse;
+        using namespace Sse2;
     }
 #endif //SIMD_SSE41_ENABLE
 
@@ -173,14 +168,15 @@ namespace Simd
 #define SIMD_LOG2(value) Log<int16_t>(value, #value)
 #define SIMD_LOG4(value) Log<int32_t>(value, #value)
 
-#define SIMD_LOG_SS(message) \
+#define SIMD_LOG_ERROR(message) \
 { \
-    std::cout << __FUNCTION__  << " : " << message << std::endl; \
-    std::cout.flush(); \
+    std::stringstream ss; \
+    ss << std::endl << " In function " << SIMD_FUNCTION << ":" << std::endl; \
+    ss << " In file " << __FILE__ << ":" << __LINE__ << ":" << std::endl; \
+    ss << " Error: " << message << std::endl << std::endl; \
+    std::cerr << ss.str() << std::flush; \
 }
 
-#define SIMD_LOG_LINE() std::cout << __FUNCTION__  << " : " << __LINE__ << std::endl << std::flush; 
-
 #else//SIMD_LOG_ENABLE
 
 #define SIMD_LOG(value)
@@ -188,9 +184,7 @@ namespace Simd
 #define SIMD_LOG2(value)
 #define SIMD_LOG4(value)
 
-#define SIMD_LOG_SS(message)
-
-#define SIMD_LOG_LINE()
+#define SIMD_LOG_ERROR(message)
 
 #endif//SIMD_LOG_ENABLE 
 
diff --git a/3rdparty/simdlib/Simd/SimdMath.h b/3rdparty/simdlib/Simd/SimdMath.h
old mode 100644
new mode 100755
index 4b674ea512..0f7425f76e
--- a/3rdparty/simdlib/Simd/SimdMath.h
+++ b/3rdparty/simdlib/Simd/SimdMath.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
 *               2018-2019 Radchenko Andrey.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -66,11 +66,21 @@ namespace Simd
 #define SIMD_ROUND
     SIMD_INLINE int Round(double value)
     {
-#if defined(SIMD_SSE2_ENABLE) && ((defined(_MSC_VER) && defined(_M_X64)) || (defined(__GNUC__) && defined(__x86_64__)))
-        __m128d t = _mm_set_sd(value);
-        return _mm_cvtsd_si32(t);
+#if defined(SIMD_X64_ENABLE) && !defined(SIMD_SSE2_DISABLE)
+        __m128d _value = _mm_set_sd(value);
+        return _mm_cvtsd_si32(_value);
 #else
-        return (int)(value + (value >= 0 ? 0.5 : -0.5));
+        return (int)(value + (value >= 0.0 ? 0.5 : -0.5));
+#endif
+    }
+
+    SIMD_INLINE int Round(float value)
+    {
+#if defined(SIMD_X64_ENABLE) && !defined(SIMD_SSE2_DISABLE)
+        __m128 _value = _mm_set_ss(value);
+        return _mm_cvtss_si32(_value);
+#else
+        return (int)(value + (value >= 0.0f ? 0.5f : -0.5f));
 #endif
     }
 #endif
@@ -263,8 +273,8 @@ namespace Simd
         }
     }
 
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         SIMD_INLINE __m128 Square(__m128 value)
         {
@@ -330,12 +340,7 @@ namespace Simd
             __m128 m = _mm_max_ps(s0, s1);
             return _mm_store_ss(dst, _mm_max_ss(m, _mm_shuffle_ps(m, m, 1)));
         }
-    }
-#endif//SIMD_SSE_ENABLE
 
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
         SIMD_INLINE __m128i SaturateI16ToU8(__m128i value)
         {
             return _mm_min_epi16(K16_00FF, _mm_max_epi16(value, K_ZERO));
@@ -508,17 +513,8 @@ namespace Simd
     }
 #endif// SIMD_SSE2_ENABLE
 
-#ifdef SIMD_SSE3_ENABLE
-    namespace Sse3
-    {
-#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug      
-        using Sse::RightNotZero;
-#endif
-    }
-#endif//SIMD_SSE3_ENABLE
-
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE
+    namespace Sse41
     {
         using namespace Sse2;
 
@@ -538,12 +534,7 @@ namespace Simd
         {
             return _mm_maddubs_epi16(UnpackU8<part>(a, b), K8_01_FF);
         }
-    }
-#endif// SIMD_SSSE3_ENABLE
 
-#ifdef SIMD_SSE41_ENABLE
-    namespace Sse41
-    {
 #if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug     
         using Sse::RightNotZero;
 #endif
diff --git a/3rdparty/simdlib/Simd/SimdMemory.h b/3rdparty/simdlib/Simd/SimdMemory.h
old mode 100644
new mode 100755
index de45abb291..d7772ffa3c
--- a/3rdparty/simdlib/Simd/SimdMemory.h
+++ b/3rdparty/simdlib/Simd/SimdMemory.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2018 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *               2016-2016 Sintegrial Technologies.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -28,9 +28,10 @@
 #include "Simd/SimdDefs.h"
 #include "Simd/SimdMath.h"
 
-#if defined(__GNUC__) && defined(SIMD_ALLOCATE_ERROR_MESSAGE)
+#if defined(SIMD_ALLOCATE_ERROR_MESSAGE)
 #include <iostream>
 #endif
+#include <memory>
 
 namespace Simd
 {
@@ -88,17 +89,18 @@ namespace Simd
         align = AlignHi(align, sizeof(void *));
         size = AlignHi(size, align);
         int result = ::posix_memalign(&ptr, align, size);
-#ifdef SIMD_ALLOCATE_ERROR_MESSAGE
         if (result != 0)
+            ptr = NULL;
+#else
+        ptr = malloc(size);
+#endif
+#ifdef SIMD_ALLOCATE_ERROR_MESSAGE
+        if (ptr == NULL)
             std::cout << "The function posix_memalign can't allocate " << size << " bytes with align " << align << " !" << std::endl << std::flush;
 #endif
 #ifdef SIMD_ALLOCATE_ASSERT
-        assert(result == 0);
-#endif
-#else
-        ptr = malloc(size);
+        assert(ptr);
 #endif
-
 #ifdef SIMD_NO_MANS_LAND
         if (ptr)
             ptr = (char*)ptr + SIMD_NO_MANS_LAND;
@@ -121,60 +123,86 @@ namespace Simd
 #endif
     }
 
+    //---------------------------------------------------------------------------------------------
+
     struct Deletable
     {
         virtual ~Deletable() {}
     };
 
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+    //---------------------------------------------------------------------------------------------
+
+#if defined(SIMD_CPP_2011_ENABLE)
+    template<class T> using Holder = std::unique_ptr<T>;
+#else
+    template <class T> class Holder
     {
-        SIMD_INLINE bool Aligned(size_t size, size_t align = sizeof(__m128))
+        T* _ptr;
+
+    public:
+        Holder(T* ptr)
+            : _ptr(ptr)
         {
-            return Simd::Aligned(size, align);
         }
 
-        SIMD_INLINE bool Aligned(const void * ptr, size_t align = sizeof(__m128))
+        ~Holder()
         {
-            return Simd::Aligned(ptr, align);
+            if (_ptr)
+                delete _ptr;
+        }
+
+        T& operator * ()
+        {
+            return *_ptr;
+        }
+
+        const T& operator * () const
+        {
+            return *_ptr;
+        }
+
+        T* operator -> ()
+        {
+            return _ptr;
         }
-    }
-#endif// SIMD_SSE_ENABLE
+
+        const T* operator -> () const
+        {
+            return _ptr;
+        }
+
+        operator bool() const 
+        {
+            return _ptr != NULL;
+        }
+    };
+#endif
+
+    //---------------------------------------------------------------------------------------------
+
 
 #ifdef SIMD_SSE2_ENABLE
     namespace Sse2
     {
-        using Sse::Aligned;
-    }
-#endif// SIMD_SSE2_ENABLE
-
-#ifdef SIMD_SSE3_ENABLE
-    namespace Sse3
-    {
-        using Sse::Aligned;
-    }
-#endif// SIMD_SSE3_ENABLE
+        SIMD_INLINE bool Aligned(size_t size, size_t align = sizeof(__m128))
+        {
+            return Simd::Aligned(size, align);
+        }
 
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        using Sse::Aligned;
+        SIMD_INLINE bool Aligned(const void * ptr, size_t align = sizeof(__m128))
+        {
+            return Simd::Aligned(ptr, align);
+        }        
     }
-#endif// SIMD_SSSE3_ENABLE
+#endif// SIMD_SSE2_ENABLE
 
 #ifdef SIMD_SSE41_ENABLE
     namespace Sse41
     {
-        using Sse::Aligned;
+        using Sse2::Aligned;
     }
 #endif// SIMD_SSE41_ENABLE
 
-#ifdef SIMD_SSE42_ENABLE
-    namespace Sse42
-    {
-    }
-#endif// SIMD_SSE42_ENABLE
-
 #ifdef SIMD_AVX_ENABLE
     namespace Avx
     {
diff --git a/3rdparty/simdlib/Simd/SimdNeon.h b/3rdparty/simdlib/Simd/SimdNeon.h
old mode 100644
new mode 100755
index 54373b506e..bf2b98be69
--- a/3rdparty/simdlib/Simd/SimdNeon.h
+++ b/3rdparty/simdlib/Simd/SimdNeon.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
 *               2018-2018 Radchenko Andrey.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -36,22 +36,18 @@ namespace Simd
 
         void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride);
 
-        void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride);
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride);
 
-        void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
-
-        void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha);
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride);
 
-        void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride);
+        void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
         void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height,
             const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
         void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride);
 
-        void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride);
-
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride);
+        void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride);
 
         void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height,
             uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride);
@@ -93,6 +89,12 @@ namespace Simd
         void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
 
+        void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride);
+
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride);
+
         void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
     }
diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp
old mode 100644
new mode 100755
index bb25c0c6e8..98a360b0e6
--- a/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -74,6 +74,8 @@ namespace Simd
                 BgrToBgra<false>(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
         }
 
+        //---------------------------------------------------------------------
+
         template <bool align> SIMD_INLINE void Bgr48pToBgra32(uint8_t * bgra,
             const uint8_t * blue, const uint8_t * green, const uint8_t * red, size_t offset, const uint8x16_t & alpha)
         {
@@ -128,6 +130,47 @@ namespace Simd
             else
                 Bgr48pToBgra32<false>(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha);
         }
+
+        //---------------------------------------------------------------------
+
+        template <bool align> SIMD_INLINE void RgbToBgra(const uint8_t* rgb, uint8_t* bgra, uint8x16_t alpha)
+        {
+            uint8x16x3_t _rgb = Load3<align>(rgb);
+            uint8x16x4_t _bgra;
+            _bgra.val[0] = _rgb.val[2];
+            _bgra.val[1] = _rgb.val[1];
+            _bgra.val[2] = _rgb.val[0];
+            _bgra.val[3] = alpha;
+            Store4<align>(bgra, _bgra);
+        }
+
+        template <bool align> void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            uint8x16_t _alpha = vdupq_n_u8(alpha);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0, colBgra = 0, colRgb = 0; col < alignedWidth; col += A, colBgra += A4, colRgb += A3)
+                    RgbToBgra<align>(rgb + colRgb, bgra + colBgra, _alpha);
+                if (width != alignedWidth)
+                    RgbToBgra<false>(rgb + 3 * (width - A), bgra + 4 * (width - A), _alpha);
+                rgb += rgbStride;
+                bgra += bgraStride;
+            }
+        }
+
+        void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride))
+                RgbToBgra<true>(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+            else
+                RgbToBgra<false>(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+        }
     }
 #endif// SIMD_NEON_ENABLE
 }
diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp
old mode 100644
new mode 100755
index 0b9fdeaedf..57cf19f18d
--- a/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -30,29 +30,31 @@ namespace Simd
 #ifdef SIMD_NEON_ENABLE    
     namespace Neon
     {
-        SIMD_INLINE uint8x8_t BgrToGray(uint8x8x3_t bgr)
+        SIMD_INLINE uint8x16_t BgrToGray(uint8x16x3_t bgr)
         {
-            return vmovn_u16(BgrToGray(vmovl_u8(bgr.val[0]), vmovl_u8(bgr.val[1]), vmovl_u8(bgr.val[2])));
+            uint8x8_t lo = vmovn_u16(BgrToGray(UnpackU8<0>(bgr.val[0]), UnpackU8<0>(bgr.val[1]), UnpackU8<0>(bgr.val[2])));
+            uint8x8_t hi = vmovn_u16(BgrToGray(UnpackU8<1>(bgr.val[0]), UnpackU8<1>(bgr.val[1]), UnpackU8<1>(bgr.val[2])));
+            return vcombine_u8(lo, hi);
         }
 
-        template <bool align> void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride)
+        template <bool align> void BgrToGray(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* gray, size_t grayStride)
         {
-            assert(width >= HA);
+            assert(width >= A);
             if (align)
                 assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride));
 
-            size_t alignedWidth = AlignLo(width, HA);
+            size_t alignedWidth = AlignLo(width, A);
             for (size_t row = 0; row < height; ++row)
             {
-                for (size_t col = 0; col < alignedWidth; col += HA)
+                for (size_t col = 0; col < alignedWidth; col += A)
                 {
-                    uint8x8x3_t _bgr = LoadHalf3<align>(bgr + 3 * col);
+                    uint8x16x3_t _bgr = Load3<align>(bgr + 3 * col);
                     Store<align>(gray + col, BgrToGray(_bgr));
                 }
                 if (alignedWidth != width)
                 {
-                    uint8x8x3_t _bgr = LoadHalf3<false>(bgr + 3 * (width - HA));
-                    Store<false>(gray + width - HA, BgrToGray(_bgr));
+                    uint8x16x3_t _bgr = Load3<false>(bgr + 3 * (width - A));
+                    Store<false>(gray + width - A, BgrToGray(_bgr));
                 }
                 bgr += bgrStride;
                 gray += grayStride;
@@ -66,6 +68,47 @@ namespace Simd
             else
                 BgrToGray<false>(bgr, width, height, bgrStride, gray, grayStride);
         }
+
+        //---------------------------------------------------------------------
+
+        SIMD_INLINE uint8x16_t RgbToGray(uint8x16x3_t rgb)
+        {
+            uint8x8_t lo = vmovn_u16(BgrToGray(UnpackU8<0>(rgb.val[2]), UnpackU8<0>(rgb.val[1]), UnpackU8<0>(rgb.val[0])));
+            uint8x8_t hi = vmovn_u16(BgrToGray(UnpackU8<1>(rgb.val[2]), UnpackU8<1>(rgb.val[1]), UnpackU8<1>(rgb.val[0])));
+            return vcombine_u8(lo, hi);
+        }
+
+        template <bool align> void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(rgb) && Aligned(rgbStride) && Aligned(gray) && Aligned(grayStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                {
+                    uint8x16x3_t _rgb = Load3<align>(rgb + 3 * col);
+                    Store<align>(gray + col, RgbToGray(_rgb));
+                }
+                if (alignedWidth != width)
+                {
+                    uint8x16x3_t _rgb = Load3<false>(rgb + 3 * (width - A));
+                    Store<false>(gray + width - A, RgbToGray(_rgb));
+                }
+                rgb += rgbStride;
+                gray += grayStride;
+            }
+        }
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+        {
+            if (Aligned(rgb) && Aligned(gray) && Aligned(rgbStride) && Aligned(grayStride))
+                RgbToGray<true>(rgb, width, height, rgbStride, gray, grayStride);
+            else
+                RgbToGray<false>(rgb, width, height, rgbStride, gray, grayStride);
+        }
     }
 #endif// SIMD_NEON_ENABLE
 }
diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp
old mode 100644
new mode 100755
index fb69a04b5f..b1e69cc3aa
--- a/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -38,7 +38,7 @@ namespace Simd
             Store3<align>(rgb, _bgr);
         }
 
-        template <bool align> void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
+        template <bool align> void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride)
         {
             assert(width >= A);
             if (align)
@@ -59,12 +59,12 @@ namespace Simd
             }
         }
 
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
+        void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride)
         {
             if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride))
-                BgrToRgb<true>(bgr, bgrStride, width, height, rgb, rgbStride);
+                BgrToRgb<true>(bgr, width, height, bgrStride, rgb, rgbStride);
             else
-                BgrToRgb<false>(bgr, bgrStride, width, height, rgb, rgbStride);
+                BgrToRgb<false>(bgr, width, height, bgrStride, rgb, rgbStride);
         }
     }
 #endif//SIMD_NEON_ENABLE
diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp
deleted file mode 100644
index b2950c7da1..0000000000
--- a/3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_NEON_ENABLE  
-    namespace Neon
-    {
-        const size_t A3 = A * 3;
-        const size_t A4 = A * 4;
-
-        union Bgra
-        {
-            uint8x16x4_t bgra;
-            uint8x16x3_t bgr;
-        };
-
-        template <bool align> SIMD_INLINE void BgrToRgba(const uint8_t * bgr, uint8_t * rgba, Bgra & _bgra)
-        {
-            _bgra.bgr = Load3<align>(bgr);
-            uint8x16_t tmp = _bgra.bgr.val[0];
-            _bgra.bgr.val[0] = _bgra.bgr.val[2];
-            _bgra.bgr.val[2] = tmp;
-            Store4<align>(rgba, _bgra.bgra);
-        }
-
-        template <bool align> void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            Bgra _bgra;
-            _bgra.bgra.val[3] = vdupq_n_u8(alpha);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0, colRgba = 0, colBgr = 0; col < alignedWidth; col += A, colRgba += A4, colBgr += A3)
-                    BgrToRgba<align>(bgr + colBgr, rgba + colRgba, _bgra);
-                if (width != alignedWidth)
-                    BgrToRgba<false>(bgr + 3 * (width - A), rgba + 4 * (width - A), _bgra);
-                bgr += bgrStride;
-                rgba += rgbaStride;
-            }
-        }
-
-        void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha)
-        {
-            if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride))
-                BgrToRgba<true>(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-            else
-                BgrToRgba<false>(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-        }
-    }
-#endif// SIMD_NEON_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp
old mode 100644
new mode 100755
index f95e1a9118..944fe5b45e
--- a/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -66,6 +66,87 @@ namespace Simd
             else
                 BgraToBgr<false>(bgra, width, height, bgraStride, bgr, bgrStride);
         }
+
+        //---------------------------------------------------------------------
+
+        template <bool align> SIMD_INLINE void BgraToRgb(const uint8_t* bgra, uint8_t* rgb)
+        {
+            uint8x16x4_t _bgra = Load4<align>(bgra);
+            uint8x16x3_t _rgb;
+            _rgb.val[0] = _bgra.val[2];
+            _rgb.val[1] = _bgra.val[1];
+            _rgb.val[2] = _bgra.val[0];
+            Store3<align>(rgb, _rgb);
+        }
+
+        template <bool align> void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            if (width == alignedWidth)
+                alignedWidth -= A;
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0, colBgra = 0, colRgb = 0; col < alignedWidth; col += A, colBgra += A4, colRgb += A3)
+                    BgraToRgb<align>(bgra + colBgra, rgb + colRgb);
+                if (width != alignedWidth)
+                    BgraToRgb<false>(bgra + 4 * (width - A), rgb + 3 * (width - A));
+                bgra += bgraStride;
+                rgb += rgbStride;
+            }
+        }
+
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride))
+                BgraToRgb<true>(bgra, width, height, bgraStride, rgb, rgbStride);
+            else
+                BgraToRgb<false>(bgra, width, height, bgraStride, rgb, rgbStride);
+        }
+
+        //---------------------------------------------------------------------
+
+        template <bool align> SIMD_INLINE void BgraToRgba(const uint8_t* bgra, uint8_t* rgba)
+        {
+            uint8x16x4_t _bgra = Load4<align>(bgra);
+            uint8x16_t tmp = _bgra.val[0];
+            _bgra.val[0] = _bgra.val[2];
+            _bgra.val[2] = tmp;
+            Store4<align>(rgba, _bgra);
+        }
+
+        template <bool align> void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            if (width == alignedWidth)
+                alignedWidth -= A;
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0, colBgra = 0, colRgba = 0; col < alignedWidth; col += A, colBgra += A4, colRgba += A4)
+                    BgraToRgba<align>(bgra + colBgra, rgba + colRgba);
+                if (width != alignedWidth)
+                    BgraToRgba<false>(bgra + 4 * (width - A), rgba + 4 * (width - A));
+                bgra += bgraStride;
+                rgba += rgbaStride;
+            }
+        }
+
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride))
+                BgraToRgba<true>(bgra, width, height, bgraStride, rgba, rgbaStride);
+            else
+                BgraToRgba<false>(bgra, width, height, bgraStride, rgba, rgbaStride);
+        }
     }
 #endif// SIMD_NEON_ENABLE
 }
diff --git a/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp
old mode 100644
new mode 100755
index 24fc228560..6b2eb4de48
--- a/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -66,6 +66,45 @@ namespace Simd
             else
                 BgraToGray<false>(bgra, width, height, bgraStride, gray, grayStride);
         }
+
+        //---------------------------------------------------------------------
+
+        SIMD_INLINE uint8x8_t RgbaToGray(uint8x8x4_t rgba)
+        {
+            return vmovn_u16(BgrToGray(vmovl_u8(rgba.val[2]), vmovl_u8(rgba.val[1]), vmovl_u8(rgba.val[0])));
+        }
+
+        template <bool align> void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+        {
+            assert(width >= HA);
+            if (align)
+                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride));
+
+            size_t alignedWidth = AlignLo(width, HA);
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += HA)
+                {
+                    uint8x8x4_t _rgba = LoadHalf4<align>(rgba + 4 * col);
+                    Store<align>(gray + col, RgbaToGray(_rgba));
+                }
+                if (alignedWidth != width)
+                {
+                    uint8x8x4_t _rgba = LoadHalf4<false>(rgba + 4 * (width - HA));
+                    Store<false>(gray + width - HA, RgbaToGray(_rgba));
+                }
+                rgba += rgbaStride;
+                gray += grayStride;
+            }
+        }
+
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+        {
+            if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride))
+                RgbaToGray<true>(rgba, width, height, rgbaStride, gray, grayStride);
+            else
+                RgbaToGray<false>(rgba, width, height, rgbaStride, gray, grayStride);
+        }
     }
 #endif// SIMD_NEON_ENABLE
 }
diff --git a/3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp b/3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp
deleted file mode 100644
index d1873eddcb..0000000000
--- a/3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_NEON_ENABLE  
-    namespace Neon
-    {
-        const size_t A4 = A * 4;
-
-        union Bgra
-        {
-            uint8x16x4_t bgra;
-        };
-
-        template <bool align> SIMD_INLINE void BgraToRgba(const uint8_t * bgra, uint8_t * rgba, Bgra & _bgra)
-        {
-            _bgra.bgra = Load4<align>(bgra);
-            uint8x16_t tmp = _bgra.bgra.val[0];
-            _bgra.bgra.val[0] = _bgra.bgra.val[2];
-            _bgra.bgra.val[2] = tmp;
-            Store4<align>(rgba, _bgra.bgra);
-        }
-
-        template <bool align> void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            Bgra _bgra;
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0, colRgba = 0; col < alignedWidth; col += A, colRgba += A4)
-                    BgraToRgba<align>(bgra + colRgba, rgba + colRgba, _bgra);
-                if (width != alignedWidth)
-                    BgraToRgba<false>(bgra + 4 * (width - A), rgba + 4 * (width - A), _bgra);
-                bgra += bgraStride;
-                rgba += rgbaStride;
-            }
-        }
-
-        void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride)
-        {
-            if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride))
-                BgraToRgba<true>(bgra, width, height, bgraStride, rgba, rgbaStride);
-            else
-                BgraToRgba<false>(bgra, width, height, bgraStride, rgba, rgbaStride);
-        }
-    }
-#endif// SIMD_NEON_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp b/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp
old mode 100644
new mode 100755
index 53530a788d..36a623efb5
--- a/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -71,6 +71,8 @@ namespace Simd
                 DeinterleaveUv<false>(uv, uvStride, width, height, u, uStride, v, vStride);
         }
 
+        //---------------------------------------------------------------------
+
         template <bool align> void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height,
             uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride)
         {
@@ -118,6 +120,8 @@ namespace Simd
                 DeinterleaveBgr<false>(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride);
         }
 
+        //---------------------------------------------------------------------
+
         template <bool align> void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height,
             uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride)
         {
@@ -125,36 +129,65 @@ namespace Simd
             if (align)
             {
                 assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride));
-                assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride));
+                assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL));
             }
 
             size_t bodyWidth = AlignLo(width, A);
             size_t tail = width - bodyWidth;
-            for (size_t row = 0; row < height; ++row)
+            if (a)
             {
-                for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += QA)
+                for (size_t row = 0; row < height; ++row)
                 {
-                    uint8x16x4_t _bgra = Load4<align>(bgra + offset);
-                    Store<align>(b + col, _bgra.val[0]);
-                    Store<align>(g + col, _bgra.val[1]);
-                    Store<align>(r + col, _bgra.val[2]);
-                    Store<align>(a + col, _bgra.val[3]);
+                    for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += QA)
+                    {
+                        uint8x16x4_t _bgra = Load4<align>(bgra + offset);
+                        Store<align>(b + col, _bgra.val[0]);
+                        Store<align>(g + col, _bgra.val[1]);
+                        Store<align>(r + col, _bgra.val[2]);
+                        Store<align>(a + col, _bgra.val[3]);
+                    }
+                    if (tail)
+                    {
+                        size_t col = width - A;
+                        size_t offset = 4 * col;
+                        uint8x16x4_t _bgra = Load4<false>(bgra + offset);
+                        Store<false>(b + col, _bgra.val[0]);
+                        Store<false>(g + col, _bgra.val[1]);
+                        Store<false>(r + col, _bgra.val[2]);
+                        Store<false>(a + col, _bgra.val[3]);
+                    }
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
+                    a += aStride;
                 }
-                if (tail)
+            }
+            else
+            {
+                for (size_t row = 0; row < height; ++row)
                 {
-                    size_t col = width - A;
-                    size_t offset = 4 * col;
-                    uint8x16x4_t _bgra = Load4<false>(bgra + offset);
-                    Store<false>(b + col, _bgra.val[0]);
-                    Store<false>(g + col, _bgra.val[1]);
-                    Store<false>(r + col, _bgra.val[2]);
-                    Store<false>(a + col, _bgra.val[3]);
+                    for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += QA)
+                    {
+                        uint8x16x4_t _bgra = Load4<align>(bgra + offset);
+                        Store<align>(b + col, _bgra.val[0]);
+                        Store<align>(g + col, _bgra.val[1]);
+                        Store<align>(r + col, _bgra.val[2]);
+                    }
+                    if (tail)
+                    {
+                        size_t col = width - A;
+                        size_t offset = 4 * col;
+                        uint8x16x4_t _bgra = Load4<false>(bgra + offset);
+                        Store<false>(b + col, _bgra.val[0]);
+                        Store<false>(g + col, _bgra.val[1]);
+                        Store<false>(r + col, _bgra.val[2]);
+                    }
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
                 }
-                bgra += bgraStride;
-                b += bStride;
-                g += gStride;
-                r += rStride;
-                a += aStride;
             }
         }
 
@@ -162,7 +195,7 @@ namespace Simd
             uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride)
         {
             if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) &&
-                Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride))
+                Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL))
                 DeinterleaveBgra<true>(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
             else
                 DeinterleaveBgra<false>(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
diff --git a/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp
old mode 100644
new mode 100755
index 752778be2a..1d63a6510b
--- a/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp
@@ -22,6 +22,7 @@
 * SOFTWARE.
 */
 #include "Simd/SimdMemory.h"
+#include "Simd/SimdLoadBlock.h"
 #include "Simd/SimdStore.h"
 #include "Simd/SimdGaussianBlur.h"
 #include "Simd/SimdLog.h"
diff --git a/3rdparty/simdlib/Simd/SimdNeonResizer.cpp b/3rdparty/simdlib/Simd/SimdNeonResizer.cpp
old mode 100644
new mode 100755
index b2e965200e..d11a0e29a8
--- a/3rdparty/simdlib/Simd/SimdNeonResizer.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonResizer.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -578,11 +578,11 @@ namespace Simd
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
         {
             ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(float32x4_t));
-            if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && dstX >= A)
+            if (param.IsByteBilinear() && dstX >= A)
                 return new ResizerByteBilinear(param);
-            else if (type == SimdResizeChannelByte && method == SimdResizeMethodArea)
+            else if (param.IsByteArea())
                 return new ResizerByteArea(param);
-            else if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp))
+            else if (param.IsFloatBilinear())
                 return new ResizerFloatBilinear(param);
             else
                 return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
diff --git a/3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp
deleted file mode 100644
index 37b288b277..0000000000
--- a/3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdConversion.h"
-
-namespace Simd
-{
-#ifdef SIMD_NEON_ENABLE    
-    namespace Neon
-    {
-        SIMD_INLINE uint8x8_t RgbToGray(uint8x8x3_t rgb)
-        {
-            return vmovn_u16(BgrToGray(vmovl_u8(rgb.val[2]), vmovl_u8(rgb.val[1]), vmovl_u8(rgb.val[0])));
-        }
-
-        template <bool align> void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride)
-        {
-            assert(width >= HA);
-            if (align)
-                assert(Aligned(rgb) && Aligned(rgbStride) && Aligned(gray) && Aligned(grayStride));
-
-            size_t alignedWidth = AlignLo(width, HA);
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += HA)
-                {
-                    uint8x8x3_t _rgb = LoadHalf3<align>(rgb + 3 * col);
-                    Store<align>(gray + col, RgbToGray(_rgb));
-                }
-                if (alignedWidth != width)
-                {
-                    uint8x8x3_t _rgb = LoadHalf3<false>(rgb + 3 * (width - HA));
-                    Store<false>(gray + width - HA, RgbToGray(_rgb));
-                }
-                rgb += rgbStride;
-                gray += grayStride;
-            }
-        }
-
-        void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride)
-        {
-            if (Aligned(rgb) && Aligned(gray) && Aligned(rgbStride) && Aligned(grayStride))
-                RgbToGray<true>(rgb, width, height, rgbStride, gray, grayStride);
-            else
-                RgbToGray<false>(rgb, width, height, rgbStride, gray, grayStride);
-        }
-    }
-#endif// SIMD_NEON_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp
deleted file mode 100644
index 377d6fcb42..0000000000
--- a/3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdConversion.h"
-
-namespace Simd
-{
-#ifdef SIMD_NEON_ENABLE
-    namespace Neon
-    {
-        SIMD_INLINE uint8x8_t RgbaToGray(uint8x8x4_t rgba)
-        {
-            return vmovn_u16(BgrToGray(vmovl_u8(rgba.val[2]), vmovl_u8(rgba.val[1]), vmovl_u8(rgba.val[0])));
-        }
-
-        template <bool align> void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride)
-        {
-            assert(width >= HA);
-            if (align)
-                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride));
-
-            size_t alignedWidth = AlignLo(width, HA);
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += HA)
-                {
-                    uint8x8x4_t _rgba = LoadHalf4<align>(rgba + 4 * col);
-                    Store<align>(gray + col, RgbaToGray(_rgba));
-                }
-                if (alignedWidth != width)
-                {
-                    uint8x8x4_t _rgba = LoadHalf4<false>(rgba + 4 * (width - HA));
-                    Store<false>(gray + width - HA, RgbaToGray(_rgba));
-                }
-                rgba += rgbaStride;
-                gray += grayStride;
-            }
-        }
-
-        void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride)
-        {
-            if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride))
-                RgbaToGray<true>(rgba, width, height, rgbaStride, gray, grayStride);
-            else
-                RgbaToGray<false>(rgba, width, height, rgbaStride, gray, grayStride);
-        }
-    }
-#endif// SIMD_NEON_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdPixel.hpp b/3rdparty/simdlib/Simd/SimdPixel.hpp
old mode 100644
new mode 100755
index 109c18ec1d..f95ce46ee6
--- a/3rdparty/simdlib/Simd/SimdPixel.hpp
+++ b/3rdparty/simdlib/Simd/SimdPixel.hpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -41,6 +41,7 @@ namespace Simd
         struct Hsv24;
         struct Hsl24;
         struct Rgb24;
+        struct Rgba32;
 
         //-------------------------------------------------------------------------
 
@@ -86,6 +87,13 @@ namespace Simd
             */
             Bgr24(const Rgb24 & p);
 
+            /*!
+                Creates a new 24-bit BGR pixel structure on the base of 32-bit RGBA pixel.
+
+                \param [in] p - 32-bit RGBA pixel.
+            */
+            Bgr24(const Rgba32& p);
+
             /*!
                 Creates a copy of 24-bit BGR pixel structure.
 
@@ -165,6 +173,13 @@ namespace Simd
             */
             Bgra32(const Rgb24 & p, const uint8_t & a = uint8_t(255));
 
+            /*!
+                Creates a new 32-bit BGRA pixel structure on the base of 32-bit RGBA pixel.
+
+                \param [in] p - 32-bit RGBA pixel.
+            */
+            Bgra32(const Rgba32& p);
+
             /*!
                 Creates a copy of 32-bit BGRA pixel structure.
 
@@ -360,6 +375,13 @@ namespace Simd
             */
             Rgb24(const Bgr24 & p);
 
+            /*!
+                Creates a new 24-bit RGB pixel structure on the base of 32-bit RGBA pixel.
+
+                \param [in] p - 32-bit RGBA pixel.
+            */
+            Rgb24(const Rgba32& p);
+
             /*!
                 Creates a copy of 24-bit RGB pixel structure.
 
@@ -392,6 +414,92 @@ namespace Simd
             template <template<class> class A> static Rgb24 & At(View<A> & view, ptrdiff_t col, ptrdiff_t row);
         };
 
+        /*! @ingroup cpp_pixels
+
+            \short 32-bit RGBA pixel.
+
+            Provides manipulation of 32-bit RGBA (Red, Blue, Green, Alpha) pixels of the View struct.
+        */
+        struct Rgba32
+        {
+            uint8_t red; /*!< \brief 8-bit red channel 32-bit BGRA pixel. */
+            uint8_t green; /*!< \brief 8-bit green channel 32-bit BGRA pixel. */
+            uint8_t blue; /*!< \brief 8-bit blue channel 32-bit BGRA pixel. */
+            uint8_t alpha; /*!< \brief 8-bit alpha channel 32-bit RGBA pixel. */
+
+            /*!
+                Creates a new 32-bit RGBA pixel structure with specified channel values.
+
+                \param [in] gray - initial value for all channels. It is equal to 0 by default.
+                \param [in] a - initial value for alpha channel. It is equal to 255 by default.
+            */
+            Rgba32(const uint8_t& gray = uint8_t(0), const uint8_t& a = uint8_t(255));
+
+            /*!
+                Creates a new 32-bit RGBA pixel structure with specified channel values.
+
+                \param [in] r - initial value for red channel.
+                \param [in] g - initial value for green channel.
+                \param [in] b - initial value for blue channel.
+                \param [in] a - initial value for alpha channel. It is equal to 255 by default.
+            */
+            Rgba32(const uint8_t& r, const uint8_t& g, const uint8_t& b, const uint8_t& a = uint8_t(255));
+
+            /*!
+                Creates a new 32-bit RGBA pixel structure on the base of 32-bit BGRA pixel.
+
+                \param [in] p - 32-bit BGRA pixel.
+            */
+            Rgba32(const Bgra32& p);
+
+            /*!
+                Creates a new 32-bit RGBA pixel structure on the base of 24-bit BGR pixel.
+
+                \param [in] p - 24-bit BGR pixel.
+                \param [in] a - initial value for alpha channel. It is equal to 255 by default.
+            */
+            Rgba32(const Bgr24& p, const uint8_t& a = uint8_t(255));
+
+            /*!
+                Creates a new of 32-bit RGBA pixel structure on the base of 24-bit RGB pixel.
+
+                \param [in] p - 24-bit RGB pixel.
+                \param [in] a - initial value for alpha channel. It is equal to 255 by default.
+            */
+            Rgba32(const Rgb24& p, const uint8_t& a = uint8_t(255));
+
+            /*!
+                Creates a copy of 32-bit RGBA pixel structure.
+
+                \param [in] p - 32-bit RGBA pixel.
+            */
+            Rgba32(const Rgba32& p);
+
+            /*!
+                \fn template <template<class> class A> static const Rgba32 & At(const View<A> & view, ptrdiff_t col, ptrdiff_t row);
+
+                Gets constant reference to the pixel with specific coordinates at the image view.
+
+                \param [in] view - an image view of 32-bit RGBA pixel format.
+                \param [in] col - x-coordinate of the pixel.
+                \param [in] row - y-coordinate of the pixel.
+                \return a constant reference to the pixel.
+            */
+            template <template<class> class A> static const Rgba32& At(const View<A>& view, ptrdiff_t col, ptrdiff_t row);
+
+            /*!
+                \fn template <template<class> class A> static Rgba32 & At(View<A> & view, ptrdiff_t col, ptrdiff_t row);
+
+                Gets reference to the pixel with specific coordinates at the image view.
+
+                \param [in] view - an image view of 32-bit RGBA pixel format.
+                \param [in] col - x-coordinate of the pixel.
+                \param [in] row - y-coordinate of the pixel.
+                \return a reference to the pixel.
+            */
+            template <template<class> class A> static Rgba32& At(View<A>& view, ptrdiff_t col, ptrdiff_t row);
+        };
+
         //-------------------------------------------------------------------------
 
         // struct Bgr24 implementation:
@@ -417,14 +525,21 @@ namespace Simd
         {
         }
 
-        SIMD_INLINE Bgr24::Bgr24(const Bgr24 & p)
+        SIMD_INLINE Bgr24::Bgr24(const Rgb24 & p)
             : blue(p.blue)
             , green(p.green)
             , red(p.red)
         {
         }
 
-        SIMD_INLINE Bgr24::Bgr24(const Rgb24 & p)
+        SIMD_INLINE Bgr24::Bgr24(const Rgba32& p)
+            : blue(p.blue)
+            , green(p.green)
+            , red(p.red)
+        {
+        }
+
+        SIMD_INLINE Bgr24::Bgr24(const Bgr24 & p)
             : blue(p.blue)
             , green(p.green)
             , red(p.red)
@@ -479,6 +594,14 @@ namespace Simd
         {
         }
 
+        SIMD_INLINE Bgra32::Bgra32(const Rgba32& p)
+            : blue(p.blue)
+            , green(p.green)
+            , red(p.red)
+            , alpha(p.alpha)
+        {
+        }
+
         SIMD_INLINE Bgra32::Bgra32(const Bgra32 & p)
             : blue(p.blue)
             , green(p.green)
@@ -605,6 +728,13 @@ namespace Simd
         {
         }
 
+        SIMD_INLINE Rgb24::Rgb24(const Rgba32& p)
+            : red(p.red)
+            , green(p.green)
+            , blue(p.blue)
+        {
+        }
+
         SIMD_INLINE Rgb24::Rgb24(const Rgb24 & p)
             : red(p.red)
             , green(p.green)
@@ -625,6 +755,70 @@ namespace Simd
 
             return Simd::At<A, Rgb24>(view, col, row);
         }
+
+        // struct Rgba32 implementation:
+
+        SIMD_INLINE Rgba32::Rgba32(const uint8_t& gray, const uint8_t& a)
+            : red(gray)
+            , green(gray)
+            , blue(gray)
+            , alpha(a)
+        {
+        }
+
+        SIMD_INLINE Rgba32::Rgba32(const uint8_t& r, const uint8_t& g, const uint8_t& b, const uint8_t& a)
+            : red(r)
+            , green(g)
+            , blue(b)
+            , alpha(a)
+        {
+        }
+
+        SIMD_INLINE Rgba32::Rgba32(const Bgra32& p)
+            : red(p.red)
+            , green(p.green)
+            , blue(p.blue)
+            , alpha(p.alpha)
+        {
+        }        
+        
+        SIMD_INLINE Rgba32::Rgba32(const Bgr24& p, const uint8_t& a)
+            : red(p.red)
+            , green(p.green)
+            , blue(p.blue)
+            , alpha(a)
+        {
+        }
+
+        SIMD_INLINE Rgba32::Rgba32(const Rgb24& p, const uint8_t& a)
+            : red(p.red)
+            , green(p.green)
+            , blue(p.blue)
+            , alpha(a)
+        {
+        }
+
+        SIMD_INLINE Rgba32::Rgba32(const Rgba32& p)
+            : red(p.red)
+            , green(p.green)
+            , blue(p.blue)
+            , alpha(p.alpha)
+        {
+        }
+
+        template <template<class> class A> SIMD_INLINE const Rgba32& Rgba32::At(const View<A>& view, ptrdiff_t col, ptrdiff_t row)
+        {
+            assert(view.format == View<A>::Rgba32);
+
+            return Simd::At<A, Rgba32>(view, col, row);
+        }
+
+        template <template<class> class A> SIMD_INLINE Rgba32& Rgba32::At(View<A>& view, ptrdiff_t col, ptrdiff_t row)
+        {
+            assert(view.format == View<A>::Rgba32);
+
+            return Simd::At<A, Rgba32>(view, col, row);
+        }
     }
 }
 
diff --git a/3rdparty/simdlib/Simd/SimdPow.h b/3rdparty/simdlib/Simd/SimdPow.h
old mode 100644
new mode 100755
index 309e3104f0..ca0db18eb5
--- a/3rdparty/simdlib/Simd/SimdPow.h
+++ b/3rdparty/simdlib/Simd/SimdPow.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
diff --git a/3rdparty/simdlib/Simd/SimdResizer.h b/3rdparty/simdlib/Simd/SimdResizer.h
old mode 100644
new mode 100755
index 0a70ee0ad6..15dacfcd0c
--- a/3rdparty/simdlib/Simd/SimdResizer.h
+++ b/3rdparty/simdlib/Simd/SimdResizer.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -46,6 +46,43 @@ namespace Simd
             this->channels = channels;
             this->align = align;
         }
+
+        bool IsByteBilinear() const
+        {
+            return type == SimdResizeChannelByte && method == SimdResizeMethodBilinear;
+        }
+
+        bool IsByteArea() const
+        {
+            return type == SimdResizeChannelByte && method == SimdResizeMethodArea;
+        }
+
+        bool IsShortBilinear() const
+        {
+            return type == SimdResizeChannelShort && method == SimdResizeMethodBilinear;
+        }
+
+        bool IsFloatBilinear() const
+        {
+            return type == SimdResizeChannelFloat && 
+                (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp || method == SimdResizeMethodInferenceEngineInterp);
+        }
+
+        bool IsNearest() const
+        {
+            return method == SimdResizeMethodNearest;
+        }
+
+        size_t ChannelSize() const
+        {
+            static const size_t sizes[3] = { 1, 2, 4 };
+            return sizes[(int)type];
+        }
+
+        size_t PixelSize() const
+        {
+            return ChannelSize() * channels;
+        }
     };
 
     class Resizer : Deletable
@@ -94,13 +131,32 @@ namespace Simd
             virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
         };
 
+        class ResizerShortBilinear : public Resizer
+        {
+        protected:
+            Array32i _ix, _iy;
+            Array32f _ax, _ay, _bx[2];
+
+            void EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t* indices, float* alphas);
+
+            template<size_t N> void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+            template<size_t N> void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+
+            virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+
+        public:
+            ResizerShortBilinear(const ResParam& param);
+
+            virtual void Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride);
+        };
+
         class ResizerFloatBilinear : public Resizer
         {
         protected:
             Array32i _ix, _iy;
             Array32f _ax, _ay, _bx[2];
 
-            void EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, bool caffeInterp, int32_t * indices, float * alphas);
+            void EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t * indices, float * alphas);
 
             virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride);
 
@@ -110,22 +166,23 @@ namespace Simd
             virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
         };
 
-        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
-    }
-
-#ifdef SIMD_SSE_ENABLE    
-    namespace Sse
-    {
-        class ResizerFloatBilinear : public Base::ResizerFloatBilinear
+        class ResizerNearest : public Resizer
         {
-            virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride);
+            void Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride);
+            template<size_t N> void Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride);
+        protected:
+            size_t _pixelSize;
+            Array32i _ix, _iy;
+
+            void EstimateIndex(size_t srcSize, size_t dstSize, size_t pixelSize, int32_t* indices);
         public:
-            ResizerFloatBilinear(const ResParam & param);
-        };
+            ResizerNearest(const ResParam& param);
 
+            virtual void Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride);
+        };        
+        
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
     }
-#endif //SIMD_SSE_ENABLE 
 
 #ifdef SIMD_SSE2_ENABLE    
     namespace Sse2
@@ -156,12 +213,19 @@ namespace Simd
             virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
         };
 
+        class ResizerFloatBilinear : public Base::ResizerFloatBilinear
+        {
+            virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride);
+        public:
+            ResizerFloatBilinear(const ResParam & param);
+        };
+
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
     }
 #endif //SIMD_SSE2_ENABLE 
 
-#ifdef SIMD_SSSE3_ENABLE    
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
     {
         class ResizerByteBilinear : public Sse2::ResizerByteBilinear
         {
@@ -183,15 +247,8 @@ namespace Simd
             ResizerByteBilinear(const ResParam & param);
 
             virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
-        };
-
-        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
-    }
-#endif //SIMD_SSSE3_ENABLE 
-
-#ifdef SIMD_SSE41_ENABLE    
-    namespace Sse41
-    {
+        };        
+        
         class ResizerByteArea : public Sse2::ResizerByteArea
         {
         protected:
@@ -202,6 +259,17 @@ namespace Simd
             virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
         };
 
+        class ResizerShortBilinear : public Base::ResizerShortBilinear
+        {
+        protected:
+            template<size_t N> void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+            template<size_t N> void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+
+            virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+        public:
+            ResizerShortBilinear(const ResParam& param);
+        };
+
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
     }
 #endif //SIMD_SSE41_ENABLE
@@ -223,15 +291,7 @@ namespace Simd
 #ifdef SIMD_AVX2_ENABLE    
     namespace Avx2
     {
-        template <class Idx> SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t * src, const Idx & index, const uint8_t * alpha, uint8_t * dst)
-        {
-            __m256i _src = _mm256_loadu_si256((__m256i*)(src + index.src));
-            __m256i _shuffle = _mm256_loadu_si256((__m256i*)&index.shuffle);
-            __m256i _alpha = _mm256_loadu_si256((__m256i*)(alpha + index.dst));
-            _mm256_storeu_si256((__m256i*)(dst + index.dst), _mm256_maddubs_epi16(Avx2::Shuffle(_src, _shuffle), _alpha));
-        }
-
-        class ResizerByteBilinear : public Ssse3::ResizerByteBilinear
+        class ResizerByteBilinear : public Sse41::ResizerByteBilinear
         {
         protected:
             struct Idx
@@ -260,6 +320,17 @@ namespace Simd
             virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
         };
 
+        class ResizerShortBilinear : public Sse41::ResizerShortBilinear
+        {
+        protected:
+            template<size_t N> void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+            template<size_t N> void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+
+            virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+        public:
+            ResizerShortBilinear(const ResParam& param);
+        };
+
         class ResizerFloatBilinear : public Base::ResizerFloatBilinear
         {
             virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride);
@@ -308,6 +379,17 @@ namespace Simd
             virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
         };
 
+        class ResizerShortBilinear : public Base::ResizerShortBilinear
+        {
+        protected:
+            template<size_t N> void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+            template<size_t N> void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+
+            virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+        public:
+            ResizerShortBilinear(const ResParam& param);
+        };
+
         class ResizerFloatBilinear : public Base::ResizerFloatBilinear
         {
             virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride);
diff --git a/3rdparty/simdlib/Simd/SimdResizerCommon.h b/3rdparty/simdlib/Simd/SimdResizerCommon.h
new file mode 100755
index 0000000000..3e6ab00ffa
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdResizerCommon.h
@@ -0,0 +1,97 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdResizerCommon_h__
+#define __SimdResizerCommon_h__
+
+#include "Simd/SimdLoad.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        const __m128i RSB_1_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x4, 0x5, -1, -1, 0x8, 0x9, -1, -1, 0xC, 0xD, -1, -1);
+        const __m128i RSB_1_1 = SIMD_MM_SETR_EPI8(0x2, 0x3, -1, -1, 0x6, 0x7, -1, -1, 0xA, 0xB, -1, -1, 0xE, 0xF, -1, -1);
+
+        SIMD_INLINE __m128 BilColS1(const uint16_t* src, const int32_t* idx, __m128 fx0, __m128 fx1)
+        {
+            __m128i s = _mm_setr_epi32(
+                *(uint32_t*)(src + idx[0]), *(uint32_t*)(src + idx[1]),
+                *(uint32_t*)(src + idx[2]), *(uint32_t*)(src + idx[3]));
+            __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_1_0)));
+            __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_1_1)));
+            return _mm_add_ps(m0, m1);
+        }
+
+        const __m128i RSB_2_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x2, 0x3, -1, -1, 0x8, 0x9, -1, -1, 0xA, 0xB, -1, -1);
+        const __m128i RSB_2_1 = SIMD_MM_SETR_EPI8(0x4, 0x5, -1, -1, 0x6, 0x7, -1, -1, 0xC, 0xD, -1, -1, 0xE, 0xF, -1, -1);
+
+        SIMD_INLINE __m128 BilColS2(const uint16_t* src, const int32_t* idx, __m128 fx0, __m128 fx1)
+        {
+            __m128i s = Sse2::Load((__m128i*)(src + idx[0]), (__m128i*)(src + idx[2]));
+            __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_2_0)));
+            __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_2_1)));
+            return _mm_add_ps(m0, m1);
+        }
+
+        const __m128i RSB_3_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x2, 0x3, -1, -1, 0x4, 0x5, -1, -1, -1, -1, -1, -1);
+        const __m128i RSB_3_1 = SIMD_MM_SETR_EPI8(0x6, 0x7, -1, -1, 0x8, 0x9, -1, -1, 0xA, 0xB, -1, -1, -1, -1, -1, -1);
+
+        SIMD_INLINE __m128 BilColS3(const uint16_t* src, __m128 fx0, __m128 fx1)
+        {
+            __m128i s = _mm_loadu_si128((__m128i*)src);
+            __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_3_0)));
+            __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_3_1)));
+            return _mm_add_ps(m0, m1);
+        }
+
+        const __m128i RSB_4_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x2, 0x3, -1, -1, 0x4, 0x5, -1, -1, 0x6, 0x7, -1, -1);
+        const __m128i RSB_4_1 = SIMD_MM_SETR_EPI8(0x8, 0x9, -1, -1, 0xA, 0xB, -1, -1, 0xC, 0xD, -1, -1, 0xE, 0xF, -1, -1);
+
+        SIMD_INLINE __m128 BilColS4(const uint16_t* src, __m128 fx0, __m128 fx1)
+        {
+            __m128i s = _mm_loadu_si128((__m128i*)src);
+            __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_4_0)));
+            __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_4_1)));
+            return _mm_add_ps(m0, m1);
+        }
+
+        const __m128i RSB_3_P = SIMD_MM_SETR_EPI8(0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, -1, -1, -1, -1);
+    }
+#endif //SIMD_SSE41_ENABLE
+
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        template <class Idx> SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t * src, const Idx & index, const uint8_t * alpha, uint8_t * dst)
+        {
+            __m256i _src = _mm256_loadu_si256((__m256i*)(src + index.src));
+            __m256i _shuffle = _mm256_loadu_si256((__m256i*)&index.shuffle);
+            __m256i _alpha = _mm256_loadu_si256((__m256i*)(alpha + index.dst));
+            _mm256_storeu_si256((__m256i*)(dst + index.dst), _mm256_maddubs_epi16(Avx2::Shuffle(_src, _shuffle), _alpha));
+        }
+    }
+#endif //SIMD_AVX2_ENABLE 
+}
+#endif//__SimdResizerCommon_h__
diff --git a/3rdparty/simdlib/Simd/SimdRuntime.h b/3rdparty/simdlib/Simd/SimdRuntime.h
old mode 100644
new mode 100755
index 5fb82ebd00..de098cdb94
--- a/3rdparty/simdlib/Simd/SimdRuntime.h
+++ b/3rdparty/simdlib/Simd/SimdRuntime.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -54,10 +54,13 @@ namespace Simd
             if (!_info.empty())
             {
                 std::sort(_candidates.begin(), _candidates.end(), [](const Candidate & a, const Candidate & b) { return a.Mean() < b.Mean(); });
-                std::cout << std::setprecision(3) << std::fixed;
                 std::cout << "Simd::Runtime " << _info << " : ";
+                int64_t f = TimeFrequency();
                 for (size_t i = 0; i < _candidates.size(); ++i)
-                    std::cout << _candidates[i].func.Name() << ": " << _candidates[i].Mean()*1000.0 << "  ";
+                {
+                    int64_t t = _candidates[i].Mean();
+                    std::cout << _candidates[i].func.Name() << ": " << t * 1000 / f << "." << (t * 1000000 / f) % 1000 << "  ";
+                }
                 std::cout << std::endl;
             }
 #endif
@@ -104,18 +107,18 @@ namespace Simd
         {
             Func func;
             size_t count;
-            double sum, min, max;
+            int64_t sum, min, max;
 
             SIMD_INLINE Candidate(const Func & f)
                 : func(f)
                 , count(0)
                 , sum(0)
-                , min(std::numeric_limits<double>::max())
-                , max(std::numeric_limits<double>::min())
+                , min(std::numeric_limits<int64_t>::max())
+                , max(0)
             {
             }
 
-            SIMD_INLINE void Update(const double & value)
+            SIMD_INLINE void Update(int64_t value)
             {
                 count += 1;
                 sum += value;
@@ -123,9 +126,14 @@ namespace Simd
                 max = std::max(max, value);
             }
 
-            SIMD_INLINE double Mean() const
+            SIMD_INLINE int64_t Mean() const
             {
-                return (sum - min - max) / (count - 2);
+                if( count > 2)
+                    return (sum - min - max) / (count - 2);
+                else if (count > 0)
+                    return sum / count;
+                else
+                    return sum;
             }
         };
         typedef std::vector<Candidate> Candidates;
@@ -144,9 +152,9 @@ namespace Simd
                 if (_info.empty())
                     _info = current->func.Info(args);
 #endif
-                double start = Simd::Time();
+                int64_t start = Simd::TimeCounter();
                 current->func.Run(args);
-                current->Update(Simd::Time() - start);
+                current->Update(Simd::TimeCounter() - start);
             }
             else
             {
@@ -173,10 +181,10 @@ namespace Simd
         SIMD_INLINE Candidate * Best()
         {
             Candidate * best = &_candidates[0];
-            double min = best->Mean();
+            int64_t min = best->Mean();
             for (size_t i = 1; i < _candidates.size(); ++i)
             {
-                double mean = _candidates[i].Mean();
+                int64_t mean = _candidates[i].Mean();
                 if (mean < min)
                 {
                     min = mean;
diff --git a/3rdparty/simdlib/Simd/SimdSet.h b/3rdparty/simdlib/Simd/SimdSet.h
old mode 100644
new mode 100755
index ae1bb6066a..22b5622e73
--- a/3rdparty/simdlib/Simd/SimdSet.h
+++ b/3rdparty/simdlib/Simd/SimdSet.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2018 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -109,6 +109,12 @@ namespace Simd
             const float a[4] = { a0, a1, a2, a3 };
             return vld1q_f32(a);
         }
+
+        SIMD_INLINE int32x4_t SetI32(int32_t a0, int32_t a1, int32_t a2, int32_t a3)
+        {
+            const int32_t a[4] = { a0, a1, a2, a3 };
+            return vld1q_s32(a);
+        }
     }
 #endif// SIMD_NEON_ENABLE
 }
diff --git a/3rdparty/simdlib/Simd/SimdSse1Resizer.cpp b/3rdparty/simdlib/Simd/SimdSse1Resizer.cpp
deleted file mode 100644
index 405ee03f4f..0000000000
--- a/3rdparty/simdlib/Simd/SimdSse1Resizer.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdResizer.h"
-#include "Simd/SimdStore.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSE_ENABLE 
-    namespace Sse
-    {
-        ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param)
-            : Base::ResizerFloatBilinear(param)
-        {
-        }
-
-        void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride)
-        {
-            size_t cn = _param.channels;
-            size_t rs = _param.dstW * cn;
-            float * pbx[2] = { _bx[0].data, _bx[1].data };
-            int32_t prev = -2;
-            size_t rsa = AlignLo(rs, Sse::F);
-            for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride)
-            {
-                float fy1 = _ay[dy];
-                float fy0 = 1.0f - fy1;
-                int32_t sy = _iy[dy];
-                int32_t k = 0;
-
-                if (sy == prev)
-                    k = 2;
-                else if (sy == prev + 1)
-                {
-                    Swap(pbx[0], pbx[1]);
-                    k = 1;
-                }
-
-                prev = sy;
-
-                for (; k < 2; k++)
-                {
-                    float * pb = pbx[k];
-                    const float * ps = src + (sy + k)*srcStride;
-                    size_t dx = 0;
-                    if (cn == 1)
-                    {
-                        __m128 _1 = _mm_set1_ps(1.0f);
-                        for (; dx < rsa; dx += Sse::F)
-                        {
-                            __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]);
-                            __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); 
-                            __m128 fx1 = _mm_load_ps(_ax.data + dx);
-                            __m128 fx0 = _mm_sub_ps(_1, fx1);
-                            __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88));
-                            __m128 m1 = _mm_mul_ps(fx1, _mm_shuffle_ps(s01, s23, 0xDD));
-                            _mm_store_ps(pb + dx, _mm_add_ps(m0, m1));
-                        }
-                    }
-                    if (cn == 3 && rs > 3)
-                    {
-                        __m128 _1 = _mm_set1_ps(1.0f);
-                        size_t rs3 = rs - 3;
-                        for (; dx < rs3; dx += 3)
-                        {
-                            __m128 s0 = _mm_loadu_ps(ps + _ix[dx] + 0);
-                            __m128 s1 = _mm_loadu_ps(ps + _ix[dx] + 3);
-                            __m128 fx1 = _mm_set1_ps(_ax.data[dx]);
-                            __m128 fx0 = _mm_sub_ps(_1, fx1);
-                            _mm_storeu_ps(pb + dx, _mm_add_ps(_mm_mul_ps(fx0, s0), _mm_mul_ps(fx1, s1)));
-                        }
-                    }
-                    for (; dx < rs; dx++)
-                    {
-                        int32_t sx = _ix[dx];
-                        float fx = _ax[dx];
-                        pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + cn] * fx;
-                    }
-                }  
-
-                size_t dx = 0;
-                __m128 _fy0 = _mm_set1_ps(fy0);
-                __m128 _fy1 = _mm_set1_ps(fy1);
-                for (; dx < rsa; dx += Sse::F)
-                {
-                    __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _fy0);
-                    __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _fy1);
-                    _mm_storeu_ps(dst + dx, _mm_add_ps(m0, m1));
-                }
-                for (; dx < rs; dx++)
-                    dst[dx] = pbx[0][dx] * fy0 + pbx[1][dx] * fy1;
-            }
-        }
-
-        //---------------------------------------------------------------------
-
-        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
-        {
-            ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128));
-            if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp))
-                return new ResizerFloatBilinear(param);
-            else
-                return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
-        }
-    }
-#endif //SIMD_SSE_ENABLE 
-}
-
diff --git a/3rdparty/simdlib/Simd/SimdSse2.h b/3rdparty/simdlib/Simd/SimdSse2.h
old mode 100644
new mode 100755
index ce304774f5..66a0d22700
--- a/3rdparty/simdlib/Simd/SimdSse2.h
+++ b/3rdparty/simdlib/Simd/SimdSse2.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -33,15 +33,11 @@ namespace Simd
     {
         void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride);
 
-        void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride);
-
         void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height,
             const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
         void BgrToGray(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *gray, size_t grayStride);
 
-        void RgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride);
-
         void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height,
             size_t channelCount, uint8_t * dst, size_t dstStride);
 
@@ -68,6 +64,8 @@ namespace Simd
         void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
 
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride);
+
         void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
 
diff --git a/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp b/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp
old mode 100644
new mode 100755
index c150220b82..b818225858
--- a/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -88,6 +88,58 @@ namespace Simd
             else
                 BgraToGray<false>(bgra, width, height, bgraStride, gray, grayStride);
         }
+
+        //---------------------------------------------------------------------
+
+        const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
+
+        SIMD_INLINE __m128i RgbaToGray32(__m128i rgba)
+        {
+            const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF);
+            const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF);
+            const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_0000), _mm_madd_epi16(r0b0, K16_RED_BLUE));
+            return _mm_srli_epi32(_mm_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT);
+        }
+
+        SIMD_INLINE __m128i RgbaToGray(__m128i rgba[4])
+        {
+            const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
+            const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
+            return _mm_packus_epi16(lo, hi);
+        }
+
+        template <bool align> void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            __m128i a[4];
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                {
+                    Load<align>(rgba + 4 * col, a);
+                    Store<align>((__m128i*)(gray + col), RgbaToGray(a));
+                }
+                if (alignedWidth != width)
+                {
+                    Load<false>(rgba + 4 * (width - A), a);
+                    Store<false>((__m128i*)(gray + width - A), RgbaToGray(a));
+                }
+                rgba += rgbaStride;
+                gray += grayStride;
+            }
+        }
+
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+        {
+            if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride))
+                RgbaToGray<true>(rgba, width, height, rgbaStride, gray, grayStride);
+            else
+                RgbaToGray<false>(rgba, width, height, rgbaStride, gray, grayStride);
+        }
     }
 #else
     // Work arround to avoid warning: libvisp_simdlib.a(SimdSse2BgraToGray.cpp.o) has no symbols
diff --git a/3rdparty/simdlib/Simd/SimdBaseBgraToRgba.cpp b/3rdparty/simdlib/Simd/SimdSse2Cpu.cpp
similarity index 62%
rename from 3rdparty/simdlib/Simd/SimdBaseBgraToRgba.cpp
rename to 3rdparty/simdlib/Simd/SimdSse2Cpu.cpp
index 8ada2f6a2c..3d1dfe00fb 100644
--- a/3rdparty/simdlib/Simd/SimdBaseBgraToRgba.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse2Cpu.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,30 +21,44 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
-#include "Simd/SimdDefs.h"
-#include <algorithm>
+#include "Simd/SimdEnable.h"
+#include "Simd/SimdCpu.h"
+
+#if defined(_MSC_VER)
+#include <windows.h>
+#endif
 
 namespace Simd
 {
-    namespace Base
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
-        void BgraToRgba(const uint8_t *bgra, size_t size, uint8_t *rgba)
+        SIMD_INLINE bool SupportedByCPU()
         {
-            for (size_t i = 0; i < size; ++i, bgra += 4, rgba += 4)
-            {
-                *(int32_t*)rgba = (*(int32_t*)bgra);
-                std::swap(rgba[0], rgba[2]);
-            }
+            return Base::CheckBit(Cpuid::Ordinary, Cpuid::Edx, Cpuid::SSE2);
         }
 
-        void BgraToRgba(const uint8_t *bgra, size_t width, size_t height, size_t bgraStride, uint8_t *rgba, size_t rgbaStride)
+        SIMD_INLINE bool SupportedByOS()
         {
-            for (size_t row = 0; row < height; ++row)
+#if defined(_MSC_VER)
+            __try
+            {
+                __m128d value = _mm_set1_pd(1.0);// try to execute of SSE2 instructions;
+                return true;
+            }
+            __except (EXCEPTION_EXECUTE_HANDLER)
             {
-                BgraToRgba(bgra, width, rgba);
-                bgra += bgraStride;
-                rgba += rgbaStride;
+                return false;
             }
+#else
+            return true;
+#endif
+        }
+
+        bool GetEnable()
+        {
+            return SupportedByCPU() && SupportedByOS();
         }
     }
+#endif
 }
diff --git a/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp b/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp
old mode 100644
new mode 100755
index 394488a804..70e4f139ea
--- a/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -22,6 +22,7 @@
 * SOFTWARE.
 */
 #include "Simd/SimdMemory.h"
+#include "Simd/SimdLoadBlock.h"
 #include "Simd/SimdStore.h"
 
 namespace Simd
diff --git a/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp b/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp
old mode 100644
new mode 100755
index f29d96eeb1..c289ab7f75
--- a/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -297,12 +297,12 @@ namespace Simd
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
         {
             ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128i));
-            if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && (channels == 1 || channels == 2) && dstX >= A)
+            if (param.IsByteBilinear() && (channels == 1 || channels == 2) && dstX >= A)
                 return new ResizerByteBilinear(param);
-            else if (type == SimdResizeChannelByte && method == SimdResizeMethodArea)
+            else if (param.IsByteArea())
                 return new ResizerByteArea(param);
             else
-                return Sse::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
+                return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
         }
     }
 #else
diff --git a/3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp b/3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp
deleted file mode 100644
index 927dde0dae..0000000000
--- a/3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdBase.h"
-#include "Simd/SimdSse2.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
-        namespace
-        {
-            struct Buffer
-            {
-                Buffer(size_t width)
-                {
-                    _p = Allocate(sizeof(uint8_t) * 4 * width);
-                    rgba = (uint8_t*)_p;
-                }
-
-                ~Buffer()
-                {
-                    Free(_p);
-                }
-
-                uint8_t * rgba;
-            private:
-                void *_p;
-            };
-        }
-
-        void RgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride)
-        {
-            assert(width >= A);
-
-            Buffer buffer(width);
-
-            for (size_t row = 1; row < height; ++row)
-            {
-                Base::BgrToBgra(rgb, width, buffer.rgba, false, false, 0xFF);
-                Sse2::RgbaToGray(buffer.rgba, width, 1, 4 * width, gray, width);
-                rgb += rgbStride;
-                gray += grayStride;
-            }
-            Base::BgrToBgra(rgb, width, buffer.rgba, false, true, 0xFF);
-            Sse2::RgbaToGray(buffer.rgba, width, 1, 4 * width, gray, width);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSse2RgbToGray.cpp.o) has no symbols
-    void dummy_SimdSse2RgbToGray(){};
-#endif//SIMD_SSE2_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp
deleted file mode 100644
index 884f09924b..0000000000
--- a/3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
-        const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
-        const __m128i K16_GREEN_0000 = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, 0x0000);
-        const __m128i K32_ROUND_TERM = SIMD_MM_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM);
-
-        SIMD_INLINE __m128i RgbaToGray32(__m128i rgba)
-        {
-            const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF);
-            const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF);
-            const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_0000), _mm_madd_epi16(r0b0, K16_RED_BLUE));
-            return _mm_srli_epi32(_mm_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT);
-        }
-
-        SIMD_INLINE __m128i RgbaToGray(__m128i rgba[4])
-        {
-            const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
-            const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
-            return _mm_packus_epi16(lo, hi);
-        }
-
-        template <bool align> SIMD_INLINE void Load(const uint8_t* p, __m128i a[4])
-        {
-            a[0] = Load<align>((__m128i*)p + 0);
-            a[1] = Load<align>((__m128i*)p + 1);
-            a[2] = Load<align>((__m128i*)p + 2);
-            a[3] = Load<align>((__m128i*)p + 3);
-        }
-
-        template <bool align> void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-            __m128i a[4];
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                {
-                    Load<align>(rgba + 4 * col, a);
-                    Store<align>((__m128i*)(gray + col), RgbaToGray(a));
-                }
-                if (alignedWidth != width)
-                {
-                    Load<false>(rgba + 4 * (width - A), a);
-                    Store<false>((__m128i*)(gray + width - A), RgbaToGray(a));
-                }
-                rgba += rgbaStride;
-                gray += grayStride;
-            }
-        }
-
-        void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride)
-        {
-            if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride))
-                RgbaToGray<true>(rgba, width, height, rgbaStride, gray, grayStride);
-            else
-                RgbaToGray<false>(rgba, width, height, rgbaStride, gray, grayStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSse2RgbaToGray.cpp.o) has no symbols
-    void dummy_SimdSse2RgbaToGray(){};
-#endif// SIMD_SSE2_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdSse41.h b/3rdparty/simdlib/Simd/SimdSse41.h
new file mode 100755
index 0000000000..958fc11bc5
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdSse41.h
@@ -0,0 +1,76 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdSse41_h__
+#define __SimdSse41_h__
+
+#include "Simd/SimdDefs.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE
+    namespace Sse41
+    {
+        void BgraToBgr(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* bgr, size_t bgrStride);
+
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride);
+
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride);
+
+        void BgrToBgra(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+
+        void BgrToGray(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* gray, size_t grayStride);
+
+        void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride);
+
+        void DeinterleaveBgr(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride);
+
+        void DeinterleaveBgra(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height, uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride);
+
+        void GaussianBlur3x3(const uint8_t* src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t* dst, size_t dstStride);
+
+        void GrayToBgr(const uint8_t* gray, size_t width, size_t height, size_t grayStride, uint8_t* bgr, size_t bgrStride);
+
+        void InterleaveBgr(const uint8_t* b, size_t bStride, const uint8_t* g, size_t gStride, const uint8_t* r, size_t rStride, size_t width, size_t height, uint8_t* bgr, size_t bgrStride);
+
+        void InterleaveBgra(const uint8_t* b, size_t bStride, const uint8_t* g, size_t gStride, const uint8_t* r, size_t rStride, const uint8_t* a, size_t aStride, size_t width, size_t height, uint8_t* bgra, size_t bgraStride);
+
+        void ReduceColor2x2(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
+
+        void ReduceGray2x2(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
+
+        void ReduceGray4x4(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
+
+        void ResizeBilinear(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
+
+        void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride);
+    }
+#endif// SIMD_SSE41_ENABLE
+}
+#endif//__SimdSse41_h__
diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdSse41BgrToBgra.cpp
old mode 100644
new mode 100755
similarity index 57%
rename from 3rdparty/simdlib/Simd/SimdSsse3BgrToBgra.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41BgrToBgra.cpp
index 2c7f277758..65787e1a45
--- a/3rdparty/simdlib/Simd/SimdSsse3BgrToBgra.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41BgrToBgra.cpp
@@ -1,74 +1,111 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        template <bool align> SIMD_INLINE void BgrToBgra(const uint8_t * bgr, uint8_t * bgra, __m128i alpha, __m128i shuffle)
-        {
-            Store<align>((__m128i*)bgra + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<align>((__m128i*)(bgr + 0)), shuffle)));
-            Store<align>((__m128i*)bgra + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 12)), shuffle)));
-            Store<align>((__m128i*)bgra + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 24)), shuffle)));
-            Store<align>((__m128i*)bgra + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(bgr + 32)), 4), shuffle)));
-        }
-
-        template <bool align> void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3);
-            __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    BgrToBgra<align>(bgr + 3 * col, bgra + 4 * col, _alpha, _shuffle);
-                if (width != alignedWidth)
-                    BgrToBgra<false>(bgr + 3 * (width - A), bgra + 4 * (width - A), _alpha, _shuffle);
-                bgr += bgrStride;
-                bgra += bgraStride;
-            }
-        }
-
-        void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha)
-        {
-            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride))
-                BgrToBgra<true>(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
-            else
-                BgrToBgra<false>(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToBgra.cpp.o) has no symbols
-    void dummy_SimdSsse3BgrToBgra(){};
-#endif// SIMD_SSSE3_ENABLE
-}
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdStore.h"
+#include "Simd/SimdMemory.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE  
+    namespace Sse41
+    {
+        template <bool align> SIMD_INLINE void BgrToBgra(const uint8_t * bgr, uint8_t * bgra, __m128i alpha, __m128i shuffle)
+        {
+            Store<align>((__m128i*)bgra + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<align>((__m128i*)(bgr + 0)), shuffle)));
+            Store<align>((__m128i*)bgra + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 12)), shuffle)));
+            Store<align>((__m128i*)bgra + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 24)), shuffle)));
+            Store<align>((__m128i*)bgra + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(bgr + 32)), 4), shuffle)));
+        }
+
+        template <bool align> void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+
+            __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3);
+            __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    BgrToBgra<align>(bgr + 3 * col, bgra + 4 * col, _alpha, _shuffle);
+                if (width != alignedWidth)
+                    BgrToBgra<false>(bgr + 3 * (width - A), bgra + 4 * (width - A), _alpha, _shuffle);
+                bgr += bgrStride;
+                bgra += bgraStride;
+            }
+        }
+
+        void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride))
+                BgrToBgra<true>(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
+            else
+                BgrToBgra<false>(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
+        }
+
+        //---------------------------------------------------------------------
+
+        template <bool align> SIMD_INLINE void RgbToBgra(const uint8_t* rgb, uint8_t* bgra, __m128i alpha, __m128i shuffle)
+        {
+            Store<align>((__m128i*)bgra + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<align>((__m128i*)(rgb + 0)), shuffle)));
+            Store<align>((__m128i*)bgra + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(rgb + 12)), shuffle)));
+            Store<align>((__m128i*)bgra + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(rgb + 24)), shuffle)));
+            Store<align>((__m128i*)bgra + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(rgb + 32)), 4), shuffle)));
+        }
+
+        template <bool align> void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+
+            __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3);
+            __m128i _shuffle = _mm_setr_epi8(0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    RgbToBgra<align>(rgb + 3 * col, bgra + 4 * col, _alpha, _shuffle);
+                if (width != alignedWidth)
+                    RgbToBgra<false>(rgb + 3 * (width - A), bgra + 4 * (width - A), _alpha, _shuffle);
+                 rgb += rgbStride;
+                bgra += bgraStride;
+            }
+        }
+
+        void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride))
+                RgbToBgra<true>(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+            else
+                RgbToBgra<false>(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToGray.cpp b/3rdparty/simdlib/Simd/SimdSse41BgrToGray.cpp
old mode 100644
new mode 100755
similarity index 56%
rename from 3rdparty/simdlib/Simd/SimdSsse3BgrToGray.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41BgrToGray.cpp
index 224a87bbce..b089e35631
--- a/3rdparty/simdlib/Simd/SimdSsse3BgrToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41BgrToGray.cpp
@@ -1,93 +1,148 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        const __m128i K16_BLUE_RED = SIMD_MM_SET2_EPI16(Base::BLUE_TO_GRAY_WEIGHT, Base::RED_TO_GRAY_WEIGHT);
-        const __m128i K16_GREEN_ROUND = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM);
-
-        SIMD_INLINE __m128i BgraToGray32(__m128i bgra)
-        {
-            const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(bgra, 1), K16_00FF);
-            const __m128i b0r0 = _mm_and_si128(bgra, K16_00FF);
-            const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(b0r0, K16_BLUE_RED));
-            return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT);
-        }
-
-        SIMD_INLINE __m128i BgraToGray(__m128i bgra[4])
-        {
-            const __m128i lo = _mm_packs_epi32(BgraToGray32(bgra[0]), BgraToGray32(bgra[1]));
-            const __m128i hi = _mm_packs_epi32(BgraToGray32(bgra[2]), BgraToGray32(bgra[3]));
-            return _mm_packus_epi16(lo, hi);
-        }
-
-        template <bool align> SIMD_INLINE __m128i BgrToGray(const uint8_t * bgr, __m128i shuffle)
-        {
-            __m128i bgra[4];
-            bgra[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<align>((__m128i*)(bgr + 0)), shuffle));
-            bgra[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 12)), shuffle));
-            bgra[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 24)), shuffle));
-            bgra[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(bgr + 32)), 4), shuffle));
-            return BgraToGray(bgra);
-        }
-
-        template <bool align> void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    Store<align>((__m128i*)(gray + col), BgrToGray<align>(bgr + 3 * col, _shuffle));
-                if (width != alignedWidth)
-                    Store<false>((__m128i*)(gray + width - A), BgrToGray<false>(bgr + 3 * (width - A), _shuffle));
-                bgr += bgrStride;
-                gray += grayStride;
-            }
-        }
-
-        void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride)
-        {
-            if (Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride))
-                BgrToGray<true>(bgr, width, height, bgrStride, gray, grayStride);
-            else
-                BgrToGray<false>(bgr, width, height, bgrStride, gray, grayStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToGray.cpp.o) has no symbols
-    void dummy_SimdSsse3BgrToGray(){};
-#endif// SIMD_SSSE3_ENABLE
-}
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdStore.h"
+#include "Simd/SimdMemory.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE  
+    namespace Sse41
+    {
+        const __m128i K16_BLUE_RED = SIMD_MM_SET2_EPI16(Base::BLUE_TO_GRAY_WEIGHT, Base::RED_TO_GRAY_WEIGHT);
+        const __m128i K16_GREEN_ROUND = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM);
+
+        SIMD_INLINE __m128i BgraToGray32(__m128i bgra)
+        {
+            const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(bgra, 1), K16_00FF);
+            const __m128i b0r0 = _mm_and_si128(bgra, K16_00FF);
+            const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(b0r0, K16_BLUE_RED));
+            return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT);
+        }
+
+        SIMD_INLINE __m128i BgraToGray(__m128i bgra[4])
+        {
+            const __m128i lo = _mm_packs_epi32(BgraToGray32(bgra[0]), BgraToGray32(bgra[1]));
+            const __m128i hi = _mm_packs_epi32(BgraToGray32(bgra[2]), BgraToGray32(bgra[3]));
+            return _mm_packus_epi16(lo, hi);
+        }
+
+        template <bool align> SIMD_INLINE __m128i BgrToGray(const uint8_t * bgr, __m128i shuffle)
+        {
+            __m128i bgra[4];
+            bgra[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<align>((__m128i*)(bgr + 0)), shuffle));
+            bgra[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 12)), shuffle));
+            bgra[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 24)), shuffle));
+            bgra[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(bgr + 32)), 4), shuffle));
+            return BgraToGray(bgra);
+        }
+
+        template <bool align> void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+
+            __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    Store<align>((__m128i*)(gray + col), BgrToGray<align>(bgr + 3 * col, _shuffle));
+                if (width != alignedWidth)
+                    Store<false>((__m128i*)(gray + width - A), BgrToGray<false>(bgr + 3 * (width - A), _shuffle));
+                bgr += bgrStride;
+                gray += grayStride;
+            }
+        }
+
+        void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride)
+        {
+            if (Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride))
+                BgrToGray<true>(bgr, width, height, bgrStride, gray, grayStride);
+            else
+                BgrToGray<false>(bgr, width, height, bgrStride, gray, grayStride);
+        }
+
+        //---------------------------------------------------------------------
+
+        const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
+
+        SIMD_INLINE __m128i RgbaToGray32(__m128i rgba)
+        {
+            const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF);
+            const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF);
+            const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(r0b0, K16_RED_BLUE));
+            return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT);
+        }
+
+        SIMD_INLINE __m128i RgbaToGray(__m128i rgba[4])
+        {
+            const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
+            const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
+            return _mm_packus_epi16(lo, hi);
+        }
+
+        template <bool align> SIMD_INLINE __m128i RgbToGray(const uint8_t* rgb, __m128i shuffle)
+        {
+            __m128i rgba[4];
+            rgba[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<align>((__m128i*)(rgb + 0)), shuffle));
+            rgba[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(rgb + 12)), shuffle));
+            rgba[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(rgb + 24)), shuffle));
+            rgba[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(rgb + 32)), 4), shuffle));
+            return RgbaToGray(rgba);
+        }
+
+        template <bool align> void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+
+            __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    Store<align>((__m128i*)(gray + col), RgbToGray<align>(rgb + 3 * col, _shuffle));
+                if (width != alignedWidth)
+                    Store<false>((__m128i*)(gray + width - A), RgbToGray<false>(rgb + 3 * (width - A), _shuffle));
+                rgb += rgbStride;
+                gray += grayStride;
+            }
+        }
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+        {
+            if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride))
+                RgbToGray<true>(rgb, width, height, rgbStride, gray, grayStride);
+            else
+                RgbToGray<false>(rgb, width, height, rgbStride, gray, grayStride);
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdSse41BgrToRgb.cpp
old mode 100644
new mode 100755
similarity index 84%
rename from 3rdparty/simdlib/Simd/SimdSsse3BgrToRgb.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41BgrToRgb.cpp
index 0f74b41b91..14a351a5c9
--- a/3rdparty/simdlib/Simd/SimdSsse3BgrToRgb.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41BgrToRgb.cpp
@@ -1,83 +1,80 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdStore.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        const __m128i K8_CVT_00 = SIMD_MM_SETR_EPI8(0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1);
-        const __m128i K8_CVT_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1);
-        const __m128i K8_CVT_10 = SIMD_MM_SETR_EPI8(-1, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        const __m128i K8_CVT_11 = SIMD_MM_SETR_EPI8(0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF);
-        const __m128i K8_CVT_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, -1);
-        const __m128i K8_CVT_21 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        const __m128i K8_CVT_22 = SIMD_MM_SETR_EPI8(-1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD);
-
-        template <bool align> SIMD_INLINE void BgrToRgb(const uint8_t * src, uint8_t * dst)
-        {
-            __m128i s0 = Load<align>((__m128i*)src + 0);
-            __m128i s1 = Load<align>((__m128i*)src + 1);
-            __m128i s2 = Load<align>((__m128i*)src + 2);
-            Store<align>((__m128i*)dst + 0, _mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_00), _mm_shuffle_epi8(s1, K8_CVT_01)));
-            Store<align>((__m128i*)dst + 1, _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_10), _mm_shuffle_epi8(s1, K8_CVT_11)), _mm_shuffle_epi8(s2, K8_CVT_12)));
-            Store<align>((__m128i*)dst + 2, _mm_or_si128(_mm_shuffle_epi8(s1, K8_CVT_21), _mm_shuffle_epi8(s2, K8_CVT_22)));
-        }
-
-        template <bool align> void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride));
-
-            const size_t A3 = A * 3;
-            size_t size = width * 3;
-            size_t aligned = AlignLo(width, A) * 3;
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t i = 0; i < aligned; i += A3)
-                    BgrToRgb<align>(bgr + i, rgb + i);
-                if (aligned < size)
-                    BgrToRgb<false>(bgr + size - A3, rgb + size - A3);
-                bgr += bgrStride;
-                rgb += rgbStride;
-            }
-        }
-
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
-        {
-            if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride))
-                BgrToRgb<true>(bgr, bgrStride, width, height, rgb, rgbStride);
-            else
-                BgrToRgb<false>(bgr, bgrStride, width, height, rgb, rgbStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToRgb.cpp.o) has no symbols
-    void dummy_SimdSsse3BgrToRgb(){};
-#endif//SIMD_SSSE3_ENABLE
-}
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdStore.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        const __m128i K8_CVT_00 = SIMD_MM_SETR_EPI8(0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1);
+        const __m128i K8_CVT_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1);
+        const __m128i K8_CVT_10 = SIMD_MM_SETR_EPI8(-1, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        const __m128i K8_CVT_11 = SIMD_MM_SETR_EPI8(0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF);
+        const __m128i K8_CVT_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, -1);
+        const __m128i K8_CVT_21 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        const __m128i K8_CVT_22 = SIMD_MM_SETR_EPI8(-1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD);
+
+        template <bool align> SIMD_INLINE void BgrToRgb(const uint8_t * src, uint8_t * dst)
+        {
+            __m128i s0 = Load<align>((__m128i*)src + 0);
+            __m128i s1 = Load<align>((__m128i*)src + 1);
+            __m128i s2 = Load<align>((__m128i*)src + 2);
+            Store<align>((__m128i*)dst + 0, _mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_00), _mm_shuffle_epi8(s1, K8_CVT_01)));
+            Store<align>((__m128i*)dst + 1, _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_10), _mm_shuffle_epi8(s1, K8_CVT_11)), _mm_shuffle_epi8(s2, K8_CVT_12)));
+            Store<align>((__m128i*)dst + 2, _mm_or_si128(_mm_shuffle_epi8(s1, K8_CVT_21), _mm_shuffle_epi8(s2, K8_CVT_22)));
+        }
+
+        template <bool align> void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            const size_t A3 = A * 3;
+            size_t size = width * 3;
+            size_t aligned = AlignLo(width, A) * 3;
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t i = 0; i < aligned; i += A3)
+                    BgrToRgb<align>(bgr + i, rgb + i);
+                if (aligned < size)
+                    BgrToRgb<false>(bgr + size - A3, rgb + size - A3);
+                bgr += bgrStride;
+                rgb += rgbStride;
+            }
+        }
+
+        void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride)
+        {
+            if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride))
+                BgrToRgb<true>(bgr, width, height, bgrStride, rgb, rgbStride);
+            else
+                BgrToRgb<false>(bgr, width, height, bgrStride, rgb, rgbStride);
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdSse41BgraToBgr.cpp
old mode 100644
new mode 100755
similarity index 53%
rename from 3rdparty/simdlib/Simd/SimdSsse3BgraToBgr.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41BgraToBgr.cpp
index ccf4c51c97..a3000972e6
--- a/3rdparty/simdlib/Simd/SimdSsse3BgraToBgr.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41BgraToBgr.cpp
@@ -1,92 +1,165 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        template <bool align> SIMD_INLINE void BgraToBgrBody(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2])
-        {
-            Store<align>((__m128i*)(bgr + 0), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 0), k[0][0]));
-            Store<false>((__m128i*)(bgr + 12), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 1), k[0][0]));
-            Store<false>((__m128i*)(bgr + 24), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 2), k[0][0]));
-            Store<false>((__m128i*)(bgr + 36), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 3), k[0][0]));
-        }
-
-        template <bool align> SIMD_INLINE void BgraToBgr(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2])
-        {
-            __m128i bgra0 = Load<align>((__m128i*)bgra + 0);
-            __m128i bgra1 = Load<align>((__m128i*)bgra + 1);
-            __m128i bgra2 = Load<align>((__m128i*)bgra + 2);
-            __m128i bgra3 = Load<align>((__m128i*)bgra + 3);
-            Store<align>((__m128i*)bgr + 0, _mm_or_si128(_mm_shuffle_epi8(bgra0, k[0][0]), _mm_shuffle_epi8(bgra1, k[0][1])));
-            Store<align>((__m128i*)bgr + 1, _mm_or_si128(_mm_shuffle_epi8(bgra1, k[1][0]), _mm_shuffle_epi8(bgra2, k[1][1])));
-            Store<align>((__m128i*)bgr + 2, _mm_or_si128(_mm_shuffle_epi8(bgra2, k[2][0]), _mm_shuffle_epi8(bgra3, k[2][1])));
-        }
-
-        template <bool align> void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-            if (width == alignedWidth)
-                alignedWidth -= A;
-
-            __m128i k[3][2];
-            k[0][0] = _mm_setr_epi8(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1);
-            k[0][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4);
-            k[1][0] = _mm_setr_epi8(0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1);
-            k[1][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9);
-            k[2][0] = _mm_setr_epi8(0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-            k[2][1] = _mm_setr_epi8(-1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    BgraToBgrBody<align>(bgra + 4 * col, bgr + 3 * col, k);
-                if (width != alignedWidth)
-                    BgraToBgr<false>(bgra + 4 * (width - A), bgr + 3 * (width - A), k);
-                bgra += bgraStride;
-                bgr += bgrStride;
-            }
-        }
-
-        void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride)
-        {
-            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride))
-                BgraToBgr<true>(bgra, width, height, bgraStride, bgr, bgrStride);
-            else
-                BgraToBgr<false>(bgra, width, height, bgraStride, bgr, bgrStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgraToBgr.cpp.o) has no symbols
-    void dummy_SimdSsse3BgraToBgr(){};
-#endif// SIMD_SSSE3_ENABLE
-}
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdStore.h"
+#include "Simd/SimdMemory.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE  
+    namespace Sse41
+    {
+        template <bool align> SIMD_INLINE void BgraToBgrBody(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2])
+        {
+            Store<align>((__m128i*)(bgr + 0), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 0), k[0][0]));
+            Store<false>((__m128i*)(bgr + 12), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 1), k[0][0]));
+            Store<false>((__m128i*)(bgr + 24), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 2), k[0][0]));
+            Store<false>((__m128i*)(bgr + 36), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 3), k[0][0]));
+        }
+
+        template <bool align> SIMD_INLINE void BgraToBgr(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2])
+        {
+            __m128i bgra0 = Load<align>((__m128i*)bgra + 0);
+            __m128i bgra1 = Load<align>((__m128i*)bgra + 1);
+            __m128i bgra2 = Load<align>((__m128i*)bgra + 2);
+            __m128i bgra3 = Load<align>((__m128i*)bgra + 3);
+            Store<align>((__m128i*)bgr + 0, _mm_or_si128(_mm_shuffle_epi8(bgra0, k[0][0]), _mm_shuffle_epi8(bgra1, k[0][1])));
+            Store<align>((__m128i*)bgr + 1, _mm_or_si128(_mm_shuffle_epi8(bgra1, k[1][0]), _mm_shuffle_epi8(bgra2, k[1][1])));
+            Store<align>((__m128i*)bgr + 2, _mm_or_si128(_mm_shuffle_epi8(bgra2, k[2][0]), _mm_shuffle_epi8(bgra3, k[2][1])));
+        }
+
+        template <bool align> void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            if (width == alignedWidth)
+                alignedWidth -= A;
+
+            __m128i k[3][2];
+            k[0][0] = _mm_setr_epi8(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1);
+            k[0][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4);
+            k[1][0] = _mm_setr_epi8(0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1);
+            k[1][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9);
+            k[2][0] = _mm_setr_epi8(0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+            k[2][1] = _mm_setr_epi8(-1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    BgraToBgrBody<align>(bgra + 4 * col, bgr + 3 * col, k);
+                if (width != alignedWidth)
+                    BgraToBgr<false>(bgra + 4 * (width - A), bgr + 3 * (width - A), k);
+                bgra += bgraStride;
+                bgr += bgrStride;
+            }
+        }
+
+        void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride))
+                BgraToBgr<true>(bgra, width, height, bgraStride, bgr, bgrStride);
+            else
+                BgraToBgr<false>(bgra, width, height, bgraStride, bgr, bgrStride);
+        }
+
+        //---------------------------------------------------------------------
+
+        template <bool align> void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            if (width == alignedWidth)
+                alignedWidth -= A;
+
+            __m128i k[3][2];
+            k[0][0] = _mm_setr_epi8(0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1);
+            k[0][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x1, 0x0, 0x6);
+            k[1][0] = _mm_setr_epi8(0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1, -1, -1, -1, -1);
+            k[1][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9);
+            k[2][0] = _mm_setr_epi8(0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+            k[2][1] = _mm_setr_epi8(-1, -1, -1, -1, 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    BgraToBgrBody<align>(bgra + 4 * col, rgb + 3 * col, k);
+                if (width != alignedWidth)
+                    BgraToBgr<false>(bgra + 4 * (width - A), rgb + 3 * (width - A), k);
+                bgra += bgraStride;
+                rgb += rgbStride;
+            }
+        }
+
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride))
+                BgraToRgb<true>(bgra, width, height, bgraStride, rgb, rgbStride);
+            else
+                BgraToRgb<false>(bgra, width, height, bgraStride, rgb, rgbStride);
+        }
+
+        //---------------------------------------------------------------------
+
+        const __m128i K8_BGRA_TO_RGBA = SIMD_MM_SETR_EPI8(0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF);
+
+        template <bool align> SIMD_INLINE void BgraToRgba(const uint8_t* bgra, uint8_t* rgba)
+        {
+            Store<align>((__m128i*)rgba, _mm_shuffle_epi8(Load<align>((__m128i*)bgra), K8_BGRA_TO_RGBA));
+        }
+
+        template <bool align> void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride));
+
+            size_t size = width * 4;
+            size_t sizeA = AlignLo(size, A);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t i = 0; i < size; i += A)
+                    BgraToRgba<align>(bgra + i, rgba + i);
+                if (size != sizeA)
+                    BgraToRgba<false>(bgra + size - sizeA, rgba + size - sizeA);
+                bgra += bgraStride;
+                rgba += rgbaStride;
+            }
+        }
+
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride))
+                BgraToRgba<true>(bgra, width, height, bgraStride, rgba, rgbaStride);
+            else
+                BgraToRgba<false>(bgra, width, height, bgraStride, rgba, rgbaStride);
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdBaseRgbToGray.cpp b/3rdparty/simdlib/Simd/SimdSse41Cpu.cpp
similarity index 54%
rename from 3rdparty/simdlib/Simd/SimdBaseRgbToGray.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41Cpu.cpp
index 6ac7f88791..9b5719ce97 100644
--- a/3rdparty/simdlib/Simd/SimdBaseRgbToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41Cpu.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,23 +21,47 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
-#include "Simd/SimdConversion.h"
+#include "Simd/SimdEnable.h"
+#include "Simd/SimdCpu.h"
+
+#if defined(_MSC_VER)
+#include <windows.h>
+#endif
 
 namespace Simd
 {
-    namespace Base
+#ifdef SIMD_SSE41_ENABLE
+    namespace Sse41
     {
-        void RgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride)
+        SIMD_INLINE bool SupportedByCPU()
         {
-            for (size_t row = 0; row < height; ++row)
+            return 
+                Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE41) &&
+                Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE42);
+        }
+
+        SIMD_INLINE bool SupportedByOS()
+        {
+#if defined(_MSC_VER)
+            __try
             {
-                const uint8_t * pRgb = rgb + row*rgbStride;
-                uint8_t * pGray = gray + row*grayStride;
-                for (const uint8_t *pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgb += 3)
-                {
-                    *pGray = RgbToGray(pRgb[0], pRgb[1], pRgb[2]);
-                }
+                int value = _mm_testz_si128(_mm_set1_epi8(0), _mm_set1_epi8(-1)); // try to execute of SSE41 instructions;
+                uint32_t crc = _mm_crc32_u8(0, 1); // try to execute of SSE42 instructions;
+                return true;
             }
+            __except (EXCEPTION_EXECUTE_HANDLER)
+            {
+                return false;
+            }
+#else
+            return true;
+#endif
+        }
+
+        bool GetEnable()
+        {
+            return SupportedByCPU() && SupportedByOS();
         }
     }
+#endif
 }
diff --git a/3rdparty/simdlib/Simd/SimdSsse3Deinterleave.cpp b/3rdparty/simdlib/Simd/SimdSse41Deinterleave.cpp
similarity index 74%
rename from 3rdparty/simdlib/Simd/SimdSsse3Deinterleave.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41Deinterleave.cpp
index 45ff364d03..68ae14efc5 100644
--- a/3rdparty/simdlib/Simd/SimdSsse3Deinterleave.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41Deinterleave.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -27,8 +27,8 @@
 
 namespace Simd
 {
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
     {
         template <bool align> SIMD_INLINE void DeinterleaveBgr(const uint8_t * bgr, uint8_t * b, uint8_t * g, uint8_t * r, size_t offset)
         {
@@ -69,9 +69,11 @@ namespace Simd
                 DeinterleaveBgr<false>(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride);
         }
 
+        //---------------------------------------------------------------------
+
         const __m128i K8_SHUFFLE_BGRA = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF);
 
-        template <bool align> SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset)
+        template <bool align, bool alpha> SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset)
         {
             __m128i _bgra[4];
             _bgra[0] = _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 0), K8_SHUFFLE_BGRA);
@@ -89,7 +91,8 @@ namespace Simd
             __m128i rraa1 = _mm_unpackhi_epi32(_bgra[2], _bgra[3]);
 
             Store<align>((__m128i*)(r + offset), _mm_unpacklo_epi64(rraa0, rraa1));
-            Store<align>((__m128i*)(a + offset), _mm_unpackhi_epi64(rraa0, rraa1));
+            if(alpha)
+                Store<align>((__m128i*)(a + offset), _mm_unpackhi_epi64(rraa0, rraa1));
         }
 
         template <bool align> void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height,
@@ -99,36 +102,51 @@ namespace Simd
             if (align)
             {
                 assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride));
-                assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride));
+                assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL));
             }
 
             size_t alignedWidth = AlignLo(width, A);
 
-            for (size_t row = 0; row < height; ++row)
+            if (a)
             {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    DeinterleaveBgra<align>(bgra + col * 4, b, g, r, a, col);
-                if (width != alignedWidth)
-                    DeinterleaveBgra<false>(bgra + 4 * (width - A), b, g, r, a, width - A);
-                bgra += bgraStride;
-                b += bStride;
-                g += gStride;
-                r += rStride;
-                a += aStride;
+                for (size_t row = 0; row < height; ++row)
+                {
+                    for (size_t col = 0; col < alignedWidth; col += A)
+                        DeinterleaveBgra<align, true>(bgra + col * 4, b, g, r, a, col);
+                    if (width != alignedWidth)
+                        DeinterleaveBgra<false, true>(bgra + 4 * (width - A), b, g, r, a, width - A);
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
+                    a += aStride;
+                }
+            }
+            else
+            {
+                for (size_t row = 0; row < height; ++row)
+                {
+                    for (size_t col = 0; col < alignedWidth; col += A)
+                        DeinterleaveBgra<align, false>(bgra + col * 4, b, g, r, NULL, col);
+                    if (width != alignedWidth)
+                        DeinterleaveBgra<false, false>(bgra + 4 * (width - A), b, g, r, NULL, width - A);
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
+                }
             }
         }
 
         void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height,
             uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride)
         {
-            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride))
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && 
+                Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL))
                 DeinterleaveBgra<true>(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
             else
                 DeinterleaveBgra<false>(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
         }
     }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Deinterleave.cpp.o) has no symbols
-    void dummy_SimdSsse3Deinterleave(){};
-#endif// SIMD_SSSE3_ENABLE
+#endif
 }
diff --git a/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp
old mode 100644
new mode 100755
index bacd2f7d91..73334c635d
--- a/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2020 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -22,6 +22,7 @@
 * SOFTWARE.
 */
 #include "Simd/SimdMemory.h"
+#include "Simd/SimdLoadBlock.h"
 #include "Simd/SimdStore.h"
 #include "Simd/SimdGaussianBlur.h"
 
diff --git a/3rdparty/simdlib/Simd/SimdSsse3GaussianBlur3x3.cpp b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur3x3.cpp
similarity index 95%
rename from 3rdparty/simdlib/Simd/SimdSsse3GaussianBlur3x3.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41GaussianBlur3x3.cpp
index 74ff76aa8a..11573a696b 100644
--- a/3rdparty/simdlib/Simd/SimdSsse3GaussianBlur3x3.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur3x3.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -22,12 +22,13 @@
 * SOFTWARE.
 */
 #include "Simd/SimdMemory.h"
+#include "Simd/SimdLoadBlock.h"
 #include "Simd/SimdStore.h"
 
 namespace Simd
 {
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
     {
         namespace
         {
@@ -154,8 +155,5 @@ namespace Simd
                 GaussianBlur3x3<false>(src, srcStride, width, height, channelCount, dst, dstStride);
         }
     }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3GaussianBlur3x3.cpp.o) has no symbols
-    void dummy_SimdSsse3GaussianBlur3x3(){};
-#endif// SIMD_SSSE3_ENABLE
+#endif
 }
diff --git a/3rdparty/simdlib/Simd/SimdSsse3GrayToBgr.cpp b/3rdparty/simdlib/Simd/SimdSse41GrayToBgr.cpp
old mode 100644
new mode 100755
similarity index 92%
rename from 3rdparty/simdlib/Simd/SimdSsse3GrayToBgr.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41GrayToBgr.cpp
index 8106f6451a..db79b3e4f0
--- a/3rdparty/simdlib/Simd/SimdSsse3GrayToBgr.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41GrayToBgr.cpp
@@ -1,75 +1,72 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        template <bool align> SIMD_INLINE void GrayToBgr(uint8_t * bgr, __m128i gray)
-        {
-            Store<align>((__m128i*)bgr + 0, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR0));
-            Store<align>((__m128i*)bgr + 1, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR1));
-            Store<align>((__m128i*)bgr + 2, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR2));
-        }
-
-        template <bool align> void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                {
-                    __m128i _gray = Load<align>((__m128i*)(gray + col));
-                    GrayToBgr<align>(bgr + 3 * col, _gray);
-                }
-                if (alignedWidth != width)
-                {
-                    __m128i _gray = Load<false>((__m128i*)(gray + width - A));
-                    GrayToBgr<false>(bgr + 3 * (width - A), _gray);
-                }
-                gray += grayStride;
-                bgr += bgrStride;
-            }
-        }
-
-        void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride)
-        {
-            if (Aligned(bgr) && Aligned(gray) && Aligned(bgrStride) && Aligned(grayStride))
-                GrayToBgr<true>(gray, width, height, grayStride, bgr, bgrStride);
-            else
-                GrayToBgr<false>(gray, width, height, grayStride, bgr, bgrStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3GrayToBgr.cpp.o) has no symbols
-    void dummy_SimdSsse3GrayToBgr(){};
-#endif// SIMD_SSSE3_ENABLE
-}
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdStore.h"
+#include "Simd/SimdMemory.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        template <bool align> SIMD_INLINE void GrayToBgr(uint8_t * bgr, __m128i gray)
+        {
+            Store<align>((__m128i*)bgr + 0, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR0));
+            Store<align>((__m128i*)bgr + 1, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR1));
+            Store<align>((__m128i*)bgr + 2, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR2));
+        }
+
+        template <bool align> void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                {
+                    __m128i _gray = Load<align>((__m128i*)(gray + col));
+                    GrayToBgr<align>(bgr + 3 * col, _gray);
+                }
+                if (alignedWidth != width)
+                {
+                    __m128i _gray = Load<false>((__m128i*)(gray + width - A));
+                    GrayToBgr<false>(bgr + 3 * (width - A), _gray);
+                }
+                gray += grayStride;
+                bgr += bgrStride;
+            }
+        }
+
+        void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride)
+        {
+            if (Aligned(bgr) && Aligned(gray) && Aligned(bgrStride) && Aligned(grayStride))
+                GrayToBgr<true>(gray, width, height, grayStride, bgr, bgrStride);
+            else
+                GrayToBgr<false>(gray, width, height, grayStride, bgr, bgrStride);
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3Interleave.cpp b/3rdparty/simdlib/Simd/SimdSse41Interleave.cpp
similarity index 96%
rename from 3rdparty/simdlib/Simd/SimdSsse3Interleave.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41Interleave.cpp
index c7213577fd..bb6354405e 100644
--- a/3rdparty/simdlib/Simd/SimdSsse3Interleave.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41Interleave.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -27,8 +27,8 @@
 
 namespace Simd
 {
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
     {
         template <bool align> SIMD_INLINE void InterleaveBgr(const uint8_t * b, const uint8_t * g, const uint8_t * r, size_t offset, uint8_t * bgr)
         {
@@ -124,8 +124,5 @@ namespace Simd
                 InterleaveBgra<false>(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride);
         }
     }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Interleave.cpp.o) has no symbols
-    void dummy_SimdSsse3Interleave(){};
-#endif// SIMD_SSSE3_ENABLE
+#endif
 }
diff --git a/3rdparty/simdlib/Simd/SimdSsse3Reduce.cpp b/3rdparty/simdlib/Simd/SimdSse41Reduce.cpp
old mode 100644
new mode 100755
similarity index 96%
rename from 3rdparty/simdlib/Simd/SimdSsse3Reduce.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41Reduce.cpp
index faded50ec7..9905a6f171
--- a/3rdparty/simdlib/Simd/SimdSsse3Reduce.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41Reduce.cpp
@@ -1,202 +1,199 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2018 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdStore.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1)
-        {
-            return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2);
-        }
-
-        SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
-        {
-            return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11));
-        }
-
-        template <size_t channelCount> __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11);
-
-        template<> SIMD_INLINE __m128i Average8<1>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
-        {
-            return Average8(s00, s01, s10, s11);
-        }
-
-        const __m128i K8_RC2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF);
-
-        template<> SIMD_INLINE __m128i Average8<2>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
-        {
-            return Average8(_mm_shuffle_epi8(s00, K8_RC2), _mm_shuffle_epi8(s01, K8_RC2), _mm_shuffle_epi8(s10, K8_RC2), _mm_shuffle_epi8(s11, K8_RC2));
-        }
-
-        const __m128i K8_RC4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF);
-
-        template<> SIMD_INLINE __m128i Average8<4>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
-        {
-            return Average8(_mm_shuffle_epi8(s00, K8_RC4), _mm_shuffle_epi8(s01, K8_RC4), _mm_shuffle_epi8(s10, K8_RC4), _mm_shuffle_epi8(s11, K8_RC4));
-        }
-
-        template <size_t channelCount, bool align> SIMD_INLINE void ReduceColor2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst)
-        {
-            __m128i s00 = Load<align>((__m128i*)src0 + 0);
-            __m128i s01 = Load<align>((__m128i*)src0 + 1);
-            __m128i s10 = Load<align>((__m128i*)src1 + 0);
-            __m128i s11 = Load<align>((__m128i*)src1 + 1);
-            Store<align>((__m128i*)dst, Average8<channelCount>(s00, s01, s10, s11));
-        }
-
-        template <size_t channelCount, bool align> void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride)
-        {
-            size_t evenWidth = AlignLo(srcWidth, 2);
-            size_t evenSize = evenWidth * channelCount;
-            size_t alignedSize = AlignLo(evenSize, DA);
-            for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2)
-            {
-                const uint8_t *src0 = src;
-                const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride);
-                size_t srcOffset = 0, dstOffset = 0;
-                for (; srcOffset < alignedSize; srcOffset += DA, dstOffset += A)
-                    ReduceColor2x2<channelCount, align>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
-                if (alignedSize != evenSize)
-                {
-                    srcOffset = evenSize - DA;
-                    dstOffset = srcOffset / 2;
-                    ReduceColor2x2<channelCount, false>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
-                }
-                if (evenWidth != srcWidth)
-                {
-                    for (size_t c = 0; c < channelCount; ++c)
-                        dst[evenSize/2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]);
-                }
-                src += 2 * srcStride;
-                dst += dstStride;
-            }
-        }
-
-        const __m128i K8_BGR0 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1);
-        const __m128i K8_BGR1 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0);
-        const __m128i K8_BGR2 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        const __m128i K8_BGR3 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1);
-        const __m128i K8_BGR4 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1);
-        const __m128i K8_BGR5 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        const __m128i K8_BGR6 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF);
-
-        template <bool align> SIMD_INLINE void ReduceBgr2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst)
-        {
-            __m128i s00 = Load<align>((__m128i*)src0 + 0);
-            __m128i s01 = Load<align>((__m128i*)src0 + 1);
-            __m128i s02 = Load<align>((__m128i*)src0 + 2);
-            __m128i s10 = Load<align>((__m128i*)src1 + 0);
-            __m128i s11 = Load<align>((__m128i*)src1 + 1);
-            __m128i s12 = Load<align>((__m128i*)src1 + 2);
-            __m128i m00 = _mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR0), _mm_shuffle_epi8(s01, K8_BGR1));
-            __m128i m01 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR2), _mm_shuffle_epi8(s01, K8_BGR3)), _mm_shuffle_epi8(s02, K8_BGR4));
-            __m128i m10 = _mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR0), _mm_shuffle_epi8(s11, K8_BGR1));
-            __m128i m11 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR2), _mm_shuffle_epi8(s11, K8_BGR3)), _mm_shuffle_epi8(s12, K8_BGR4));
-            Store<align>((__m128i*)dst + 0, Average8(m00, m01, m10, m11));
-            __m128i s03 = Load<align>((__m128i*)src0 + 3);
-            __m128i s04 = Load<align>((__m128i*)src0 + 4);
-            __m128i s13 = Load<align>((__m128i*)src1 + 3);
-            __m128i s14 = Load<align>((__m128i*)src1 + 4);
-            __m128i m02 = _mm_or_si128(_mm_shuffle_epi8(s01, K8_BGR5), _mm_shuffle_epi8(s02, K8_BGR6));
-            __m128i m03 = _mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR0), _mm_shuffle_epi8(s04, K8_BGR1));
-            __m128i m12 = _mm_or_si128(_mm_shuffle_epi8(s11, K8_BGR5), _mm_shuffle_epi8(s12, K8_BGR6));
-            __m128i m13 = _mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR0), _mm_shuffle_epi8(s14, K8_BGR1));
-            Store<align>((__m128i*)dst + 1, Average8(m02, m03, m12, m13));
-            __m128i s05 = Load<align>((__m128i*)src0 + 5);
-            __m128i s15 = Load<align>((__m128i*)src1 + 5);
-            __m128i m04 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR2), _mm_shuffle_epi8(s04, K8_BGR3)), _mm_shuffle_epi8(s05, K8_BGR4));
-            __m128i m05 = _mm_or_si128(_mm_shuffle_epi8(s04, K8_BGR5), _mm_shuffle_epi8(s05, K8_BGR6));
-            __m128i m14 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR2), _mm_shuffle_epi8(s14, K8_BGR3)), _mm_shuffle_epi8(s15, K8_BGR4));
-            __m128i m15 = _mm_or_si128(_mm_shuffle_epi8(s14, K8_BGR5), _mm_shuffle_epi8(s15, K8_BGR6));
-            Store<align>((__m128i*)dst + 2, Average8(m04, m05, m14, m15));
-        }
-
-        template <bool align> void ReduceBgr2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride)
-        {
-            size_t evenWidth = AlignLo(srcWidth, 2);
-            size_t alignedWidth = AlignLo(srcWidth, DA);
-            size_t evenSize = evenWidth * 3;
-            size_t alignedSize = alignedWidth*3;
-            size_t srcStep = DA * 3, dstStep = A*3;
-            for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2)
-            {
-                const uint8_t *src0 = src;
-                const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride);
-                size_t srcOffset = 0, dstOffset = 0;
-                for (; srcOffset < alignedSize; srcOffset += srcStep, dstOffset += dstStep)
-                    ReduceBgr2x2<align>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
-                if (alignedSize != evenSize)
-                {
-                    srcOffset = evenSize - srcStep;
-                    dstOffset = srcOffset / 2;
-                    ReduceBgr2x2<false>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
-                }
-                if (evenWidth != srcWidth)
-                {
-                    for (size_t c = 0; c < 3; ++c)
-                        dst[evenSize / 2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]);
-                }
-                src += 2 * srcStride;
-                dst += dstStride;
-            }
-        }
-
-        template <bool align> void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount)
-        {
-            assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA);
-            if (align)
-            {
-                assert(Aligned(src) && Aligned(srcStride));
-                assert(Aligned(dst) && Aligned(dstStride));
-            }
-
-            switch (channelCount)
-            {
-            case 1: ReduceColor2x2<1, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
-            case 2: ReduceColor2x2<2, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
-            case 3: ReduceBgr2x2<align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
-            case 4: ReduceColor2x2<4, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
-            default: assert(0);
-            }
-        }
-
-        void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount)
-        {
-            if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride))
-                ReduceColor2x2<true>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
-            else
-                ReduceColor2x2<false>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Reduce.cpp.o) has no symbols
-    void dummy_SimdSsse3Reduce(){};
-#endif// SIMD_SSSE3_ENABLE
-}
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdStore.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1)
+        {
+            return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2);
+        }
+
+        SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
+        {
+            return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11));
+        }
+
+        template <size_t channelCount> __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11);
+
+        template<> SIMD_INLINE __m128i Average8<1>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
+        {
+            return Average8(s00, s01, s10, s11);
+        }
+
+        const __m128i K8_RC2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF);
+
+        template<> SIMD_INLINE __m128i Average8<2>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
+        {
+            return Average8(_mm_shuffle_epi8(s00, K8_RC2), _mm_shuffle_epi8(s01, K8_RC2), _mm_shuffle_epi8(s10, K8_RC2), _mm_shuffle_epi8(s11, K8_RC2));
+        }
+
+        const __m128i K8_RC4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF);
+
+        template<> SIMD_INLINE __m128i Average8<4>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
+        {
+            return Average8(_mm_shuffle_epi8(s00, K8_RC4), _mm_shuffle_epi8(s01, K8_RC4), _mm_shuffle_epi8(s10, K8_RC4), _mm_shuffle_epi8(s11, K8_RC4));
+        }
+
+        template <size_t channelCount, bool align> SIMD_INLINE void ReduceColor2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst)
+        {
+            __m128i s00 = Load<align>((__m128i*)src0 + 0);
+            __m128i s01 = Load<align>((__m128i*)src0 + 1);
+            __m128i s10 = Load<align>((__m128i*)src1 + 0);
+            __m128i s11 = Load<align>((__m128i*)src1 + 1);
+            Store<align>((__m128i*)dst, Average8<channelCount>(s00, s01, s10, s11));
+        }
+
+        template <size_t channelCount, bool align> void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride)
+        {
+            size_t evenWidth = AlignLo(srcWidth, 2);
+            size_t evenSize = evenWidth * channelCount;
+            size_t alignedSize = AlignLo(evenSize, DA);
+            for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2)
+            {
+                const uint8_t *src0 = src;
+                const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride);
+                size_t srcOffset = 0, dstOffset = 0;
+                for (; srcOffset < alignedSize; srcOffset += DA, dstOffset += A)
+                    ReduceColor2x2<channelCount, align>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
+                if (alignedSize != evenSize)
+                {
+                    srcOffset = evenSize - DA;
+                    dstOffset = srcOffset / 2;
+                    ReduceColor2x2<channelCount, false>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
+                }
+                if (evenWidth != srcWidth)
+                {
+                    for (size_t c = 0; c < channelCount; ++c)
+                        dst[evenSize/2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]);
+                }                
+                src += 2 * srcStride;
+                dst += dstStride;
+            }
+        }
+
+        const __m128i K8_BGR0 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1);
+        const __m128i K8_BGR1 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0);
+        const __m128i K8_BGR2 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        const __m128i K8_BGR3 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1);
+        const __m128i K8_BGR4 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1);
+        const __m128i K8_BGR5 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        const __m128i K8_BGR6 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF);
+
+        template <bool align> SIMD_INLINE void ReduceBgr2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst)
+        {
+            __m128i s00 = Load<align>((__m128i*)src0 + 0);
+            __m128i s01 = Load<align>((__m128i*)src0 + 1);
+            __m128i s02 = Load<align>((__m128i*)src0 + 2);
+            __m128i s10 = Load<align>((__m128i*)src1 + 0);
+            __m128i s11 = Load<align>((__m128i*)src1 + 1);
+            __m128i s12 = Load<align>((__m128i*)src1 + 2);
+            __m128i m00 = _mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR0), _mm_shuffle_epi8(s01, K8_BGR1));
+            __m128i m01 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR2), _mm_shuffle_epi8(s01, K8_BGR3)), _mm_shuffle_epi8(s02, K8_BGR4));
+            __m128i m10 = _mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR0), _mm_shuffle_epi8(s11, K8_BGR1));
+            __m128i m11 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR2), _mm_shuffle_epi8(s11, K8_BGR3)), _mm_shuffle_epi8(s12, K8_BGR4));
+            Store<align>((__m128i*)dst + 0, Average8(m00, m01, m10, m11));
+            __m128i s03 = Load<align>((__m128i*)src0 + 3);
+            __m128i s04 = Load<align>((__m128i*)src0 + 4); 
+            __m128i s13 = Load<align>((__m128i*)src1 + 3);
+            __m128i s14 = Load<align>((__m128i*)src1 + 4);
+            __m128i m02 = _mm_or_si128(_mm_shuffle_epi8(s01, K8_BGR5), _mm_shuffle_epi8(s02, K8_BGR6));
+            __m128i m03 = _mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR0), _mm_shuffle_epi8(s04, K8_BGR1));
+            __m128i m12 = _mm_or_si128(_mm_shuffle_epi8(s11, K8_BGR5), _mm_shuffle_epi8(s12, K8_BGR6));
+            __m128i m13 = _mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR0), _mm_shuffle_epi8(s14, K8_BGR1));
+            Store<align>((__m128i*)dst + 1, Average8(m02, m03, m12, m13));
+            __m128i s05 = Load<align>((__m128i*)src0 + 5);
+            __m128i s15 = Load<align>((__m128i*)src1 + 5);
+            __m128i m04 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR2), _mm_shuffle_epi8(s04, K8_BGR3)), _mm_shuffle_epi8(s05, K8_BGR4));
+            __m128i m05 = _mm_or_si128(_mm_shuffle_epi8(s04, K8_BGR5), _mm_shuffle_epi8(s05, K8_BGR6));
+            __m128i m14 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR2), _mm_shuffle_epi8(s14, K8_BGR3)), _mm_shuffle_epi8(s15, K8_BGR4));
+            __m128i m15 = _mm_or_si128(_mm_shuffle_epi8(s14, K8_BGR5), _mm_shuffle_epi8(s15, K8_BGR6));
+            Store<align>((__m128i*)dst + 2, Average8(m04, m05, m14, m15));
+        }
+
+        template <bool align> void ReduceBgr2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride)
+        {
+            size_t evenWidth = AlignLo(srcWidth, 2);
+            size_t alignedWidth = AlignLo(srcWidth, DA);
+            size_t evenSize = evenWidth * 3;
+            size_t alignedSize = alignedWidth*3;
+            size_t srcStep = DA * 3, dstStep = A*3;
+            for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2)
+            {
+                const uint8_t *src0 = src;
+                const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride);
+                size_t srcOffset = 0, dstOffset = 0;
+                for (; srcOffset < alignedSize; srcOffset += srcStep, dstOffset += dstStep)
+                    ReduceBgr2x2<align>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
+                if (alignedSize != evenSize)
+                {
+                    srcOffset = evenSize - srcStep;
+                    dstOffset = srcOffset / 2;
+                    ReduceBgr2x2<false>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
+                }
+                if (evenWidth != srcWidth)
+                {
+                    for (size_t c = 0; c < 3; ++c)
+                        dst[evenSize / 2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]);
+                }
+                src += 2 * srcStride;
+                dst += dstStride;
+            }
+        }
+
+        template <bool align> void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount)
+        {
+            assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA);
+            if (align)
+            {
+                assert(Aligned(src) && Aligned(srcStride));
+                assert(Aligned(dst) && Aligned(dstStride));
+            }
+
+            switch (channelCount)
+            {
+            case 1: ReduceColor2x2<1, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
+            case 2: ReduceColor2x2<2, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
+            case 3: ReduceBgr2x2<align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
+            case 4: ReduceColor2x2<4, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
+            default: assert(0);
+            }
+        }
+
+        void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount)
+        {
+            if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride))
+                ReduceColor2x2<true>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
+            else
+                ReduceColor2x2<false>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray2x2.cpp b/3rdparty/simdlib/Simd/SimdSse41ReduceGray2x2.cpp
old mode 100644
new mode 100755
similarity index 94%
rename from 3rdparty/simdlib/Simd/SimdSsse3ReduceGray2x2.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41ReduceGray2x2.cpp
index 24d071182d..dd8bd5b0e3
--- a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray2x2.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41ReduceGray2x2.cpp
@@ -1,96 +1,93 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdStore.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1)
-        {
-            return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2);
-        }
-
-        SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
-        {
-            return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11));
-        }
-
-        template <bool align> void ReduceGray2x2(
-            const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride)
-        {
-            assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA);
-            if (align)
-            {
-                assert(Aligned(src) && Aligned(srcStride));
-                assert(Aligned(dst) && Aligned(dstStride) && Aligned(dstWidth));
-            }
-
-            size_t alignedWidth = AlignLo(srcWidth, DA);
-            size_t evenWidth = AlignLo(srcWidth, 2);
-            for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2)
-            {
-                const uint8_t *src0 = src;
-                const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride);
-                size_t srcOffset = 0, dstOffset = 0;
-                for (; srcOffset < alignedWidth; srcOffset += DA, dstOffset += A)
-                {
-                    Store<align>((__m128i*)(dst + dstOffset), Average8(
-                        Load<align>((__m128i*)(src0 + srcOffset)), Load<align>((__m128i*)(src0 + srcOffset + A)),
-                        Load<align>((__m128i*)(src1 + srcOffset)), Load<align>((__m128i*)(src1 + srcOffset + A))));
-                }
-                if (alignedWidth != srcWidth)
-                {
-                    dstOffset = dstWidth - A - (evenWidth != srcWidth ? 1 : 0);
-                    srcOffset = evenWidth - DA;
-                    Store<align>((__m128i*)(dst + dstOffset), Average8(
-                        Load<align>((__m128i*)(src0 + srcOffset)), Load<align>((__m128i*)(src0 + srcOffset + A)),
-                        Load<align>((__m128i*)(src1 + srcOffset)), Load<align>((__m128i*)(src1 + srcOffset + A))));
-                    if (evenWidth != srcWidth)
-                    {
-                        dst[dstWidth - 1] = Base::Average(src0[evenWidth], src1[evenWidth]);
-                    }
-                }
-                src += 2 * srcStride;
-                dst += dstStride;
-            }
-        }
-
-        void ReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride)
-        {
-            if (Aligned(src) && Aligned(srcWidth) && Aligned(srcStride) && Aligned(dst) && Aligned(dstWidth) && Aligned(dstStride))
-                ReduceGray2x2<true>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
-            else
-                ReduceGray2x2<false>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Reduce2x2.cpp.o) has no symbols
-    void dummy_SimdSsse3Reduce2x2(){};
-#endif// SIMD_SSSE3_ENABLE
-}
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdStore.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1)
+        {
+            return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2);
+        }
+
+        SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
+        {
+            return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11));
+        }
+
+        template <bool align> void ReduceGray2x2(
+            const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride)
+        {
+            assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA);
+            if (align)
+            {
+                assert(Aligned(src) && Aligned(srcStride));
+                assert(Aligned(dst) && Aligned(dstStride) && Aligned(dstWidth));
+            }
+
+            size_t alignedWidth = AlignLo(srcWidth, DA);
+            size_t evenWidth = AlignLo(srcWidth, 2);
+            for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2)
+            {
+                const uint8_t *src0 = src;
+                const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride);
+                size_t srcOffset = 0, dstOffset = 0;
+                for (; srcOffset < alignedWidth; srcOffset += DA, dstOffset += A)
+                {
+                    Store<align>((__m128i*)(dst + dstOffset), Average8(
+                        Load<align>((__m128i*)(src0 + srcOffset)), Load<align>((__m128i*)(src0 + srcOffset + A)),
+                        Load<align>((__m128i*)(src1 + srcOffset)), Load<align>((__m128i*)(src1 + srcOffset + A))));
+                }
+                if (alignedWidth != srcWidth)
+                {
+                    dstOffset = dstWidth - A - (evenWidth != srcWidth ? 1 : 0);
+                    srcOffset = evenWidth - DA;
+                    Store<align>((__m128i*)(dst + dstOffset), Average8(
+                        Load<align>((__m128i*)(src0 + srcOffset)), Load<align>((__m128i*)(src0 + srcOffset + A)),
+                        Load<align>((__m128i*)(src1 + srcOffset)), Load<align>((__m128i*)(src1 + srcOffset + A))));
+                    if (evenWidth != srcWidth)
+                    {
+                        dst[dstWidth - 1] = Base::Average(src0[evenWidth], src1[evenWidth]);
+                    }
+                }
+                src += 2 * srcStride;
+                dst += dstStride;
+            }
+        }
+
+        void ReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride)
+        {
+            if (Aligned(src) && Aligned(srcWidth) && Aligned(srcStride) && Aligned(dst) && Aligned(dstWidth) && Aligned(dstStride))
+                ReduceGray2x2<true>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
+            else
+                ReduceGray2x2<false>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray4x4.cpp b/3rdparty/simdlib/Simd/SimdSse41ReduceGray4x4.cpp
old mode 100644
new mode 100755
similarity index 96%
rename from 3rdparty/simdlib/Simd/SimdSsse3ReduceGray4x4.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41ReduceGray4x4.cpp
index 261e84c918..7754b290ba
--- a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray4x4.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41ReduceGray4x4.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -26,8 +26,8 @@
 
 namespace Simd
 {
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
     {
         namespace
         {
@@ -170,8 +170,5 @@ namespace Simd
                 ReduceGray4x4<false>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
         }
     }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Reduce4x4.cpp.o) has no symbols
-    void dummy_SimdSsse3Reduce4x4(){};
-#endif// SIMD_SSSE3_ENABLE
+#endif
 }
diff --git a/3rdparty/simdlib/Simd/SimdSsse3ResizeBilinear.cpp b/3rdparty/simdlib/Simd/SimdSse41ResizeBilinear.cpp
old mode 100644
new mode 100755
similarity index 98%
rename from 3rdparty/simdlib/Simd/SimdSsse3ResizeBilinear.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41ResizeBilinear.cpp
index b39f619005..50a708aa20
--- a/3rdparty/simdlib/Simd/SimdSsse3ResizeBilinear.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41ResizeBilinear.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -27,8 +27,8 @@
 
 namespace Simd
 {
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE
+    namespace Sse41
     {
         namespace
         {
@@ -401,9 +401,6 @@ namespace Simd
             }
         }
     }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3ResizeBilinear.cpp.o) has no symbols
-    void dummy_SimdSsse3ResizeBilinear(){};
 #endif
 }
 
diff --git a/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp b/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp
old mode 100644
new mode 100755
index b766a8a209..e3e8e7b360
--- a/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -32,6 +32,309 @@ namespace Simd
 #ifdef SIMD_SSE41_ENABLE
     namespace Sse41
     {
+        ResizerByteBilinear::ResizerByteBilinear(const ResParam& param)
+            : Sse2::ResizerByteBilinear(param)
+            , _blocks(0)
+        {
+        }
+
+        size_t ResizerByteBilinear::BlockCountMax(size_t align)
+        {
+            return (size_t)Simd::Max(::ceil(float(_param.srcW) / (align - 1)), ::ceil(float(_param.dstW) * 2.0f / align));
+        }
+
+        void ResizerByteBilinear::EstimateParams()
+        {
+            if (_ax.data)
+                return;
+            if (_param.channels == 1 && _param.srcW < 4 * _param.dstW)
+                _blocks = BlockCountMax(A);
+            float scale = (float)_param.srcW / _param.dstW;
+            _ax.Resize(AlignHi(_param.dstW, A) * _param.channels * 2, false, _param.align);
+            uint8_t* alphas = _ax.data;
+            if (_blocks)
+            {
+                _ixg.Resize(_blocks);
+                int block = 0;
+                _ixg[0].src = 0;
+                _ixg[0].dst = 0;
+                for (int dstIndex = 0; dstIndex < (int)_param.dstW; ++dstIndex)
+                {
+                    float alpha = (float)((dstIndex + 0.5) * scale - 0.5);
+                    int srcIndex = (int)::floor(alpha);
+                    alpha -= srcIndex;
+
+                    if (srcIndex < 0)
+                    {
+                        srcIndex = 0;
+                        alpha = 0;
+                    }
+
+                    if (srcIndex > (int)_param.srcW - 2)
+                    {
+                        srcIndex = (int)_param.srcW - 2;
+                        alpha = 1;
+                    }
+
+                    int dst = 2 * dstIndex - _ixg[block].dst;
+                    int src = srcIndex - _ixg[block].src;
+                    if (src >= A - 1 || dst >= A)
+                    {
+                        block++;
+                        _ixg[block].src = Simd::Min(srcIndex, int(_param.srcW - A));
+                        _ixg[block].dst = 2 * dstIndex;
+                        dst = 0;
+                        src = srcIndex - _ixg[block].src;
+                    }
+                    _ixg[block].shuffle[dst] = src;
+                    _ixg[block].shuffle[dst + 1] = src + 1;
+
+                    alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5);
+                    alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]);
+                    alphas += 2;
+                }
+                _blocks = block + 1;
+            }
+            else
+            {
+                _ix.Resize(_param.dstW);
+                for (size_t i = 0; i < _param.dstW; ++i)
+                {
+                    float alpha = (float)((i + 0.5) * scale - 0.5);
+                    ptrdiff_t index = (ptrdiff_t)::floor(alpha);
+                    alpha -= index;
+
+                    if (index < 0)
+                    {
+                        index = 0;
+                        alpha = 0;
+                    }
+
+                    if (index > (ptrdiff_t)_param.srcW - 2)
+                    {
+                        index = _param.srcW - 2;
+                        alpha = 1;
+                    }
+
+                    _ix[i] = (int)index;
+                    alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5);
+                    alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]);
+                    for (size_t channel = 1; channel < _param.channels; channel++)
+                        ((uint16_t*)alphas)[channel] = *(uint16_t*)alphas;
+                    alphas += 2 * _param.channels;
+                }
+            }
+            size_t size = AlignHi(_param.dstW, _param.align) * _param.channels * 2;
+            _bx[0].Resize(size, false, _param.align);
+            _bx[1].Resize(size, false, _param.align);
+        }
+
+        template <size_t N> void ResizerByteBilinearInterpolateX(const __m128i* alpha, __m128i* buffer);
+
+        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<1>(const __m128i* alpha, __m128i* buffer)
+        {
+            _mm_store_si128(buffer, _mm_maddubs_epi16(_mm_load_si128(buffer), _mm_load_si128(alpha)));
+        }
+
+        const __m128i K8_SHUFFLE_X2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF);
+
+        SIMD_INLINE void ResizerByteBilinearInterpolateX2(const __m128i* alpha, __m128i* buffer)
+        {
+            __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X2);
+            _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha)));
+        }
+
+        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<2>(const __m128i* alpha, __m128i* buffer)
+        {
+            ResizerByteBilinearInterpolateX2(alpha + 0, buffer + 0);
+            ResizerByteBilinearInterpolateX2(alpha + 1, buffer + 1);
+        }
+
+        const __m128i K8_SHUFFLE_X3_00 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1);
+        const __m128i K8_SHUFFLE_X3_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0);
+        const __m128i K8_SHUFFLE_X3_10 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        const __m128i K8_SHUFFLE_X3_11 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1);
+        const __m128i K8_SHUFFLE_X3_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1);
+        const __m128i K8_SHUFFLE_X3_21 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        const __m128i K8_SHUFFLE_X3_22 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF);
+
+        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<3>(const __m128i* alpha, __m128i* buffer)
+        {
+            __m128i src[3], shuffled[3];
+            src[0] = _mm_load_si128(buffer + 0);
+            src[1] = _mm_load_si128(buffer + 1);
+            src[2] = _mm_load_si128(buffer + 2);
+            shuffled[0] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_00);
+            shuffled[0] = _mm_or_si128(shuffled[0], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_01));
+            _mm_store_si128(buffer + 0, _mm_maddubs_epi16(shuffled[0], _mm_load_si128(alpha + 0)));
+            shuffled[1] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_10);
+            shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_11));
+            shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_12));
+            _mm_store_si128(buffer + 1, _mm_maddubs_epi16(shuffled[1], _mm_load_si128(alpha + 1)));
+            shuffled[2] = _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_21);
+            shuffled[2] = _mm_or_si128(shuffled[2], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_22));
+            _mm_store_si128(buffer + 2, _mm_maddubs_epi16(shuffled[2], _mm_load_si128(alpha + 2)));
+        }
+
+        const __m128i K8_SHUFFLE_X4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF);
+
+        SIMD_INLINE void ResizerByteBilinearInterpolateX4(const __m128i* alpha, __m128i* buffer)
+        {
+            __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X4);
+            _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha)));
+        }
+
+        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<4>(const __m128i* alpha, __m128i* buffer)
+        {
+            ResizerByteBilinearInterpolateX4(alpha + 0, buffer + 0);
+            ResizerByteBilinearInterpolateX4(alpha + 1, buffer + 1);
+            ResizerByteBilinearInterpolateX4(alpha + 2, buffer + 2);
+            ResizerByteBilinearInterpolateX4(alpha + 3, buffer + 3);
+        }
+
+        const __m128i K16_FRACTION_ROUND_TERM = SIMD_MM_SET1_EPI16(Base::BILINEAR_ROUND_TERM);
+
+        template<bool align> SIMD_INLINE __m128i ResizerByteBilinearInterpolateY(const __m128i* pbx0, const __m128i* pbx1, __m128i alpha[2])
+        {
+            __m128i sum = _mm_add_epi16(_mm_mullo_epi16(Load<align>(pbx0), alpha[0]), _mm_mullo_epi16(Load<align>(pbx1), alpha[1]));
+            return _mm_srli_epi16(_mm_add_epi16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT);
+        }
+
+        template<bool align> SIMD_INLINE void ResizerByteBilinearInterpolateY(const uint8_t* bx0, const uint8_t* bx1, __m128i alpha[2], uint8_t* dst)
+        {
+            __m128i lo = ResizerByteBilinearInterpolateY<align>((__m128i*)bx0 + 0, (__m128i*)bx1 + 0, alpha);
+            __m128i hi = ResizerByteBilinearInterpolateY<align>((__m128i*)bx0 + 1, (__m128i*)bx1 + 1, alpha);
+            Store<false>((__m128i*)dst, _mm_packus_epi16(lo, hi));
+        }
+
+        template<size_t N> void ResizerByteBilinear::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
+        {
+            struct One { uint8_t val[N * 1]; };
+            struct Two { uint8_t val[N * 2]; };
+
+            size_t size = 2 * _param.dstW * N;
+            size_t aligned = AlignHi(size, DA) - DA;
+            const size_t step = A * N;
+            ptrdiff_t previous = -2;
+            __m128i a[2];
+            uint8_t* bx[2] = { _bx[0].data, _bx[1].data };
+            const uint8_t* ax = _ax.data;
+            const int32_t* ix = _ix.data;
+            size_t dstW = _param.dstW;
+
+            for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride)
+            {
+                a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst]));
+                a[1] = _mm_set1_epi16(int16_t(_ay[yDst]));
+
+                ptrdiff_t sy = _iy[yDst];
+                int k = 0;
+
+                if (sy == previous)
+                    k = 2;
+                else if (sy == previous + 1)
+                {
+                    Swap(bx[0], bx[1]);
+                    k = 1;
+                }
+
+                previous = sy;
+
+                for (; k < 2; k++)
+                {
+                    Two* pb = (Two*)bx[k];
+                    const One* psrc = (const One*)(src + (sy + k) * srcStride);
+                    for (size_t x = 0; x < dstW; x++)
+                        pb[x] = *(Two*)(psrc + ix[x]);
+
+                    uint8_t* pbx = bx[k];
+                    for (size_t i = 0; i < size; i += step)
+                        ResizerByteBilinearInterpolateX<N>((__m128i*)(ax + i), (__m128i*)(pbx + i));
+                }
+
+                for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A)
+                    ResizerByteBilinearInterpolateY<true>(bx[0] + ib, bx[1] + ib, a, dst + id);
+                size_t i = size - DA;
+                ResizerByteBilinearInterpolateY<false>(bx[0] + i, bx[1] + i, a, dst + i / 2);
+            }
+        }
+
+        template <class Idx> SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t* src, const Idx& index, const uint8_t* alpha, uint8_t* dst)
+        {
+            __m128i _src = _mm_loadu_si128((__m128i*)(src + index.src));
+            __m128i _shuffle = _mm_loadu_si128((__m128i*) & index.shuffle);
+            __m128i _alpha = _mm_loadu_si128((__m128i*)(alpha + index.dst));
+            _mm_storeu_si128((__m128i*)(dst + index.dst), _mm_maddubs_epi16(_mm_shuffle_epi8(_src, _shuffle), _alpha));
+        }
+
+        void ResizerByteBilinear::RunG(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
+        {
+            size_t bufW = AlignHi(_param.dstW, A) * 2;
+            size_t size = 2 * _param.dstW;
+            size_t aligned = AlignHi(size, DA) - DA;
+            size_t blocks = _blocks;
+            ptrdiff_t previous = -2;
+            __m128i a[2];
+            uint8_t* bx[2] = { _bx[0].data, _bx[1].data };
+            const uint8_t* ax = _ax.data;
+            const Idx* ixg = _ixg.data;
+
+            for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride)
+            {
+                a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst]));
+                a[1] = _mm_set1_epi16(int16_t(_ay[yDst]));
+
+                ptrdiff_t sy = _iy[yDst];
+                int k = 0;
+
+                if (sy == previous)
+                    k = 2;
+                else if (sy == previous + 1)
+                {
+                    Swap(bx[0], bx[1]);
+                    k = 1;
+                }
+
+                previous = sy;
+
+                for (; k < 2; k++)
+                {
+                    const uint8_t* psrc = src + (sy + k) * srcStride;
+                    uint8_t* pdst = bx[k];
+                    for (size_t i = 0; i < blocks; ++i)
+                        ResizerByteBilinearLoadGrayInterpolated(psrc, ixg[i], ax, pdst);
+                }
+
+                for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A)
+                    ResizerByteBilinearInterpolateY<true>(bx[0] + ib, bx[1] + ib, a, dst + id);
+                size_t i = size - DA;
+                ResizerByteBilinearInterpolateY<false>(bx[0] + i, bx[1] + i, a, dst + i / 2);
+            }
+        }
+
+        void ResizerByteBilinear::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
+        {
+            assert(_param.dstW >= A);
+
+            EstimateParams();
+            switch (_param.channels)
+            {
+            case 1:
+                if (_blocks)
+                    RunG(src, srcStride, dst, dstStride);
+                else
+                    Run<1>(src, srcStride, dst, dstStride);
+                break;
+            case 2: Run<2>(src, srcStride, dst, dstStride); break;
+            case 3: Run<3>(src, srcStride, dst, dstStride); break;
+            case 4: Run<4>(src, srcStride, dst, dstStride); break;
+            default:
+                assert(0);
+            }
+        }
+
+        //---------------------------------------------------------------------
+
         ResizerByteArea::ResizerByteArea(const ResParam & param)
             : Sse2::ResizerByteArea(param)
         {
@@ -200,10 +503,12 @@ namespace Simd
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
         {
             ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128i));
-            if (type == SimdResizeChannelByte && method == SimdResizeMethodArea)
+            if (param.IsByteBilinear() && dstX >= A)
+                return new ResizerByteBilinear(param);
+            else if (param.IsByteArea())
                 return new ResizerByteArea(param);
             else
-                return Ssse3::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
+                return Sse2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
         }
     }
 #else
diff --git a/3rdparty/simdlib/Simd/SimdSsse3.h b/3rdparty/simdlib/Simd/SimdSsse3.h
deleted file mode 100644
index ed7849f39d..0000000000
--- a/3rdparty/simdlib/Simd/SimdSsse3.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#ifndef __SimdSsse3_h__
-#define __SimdSsse3_h__
-
-#include "Simd/SimdDefs.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride);
-
-        void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
-
-        void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha);
-
-        void BgraToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride);
-
-        void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride);
-
-        void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride);
-
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride);
-
-        void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride);
-
-        void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride);
-
-        void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride);
-
-        void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride);
-
-        void InterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride);
-
-        void InterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride);
-
-        void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
-
-        void ReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
-
-        void ReduceGray4x4(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
-
-        void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
-
-        // ViSP custom SIMD code
-        void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff);
-    }
-#endif// SIMD_SSSE3_ENABLE
-}
-#endif//__SimdSsse3_h__
diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp b/3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp
deleted file mode 100644
index bb01107812..0000000000
--- a/3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        template <bool align> SIMD_INLINE void BgrToRgba(const uint8_t * bgr, uint8_t * rgba, __m128i alpha, __m128i shuffle)
-        {
-            Store<align>((__m128i*)rgba + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<align>((__m128i*)(bgr + 0)), shuffle)));
-            Store<align>((__m128i*)rgba + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 12)), shuffle)));
-            Store<align>((__m128i*)rgba + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 24)), shuffle)));
-            Store<align>((__m128i*)rgba + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(bgr + 32)), 4), shuffle)));
-        }
-
-        template <bool align> void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3);
-            __m128i _shuffle = _mm_setr_epi8(0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    BgrToRgba<align>(bgr + 3 * col, rgba + 4 * col, _alpha, _shuffle);
-                if (width != alignedWidth)
-                    BgrToRgba<false>(bgr + 3 * (width - A), rgba + 4 * (width - A), _alpha, _shuffle);
-                bgr += bgrStride;
-                rgba += rgbaStride;
-            }
-        }
-
-        void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha)
-        {
-            if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride))
-                BgrToRgba<true>(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-            else
-                BgrToRgba<false>(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToRGBa.cpp.o) has no symbols
-    void dummy_SimdSsse3BgrToRGBa(){};
-#endif// SIMD_SSSE3_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp b/3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp
deleted file mode 100644
index d455781ed3..0000000000
--- a/3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        template <bool align> SIMD_INLINE void BgraToRgba(const uint8_t * bgra, uint8_t * rgba, __m128i shuffle)
-        {
-            Store<align>((__m128i*)rgba + 0, _mm_shuffle_epi8(Load<align>((__m128i*)(bgra + 0)), shuffle));
-            Store<align>((__m128i*)rgba + 1, _mm_shuffle_epi8(Load<align>((__m128i*)(bgra + 16)), shuffle));
-            Store<align>((__m128i*)rgba + 2, _mm_shuffle_epi8(Load<align>((__m128i*)(bgra + 32)), shuffle));
-            Store<align>((__m128i*)rgba + 3, _mm_shuffle_epi8(Load<align>((__m128i*)(bgra + 48)), shuffle));
-        }
-
-        template <bool align> void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            __m128i _shuffle = _mm_setr_epi8(0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    BgraToRgba<align>(bgra + 4 * col, rgba + 4 * col, _shuffle);
-                if (width != alignedWidth)
-                    BgraToRgba<false>(bgra + 4 * (width - A), rgba + 4 * (width - A), _shuffle);
-                bgra += bgraStride;
-                rgba += rgbaStride;
-            }
-        }
-
-        void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride)
-        {
-            if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride))
-                BgraToRgba<true>(bgra, width, height, bgraStride, rgba, rgbaStride);
-            else
-                BgraToRgba<false>(bgra, width, height, bgraStride, rgba, rgbaStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToRGBa.cpp.o) has no symbols
-    void dummy_SimdSsse3BgraToRGBa(){};
-#endif// SIMD_SSSE3_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp b/3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp
deleted file mode 100644
index 985a772d47..0000000000
--- a/3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdBase.h"
-#include "Simd/SimdStore.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff)
-        {
-            const __m128i mask1 = _mm_set_epi8(-1, 14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0);
-            const __m128i mask2 = _mm_set_epi8(-1, 15, -1, 13, -1, 11, -1, 9, -1, 7, -1, 5, -1, 3, -1, 1);
-            const __m128i mask_out2 = _mm_set_epi8(14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0, -1);
-
-            size_t i = 0;
-            for (; i <= size-16; i+= 16) {
-                const __m128i vdata1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(img1 + i));
-                const __m128i vdata2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(img2 + i));
-
-                __m128i vdata1_reorg = _mm_shuffle_epi8(vdata1, mask1);
-                __m128i vdata2_reorg = _mm_shuffle_epi8(vdata2, mask1);
-
-                const __m128i vshift = _mm_set1_epi16(128);
-                __m128i vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift);
-
-                const __m128i v255 = _mm_set1_epi16(255);
-                const __m128i vzero = _mm_setzero_si128();
-                const __m128i vdata_diff_min_max1 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero);
-
-                vdata1_reorg = _mm_shuffle_epi8(vdata1, mask2);
-                vdata2_reorg = _mm_shuffle_epi8(vdata2, mask2);
-
-                vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift);
-                const __m128i vdata_diff_min_max2 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero);
-
-                _mm_storeu_si128(reinterpret_cast<__m128i *>(imgDiff + i), _mm_or_si128(_mm_shuffle_epi8(vdata_diff_min_max1, mask1),
-                                                                                        _mm_shuffle_epi8(vdata_diff_min_max2, mask_out2)));
-            }
-
-            if (i < size) {
-                Base::SimdImageDifference(img1 + i, img2 + i, size - i, imgDiff + i);
-            }
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3CustomFunctions.cpp.o) has no symbols
-    void dummy_SimdSsse3CustomFunctions(){};
-#endif// SIMD_SSSE3_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp b/3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp
deleted file mode 100644
index 37f2eca6c1..0000000000
--- a/3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp
+++ /dev/null
@@ -1,350 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdStore.h"
-#include "Simd/SimdResizer.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        ResizerByteBilinear::ResizerByteBilinear(const ResParam & param)
-            : Sse2::ResizerByteBilinear(param)
-            , _blocks(0)
-        {
-        }
-
-        size_t ResizerByteBilinear::BlockCountMax(size_t align)
-        {
-            return (size_t)Simd::Max(::ceil(float(_param.srcW) / (align - 1)), ::ceil(float(_param.dstW) * 2.0f / align ));
-        }
-
-        void ResizerByteBilinear::EstimateParams()
-        {
-            if (_ax.data)
-                return;
-            if (_param.channels == 1 && _param.srcW < 4 * _param.dstW)
-                _blocks = BlockCountMax(A);
-            float scale = (float)_param.srcW / _param.dstW;
-            _ax.Resize(AlignHi(_param.dstW, A) * _param.channels * 2, false, _param.align);
-            uint8_t * alphas = _ax.data;
-            if (_blocks)
-            {
-                _ixg.Resize(_blocks);
-                int block = 0;
-                _ixg[0].src = 0;
-                _ixg[0].dst = 0;
-                for (int dstIndex = 0; dstIndex < (int)_param.dstW; ++dstIndex)
-                {
-                    float alpha = (float)((dstIndex + 0.5)*scale - 0.5);
-                    int srcIndex = (int)::floor(alpha);
-                    alpha -= srcIndex;
-
-                    if (srcIndex < 0)
-                    {
-                        srcIndex = 0;
-                        alpha = 0;
-                    }
-
-                    if (srcIndex > (int)_param.srcW - 2)
-                    {
-                        srcIndex = (int)_param.srcW - 2;
-                        alpha = 1;
-                    }
-
-                    int dst = 2 * dstIndex - _ixg[block].dst;
-                    int src = srcIndex - _ixg[block].src;
-                    if (src >= A - 1 || dst >= A)
-                    {
-                        block++;
-                        _ixg[block].src = Simd::Min(srcIndex, int(_param.srcW - A));
-                        _ixg[block].dst = 2 * dstIndex;
-                        dst = 0;
-                        src = srcIndex - _ixg[block].src;
-                    }
-                    _ixg[block].shuffle[dst] = src;
-                    _ixg[block].shuffle[dst + 1] = src + 1;
-
-                    alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5);
-                    alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]);
-                    alphas += 2;
-                }
-                _blocks = block + 1;
-            }
-            else
-            {
-                _ix.Resize(_param.dstW);
-                for (size_t i = 0; i < _param.dstW; ++i)
-                {
-                    float alpha = (float)((i + 0.5)*scale - 0.5);
-                    ptrdiff_t index = (ptrdiff_t)::floor(alpha);
-                    alpha -= index;
-
-                    if (index < 0)
-                    {
-                        index = 0;
-                        alpha = 0;
-                    }
-
-                    if (index >(ptrdiff_t)_param.srcW - 2)
-                    {
-                        index = _param.srcW - 2;
-                        alpha = 1;
-                    }
-
-                    _ix[i] = (int)index;
-                    alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5);
-                    alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]);
-                    for (size_t channel = 1; channel < _param.channels; channel++)
-                        ((uint16_t*)alphas)[channel] = *(uint16_t*)alphas;
-                    alphas += 2 * _param.channels;
-                }
-            }
-            size_t size = AlignHi(_param.dstW, _param.align)*_param.channels * 2;
-            _bx[0].Resize(size, false, _param.align);
-            _bx[1].Resize(size, false, _param.align);
-        }
-
-        template <size_t N> void ResizerByteBilinearInterpolateX(const __m128i * alpha, __m128i * buffer);
-
-        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<1>(const __m128i * alpha, __m128i * buffer)
-        {
-            _mm_store_si128(buffer, _mm_maddubs_epi16(_mm_load_si128(buffer), _mm_load_si128(alpha)));
-        }
-
-        const __m128i K8_SHUFFLE_X2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF);
-
-        SIMD_INLINE void ResizerByteBilinearInterpolateX2(const __m128i * alpha, __m128i * buffer)
-        {
-            __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X2);
-            _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha)));
-        }
-
-        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<2>(const __m128i * alpha, __m128i * buffer)
-        {
-            ResizerByteBilinearInterpolateX2(alpha + 0, buffer + 0);
-            ResizerByteBilinearInterpolateX2(alpha + 1, buffer + 1);
-        }
-
-        const __m128i K8_SHUFFLE_X3_00 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1);
-        const __m128i K8_SHUFFLE_X3_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0);
-        const __m128i K8_SHUFFLE_X3_10 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        const __m128i K8_SHUFFLE_X3_11 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1);
-        const __m128i K8_SHUFFLE_X3_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1);
-        const __m128i K8_SHUFFLE_X3_21 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        const __m128i K8_SHUFFLE_X3_22 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF);
-
-        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<3>(const __m128i * alpha, __m128i * buffer)
-        {
-            __m128i src[3], shuffled[3];
-            src[0] = _mm_load_si128(buffer + 0);
-            src[1] = _mm_load_si128(buffer + 1);
-            src[2] = _mm_load_si128(buffer + 2);
-            shuffled[0] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_00);
-            shuffled[0] = _mm_or_si128(shuffled[0], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_01));
-            _mm_store_si128(buffer + 0, _mm_maddubs_epi16(shuffled[0], _mm_load_si128(alpha + 0)));
-            shuffled[1] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_10);
-            shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_11));
-            shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_12));
-            _mm_store_si128(buffer + 1, _mm_maddubs_epi16(shuffled[1], _mm_load_si128(alpha + 1)));
-            shuffled[2] = _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_21);
-            shuffled[2] = _mm_or_si128(shuffled[2], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_22));
-            _mm_store_si128(buffer + 2, _mm_maddubs_epi16(shuffled[2], _mm_load_si128(alpha + 2)));
-        }
-
-        const __m128i K8_SHUFFLE_X4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF);
-
-        SIMD_INLINE void ResizerByteBilinearInterpolateX4(const __m128i * alpha, __m128i * buffer)
-        {
-            __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X4);
-            _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha)));
-        }
-
-        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<4>(const __m128i * alpha, __m128i * buffer)
-        {
-            ResizerByteBilinearInterpolateX4(alpha + 0, buffer + 0);
-            ResizerByteBilinearInterpolateX4(alpha + 1, buffer + 1);
-            ResizerByteBilinearInterpolateX4(alpha + 2, buffer + 2);
-            ResizerByteBilinearInterpolateX4(alpha + 3, buffer + 3);
-        }
-
-        const __m128i K16_FRACTION_ROUND_TERM = SIMD_MM_SET1_EPI16(Base::BILINEAR_ROUND_TERM);
-
-        template<bool align> SIMD_INLINE __m128i ResizerByteBilinearInterpolateY(const __m128i * pbx0, const __m128i * pbx1, __m128i alpha[2])
-        {
-            __m128i sum = _mm_add_epi16(_mm_mullo_epi16(Load<align>(pbx0), alpha[0]), _mm_mullo_epi16(Load<align>(pbx1), alpha[1]));
-            return _mm_srli_epi16(_mm_add_epi16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT);
-        }
-
-        template<bool align> SIMD_INLINE void ResizerByteBilinearInterpolateY(const uint8_t * bx0, const uint8_t * bx1, __m128i alpha[2], uint8_t * dst)
-        {
-            __m128i lo = ResizerByteBilinearInterpolateY<align>((__m128i*)bx0 + 0, (__m128i*)bx1 + 0, alpha);
-            __m128i hi = ResizerByteBilinearInterpolateY<align>((__m128i*)bx0 + 1, (__m128i*)bx1 + 1, alpha);
-            Store<false>((__m128i*)dst, _mm_packus_epi16(lo, hi));
-        }
-
-        template<size_t N> void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride)
-        {
-            struct One { uint8_t val[N * 1]; };
-            struct Two { uint8_t val[N * 2]; };
-
-            size_t size = 2 * _param.dstW*N;
-            size_t aligned = AlignHi(size, DA) - DA;
-            const size_t step = A * N;
-            ptrdiff_t previous = -2;
-            __m128i a[2];
-            uint8_t * bx[2] = { _bx[0].data, _bx[1].data };
-            const uint8_t * ax = _ax.data;
-            const int32_t * ix = _ix.data;
-            size_t dstW = _param.dstW;
-
-            for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride)
-            {
-                a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst]));
-                a[1] = _mm_set1_epi16(int16_t(_ay[yDst]));
-
-                ptrdiff_t sy = _iy[yDst];
-                int k = 0;
-
-                if (sy == previous)
-                    k = 2;
-                else if (sy == previous + 1)
-                {
-                    Swap(bx[0], bx[1]);
-                    k = 1;
-                }
-
-                previous = sy;
-
-                for (; k < 2; k++)
-                {
-                    Two * pb = (Two *)bx[k];
-                    const One * psrc = (const One *)(src + (sy + k)*srcStride);
-                    for (size_t x = 0; x < dstW; x++)
-                        pb[x] = *(Two *)(psrc + ix[x]);
-
-                    uint8_t * pbx = bx[k];
-                    for (size_t i = 0; i < size; i += step)
-                        ResizerByteBilinearInterpolateX<N>((__m128i*)(ax + i), (__m128i*)(pbx + i));
-                }
-
-                for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A)
-                    ResizerByteBilinearInterpolateY<true>(bx[0] + ib, bx[1] + ib, a, dst + id);
-                size_t i = size - DA;
-                ResizerByteBilinearInterpolateY<false>(bx[0] + i, bx[1] + i, a, dst + i / 2);
-            }
-        }
-
-        template <class Idx> SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t * src, const Idx & index, const uint8_t * alpha, uint8_t * dst)
-        {
-            __m128i _src = _mm_loadu_si128((__m128i*)(src + index.src));
-            __m128i _shuffle = _mm_loadu_si128((__m128i*)&index.shuffle);
-            __m128i _alpha = _mm_loadu_si128((__m128i*)(alpha + index.dst));
-            _mm_storeu_si128((__m128i*)(dst + index.dst), _mm_maddubs_epi16(_mm_shuffle_epi8(_src, _shuffle), _alpha));
-        }
-
-        void ResizerByteBilinear::RunG(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride)
-        {
-            size_t bufW = AlignHi(_param.dstW, A) * 2;
-            size_t size = 2 * _param.dstW;
-            size_t aligned = AlignHi(size, DA) - DA;
-            size_t blocks = _blocks;
-            ptrdiff_t previous = -2;
-            __m128i a[2];
-            uint8_t * bx[2] = { _bx[0].data, _bx[1].data };
-            const uint8_t * ax = _ax.data;
-            const Idx * ixg = _ixg.data;
-
-            for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride)
-            {
-                a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst]));
-                a[1] = _mm_set1_epi16(int16_t(_ay[yDst]));
-
-                ptrdiff_t sy = _iy[yDst];
-                int k = 0;
-
-                if (sy == previous)
-                    k = 2;
-                else if (sy == previous + 1)
-                {
-                    Swap(bx[0], bx[1]);
-                    k = 1;
-                }
-
-                previous = sy;
-
-                for (; k < 2; k++)
-                {
-                    const uint8_t * psrc = src + (sy + k)*srcStride;
-                    uint8_t * pdst = bx[k];
-                    for (size_t i = 0; i < blocks; ++i)
-                        ResizerByteBilinearLoadGrayInterpolated(psrc, ixg[i], ax, pdst);
-                }
-
-                for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A)
-                    ResizerByteBilinearInterpolateY<true>(bx[0] + ib, bx[1] + ib, a, dst + id);
-                size_t i = size - DA;
-                ResizerByteBilinearInterpolateY<false>(bx[0] + i, bx[1] + i, a, dst + i / 2);
-            }
-        }
-
-        void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride)
-        {
-            assert(_param.dstW >= A);
-
-            EstimateParams();
-            switch (_param.channels)
-            {
-            case 1:
-                if(_blocks)
-                    RunG(src, srcStride, dst, dstStride);
-                else
-                    Run<1>(src, srcStride, dst, dstStride);
-                break;
-            case 2: Run<2>(src, srcStride, dst, dstStride); break;
-            case 3: Run<3>(src, srcStride, dst, dstStride); break;
-            case 4: Run<4>(src, srcStride, dst, dstStride); break;
-            default:
-                assert(0);
-            }
-        }
-
-        //---------------------------------------------------------------------
-
-        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
-        {
-            ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128i));
-            if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && dstX >= A)
-                return new ResizerByteBilinear(param);
-            else
-                return Sse2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Resizer.cpp.o) has no symbols
-    void dummy_SimdSsse3Resizer(){};
-#endif//SIMD_SSSE3_ENABLE
-}
-
diff --git a/3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp b/3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp
deleted file mode 100644
index cf79dd55bd..0000000000
--- a/3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
-        const __m128i K16_GREEN_ROUND = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM);
-
-        SIMD_INLINE __m128i RgbaToGray32(__m128i rgba)
-        {
-            const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF);
-            const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF);
-            const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(r0b0, K16_RED_BLUE));
-            return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT);
-        }
-
-        SIMD_INLINE __m128i RgbToGray(__m128i rgba[4])
-        {
-            const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
-            const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
-            return _mm_packus_epi16(lo, hi);
-        }
-
-        template <bool align> SIMD_INLINE __m128i RgbToGray(const uint8_t * rgb, __m128i shuffle)
-        {
-            __m128i rgba[4];
-            rgba[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<align>((__m128i*)(rgb + 0)), shuffle));
-            rgba[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(rgb + 12)), shuffle));
-            rgba[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(rgb + 24)), shuffle));
-            rgba[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(rgb + 32)), 4), shuffle));
-            return RgbToGray(rgba);
-        }
-
-        template <bool align> void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    Store<align>((__m128i*)(gray + col), RgbToGray<align>(rgb + 3 * col, _shuffle));
-                if (width != alignedWidth)
-                    Store<false>((__m128i*)(gray + width - A), RgbToGray<false>(rgb + 3 * (width - A), _shuffle));
-                rgb += rgbStride;
-                gray += grayStride;
-            }
-        }
-
-        void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride)
-        {
-            if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride))
-                RgbToGray<true>(rgb, width, height, rgbStride, gray, grayStride);
-            else
-                RgbToGray<false>(rgb, width, height, rgbStride, gray, grayStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3RgbToGray.cpp.o) has no symbols
-    void dummy_SimdSsse3RgbToGray(){};
-#endif// SIMD_SSSE3_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdStore.h b/3rdparty/simdlib/Simd/SimdStore.h
old mode 100644
new mode 100755
index 11ae3f7815..2b22a9616d
--- a/3rdparty/simdlib/Simd/SimdStore.h
+++ b/3rdparty/simdlib/Simd/SimdStore.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -31,8 +31,8 @@
 
 namespace Simd
 {
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         template <bool align> SIMD_INLINE void Store(float  * p, __m128 a);
 
@@ -63,13 +63,6 @@ namespace Simd
             __m128 old = Load<align>(p);
             Store<align>(p, Combine(mask, value, old));
         }
-    }
-#endif//SIMD_SSE_ENABLE
-
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
-        using namespace Sse;
 
         template <bool align> SIMD_INLINE void Store(__m128i * p, __m128i a);
 
@@ -83,6 +76,11 @@ namespace Simd
             _mm_store_si128(p, a);
         }
 
+        template <int part> SIMD_INLINE void StoreHalf(__m128i* p, __m128i a)
+        {
+            StoreHalf<part>((float*)p, _mm_castsi128_ps(a));
+        }
+
         template <bool align> SIMD_INLINE void StoreMasked(__m128i * p, __m128i value, __m128i mask)
         {
             __m128i old = Load<align>(p);
@@ -95,7 +93,6 @@ namespace Simd
     namespace Sse41
     {
 #if defined(_MSC_VER) && _MSC_VER >= 1800  && _MSC_VER < 1900 // Visual Studio 2013 compiler bug       
-        using Sse::Store;
         using Sse2::Store;
 #endif
     }
@@ -118,8 +115,8 @@ namespace Simd
 
         template <bool align> SIMD_INLINE void Store(float * p0, float * p1, __m256 a)
         {
-            Sse::Store<align>(p0, _mm256_extractf128_ps(a, 0));
-            Sse::Store<align>(p1, _mm256_extractf128_ps(a, 1));
+            Sse2::Store<align>(p0, _mm256_extractf128_ps(a, 0));
+            Sse2::Store<align>(p1, _mm256_extractf128_ps(a, 1));
         }
 
         template <bool align> SIMD_INLINE void StoreMasked(float * p, __m256 value, __m256 mask)
@@ -163,11 +160,6 @@ namespace Simd
             return _mm256_permute4x64_epi64(_mm256_packus_epi16(lo, hi), 0xD8);
         }
 
-        SIMD_INLINE __m256i PackU16ToU8(__m256i lo, __m256i hi)
-        {
-            return _mm256_permute4x64_epi64(_mm256_packus_epi16(lo, hi), 0xD8);
-        }
-
         SIMD_INLINE __m256i PackI32ToI16(__m256i lo, __m256i hi)
         {
             return _mm256_permute4x64_epi64(_mm256_packs_epi32(lo, hi), 0xD8);
@@ -184,6 +176,12 @@ namespace Simd
             lo = _mm256_permute2x128_si256(lo, hi, 0x20);
             hi = _mm256_permute2x128_si256(_lo, hi, 0x31);
         }
+
+        template <bool align> SIMD_INLINE void Store24(uint8_t * p, __m256i a)
+        {
+            Sse2::Store<align>((__m128i*)p, _mm256_extractf128_si256(a, 0));
+            Sse2::StoreHalf<0>((__m128i*)p + 1, _mm256_extractf128_si256(a, 1));
+        }
     }
 #endif//SIMD_SAVX2_ENABLE
 
@@ -230,27 +228,27 @@ namespace Simd
 
         template <bool align> SIMD_INLINE void Store(uint16_t * p, uint16x8_t a)
         {
-            Store<align>((uint8_t*)p, (uint8x16_t)a);
+            Store<align>((uint8_t*)p, vreinterpretq_u8_u16(a));
         }
 
         template <bool align> SIMD_INLINE void Store(uint16_t * p, uint16x4_t a)
         {
-            Store<align>((uint8_t*)p, (uint8x8_t)a);
+            Store<align>((uint8_t*)p, vreinterpret_u8_u16(a));
         }
 
         template <bool align> SIMD_INLINE void Store(int16_t * p, int16x8_t a)
         {
-            Store<align>((uint8_t*)p, (uint8x16_t)a);
+            Store<align>((uint8_t*)p, vreinterpretq_u8_s16(a));
         }
 
         template <bool align> SIMD_INLINE void Store(uint32_t * p, uint32x4_t a)
         {
-            Store<align>((uint8_t*)p, (uint8x16_t)a);
+            Store<align>((uint8_t*)p, vreinterpretq_u8_u32(a));
         }
 
         template <bool align> SIMD_INLINE void Store(int32_t * p, int32x4_t a)
         {
-            Store<align>((uint8_t*)p, (uint8x16_t)a);
+            Store<align>((uint8_t*)p, vreinterpretq_u8_s32(a));
         }
 
         template <bool align> SIMD_INLINE void Store2(uint8_t * p, uint8x16x2_t a);
@@ -310,7 +308,6 @@ namespace Simd
 #endif
         }
 
-
         template <bool align> SIMD_INLINE void Store3(uint8_t * p, uint8x16x3_t a);
 
         template <> SIMD_INLINE void Store3<false>(uint8_t * p, uint8x16x3_t a)
diff --git a/3rdparty/simdlib/Simd/SimdStream.h b/3rdparty/simdlib/Simd/SimdStream.h
old mode 100644
new mode 100755
index b6399bd1f1..6abf65cf68
--- a/3rdparty/simdlib/Simd/SimdStream.h
+++ b/3rdparty/simdlib/Simd/SimdStream.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -30,36 +30,31 @@ namespace Simd
 {
     const size_t STREAM_SIZE_MIN = 0x00100000;
 
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
-        template <bool align, bool stream> SIMD_INLINE void Stream(float  * p, __m128 a);
+        template <bool align, bool stream> SIMD_INLINE void Stream(float* p, __m128 a);
 
-        template <> SIMD_INLINE void Stream<false, false>(float  * p, __m128 a)
+        template <> SIMD_INLINE void Stream<false, false>(float* p, __m128 a)
         {
             _mm_storeu_ps(p, a);
         }
 
-        template <> SIMD_INLINE void Stream<false, true>(float  * p, __m128 a)
+        template <> SIMD_INLINE void Stream<false, true>(float* p, __m128 a)
         {
             _mm_storeu_ps(p, a);
         }
 
-        template <> SIMD_INLINE void Stream<true, false>(float  * p, __m128 a)
+        template <> SIMD_INLINE void Stream<true, false>(float* p, __m128 a)
         {
             _mm_store_ps(p, a);
         }
 
-        template <> SIMD_INLINE void Stream<true, true>(float  * p, __m128 a)
+        template <> SIMD_INLINE void Stream<true, true>(float* p, __m128 a)
         {
             _mm_stream_ps(p, a);
         }
-    }
-#endif//SIMD_SSE_ENABLE
 
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
         template <bool align, bool stream> SIMD_INLINE void Stream(__m128i  * p, __m128i a);
 
         template <> SIMD_INLINE void Stream<false, false>(__m128i   * p, __m128i a)
diff --git a/3rdparty/simdlib/Simd/SimdUpdate.h b/3rdparty/simdlib/Simd/SimdUpdate.h
old mode 100644
new mode 100755
index 47e9b22dc2..4c4d64b1c0
--- a/3rdparty/simdlib/Simd/SimdUpdate.h
+++ b/3rdparty/simdlib/Simd/SimdUpdate.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -47,8 +47,8 @@ namespace Simd
         }
     }
 
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         template <UpdateType update, bool align> SIMD_INLINE void Update(float  * p, __m128 a)
         {
@@ -63,13 +63,10 @@ namespace Simd
         template <> SIMD_INLINE void Update<UpdateAdd, true>(float  * p, __m128 a)
         {
             Store<true>(p, _mm_add_ps(Load<true>(p), a));
-        }
-    }
-#endif//SIMD_SSE_ENABLE
+        }   
 
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
+        //-----------------------------------------------------------------------------------------
+        
         template <UpdateType update, bool align> SIMD_INLINE void Update(int32_t  * p, __m128i a)
         {
             Store<align>((__m128i*)p, a);
@@ -160,6 +157,6 @@ namespace Simd
             Store<true>(p, vaddq_f32(Load<true>(p), a));
         }
     }
-#endif//SIMD_SSE_ENABLE
+#endif//SIMD_NEON_ENABLE
 }
 #endif//__SimdUpdate_h__
diff --git a/3rdparty/simdlib/Simd/SimdVersion.h b/3rdparty/simdlib/Simd/SimdVersion.h
index 72ae751ade..09efd5de91 100644
--- a/3rdparty/simdlib/Simd/SimdVersion.h
+++ b/3rdparty/simdlib/Simd/SimdVersion.h
@@ -34,7 +34,7 @@
 #ifndef __SimdVersion_h__
 #define __SimdVersion_h__
 
-#define SIMD_VERSION "4.4.82"
+#define SIMD_VERSION "4.9.107"
 
 #endif//__SimdVersion_h__
 
diff --git a/3rdparty/simdlib/Simd/SimdView.hpp b/3rdparty/simdlib/Simd/SimdView.hpp
old mode 100644
new mode 100755
index c9a51c5f61..0c61a0e6e8
--- a/3rdparty/simdlib/Simd/SimdView.hpp
+++ b/3rdparty/simdlib/Simd/SimdView.hpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
 *               2014-2019 Antonenka Mikhail,
 *               2018-2019 Dmitry Fedorov,
 *               2019-2019 Artur Voronkov.
@@ -95,7 +95,9 @@ namespace Simd
             /*! A single channel 64-bit float point pixel format. */
             Double,
             /*! A 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */
-            Rgb24
+            Rgb24,
+            /*! A 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */
+            Rgba32,
         };
 
         /*!
diff --git a/modules/core/src/image/vpImageConvert.cpp b/modules/core/src/image/vpImageConvert.cpp
index 7d7ef289e9..b3a95e4372 100644
--- a/modules/core/src/image/vpImageConvert.cpp
+++ b/modules/core/src/image/vpImageConvert.cpp
@@ -673,7 +673,7 @@ dest.resize((unsigned int)src.rows, (unsigned int)src.cols);
       }
   } else if (src.type() == CV_8UC3) {
     if (src.isContinuous() && !flip) {
-      SimdBgrToRgba(src.data, src.cols, src.rows, src.step[0], reinterpret_cast<uint8_t*>(dest.bitmap),
+      SimdRgbToBgra(src.data, src.cols, src.rows, src.step[0], reinterpret_cast<uint8_t*>(dest.bitmap),
                     dest.getWidth() * sizeof(vpRGBa), vpRGBa::alpha_default);
     } else {
       vpRGBa rgbaVal;
@@ -3519,7 +3519,7 @@ void vpImageConvert::BGRToRGBa(unsigned char *bgr, unsigned char *rgba, unsigned
                                bool flip)
 {
   if (!flip) {
-    SimdBgrToRgba(bgr, width, height, width*3, rgba, width * sizeof(vpRGBa), vpRGBa::alpha_default);
+    SimdRgbToBgra(bgr, width, height, width*3, rgba, width * sizeof(vpRGBa), vpRGBa::alpha_default);
   } else {
     // if we have to flip the image, we start from the end last scanline so the
     // step is negative

From 04044b64040143b5887237b15dd2f1239c5218a6 Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Wed, 3 Nov 2021 10:17:07 +0100
Subject: [PATCH 02/18] Add missing file.

---
 3rdparty/simdlib/Simd/SimdNeonCpu.cpp | 59 +++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 3rdparty/simdlib/Simd/SimdNeonCpu.cpp

diff --git a/3rdparty/simdlib/Simd/SimdNeonCpu.cpp b/3rdparty/simdlib/Simd/SimdNeonCpu.cpp
new file mode 100644
index 0000000000..8b644c04f6
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdNeonCpu.cpp
@@ -0,0 +1,59 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdEnable.h"
+#include "Simd/SimdCpu.h"
+
+#if defined(__GNUC__) && (defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE))
+#include <fcntl.h>
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#endif
+
+namespace Simd
+{
+#ifdef SIMD_NEON_ENABLE
+    namespace Neon
+    {
+        SIMD_INLINE bool SupportedByCPU()
+        {
+#if defined(_MSC_VER)
+            return true;
+#elif defined(__GNUC__)
+#if defined(SIMD_ARM64_ENABLE)
+            return true;
+#else
+            return Base::CheckBit(AT_HWCAP, HWCAP_NEON);
+#endif
+#else
+#error Do not know how to detect NEON support!
+#endif
+        }
+
+        bool GetEnable()
+        {
+            return SupportedByCPU();
+        }
+    }
+#endif
+}

From df0461608768ea77f71ef27fd360c8e6594240d2 Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Wed, 3 Nov 2021 11:18:25 +0100
Subject: [PATCH 03/18] Remove not used SSE flags. Add missing SSE 4.1
 implementation.

---
 3rdparty/simdlib/CMakeLists.txt               | 64 ++---------------
 3rdparty/simdlib/Simd/SimdLib.cpp             |  7 +-
 3rdparty/simdlib/Simd/SimdSse41.h             |  3 +
 .../simdlib/Simd/SimdSse41CustomFunctions.cpp | 69 +++++++++++++++++++
 modules/io/src/image/vpImageIo.cpp            |  2 +-
 5 files changed, 83 insertions(+), 62 deletions(-)
 create mode 100644 3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp

diff --git a/3rdparty/simdlib/CMakeLists.txt b/3rdparty/simdlib/CMakeLists.txt
index dc6d111aae..1acb1341be 100644
--- a/3rdparty/simdlib/CMakeLists.txt
+++ b/3rdparty/simdlib/CMakeLists.txt
@@ -20,46 +20,31 @@ file(GLOB_RECURSE SIMD_BASE_HDR ${CMAKE_CURRENT_SOURCE_DIR}/Simd/*.h ${CMAKE_CUR
 if(X86 OR X86_64)
 
     # Flags check
-    set(SSE_FLAG    "")
     set(SSE2_FLAG   "")
-    set(SSE3_FLAG   "")
-    set(SSSE3_FLAG  "")
-    set(SSE4_1_FLAG "")
     set(SSE4_2_FLAG "")
     set(AVX_FLAG    "")
     set(AVX2_FLAG   "")
 
     if(MSVC)
         if(NOT MSVC64)
-            vp_check_compiler_flag(CXX "/arch:SSE"    HAVE_SSE_FLAG     "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse.cpp")
             vp_check_compiler_flag(CXX "/arch:SSE2"   HAVE_SSE2_FLAG    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp")
         endif()
 
         vp_check_compiler_flag(CXX "/arch:AVX"    HAVE_AVX_FLAG     "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx.cpp")
         vp_check_compiler_flag(CXX "/arch:AVX2"   HAVE_AVX2_FLAG    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp")
 
-        if(HAVE_SSE_FLAG)
-            set(SSE_FLAG "/arch:SSE")
-        endif()
         if(HAVE_SSE2_FLAG)
             set(SSE2_FLAG "/arch:SSE2")
         endif()
         if(HAVE_AVX_FLAG)
             set(AVX_FLAG    "/arch:AVX")
             set(SSE4_2_FLAG "/arch:AVX")
-            set(SSE4_1_FLAG "/arch:AVX")
-            set(SSSE3_FLAG  "/arch:AVX")
-            set(SSE3_FLAG   "/arch:AVX")
         endif()
         if(HAVE_AVX2_FLAG)
             set(AVX2_FLAG "/arch:AVX2")
         endif()
     else()
-        vp_check_compiler_flag(CXX "-msse"    HAVE_SSE_FLAG     "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse.cpp")
         vp_check_compiler_flag(CXX "-msse2"   HAVE_SSE2_FLAG    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp")
-        vp_check_compiler_flag(CXX "-msse3"   HAVE_SSE3_FLAG    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse3.cpp")
-        vp_check_compiler_flag(CXX "-mssse3"  HAVE_SSSE3_FLAG   "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_ssse3.cpp")
-        vp_check_compiler_flag(CXX "-msse4.1" HAVE_SSE4_1_FLAG  "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse41.cpp")
         vp_check_compiler_flag(CXX "-msse4.2" HAVE_SSE4_2_FLAG  "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse42.cpp")
         vp_check_compiler_flag(CXX "-mavx"    HAVE_AVX_FLAG     "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx.cpp")
         vp_check_compiler_flag(CXX "-mavx2"   HAVE_AVX2_FLAG    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp")
@@ -68,23 +53,11 @@ if(X86 OR X86_64)
         vp_check_compiler_flag(CXX "-Wno-sign-compare"       HAVE_NO_SIGN_COMPARE_FLAG     "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_warning.cpp")
         vp_check_compiler_flag(CXX "-Wno-ignored-qualifiers" HAVE_NO_IGNORED_QUALIFIERS    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_warning.cpp")
 
-        if(HAVE_SSE_FLAG)
-            set(SSE_FLAG "-msse")
-        endif()
         if(HAVE_SSE2_FLAG)
-            set(SSE2_FLAG "-msse2")
-        endif()
-        if(HAVE_SSE3_FLAG)
-            set(SSE3_FLAG "-msse3")
-        endif()
-        if(HAVE_SSSE3_FLAG)
-            set(SSSE3_FLAG "-mssse3")
-        endif()
-        if(HAVE_SSE4_1_FLAG)
-            set(SSE4_1_FLAG "-msse4.1")
+            set(SSE2_FLAG "-msse -msse2")
         endif()
         if(HAVE_SSE4_2_FLAG)
-            set(SSE4_2_FLAG "-msse4.2")
+            set(SSE4_2_FLAG "-msse3 -mssse3 -msse4.1 -msse4.2")
         endif()
         if(HAVE_AVX_FLAG)
             set(AVX_FLAG "-mavx")
@@ -110,10 +83,10 @@ if(X86 OR X86_64)
     set_source_files_properties(${SIMD_BASE_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS}")
 
     file(GLOB_RECURSE SIMD_SSE2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse2*.cpp)
-    set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG} ${SSE2_FLAG}")
+    set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE2_FLAG}")
 
     file(GLOB_RECURSE SIMD_SSE41_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse41*.cpp)
-    set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG} ${SSSE3_FLAG} ${SSE4_1_FLAG} ${SSE4_2_FLAG}")
+    set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}")
 
     file(GLOB_RECURSE SIMD_AVX1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx1*.cpp)
     set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}")
@@ -126,7 +99,7 @@ if(X86 OR X86_64)
     endif()
 
     set(SIMD_LIB_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}")
-    set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE1_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE3_SRC} ${SIMD_SSSE3_SRC} ${SIMD_SSE41_SRC} ${SIMD_SSE42_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC})
+    set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE41_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC})
 
     file(GLOB_RECURSE SIMD_LIB_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdLib.cpp)
     set_source_files_properties(${SIMD_LIB_SRC} PROPERTIES COMPILE_FLAGS "${SIMD_LIB_FLAGS}")
@@ -171,32 +144,21 @@ elseif(WINRT)
        add_library(${SIMD_LIBRARY} STATIC ${SIMD_LIB_SRC} ${SIMD_BASE_SRC} ${SIMD_NEON_SRC} ${SIMD_BASE_HDR})
     else()
         # Flags check
-        set(SSE_FLAG    "")
         set(SSE2_FLAG   "")
-        set(SSE3_FLAG   "")
-        set(SSSE3_FLAG  "")
-        set(SSE4_1_FLAG "")
         set(SSE4_2_FLAG "")
         set(AVX_FLAG    "")
         set(AVX2_FLAG   "")
 
-        vp_check_compiler_flag(CXX "/arch:SSE"    HAVE_SSE_FLAG     "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse.cpp")
         vp_check_compiler_flag(CXX "/arch:SSE2"   HAVE_SSE2_FLAG    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp")
         vp_check_compiler_flag(CXX "/arch:AVX"    HAVE_AVX_FLAG     "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx.cpp")
         vp_check_compiler_flag(CXX "/arch:AVX2"   HAVE_AVX2_FLAG    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp")
 
-        if(HAVE_SSE_FLAG)
-            set(SSE_FLAG "/arch:SSE")
-        endif()
         if(HAVE_SSE2_FLAG)
             set(SSE2_FLAG "/arch:SSE2")
         endif()
         if(HAVE_AVX_FLAG)
             set(AVX_FLAG    "/arch:AVX")
             set(SSE4_2_FLAG "/arch:AVX")
-            set(SSE4_1_FLAG "/arch:AVX")
-            set(SSSE3_FLAG  "/arch:AVX")
-            set(SSE3_FLAG   "/arch:AVX")
         endif()
         if(HAVE_AVX2_FLAG)
             set(AVX2_FLAG "/arch:AVX2")
@@ -205,23 +167,11 @@ elseif(WINRT)
         file(GLOB_RECURSE SIMD_BASE_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdBase*.cpp)
         set_source_files_properties(${SIMD_BASE_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS}")
 
-        file(GLOB_RECURSE SIMD_SSE1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse1*.cpp)
-        set_source_files_properties(${SIMD_SSE1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG}")
-
         file(GLOB_RECURSE SIMD_SSE2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse2*.cpp)
         set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE2_FLAG}")
 
-        file(GLOB_RECURSE SIMD_SSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse3*.cpp)
-        set_source_files_properties(${SIMD_SSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG}")
-
-        file(GLOB_RECURSE SIMD_SSSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSsse3*.cpp)
-        set_source_files_properties(${SIMD_SSSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSSE3_FLAG}")
-
         file(GLOB_RECURSE SIMD_SSE41_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse41*.cpp)
-        set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_1_FLAG}")
-
-        file(GLOB_RECURSE SIMD_SSE42_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse42*.cpp)
-        set_source_files_properties(${SIMD_SSE42_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}")
+        set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}")
 
         file(GLOB_RECURSE SIMD_AVX1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx1*.cpp)
         set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}")
@@ -230,7 +180,7 @@ elseif(WINRT)
         set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}")
 
         set(SIMD_LIB_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}")
-        set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE1_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE3_SRC} ${SIMD_SSSE3_SRC} ${SIMD_SSE41_SRC} ${SIMD_SSE42_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC})
+        set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE41_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC})
 
         file(GLOB_RECURSE SIMD_LIB_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdLib.cpp)
         set_source_files_properties(${SIMD_LIB_SRC} PROPERTIES COMPILE_FLAGS "${SIMD_LIB_FLAGS}")
diff --git a/3rdparty/simdlib/Simd/SimdLib.cpp b/3rdparty/simdlib/Simd/SimdLib.cpp
index b1cac8b1ba..89718bb80e 100755
--- a/3rdparty/simdlib/Simd/SimdLib.cpp
+++ b/3rdparty/simdlib/Simd/SimdLib.cpp
@@ -862,10 +862,9 @@ SIMD_API void SimdMatTranspose(const double * mat, size_t rows, size_t cols, dou
 
 SIMD_API void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff)
 {
-  //TODO:
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && size >= Ssse3::A)
-        Ssse3::SimdImageDifference(img1,img2, size, imgDiff);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && size >= Sse41::A)
+        Sse41::SimdImageDifference(img1,img2, size, imgDiff);
     else
 #endif
         Base::SimdImageDifference(img1, img2, size, imgDiff);
diff --git a/3rdparty/simdlib/Simd/SimdSse41.h b/3rdparty/simdlib/Simd/SimdSse41.h
index 958fc11bc5..7a4bb04ad8 100755
--- a/3rdparty/simdlib/Simd/SimdSse41.h
+++ b/3rdparty/simdlib/Simd/SimdSse41.h
@@ -70,6 +70,9 @@ namespace Simd
         void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
 
         void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride);
+
+        // ViSP custom SIMD code
+        void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff);
     }
 #endif// SIMD_SSE41_ENABLE
 }
diff --git a/3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp b/3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp
new file mode 100644
index 0000000000..f34a29329d
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp
@@ -0,0 +1,69 @@
+/*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdBase.h"
+#include "Simd/SimdStore.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE
+    namespace Sse41
+    {
+        void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff)
+        {
+            const __m128i mask1 = _mm_set_epi8(-1, 14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0);
+            const __m128i mask2 = _mm_set_epi8(-1, 15, -1, 13, -1, 11, -1, 9, -1, 7, -1, 5, -1, 3, -1, 1);
+            const __m128i mask_out2 = _mm_set_epi8(14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0, -1);
+
+            size_t i = 0;
+            for (; i <= size-16; i+= 16) {
+                const __m128i vdata1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(img1 + i));
+                const __m128i vdata2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(img2 + i));
+
+                __m128i vdata1_reorg = _mm_shuffle_epi8(vdata1, mask1);
+                __m128i vdata2_reorg = _mm_shuffle_epi8(vdata2, mask1);
+
+                const __m128i vshift = _mm_set1_epi16(128);
+                __m128i vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift);
+
+                const __m128i v255 = _mm_set1_epi16(255);
+                const __m128i vzero = _mm_setzero_si128();
+                const __m128i vdata_diff_min_max1 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero);
+
+                vdata1_reorg = _mm_shuffle_epi8(vdata1, mask2);
+                vdata2_reorg = _mm_shuffle_epi8(vdata2, mask2);
+
+                vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift);
+                const __m128i vdata_diff_min_max2 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero);
+
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(imgDiff + i), _mm_or_si128(_mm_shuffle_epi8(vdata_diff_min_max1, mask1),
+                                                                                        _mm_shuffle_epi8(vdata_diff_min_max2, mask_out2)));
+            }
+
+            if (i < size) {
+                Base::SimdImageDifference(img1 + i, img2 + i, size - i, imgDiff + i);
+            }
+        }
+    }
+#else
+    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3CustomFunctions.cpp.o) has no symbols
+    void dummy_SimdSse41CustomFunctions(){};
+#endif// SIMD_SSE41_ENABLE
+}
diff --git a/modules/io/src/image/vpImageIo.cpp b/modules/io/src/image/vpImageIo.cpp
index 633503389c..ab290fa5f7 100644
--- a/modules/io/src/image/vpImageIo.cpp
+++ b/modules/io/src/image/vpImageIo.cpp
@@ -102,7 +102,7 @@ void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const st
   while (cpt_elt != nb_elt) {
     // Skip empty lines or lines starting with # (comment)
     while (std::getline(fd, line) && (line.compare(0, 1, "#") == 0 || line.size() == 0)) {
-    };
+    }
 
     if (fd.eof()) {
       fd.close();

From 9c193041c09bdf2fc7f20568d3e5677904191168 Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Wed, 3 Nov 2021 18:07:41 +0100
Subject: [PATCH 04/18] WIP code to add and test image loading/saving using
 Simd and for JPEG and PNG image format.

---
 3rdparty/simdlib/CMakeLists.txt               |    4 +-
 3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp   |  158 ++
 3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp   |  138 +
 .../simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp    |  351 +++
 .../simdlib/Simd/SimdAvx2ImageSavePng.cpp     |  369 +++
 3rdparty/simdlib/Simd/SimdBase.h              |    4 +
 3rdparty/simdlib/Simd/SimdBaseCrc32.cpp       |  978 +++++++
 3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp   |  371 +++
 .../simdlib/Simd/SimdBaseImageLoadJpeg.cpp    | 2456 +++++++++++++++++
 .../simdlib/Simd/SimdBaseImageLoadPng.cpp     | 1317 +++++++++
 3rdparty/simdlib/Simd/SimdBaseImageSave.cpp   |  340 +++
 .../simdlib/Simd/SimdBaseImageSaveJpeg.cpp    |  451 +++
 .../simdlib/Simd/SimdBaseImageSavePng.cpp     |  379 +++
 3rdparty/simdlib/Simd/SimdImageLoad.h         |  396 +++
 3rdparty/simdlib/Simd/SimdImageSave.h         |  386 +++
 3rdparty/simdlib/Simd/SimdImageSaveJpeg.h     |  649 +++++
 3rdparty/simdlib/Simd/SimdImageSavePng.h      |  235 ++
 3rdparty/simdlib/Simd/SimdLib.cpp             |   32 +-
 3rdparty/simdlib/Simd/SimdLib.h               |  109 +-
 3rdparty/simdlib/Simd/SimdMath.h              |    5 +
 3rdparty/simdlib/Simd/SimdMemory.h            |   19 +
 3rdparty/simdlib/Simd/SimdMemoryStream.h      |  510 ++++
 3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp   |  154 ++
 3rdparty/simdlib/Simd/SimdNeonImageSave.cpp   |  134 +
 3rdparty/simdlib/Simd/SimdPerformance.h       |  197 ++
 3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp  |  159 ++
 .../simdlib/Simd/SimdSse41ImageLoadPng.cpp    | 1805 ++++++++++++
 3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp  |  139 +
 .../simdlib/Simd/SimdSse41ImageSaveJpeg.cpp   |  431 +++
 .../simdlib/Simd/SimdSse41ImageSavePng.cpp    |  370 +++
 3rdparty/simdlib/Simd/SimdView.hpp            |  209 +-
 CMakeLists.txt                                |    2 +
 modules/io/CMakeLists.txt                     |   14 +-
 modules/io/include/visp3/io/vpImageIo.h       |    8 +
 modules/io/src/image/vpImageIo.cpp            |   63 +
 modules/io/test/perfImageLoadSave.cpp         |  461 ++++
 36 files changed, 13646 insertions(+), 157 deletions(-)
 create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseCrc32.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageSave.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdImageLoad.h
 create mode 100644 3rdparty/simdlib/Simd/SimdImageSave.h
 create mode 100644 3rdparty/simdlib/Simd/SimdImageSaveJpeg.h
 create mode 100644 3rdparty/simdlib/Simd/SimdImageSavePng.h
 create mode 100644 3rdparty/simdlib/Simd/SimdMemoryStream.h
 create mode 100644 3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdNeonImageSave.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdPerformance.h
 create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp
 create mode 100644 modules/io/test/perfImageLoadSave.cpp

diff --git a/3rdparty/simdlib/CMakeLists.txt b/3rdparty/simdlib/CMakeLists.txt
index 1acb1341be..95b3358ad2 100644
--- a/3rdparty/simdlib/CMakeLists.txt
+++ b/3rdparty/simdlib/CMakeLists.txt
@@ -93,9 +93,9 @@ if(X86 OR X86_64)
 
     file(GLOB_RECURSE SIMD_AVX2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx2*.cpp)
     if(MSVC)
-        set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}")
+        set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
     else()
-        set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mfma")
+        set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mfma -mbmi -mbmi2 -mlzcnt -fabi-version=4 -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
     endif()
 
     set(SIMD_LIB_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}")
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp
new file mode 100644
index 0000000000..aad4785761
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp
@@ -0,0 +1,158 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdAvx2.h"
+
+#include <memory>
+
+namespace Simd
+{
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param)
+            : Sse41::ImagePgmTxtLoader(param)
+        {
+        }
+
+        void ImagePgmTxtLoader::SetConverters()
+        {
+            Sse41::ImagePgmTxtLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _toAny = Avx2::GrayToBgr; break;
+                case SimdPixelFormatBgra32: _toBgra = Avx2::GrayToBgra; break;
+                case SimdPixelFormatRgb24: _toAny = Avx2::GrayToBgr; break;
+                case SimdPixelFormatRgba32: _toBgra = Avx2::GrayToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param)
+            : Sse41::ImagePgmBinLoader(param)
+        {
+        }
+
+        void ImagePgmBinLoader::SetConverters()
+        {
+            Sse41::ImagePgmBinLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _toAny = Avx2::GrayToBgr; break;
+                case SimdPixelFormatBgra32: _toBgra = Avx2::GrayToBgra; break;
+                case SimdPixelFormatRgb24: _toAny = Avx2::GrayToBgr; break;
+                case SimdPixelFormatRgba32: _toBgra = Avx2::GrayToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param)
+            : Sse41::ImagePpmTxtLoader(param)
+        {
+        }
+
+        void ImagePpmTxtLoader::SetConverters()
+        {
+            Sse41::ImagePpmTxtLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _toAny = Avx2::RgbToGray; break;
+                case SimdPixelFormatBgr24: _toAny = Avx2::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _toBgra = Avx2::RgbToBgra; break;
+                case SimdPixelFormatRgba32: _toBgra = Avx2::BgrToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param)
+            : Sse41::ImagePpmBinLoader(param)
+        {
+        }
+
+        void ImagePpmBinLoader::SetConverters()
+        {
+            Sse41::ImagePpmBinLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _toAny = Avx2::RgbToGray; break;
+                case SimdPixelFormatBgr24: _toAny = Avx2::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _toBgra = Avx2::RgbToBgra; break;
+                case SimdPixelFormatRgba32: _toBgra = Avx2::BgrToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageLoader* CreateImageLoader(const ImageLoaderParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinLoader(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinLoader(param);
+            case SimdImageFilePng: return new Sse41::ImagePngLoader(param);
+            case SimdImageFileJpeg: return new Base::ImageJpegLoader(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+        {
+            ImageLoaderParam param(data, size, *format);
+            if (param.Validate())
+            {
+                Holder<ImageLoader> loader(CreateImageLoader(param));
+                if (loader)
+                {
+                    if (loader->FromStream())
+                        return loader->Release(stride, width, height, format);
+                }
+            }
+            return NULL;
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp
new file mode 100644
index 0000000000..bd7e057092
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp
@@ -0,0 +1,138 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdAvx2.h"
+
+#include <memory>
+
+namespace Simd
+{
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param)
+            : Sse41::ImagePgmTxtSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _convert = Avx2::BgrToGray; break;
+                case SimdPixelFormatBgra32: _convert = Avx2::BgraToGray; break;
+                case SimdPixelFormatRgb24: _convert = Avx2::RgbToGray; break;
+                case SimdPixelFormatRgba32: _convert = Avx2::RgbaToGray; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param)
+            : Sse41::ImagePgmBinSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _convert = Avx2::BgrToGray; break;
+                case SimdPixelFormatBgra32: _convert = Avx2::BgraToGray; break;
+                case SimdPixelFormatRgb24: _convert = Avx2::RgbToGray; break;
+                case SimdPixelFormatRgba32: _convert = Avx2::RgbaToGray; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param)
+            : Sse41::ImagePpmTxtSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _convert = Avx2::GrayToBgr; break;
+                case SimdPixelFormatBgr24: _convert = Avx2::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _convert = Avx2::BgraToRgb; break;
+                case SimdPixelFormatRgba32: _convert = Avx2::BgraToBgr; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param)
+            : Sse41::ImagePpmBinSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _convert = Avx2::GrayToBgr; break;
+                case SimdPixelFormatBgr24: _convert = Avx2::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _convert = Avx2::BgraToRgb; break;
+                case SimdPixelFormatRgba32: _convert = Avx2::BgraToBgr; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageSaver* CreateImageSaver(const ImageSaverParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinSaver(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinSaver(param);
+            case SimdImageFilePng: return new ImagePngSaver(param);
+            case SimdImageFileJpeg: return new ImageJpegSaver(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size)
+        {
+            ImageSaverParam param(width, height, format, file, quality);
+            if (param.Validate())
+            {
+                Holder<ImageSaver> saver(CreateImageSaver(param));
+                if (saver)
+                {
+                    if (saver->ToStream(src, stride))
+                        return saver->Release(size);
+                }
+            }
+            return NULL;
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp
new file mode 100644
index 0000000000..2ff51e4dc1
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp
@@ -0,0 +1,351 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdImageSaveJpeg.h"
+#include "Simd/SimdLoad.h"
+#include "Simd/SimdAvx2.h"
+
+namespace Simd
+{
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        const uint32_t JpegZigZagTi32[64] = {
+            0, 8, 1, 2, 9, 16, 24, 17,
+            10, 3, 4, 11, 18, 25, 32, 40,
+            33, 26, 19, 12, 5, 6, 13, 20,
+            27, 34, 41, 48, 56, 49, 42, 35,
+            28, 21, 14, 7, 15, 22, 29, 36,
+            43, 50, 57, 58, 51, 44, 37, 30,
+            23, 31, 38, 45, 52, 59, 60, 53,
+            46, 39, 47, 54, 61, 62, 55, 63 };
+
+        //---------------------------------------------------------------------
+
+        static int JpegProcessDu(Base::BitBuf& bitBuf, float* CDU, int stride, const float* fdtbl, int DC, const uint16_t HTDC[256][2], const uint16_t HTAC[256][2])
+        {
+            SIMD_ALIGNED(32) int DUO[64], DU[64];
+            JpegDct(CDU, stride, fdtbl, DUO);
+            union
+            {
+                uint64_t u64[1];
+                uint32_t u32[2];
+                uint8_t u8[8];
+            } dum;
+            for (int i = 0, j = 0; i < 64; i += 8, j++)
+            {
+                __m256i du = _mm256_i32gather_epi32(DUO, _mm256_loadu_si256((__m256i*)(JpegZigZagTi32 + i)), 4);
+                dum.u8[j] = ~_mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpeq_epi32(du, Avx2::K_ZERO)));
+                _mm256_storeu_si256((__m256i*)(DU + i), du);
+            }
+            int diff = DU[0] - DC;
+            if (diff == 0)
+                bitBuf.Push(HTDC[0]);
+            else
+            {
+                uint16_t bits[2];
+                Base::JpegCalcBits(diff, bits);
+                bitBuf.Push(HTDC[bits[1]]);
+                bitBuf.Push(bits);
+            }
+#if defined(SIMD_X64_ENABLE)
+            if (dum.u64[0] == 0)
+            {
+                bitBuf.Push(HTAC[0x00]);
+                return DU[0];
+            }
+            dum.u64[0] >>= 1;
+            int i = 1;
+            for (; dum.u64[0]; ++i, dum.u64[0] >>= 1)
+            {
+                int nrzeroes = (int)_tzcnt_u64(dum.u64[0]);
+                i += nrzeroes;
+                dum.u64[0] >>= nrzeroes;
+                if (nrzeroes >= 16)
+                {
+                    for (int nrmarker = 16; nrmarker <= nrzeroes; nrmarker += 16)
+                        bitBuf.Push(HTAC[0xF0]);
+                    nrzeroes &= 15;
+                }
+                uint16_t bits[2];
+                Base::JpegCalcBits(DU[i], bits);
+                bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]);
+                bitBuf.Push(bits);
+            }
+            if (i < 64)
+                bitBuf.Push(HTAC[0x00]);
+#else
+            int end0pos = 64;
+            do
+            {
+                end0pos -= 8;
+                int mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_loadu_si256((__m256i*)(DU + end0pos)), Avx2::K_ZERO));
+                if (mask)
+                {
+                    end0pos += 7 - _lzcnt_u32(mask) / 4;
+                    break;
+                }
+            } 
+            while (end0pos > 0);
+            if (end0pos == 0)
+            {
+                bitBuf.Push(HTAC[0x00]);
+                return DU[0];
+            }
+            for (int i = 1; i <= end0pos; ++i)
+            {
+                int startpos = i;
+                for (; DU[i] == 0 && i <= end0pos; ++i);
+                int nrzeroes = i - startpos;
+                if (nrzeroes >= 16)
+                {
+                    int lng = nrzeroes >> 4;
+                    int nrmarker;
+                    for (nrmarker = 1; nrmarker <= lng; ++nrmarker)
+                        bitBuf.Push(HTAC[0xF0]);
+                    nrzeroes &= 15;
+                }
+                uint16_t bits[2];
+                Base::JpegCalcBits(DU[i], bits);
+                bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]);
+                bitBuf.Push(bits);
+            }
+            if (end0pos != 63)
+                bitBuf.Push(HTAC[0x00]);
+#endif
+            return DU[0];
+        }
+
+        SIMD_INLINE void RgbToYuvInit(__m256 k[10])
+        {
+            k[0] = _mm256_set1_ps(+0.29900f);
+            k[1] = _mm256_set1_ps(+0.58700f);
+            k[2] = _mm256_set1_ps(+0.11400f);
+            k[3] = _mm256_set1_ps(-128.000f);
+            k[4] = _mm256_set1_ps(-0.16874f);
+            k[5] = _mm256_set1_ps(-0.33126f);
+            k[6] = _mm256_set1_ps(+0.50000f);
+            k[7] = _mm256_set1_ps(+0.50000f);
+            k[8] = _mm256_set1_ps(-0.41869f);
+            k[9] = _mm256_set1_ps(-0.08131f);
+        }
+
+        SIMD_INLINE void RgbToYuv(const uint8_t* r, const uint8_t* g, const uint8_t* b, int stride, int height, 
+            const __m256 k[10], float* y, float* u, float* v, int size)
+        {
+            for (int row = 0; row < size;)
+            {
+                for (int col = 0; col < size; col += 8)
+                {
+                    __m256 _r = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(r + col))));
+                    __m256 _g = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(g + col))));
+                    __m256 _b = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(b + col))));
+                    _mm256_storeu_ps(y + col, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, k[0]), _mm256_mul_ps(_g, k[1])), _mm256_mul_ps(_b, k[2])), k[3]));
+                    //_mm256_storeu_ps(y + col, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, _yr), _mm256_mul_ps(_g, _yg)), _mm256_add_ps(_mm256_mul_ps(_b, _yb), _yt)));
+                    _mm256_storeu_ps(u + col, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, k[4]), _mm256_mul_ps(_g, k[5])), _mm256_mul_ps(_b, k[6])));
+                    _mm256_storeu_ps(v + col, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, k[7]), _mm256_mul_ps(_g, k[8])), _mm256_mul_ps(_b, k[9])));
+                }
+                if(++row < height)
+                    r += stride, g += stride, b += stride;
+                y += size, u += size, v += size;
+            }
+        }
+
+        SIMD_INLINE void GrayToY(const uint8_t* g, int stride, int height, const __m256 k[10], float* y, int size)
+        {
+            for (int row = 0; row < size;)
+            {
+                for (int col = 0; col < size; col += 8)
+                {
+                    __m256 _g = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(g + col))));
+                    _mm256_storeu_ps(y + col, _mm256_add_ps(_g, k[3]));
+                }
+                if (++row < height)
+                    g += stride;
+                y += size;
+            }
+        }
+
+        SIMD_INLINE void SubUv(const float * src, float * dst)
+        {
+            __m256 _0_25 = _mm256_set1_ps(0.25f), s0, s1;
+            for (int yy = 0; yy < 8; yy += 1)
+            {
+                s0 = _mm256_add_ps(_mm256_loadu_ps(src + 0), _mm256_loadu_ps(src + 16));
+                s1 = _mm256_add_ps(_mm256_loadu_ps(src + 8), _mm256_loadu_ps(src + 24));
+                _mm256_storeu_ps(dst + 0, _mm256_mul_ps(PermutedHorizontalAdd(s0, s1), _0_25));
+                src += 32;
+                dst += 8;
+            }
+        }
+
+        void JpegWriteBlockSubs(OutputMemoryStream& stream, int width, int height, const uint8_t* red,
+            const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3])
+        {
+            __m256 k[10];
+            RgbToYuvInit(k);
+            int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2];
+            int width16 = width & (~15);
+            bool gray = red == green && red == blue;
+            Base::BitBuf bitBuf;
+            for (int y = 0; y < height; y += 16)
+            {
+                int x = 0;
+                SIMD_ALIGNED(16) float Y[256], U[256], V[256];
+                SIMD_ALIGNED(16) float subU[64], subV[64];
+                for (; x < width16; x += 16)
+                {
+                    if (gray)
+                        GrayToY(red + x, stride, height - y, k, Y, 16);
+                    else
+                        RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 16);
+                    DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        SubUv(U, subU);
+                        SubUv(V, subV);
+                        DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                    if (bitBuf.Full())
+                    {
+                        Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+                        bitBuf.Clear();
+                    }
+                }
+                for (; x < width; x += 16)
+                {
+                    if (gray)
+                        Base::GrayToY(red + x, stride, height - y, width - x, Y, 16);
+                    else
+                        Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 16);
+                    DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        SubUv(U, subU);
+                        SubUv(V, subV);
+                        DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                }
+            }
+            Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+            bitBuf.Clear();
+        }
+
+        void JpegWriteBlockFull(OutputMemoryStream& stream, int width, int height, const uint8_t* red,
+            const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3])
+        {
+            __m256 k[10];
+            RgbToYuvInit(k);
+            int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2];
+            int width8 = width & (~7);
+            bool gray = red == green && red == blue;
+            Base::BitBuf bitBuf;
+            for (int y = 0; y < height; y += 8)
+            {
+                int x = 0;
+                SIMD_ALIGNED(16) float Y[64], U[64], V[64];
+                for (; x < width8; x += 8)
+                {
+                    if (gray)
+                        GrayToY(red + x, stride, height - y, k, Y, 8);
+                    else
+                        RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 8);
+                    DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                    if (bitBuf.Full())
+                    {
+                        Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+                        bitBuf.Clear();
+                    }
+                }
+                for (; x < width; x += 8)
+                {
+                    if (gray)
+                        Base::GrayToY(red + x, stride, height - y, width - x, Y, 8);
+                    else
+                        Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 8);
+                    DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                }
+                Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+                bitBuf.Clear();
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageJpegSaver::ImageJpegSaver(const ImageSaverParam& param)
+            : Sse41::ImageJpegSaver(param)
+        {
+        }
+
+        void ImageJpegSaver::Init()
+        {
+            Sse41::ImageJpegSaver::Init();
+            if (_param.width >= 32)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24:
+                case SimdPixelFormatRgb24:
+                    _deintBgr = Avx2::DeinterleaveBgr;
+                    break;
+                case SimdPixelFormatBgra32:
+                case SimdPixelFormatRgba32:
+                    _deintBgra = Avx2::DeinterleaveBgra;
+                    break;
+                default: 
+                    break;
+                }
+            }
+            _writeBlock = _subSample ? JpegWriteBlockSubs : JpegWriteBlockFull;
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp
new file mode 100644
index 0000000000..3cfa79fc62
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp
@@ -0,0 +1,369 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdImageSavePng.h"
+#include "Simd/SimdAvx2.h"
+#include "Simd/SimdExtract.h"
+
+namespace Simd
+{        
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        static uint32_t ZlibAdler32(uint8_t* data, int size)
+        {
+            __m256i _i0 = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7), _8 = _mm256_set1_epi32(8);
+            uint32_t lo = 1, hi = 0;
+            for (int b = 0, n = (int)(size % 5552); b < size;)
+            {
+                int n8 = n & (~7), i = 0;
+                __m256i _i = _mm256_add_epi32(_i0, _mm256_set1_epi32(n));
+                __m256i _l = _mm256_setzero_si256(), _h = _mm256_setzero_si256();
+                for (; i < n8; i += 8)
+                {
+                    __m256i d = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(data + b + i)));
+                    _l = _mm256_add_epi32(_l, d);
+                    _h = _mm256_add_epi32(_h, _mm256_mullo_epi32(d, _i));
+                    _i = _mm256_sub_epi32(_i, _8);
+                }
+                int l = Avx2::ExtractSum<uint32_t>(_l), h = Avx2::ExtractSum<uint32_t>(_h);
+                for (; i < n; ++i)
+                {
+                    l += data[b + i];
+                    h += data[b + i]*(n - i);
+                }
+                hi = (hi + h + lo*n) % 65521;
+                lo = (lo + l) % 65521;
+                b += n;
+                n = 5552;
+            }
+            return (hi << 16) | lo;
+        }
+
+        void ZlibCompress(uint8_t* data, int size, int quality, OutputMemoryStream& stream)
+        {
+            const int ZHASH = 16384;
+            if (quality < 5)
+                quality = 5;
+            const int basket = quality * 2;
+            Array32i hashTable(ZHASH * basket);
+            memset(hashTable.data, -1, hashTable.RawSize());
+
+            stream.Write(uint8_t(0x78));
+            stream.Write(uint8_t(0x5e));
+            stream.WriteBits(1, 1);
+            stream.WriteBits(1, 2);
+
+            int i = 0, j;
+            while (i < size - 3)
+            {
+                int h = Base::ZlibHash(data + i) & (ZHASH - 1), best = 3;
+                uint8_t* bestLoc = 0;
+                int* hList = hashTable.data + h * basket;
+                for (j = 0; hList[j] != -1 && j < basket; ++j)
+                {
+                    if (hList[j] > i - 32768)
+                    {
+                        int d = Avx2::ZlibCount(data + hList[j], data + i, size - i);
+                        if (d >= best)
+                        {
+                            best = d;
+                            bestLoc = data + hList[j];
+                        }
+                    }
+                }
+                if (j == basket)
+                {
+                    memcpy(hList, hList + quality, quality * sizeof(int));
+                    memset(hList + quality, -1, quality * sizeof(int));
+                    j = quality;
+                }
+                hList[j] = i;
+
+                if (bestLoc)
+                {
+                    h = Base::ZlibHash(data + i + 1) & (ZHASH - 1);
+                    int* hList = hashTable.data + h * basket;
+                    for (j = 0; hList[j] != -1 && j < basket; ++j)
+                    {
+                        if (hList[j] > i - 32767)
+                        {
+                            int e = Avx2::ZlibCount(data + hList[j], data + i + 1, size - i - 1);
+                            if (e > best)
+                            {
+                                bestLoc = NULL;
+                                break;
+                            }
+                        }
+                    }
+                }
+
+                if (bestLoc)
+                {
+                    int d = (int)(data + i - bestLoc);
+                    assert(d <= 32767 && best <= 258);
+                    for (j = 0; best > Base::ZlibLenC[j + 1] - 1; ++j);
+                    Base::ZlibHuff(j + 257, stream);
+                    if (Base::ZlibLenEb[j])
+                        stream.WriteBits(best - Base::ZlibLenC[j], Base::ZlibLenEb[j]);
+                    for (j = 0; d > Base::ZlibDistC[j + 1] - 1; ++j);
+                    stream.WriteBits(Base::ZlibBitRev(j, 5), 5);
+                    if (Base::ZlibDistEb[j])
+                        stream.WriteBits(d - Base::ZlibDistC[j], Base::ZlibDistEb[j]);
+                    i += best;
+                }
+                else
+                {
+                    Base::ZlibHuffB(data[i], stream);
+                    ++i;
+                }
+            }
+            for (; i < size; ++i)
+                Base::ZlibHuffB(data[i], stream);
+            Base::ZlibHuff(256, stream);
+            stream.FlushBits();
+            stream.WriteBe32u(ZlibAdler32(data, size));
+        }
+
+        uint32_t EncodeLine0(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size, A);
+            __m256i _sum = _mm256_setzero_si256();
+            for (; i < sizeA; i += A)
+            {
+                __m256i _src = _mm256_loadu_si256((__m256i*)(src + i));
+                _mm256_storeu_si256((__m256i*)(dst + i), _src);
+                _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_src)));
+            }
+            uint32_t sum = Avx2::ExtractSum<uint32_t>(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine1(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            __m256i _sum = _mm256_setzero_si256();
+            for (; i < sizeA; i += A)
+            {
+                __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i));
+                __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n));
+                __m256i _dst = _mm256_sub_epi8(_src0, _src1);
+                _mm256_storeu_si256((__m256i*)(dst + i), _dst);
+                _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst)));
+            }
+            sum += Avx2::ExtractSum<uint32_t>(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - n];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine2(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i] - src[i - stride];
+                sum += ::abs(dst[i]);
+            }
+            __m256i _sum = _mm256_setzero_si256();
+            for (; i < sizeA; i += A)
+            {
+                __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i));
+                __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - stride));
+                __m256i _dst = _mm256_sub_epi8(_src0, _src1);
+                _mm256_storeu_si256((__m256i*)(dst + i), _dst);
+                _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst)));
+            }
+            sum += Avx2::ExtractSum<uint32_t>(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - stride];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine3(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i] - (src[i - stride] >> 1);
+                sum += ::abs(dst[i]);
+            }
+            __m256i _sum = _mm256_setzero_si256();
+            for (; i < sizeA; i += A)
+            {
+                __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i));
+                __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n));
+                __m256i _src2 = _mm256_loadu_si256((__m256i*)(src + i - stride));
+                __m256i lo = _mm256_srli_epi16(_mm256_add_epi16(UnpackU8<0>(_src1), UnpackU8<0>(_src2)), 1);
+                __m256i hi = _mm256_srli_epi16(_mm256_add_epi16(UnpackU8<1>(_src1), UnpackU8<1>(_src2)), 1);
+                __m256i _dst = _mm256_sub_epi8(_src0, _mm256_packus_epi16(lo, hi));
+                _mm256_storeu_si256((__m256i*)(dst + i), _dst);
+                _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst)));
+            }
+            sum += Avx2::ExtractSum<uint32_t>(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - ((src[i - n] + src[i - stride]) >> 1);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        SIMD_INLINE __m256i Paeth(__m256i a, __m256i b, __m256i c)
+        {
+            __m256i p = _mm256_sub_epi16(_mm256_add_epi16(a, b), c);
+            __m256i pa = _mm256_abs_epi16(_mm256_sub_epi16(p, a));
+            __m256i pb = _mm256_abs_epi16(_mm256_sub_epi16(p, b));
+            __m256i pc = _mm256_abs_epi16(_mm256_sub_epi16(p, c));
+            __m256i mbc = _mm256_or_si256(_mm256_cmpgt_epi16(pa, pb), _mm256_cmpgt_epi16(pa, pc));
+            __m256i mc = _mm256_cmpgt_epi16(pb, pc);
+            return _mm256_blendv_epi8(a, _mm256_blendv_epi8(b, c, mc), mbc);
+        }
+
+        uint32_t EncodeLine4(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = (int8_t)(src[i] - src[i - stride]);
+                sum += ::abs(dst[i]);
+            }
+            __m256i _sum = _mm256_setzero_si256();
+            for (; i < sizeA; i += A)
+            {
+                __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i));
+                __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n));
+                __m256i _src2 = _mm256_loadu_si256((__m256i*)(src + i - stride));
+                __m256i _src3 = _mm256_loadu_si256((__m256i*)(src + i - stride - n));
+                __m256i lo = Paeth(UnpackU8<0>(_src1), UnpackU8<0>(_src2), UnpackU8<0>(_src3));
+                __m256i hi = Paeth(UnpackU8<1>(_src1), UnpackU8<1>(_src2), UnpackU8<1>(_src3));
+                __m256i _dst = _mm256_sub_epi8(_src0, _mm256_packus_epi16(lo, hi));
+                _mm256_storeu_si256((__m256i*)(dst + i), _dst);
+                _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst)));
+            }
+            sum += Avx2::ExtractSum<uint32_t>(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - Base::Paeth(src[i - n], src[i - stride], src[i - stride - n]);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine5(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            __m256i _sum = _mm256_setzero_si256();
+            for (; i < sizeA; i += A)
+            {
+                __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i));
+                __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n));
+                __m256i lo = _mm256_srli_epi16(UnpackU8<0>(_src1), 1);
+                __m256i hi = _mm256_srli_epi16(UnpackU8<1>(_src1), 1);
+                __m256i _dst = _mm256_sub_epi8(_src0, _mm256_packus_epi16(lo, hi));
+                _mm256_storeu_si256((__m256i*)(dst + i), _dst);
+                _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst)));
+            }
+            sum += Avx2::ExtractSum<uint32_t>(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - (src[i - n] >> 1);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine6(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            __m256i _sum = _mm256_setzero_si256();
+            for (; i < sizeA; i += A)
+            {
+                __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i));
+                __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n));
+                __m256i _dst = _mm256_sub_epi8(_src0, _src1);
+                _mm256_storeu_si256((__m256i*)(dst + i), _dst);
+                _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst)));
+            }
+            sum += Avx2::ExtractSum<uint32_t>(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - n];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        ImagePngSaver::ImagePngSaver(const ImageSaverParam& param)
+            : Sse41::ImagePngSaver(param)
+        {
+            if (_param.format == SimdPixelFormatBgr24)
+                _convert = Avx2::BgrToRgb;
+            else if (_param.format == SimdPixelFormatBgra32)
+                _convert = Avx2::BgraToRgba;
+            _encode[0] = Avx2::EncodeLine0;
+            _encode[1] = Avx2::EncodeLine1;
+            _encode[2] = Avx2::EncodeLine2;
+            _encode[3] = Avx2::EncodeLine3;
+            _encode[4] = Avx2::EncodeLine4;
+            _encode[5] = Avx2::EncodeLine5;
+            _encode[6] = Avx2::EncodeLine6;
+            _compress = Avx2::ZlibCompress;
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdBase.h b/3rdparty/simdlib/Simd/SimdBase.h
index 998a7b7cbe..3ad6e60d96 100755
--- a/3rdparty/simdlib/Simd/SimdBase.h
+++ b/3rdparty/simdlib/Simd/SimdBase.h
@@ -32,6 +32,10 @@ namespace Simd
 {
     namespace Base
     {
+        uint32_t Crc32(const void* src, size_t size);
+
+        uint32_t Crc32c(const void * src, size_t size);
+
         void BgraToBgr(const uint8_t * bgra, size_t size, uint8_t * bgr, bool lastRow);
 
         void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride);
diff --git a/3rdparty/simdlib/Simd/SimdBaseCrc32.cpp b/3rdparty/simdlib/Simd/SimdBaseCrc32.cpp
new file mode 100644
index 0000000000..4008b0f0d8
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseCrc32.cpp
@@ -0,0 +1,978 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdDefs.h"
+
+namespace Simd
+{
+    namespace Base
+    {
+        static SIMD_INLINE uint32_t Reorder32(uint32_t x)
+        {
+#if defined(__GNUC__) || defined(__clang__)
+            return __builtin_bswap32(x);
+#else
+            return (x >> 24) |
+                ((x >> 8) & 0x0000FF00) |
+                ((x << 8) & 0x00FF0000) |
+                (x << 24);
+#endif
+        }
+
+        // Precalculated CRC32c lookup table for polynomial 0xEDB88320.
+        static const uint32_t Crc32Table[16][256] =
+        {
+            {
+                0x00000000,0x77073096,0xEE0E612C,0x990951BA,0x076DC419,0x706AF48F,0xE963A535,0x9E6495A3,
+                0x0EDB8832,0x79DCB8A4,0xE0D5E91E,0x97D2D988,0x09B64C2B,0x7EB17CBD,0xE7B82D07,0x90BF1D91,
+                0x1DB71064,0x6AB020F2,0xF3B97148,0x84BE41DE,0x1ADAD47D,0x6DDDE4EB,0xF4D4B551,0x83D385C7,
+                0x136C9856,0x646BA8C0,0xFD62F97A,0x8A65C9EC,0x14015C4F,0x63066CD9,0xFA0F3D63,0x8D080DF5,
+                0x3B6E20C8,0x4C69105E,0xD56041E4,0xA2677172,0x3C03E4D1,0x4B04D447,0xD20D85FD,0xA50AB56B,
+                0x35B5A8FA,0x42B2986C,0xDBBBC9D6,0xACBCF940,0x32D86CE3,0x45DF5C75,0xDCD60DCF,0xABD13D59,
+                0x26D930AC,0x51DE003A,0xC8D75180,0xBFD06116,0x21B4F4B5,0x56B3C423,0xCFBA9599,0xB8BDA50F,
+                0x2802B89E,0x5F058808,0xC60CD9B2,0xB10BE924,0x2F6F7C87,0x58684C11,0xC1611DAB,0xB6662D3D,
+                0x76DC4190,0x01DB7106,0x98D220BC,0xEFD5102A,0x71B18589,0x06B6B51F,0x9FBFE4A5,0xE8B8D433,
+                0x7807C9A2,0x0F00F934,0x9609A88E,0xE10E9818,0x7F6A0DBB,0x086D3D2D,0x91646C97,0xE6635C01,
+                0x6B6B51F4,0x1C6C6162,0x856530D8,0xF262004E,0x6C0695ED,0x1B01A57B,0x8208F4C1,0xF50FC457,
+                0x65B0D9C6,0x12B7E950,0x8BBEB8EA,0xFCB9887C,0x62DD1DDF,0x15DA2D49,0x8CD37CF3,0xFBD44C65,
+                0x4DB26158,0x3AB551CE,0xA3BC0074,0xD4BB30E2,0x4ADFA541,0x3DD895D7,0xA4D1C46D,0xD3D6F4FB,
+                0x4369E96A,0x346ED9FC,0xAD678846,0xDA60B8D0,0x44042D73,0x33031DE5,0xAA0A4C5F,0xDD0D7CC9,
+                0x5005713C,0x270241AA,0xBE0B1010,0xC90C2086,0x5768B525,0x206F85B3,0xB966D409,0xCE61E49F,
+                0x5EDEF90E,0x29D9C998,0xB0D09822,0xC7D7A8B4,0x59B33D17,0x2EB40D81,0xB7BD5C3B,0xC0BA6CAD,
+                0xEDB88320,0x9ABFB3B6,0x03B6E20C,0x74B1D29A,0xEAD54739,0x9DD277AF,0x04DB2615,0x73DC1683,
+                0xE3630B12,0x94643B84,0x0D6D6A3E,0x7A6A5AA8,0xE40ECF0B,0x9309FF9D,0x0A00AE27,0x7D079EB1,
+                0xF00F9344,0x8708A3D2,0x1E01F268,0x6906C2FE,0xF762575D,0x806567CB,0x196C3671,0x6E6B06E7,
+                0xFED41B76,0x89D32BE0,0x10DA7A5A,0x67DD4ACC,0xF9B9DF6F,0x8EBEEFF9,0x17B7BE43,0x60B08ED5,
+                0xD6D6A3E8,0xA1D1937E,0x38D8C2C4,0x4FDFF252,0xD1BB67F1,0xA6BC5767,0x3FB506DD,0x48B2364B,
+                0xD80D2BDA,0xAF0A1B4C,0x36034AF6,0x41047A60,0xDF60EFC3,0xA867DF55,0x316E8EEF,0x4669BE79,
+                0xCB61B38C,0xBC66831A,0x256FD2A0,0x5268E236,0xCC0C7795,0xBB0B4703,0x220216B9,0x5505262F,
+                0xC5BA3BBE,0xB2BD0B28,0x2BB45A92,0x5CB36A04,0xC2D7FFA7,0xB5D0CF31,0x2CD99E8B,0x5BDEAE1D,
+                0x9B64C2B0,0xEC63F226,0x756AA39C,0x026D930A,0x9C0906A9,0xEB0E363F,0x72076785,0x05005713,
+                0x95BF4A82,0xE2B87A14,0x7BB12BAE,0x0CB61B38,0x92D28E9B,0xE5D5BE0D,0x7CDCEFB7,0x0BDBDF21,
+                0x86D3D2D4,0xF1D4E242,0x68DDB3F8,0x1FDA836E,0x81BE16CD,0xF6B9265B,0x6FB077E1,0x18B74777,
+                0x88085AE6,0xFF0F6A70,0x66063BCA,0x11010B5C,0x8F659EFF,0xF862AE69,0x616BFFD3,0x166CCF45,
+                0xA00AE278,0xD70DD2EE,0x4E048354,0x3903B3C2,0xA7672661,0xD06016F7,0x4969474D,0x3E6E77DB,
+                0xAED16A4A,0xD9D65ADC,0x40DF0B66,0x37D83BF0,0xA9BCAE53,0xDEBB9EC5,0x47B2CF7F,0x30B5FFE9,
+                0xBDBDF21C,0xCABAC28A,0x53B39330,0x24B4A3A6,0xBAD03605,0xCDD70693,0x54DE5729,0x23D967BF,
+                0xB3667A2E,0xC4614AB8,0x5D681B02,0x2A6F2B94,0xB40BBE37,0xC30C8EA1,0x5A05DF1B,0x2D02EF8D,
+            },
+            {
+                0x00000000,0x191B3141,0x32366282,0x2B2D53C3,0x646CC504,0x7D77F445,0x565AA786,0x4F4196C7,
+                0xC8D98A08,0xD1C2BB49,0xFAEFE88A,0xE3F4D9CB,0xACB54F0C,0xB5AE7E4D,0x9E832D8E,0x87981CCF,
+                0x4AC21251,0x53D92310,0x78F470D3,0x61EF4192,0x2EAED755,0x37B5E614,0x1C98B5D7,0x05838496,
+                0x821B9859,0x9B00A918,0xB02DFADB,0xA936CB9A,0xE6775D5D,0xFF6C6C1C,0xD4413FDF,0xCD5A0E9E,
+                0x958424A2,0x8C9F15E3,0xA7B24620,0xBEA97761,0xF1E8E1A6,0xE8F3D0E7,0xC3DE8324,0xDAC5B265,
+                0x5D5DAEAA,0x44469FEB,0x6F6BCC28,0x7670FD69,0x39316BAE,0x202A5AEF,0x0B07092C,0x121C386D,
+                0xDF4636F3,0xC65D07B2,0xED705471,0xF46B6530,0xBB2AF3F7,0xA231C2B6,0x891C9175,0x9007A034,
+                0x179FBCFB,0x0E848DBA,0x25A9DE79,0x3CB2EF38,0x73F379FF,0x6AE848BE,0x41C51B7D,0x58DE2A3C,
+                0xF0794F05,0xE9627E44,0xC24F2D87,0xDB541CC6,0x94158A01,0x8D0EBB40,0xA623E883,0xBF38D9C2,
+                0x38A0C50D,0x21BBF44C,0x0A96A78F,0x138D96CE,0x5CCC0009,0x45D73148,0x6EFA628B,0x77E153CA,
+                0xBABB5D54,0xA3A06C15,0x888D3FD6,0x91960E97,0xDED79850,0xC7CCA911,0xECE1FAD2,0xF5FACB93,
+                0x7262D75C,0x6B79E61D,0x4054B5DE,0x594F849F,0x160E1258,0x0F152319,0x243870DA,0x3D23419B,
+                0x65FD6BA7,0x7CE65AE6,0x57CB0925,0x4ED03864,0x0191AEA3,0x188A9FE2,0x33A7CC21,0x2ABCFD60,
+                0xAD24E1AF,0xB43FD0EE,0x9F12832D,0x8609B26C,0xC94824AB,0xD05315EA,0xFB7E4629,0xE2657768,
+                0x2F3F79F6,0x362448B7,0x1D091B74,0x04122A35,0x4B53BCF2,0x52488DB3,0x7965DE70,0x607EEF31,
+                0xE7E6F3FE,0xFEFDC2BF,0xD5D0917C,0xCCCBA03D,0x838A36FA,0x9A9107BB,0xB1BC5478,0xA8A76539,
+                0x3B83984B,0x2298A90A,0x09B5FAC9,0x10AECB88,0x5FEF5D4F,0x46F46C0E,0x6DD93FCD,0x74C20E8C,
+                0xF35A1243,0xEA412302,0xC16C70C1,0xD8774180,0x9736D747,0x8E2DE606,0xA500B5C5,0xBC1B8484,
+                0x71418A1A,0x685ABB5B,0x4377E898,0x5A6CD9D9,0x152D4F1E,0x0C367E5F,0x271B2D9C,0x3E001CDD,
+                0xB9980012,0xA0833153,0x8BAE6290,0x92B553D1,0xDDF4C516,0xC4EFF457,0xEFC2A794,0xF6D996D5,
+                0xAE07BCE9,0xB71C8DA8,0x9C31DE6B,0x852AEF2A,0xCA6B79ED,0xD37048AC,0xF85D1B6F,0xE1462A2E,
+                0x66DE36E1,0x7FC507A0,0x54E85463,0x4DF36522,0x02B2F3E5,0x1BA9C2A4,0x30849167,0x299FA026,
+                0xE4C5AEB8,0xFDDE9FF9,0xD6F3CC3A,0xCFE8FD7B,0x80A96BBC,0x99B25AFD,0xB29F093E,0xAB84387F,
+                0x2C1C24B0,0x350715F1,0x1E2A4632,0x07317773,0x4870E1B4,0x516BD0F5,0x7A468336,0x635DB277,
+                0xCBFAD74E,0xD2E1E60F,0xF9CCB5CC,0xE0D7848D,0xAF96124A,0xB68D230B,0x9DA070C8,0x84BB4189,
+                0x03235D46,0x1A386C07,0x31153FC4,0x280E0E85,0x674F9842,0x7E54A903,0x5579FAC0,0x4C62CB81,
+                0x8138C51F,0x9823F45E,0xB30EA79D,0xAA1596DC,0xE554001B,0xFC4F315A,0xD7626299,0xCE7953D8,
+                0x49E14F17,0x50FA7E56,0x7BD72D95,0x62CC1CD4,0x2D8D8A13,0x3496BB52,0x1FBBE891,0x06A0D9D0,
+                0x5E7EF3EC,0x4765C2AD,0x6C48916E,0x7553A02F,0x3A1236E8,0x230907A9,0x0824546A,0x113F652B,
+                0x96A779E4,0x8FBC48A5,0xA4911B66,0xBD8A2A27,0xF2CBBCE0,0xEBD08DA1,0xC0FDDE62,0xD9E6EF23,
+                0x14BCE1BD,0x0DA7D0FC,0x268A833F,0x3F91B27E,0x70D024B9,0x69CB15F8,0x42E6463B,0x5BFD777A,
+                0xDC656BB5,0xC57E5AF4,0xEE530937,0xF7483876,0xB809AEB1,0xA1129FF0,0x8A3FCC33,0x9324FD72,
+            },
+            {
+                0x00000000,0x01C26A37,0x0384D46E,0x0246BE59,0x0709A8DC,0x06CBC2EB,0x048D7CB2,0x054F1685,
+                0x0E1351B8,0x0FD13B8F,0x0D9785D6,0x0C55EFE1,0x091AF964,0x08D89353,0x0A9E2D0A,0x0B5C473D,
+                0x1C26A370,0x1DE4C947,0x1FA2771E,0x1E601D29,0x1B2F0BAC,0x1AED619B,0x18ABDFC2,0x1969B5F5,
+                0x1235F2C8,0x13F798FF,0x11B126A6,0x10734C91,0x153C5A14,0x14FE3023,0x16B88E7A,0x177AE44D,
+                0x384D46E0,0x398F2CD7,0x3BC9928E,0x3A0BF8B9,0x3F44EE3C,0x3E86840B,0x3CC03A52,0x3D025065,
+                0x365E1758,0x379C7D6F,0x35DAC336,0x3418A901,0x3157BF84,0x3095D5B3,0x32D36BEA,0x331101DD,
+                0x246BE590,0x25A98FA7,0x27EF31FE,0x262D5BC9,0x23624D4C,0x22A0277B,0x20E69922,0x2124F315,
+                0x2A78B428,0x2BBADE1F,0x29FC6046,0x283E0A71,0x2D711CF4,0x2CB376C3,0x2EF5C89A,0x2F37A2AD,
+                0x709A8DC0,0x7158E7F7,0x731E59AE,0x72DC3399,0x7793251C,0x76514F2B,0x7417F172,0x75D59B45,
+                0x7E89DC78,0x7F4BB64F,0x7D0D0816,0x7CCF6221,0x798074A4,0x78421E93,0x7A04A0CA,0x7BC6CAFD,
+                0x6CBC2EB0,0x6D7E4487,0x6F38FADE,0x6EFA90E9,0x6BB5866C,0x6A77EC5B,0x68315202,0x69F33835,
+                0x62AF7F08,0x636D153F,0x612BAB66,0x60E9C151,0x65A6D7D4,0x6464BDE3,0x662203BA,0x67E0698D,
+                0x48D7CB20,0x4915A117,0x4B531F4E,0x4A917579,0x4FDE63FC,0x4E1C09CB,0x4C5AB792,0x4D98DDA5,
+                0x46C49A98,0x4706F0AF,0x45404EF6,0x448224C1,0x41CD3244,0x400F5873,0x4249E62A,0x438B8C1D,
+                0x54F16850,0x55330267,0x5775BC3E,0x56B7D609,0x53F8C08C,0x523AAABB,0x507C14E2,0x51BE7ED5,
+                0x5AE239E8,0x5B2053DF,0x5966ED86,0x58A487B1,0x5DEB9134,0x5C29FB03,0x5E6F455A,0x5FAD2F6D,
+                0xE1351B80,0xE0F771B7,0xE2B1CFEE,0xE373A5D9,0xE63CB35C,0xE7FED96B,0xE5B86732,0xE47A0D05,
+                0xEF264A38,0xEEE4200F,0xECA29E56,0xED60F461,0xE82FE2E4,0xE9ED88D3,0xEBAB368A,0xEA695CBD,
+                0xFD13B8F0,0xFCD1D2C7,0xFE976C9E,0xFF5506A9,0xFA1A102C,0xFBD87A1B,0xF99EC442,0xF85CAE75,
+                0xF300E948,0xF2C2837F,0xF0843D26,0xF1465711,0xF4094194,0xF5CB2BA3,0xF78D95FA,0xF64FFFCD,
+                0xD9785D60,0xD8BA3757,0xDAFC890E,0xDB3EE339,0xDE71F5BC,0xDFB39F8B,0xDDF521D2,0xDC374BE5,
+                0xD76B0CD8,0xD6A966EF,0xD4EFD8B6,0xD52DB281,0xD062A404,0xD1A0CE33,0xD3E6706A,0xD2241A5D,
+                0xC55EFE10,0xC49C9427,0xC6DA2A7E,0xC7184049,0xC25756CC,0xC3953CFB,0xC1D382A2,0xC011E895,
+                0xCB4DAFA8,0xCA8FC59F,0xC8C97BC6,0xC90B11F1,0xCC440774,0xCD866D43,0xCFC0D31A,0xCE02B92D,
+                0x91AF9640,0x906DFC77,0x922B422E,0x93E92819,0x96A63E9C,0x976454AB,0x9522EAF2,0x94E080C5,
+                0x9FBCC7F8,0x9E7EADCF,0x9C381396,0x9DFA79A1,0x98B56F24,0x99770513,0x9B31BB4A,0x9AF3D17D,
+                0x8D893530,0x8C4B5F07,0x8E0DE15E,0x8FCF8B69,0x8A809DEC,0x8B42F7DB,0x89044982,0x88C623B5,
+                0x839A6488,0x82580EBF,0x801EB0E6,0x81DCDAD1,0x8493CC54,0x8551A663,0x8717183A,0x86D5720D,
+                0xA9E2D0A0,0xA820BA97,0xAA6604CE,0xABA46EF9,0xAEEB787C,0xAF29124B,0xAD6FAC12,0xACADC625,
+                0xA7F18118,0xA633EB2F,0xA4755576,0xA5B73F41,0xA0F829C4,0xA13A43F3,0xA37CFDAA,0xA2BE979D,
+                0xB5C473D0,0xB40619E7,0xB640A7BE,0xB782CD89,0xB2CDDB0C,0xB30FB13B,0xB1490F62,0xB08B6555,
+                0xBBD72268,0xBA15485F,0xB853F606,0xB9919C31,0xBCDE8AB4,0xBD1CE083,0xBF5A5EDA,0xBE9834ED,
+            },
+            {
+                0x00000000,0xB8BC6765,0xAA09C88B,0x12B5AFEE,0x8F629757,0x37DEF032,0x256B5FDC,0x9DD738B9,
+                0xC5B428EF,0x7D084F8A,0x6FBDE064,0xD7018701,0x4AD6BFB8,0xF26AD8DD,0xE0DF7733,0x58631056,
+                0x5019579F,0xE8A530FA,0xFA109F14,0x42ACF871,0xDF7BC0C8,0x67C7A7AD,0x75720843,0xCDCE6F26,
+                0x95AD7F70,0x2D111815,0x3FA4B7FB,0x8718D09E,0x1ACFE827,0xA2738F42,0xB0C620AC,0x087A47C9,
+                0xA032AF3E,0x188EC85B,0x0A3B67B5,0xB28700D0,0x2F503869,0x97EC5F0C,0x8559F0E2,0x3DE59787,
+                0x658687D1,0xDD3AE0B4,0xCF8F4F5A,0x7733283F,0xEAE41086,0x525877E3,0x40EDD80D,0xF851BF68,
+                0xF02BF8A1,0x48979FC4,0x5A22302A,0xE29E574F,0x7F496FF6,0xC7F50893,0xD540A77D,0x6DFCC018,
+                0x359FD04E,0x8D23B72B,0x9F9618C5,0x272A7FA0,0xBAFD4719,0x0241207C,0x10F48F92,0xA848E8F7,
+                0x9B14583D,0x23A83F58,0x311D90B6,0x89A1F7D3,0x1476CF6A,0xACCAA80F,0xBE7F07E1,0x06C36084,
+                0x5EA070D2,0xE61C17B7,0xF4A9B859,0x4C15DF3C,0xD1C2E785,0x697E80E0,0x7BCB2F0E,0xC377486B,
+                0xCB0D0FA2,0x73B168C7,0x6104C729,0xD9B8A04C,0x446F98F5,0xFCD3FF90,0xEE66507E,0x56DA371B,
+                0x0EB9274D,0xB6054028,0xA4B0EFC6,0x1C0C88A3,0x81DBB01A,0x3967D77F,0x2BD27891,0x936E1FF4,
+                0x3B26F703,0x839A9066,0x912F3F88,0x299358ED,0xB4446054,0x0CF80731,0x1E4DA8DF,0xA6F1CFBA,
+                0xFE92DFEC,0x462EB889,0x549B1767,0xEC277002,0x71F048BB,0xC94C2FDE,0xDBF98030,0x6345E755,
+                0x6B3FA09C,0xD383C7F9,0xC1366817,0x798A0F72,0xE45D37CB,0x5CE150AE,0x4E54FF40,0xF6E89825,
+                0xAE8B8873,0x1637EF16,0x048240F8,0xBC3E279D,0x21E91F24,0x99557841,0x8BE0D7AF,0x335CB0CA,
+                0xED59B63B,0x55E5D15E,0x47507EB0,0xFFEC19D5,0x623B216C,0xDA874609,0xC832E9E7,0x708E8E82,
+                0x28ED9ED4,0x9051F9B1,0x82E4565F,0x3A58313A,0xA78F0983,0x1F336EE6,0x0D86C108,0xB53AA66D,
+                0xBD40E1A4,0x05FC86C1,0x1749292F,0xAFF54E4A,0x322276F3,0x8A9E1196,0x982BBE78,0x2097D91D,
+                0x78F4C94B,0xC048AE2E,0xD2FD01C0,0x6A4166A5,0xF7965E1C,0x4F2A3979,0x5D9F9697,0xE523F1F2,
+                0x4D6B1905,0xF5D77E60,0xE762D18E,0x5FDEB6EB,0xC2098E52,0x7AB5E937,0x680046D9,0xD0BC21BC,
+                0x88DF31EA,0x3063568F,0x22D6F961,0x9A6A9E04,0x07BDA6BD,0xBF01C1D8,0xADB46E36,0x15080953,
+                0x1D724E9A,0xA5CE29FF,0xB77B8611,0x0FC7E174,0x9210D9CD,0x2AACBEA8,0x38191146,0x80A57623,
+                0xD8C66675,0x607A0110,0x72CFAEFE,0xCA73C99B,0x57A4F122,0xEF189647,0xFDAD39A9,0x45115ECC,
+                0x764DEE06,0xCEF18963,0xDC44268D,0x64F841E8,0xF92F7951,0x41931E34,0x5326B1DA,0xEB9AD6BF,
+                0xB3F9C6E9,0x0B45A18C,0x19F00E62,0xA14C6907,0x3C9B51BE,0x842736DB,0x96929935,0x2E2EFE50,
+                0x2654B999,0x9EE8DEFC,0x8C5D7112,0x34E11677,0xA9362ECE,0x118A49AB,0x033FE645,0xBB838120,
+                0xE3E09176,0x5B5CF613,0x49E959FD,0xF1553E98,0x6C820621,0xD43E6144,0xC68BCEAA,0x7E37A9CF,
+                0xD67F4138,0x6EC3265D,0x7C7689B3,0xC4CAEED6,0x591DD66F,0xE1A1B10A,0xF3141EE4,0x4BA87981,
+                0x13CB69D7,0xAB770EB2,0xB9C2A15C,0x017EC639,0x9CA9FE80,0x241599E5,0x36A0360B,0x8E1C516E,
+                0x866616A7,0x3EDA71C2,0x2C6FDE2C,0x94D3B949,0x090481F0,0xB1B8E695,0xA30D497B,0x1BB12E1E,
+                0x43D23E48,0xFB6E592D,0xE9DBF6C3,0x516791A6,0xCCB0A91F,0x740CCE7A,0x66B96194,0xDE0506F1,
+            },
+            {
+                0x00000000,0x3D6029B0,0x7AC05360,0x47A07AD0,0xF580A6C0,0xC8E08F70,0x8F40F5A0,0xB220DC10,
+                0x30704BC1,0x0D106271,0x4AB018A1,0x77D03111,0xC5F0ED01,0xF890C4B1,0xBF30BE61,0x825097D1,
+                0x60E09782,0x5D80BE32,0x1A20C4E2,0x2740ED52,0x95603142,0xA80018F2,0xEFA06222,0xD2C04B92,
+                0x5090DC43,0x6DF0F5F3,0x2A508F23,0x1730A693,0xA5107A83,0x98705333,0xDFD029E3,0xE2B00053,
+                0xC1C12F04,0xFCA106B4,0xBB017C64,0x866155D4,0x344189C4,0x0921A074,0x4E81DAA4,0x73E1F314,
+                0xF1B164C5,0xCCD14D75,0x8B7137A5,0xB6111E15,0x0431C205,0x3951EBB5,0x7EF19165,0x4391B8D5,
+                0xA121B886,0x9C419136,0xDBE1EBE6,0xE681C256,0x54A11E46,0x69C137F6,0x2E614D26,0x13016496,
+                0x9151F347,0xAC31DAF7,0xEB91A027,0xD6F18997,0x64D15587,0x59B17C37,0x1E1106E7,0x23712F57,
+                0x58F35849,0x659371F9,0x22330B29,0x1F532299,0xAD73FE89,0x9013D739,0xD7B3ADE9,0xEAD38459,
+                0x68831388,0x55E33A38,0x124340E8,0x2F236958,0x9D03B548,0xA0639CF8,0xE7C3E628,0xDAA3CF98,
+                0x3813CFCB,0x0573E67B,0x42D39CAB,0x7FB3B51B,0xCD93690B,0xF0F340BB,0xB7533A6B,0x8A3313DB,
+                0x0863840A,0x3503ADBA,0x72A3D76A,0x4FC3FEDA,0xFDE322CA,0xC0830B7A,0x872371AA,0xBA43581A,
+                0x9932774D,0xA4525EFD,0xE3F2242D,0xDE920D9D,0x6CB2D18D,0x51D2F83D,0x167282ED,0x2B12AB5D,
+                0xA9423C8C,0x9422153C,0xD3826FEC,0xEEE2465C,0x5CC29A4C,0x61A2B3FC,0x2602C92C,0x1B62E09C,
+                0xF9D2E0CF,0xC4B2C97F,0x8312B3AF,0xBE729A1F,0x0C52460F,0x31326FBF,0x7692156F,0x4BF23CDF,
+                0xC9A2AB0E,0xF4C282BE,0xB362F86E,0x8E02D1DE,0x3C220DCE,0x0142247E,0x46E25EAE,0x7B82771E,
+                0xB1E6B092,0x8C869922,0xCB26E3F2,0xF646CA42,0x44661652,0x79063FE2,0x3EA64532,0x03C66C82,
+                0x8196FB53,0xBCF6D2E3,0xFB56A833,0xC6368183,0x74165D93,0x49767423,0x0ED60EF3,0x33B62743,
+                0xD1062710,0xEC660EA0,0xABC67470,0x96A65DC0,0x248681D0,0x19E6A860,0x5E46D2B0,0x6326FB00,
+                0xE1766CD1,0xDC164561,0x9BB63FB1,0xA6D61601,0x14F6CA11,0x2996E3A1,0x6E369971,0x5356B0C1,
+                0x70279F96,0x4D47B626,0x0AE7CCF6,0x3787E546,0x85A73956,0xB8C710E6,0xFF676A36,0xC2074386,
+                0x4057D457,0x7D37FDE7,0x3A978737,0x07F7AE87,0xB5D77297,0x88B75B27,0xCF1721F7,0xF2770847,
+                0x10C70814,0x2DA721A4,0x6A075B74,0x576772C4,0xE547AED4,0xD8278764,0x9F87FDB4,0xA2E7D404,
+                0x20B743D5,0x1DD76A65,0x5A7710B5,0x67173905,0xD537E515,0xE857CCA5,0xAFF7B675,0x92979FC5,
+                0xE915E8DB,0xD475C16B,0x93D5BBBB,0xAEB5920B,0x1C954E1B,0x21F567AB,0x66551D7B,0x5B3534CB,
+                0xD965A31A,0xE4058AAA,0xA3A5F07A,0x9EC5D9CA,0x2CE505DA,0x11852C6A,0x562556BA,0x6B457F0A,
+                0x89F57F59,0xB49556E9,0xF3352C39,0xCE550589,0x7C75D999,0x4115F029,0x06B58AF9,0x3BD5A349,
+                0xB9853498,0x84E51D28,0xC34567F8,0xFE254E48,0x4C059258,0x7165BBE8,0x36C5C138,0x0BA5E888,
+                0x28D4C7DF,0x15B4EE6F,0x521494BF,0x6F74BD0F,0xDD54611F,0xE03448AF,0xA794327F,0x9AF41BCF,
+                0x18A48C1E,0x25C4A5AE,0x6264DF7E,0x5F04F6CE,0xED242ADE,0xD044036E,0x97E479BE,0xAA84500E,
+                0x4834505D,0x755479ED,0x32F4033D,0x0F942A8D,0xBDB4F69D,0x80D4DF2D,0xC774A5FD,0xFA148C4D,
+                0x78441B9C,0x4524322C,0x028448FC,0x3FE4614C,0x8DC4BD5C,0xB0A494EC,0xF704EE3C,0xCA64C78C,
+            },
+            {
+                0x00000000,0xCB5CD3A5,0x4DC8A10B,0x869472AE,0x9B914216,0x50CD91B3,0xD659E31D,0x1D0530B8,
+                0xEC53826D,0x270F51C8,0xA19B2366,0x6AC7F0C3,0x77C2C07B,0xBC9E13DE,0x3A0A6170,0xF156B2D5,
+                0x03D6029B,0xC88AD13E,0x4E1EA390,0x85427035,0x9847408D,0x531B9328,0xD58FE186,0x1ED33223,
+                0xEF8580F6,0x24D95353,0xA24D21FD,0x6911F258,0x7414C2E0,0xBF481145,0x39DC63EB,0xF280B04E,
+                0x07AC0536,0xCCF0D693,0x4A64A43D,0x81387798,0x9C3D4720,0x57619485,0xD1F5E62B,0x1AA9358E,
+                0xEBFF875B,0x20A354FE,0xA6372650,0x6D6BF5F5,0x706EC54D,0xBB3216E8,0x3DA66446,0xF6FAB7E3,
+                0x047A07AD,0xCF26D408,0x49B2A6A6,0x82EE7503,0x9FEB45BB,0x54B7961E,0xD223E4B0,0x197F3715,
+                0xE82985C0,0x23755665,0xA5E124CB,0x6EBDF76E,0x73B8C7D6,0xB8E41473,0x3E7066DD,0xF52CB578,
+                0x0F580A6C,0xC404D9C9,0x4290AB67,0x89CC78C2,0x94C9487A,0x5F959BDF,0xD901E971,0x125D3AD4,
+                0xE30B8801,0x28575BA4,0xAEC3290A,0x659FFAAF,0x789ACA17,0xB3C619B2,0x35526B1C,0xFE0EB8B9,
+                0x0C8E08F7,0xC7D2DB52,0x4146A9FC,0x8A1A7A59,0x971F4AE1,0x5C439944,0xDAD7EBEA,0x118B384F,
+                0xE0DD8A9A,0x2B81593F,0xAD152B91,0x6649F834,0x7B4CC88C,0xB0101B29,0x36846987,0xFDD8BA22,
+                0x08F40F5A,0xC3A8DCFF,0x453CAE51,0x8E607DF4,0x93654D4C,0x58399EE9,0xDEADEC47,0x15F13FE2,
+                0xE4A78D37,0x2FFB5E92,0xA96F2C3C,0x6233FF99,0x7F36CF21,0xB46A1C84,0x32FE6E2A,0xF9A2BD8F,
+                0x0B220DC1,0xC07EDE64,0x46EAACCA,0x8DB67F6F,0x90B34FD7,0x5BEF9C72,0xDD7BEEDC,0x16273D79,
+                0xE7718FAC,0x2C2D5C09,0xAAB92EA7,0x61E5FD02,0x7CE0CDBA,0xB7BC1E1F,0x31286CB1,0xFA74BF14,
+                0x1EB014D8,0xD5ECC77D,0x5378B5D3,0x98246676,0x852156CE,0x4E7D856B,0xC8E9F7C5,0x03B52460,
+                0xF2E396B5,0x39BF4510,0xBF2B37BE,0x7477E41B,0x6972D4A3,0xA22E0706,0x24BA75A8,0xEFE6A60D,
+                0x1D661643,0xD63AC5E6,0x50AEB748,0x9BF264ED,0x86F75455,0x4DAB87F0,0xCB3FF55E,0x006326FB,
+                0xF135942E,0x3A69478B,0xBCFD3525,0x77A1E680,0x6AA4D638,0xA1F8059D,0x276C7733,0xEC30A496,
+                0x191C11EE,0xD240C24B,0x54D4B0E5,0x9F886340,0x828D53F8,0x49D1805D,0xCF45F2F3,0x04192156,
+                0xF54F9383,0x3E134026,0xB8873288,0x73DBE12D,0x6EDED195,0xA5820230,0x2316709E,0xE84AA33B,
+                0x1ACA1375,0xD196C0D0,0x5702B27E,0x9C5E61DB,0x815B5163,0x4A0782C6,0xCC93F068,0x07CF23CD,
+                0xF6999118,0x3DC542BD,0xBB513013,0x700DE3B6,0x6D08D30E,0xA65400AB,0x20C07205,0xEB9CA1A0,
+                0x11E81EB4,0xDAB4CD11,0x5C20BFBF,0x977C6C1A,0x8A795CA2,0x41258F07,0xC7B1FDA9,0x0CED2E0C,
+                0xFDBB9CD9,0x36E74F7C,0xB0733DD2,0x7B2FEE77,0x662ADECF,0xAD760D6A,0x2BE27FC4,0xE0BEAC61,
+                0x123E1C2F,0xD962CF8A,0x5FF6BD24,0x94AA6E81,0x89AF5E39,0x42F38D9C,0xC467FF32,0x0F3B2C97,
+                0xFE6D9E42,0x35314DE7,0xB3A53F49,0x78F9ECEC,0x65FCDC54,0xAEA00FF1,0x28347D5F,0xE368AEFA,
+                0x16441B82,0xDD18C827,0x5B8CBA89,0x90D0692C,0x8DD55994,0x46898A31,0xC01DF89F,0x0B412B3A,
+                0xFA1799EF,0x314B4A4A,0xB7DF38E4,0x7C83EB41,0x6186DBF9,0xAADA085C,0x2C4E7AF2,0xE712A957,
+                0x15921919,0xDECECABC,0x585AB812,0x93066BB7,0x8E035B0F,0x455F88AA,0xC3CBFA04,0x089729A1,
+                0xF9C19B74,0x329D48D1,0xB4093A7F,0x7F55E9DA,0x6250D962,0xA90C0AC7,0x2F987869,0xE4C4ABCC,
+            },
+            {
+                0x00000000,0xA6770BB4,0x979F1129,0x31E81A9D,0xF44F2413,0x52382FA7,0x63D0353A,0xC5A73E8E,
+                0x33EF4E67,0x959845D3,0xA4705F4E,0x020754FA,0xC7A06A74,0x61D761C0,0x503F7B5D,0xF64870E9,
+                0x67DE9CCE,0xC1A9977A,0xF0418DE7,0x56368653,0x9391B8DD,0x35E6B369,0x040EA9F4,0xA279A240,
+                0x5431D2A9,0xF246D91D,0xC3AEC380,0x65D9C834,0xA07EF6BA,0x0609FD0E,0x37E1E793,0x9196EC27,
+                0xCFBD399C,0x69CA3228,0x582228B5,0xFE552301,0x3BF21D8F,0x9D85163B,0xAC6D0CA6,0x0A1A0712,
+                0xFC5277FB,0x5A257C4F,0x6BCD66D2,0xCDBA6D66,0x081D53E8,0xAE6A585C,0x9F8242C1,0x39F54975,
+                0xA863A552,0x0E14AEE6,0x3FFCB47B,0x998BBFCF,0x5C2C8141,0xFA5B8AF5,0xCBB39068,0x6DC49BDC,
+                0x9B8CEB35,0x3DFBE081,0x0C13FA1C,0xAA64F1A8,0x6FC3CF26,0xC9B4C492,0xF85CDE0F,0x5E2BD5BB,
+                0x440B7579,0xE27C7ECD,0xD3946450,0x75E36FE4,0xB044516A,0x16335ADE,0x27DB4043,0x81AC4BF7,
+                0x77E43B1E,0xD19330AA,0xE07B2A37,0x460C2183,0x83AB1F0D,0x25DC14B9,0x14340E24,0xB2430590,
+                0x23D5E9B7,0x85A2E203,0xB44AF89E,0x123DF32A,0xD79ACDA4,0x71EDC610,0x4005DC8D,0xE672D739,
+                0x103AA7D0,0xB64DAC64,0x87A5B6F9,0x21D2BD4D,0xE47583C3,0x42028877,0x73EA92EA,0xD59D995E,
+                0x8BB64CE5,0x2DC14751,0x1C295DCC,0xBA5E5678,0x7FF968F6,0xD98E6342,0xE86679DF,0x4E11726B,
+                0xB8590282,0x1E2E0936,0x2FC613AB,0x89B1181F,0x4C162691,0xEA612D25,0xDB8937B8,0x7DFE3C0C,
+                0xEC68D02B,0x4A1FDB9F,0x7BF7C102,0xDD80CAB6,0x1827F438,0xBE50FF8C,0x8FB8E511,0x29CFEEA5,
+                0xDF879E4C,0x79F095F8,0x48188F65,0xEE6F84D1,0x2BC8BA5F,0x8DBFB1EB,0xBC57AB76,0x1A20A0C2,
+                0x8816EAF2,0x2E61E146,0x1F89FBDB,0xB9FEF06F,0x7C59CEE1,0xDA2EC555,0xEBC6DFC8,0x4DB1D47C,
+                0xBBF9A495,0x1D8EAF21,0x2C66B5BC,0x8A11BE08,0x4FB68086,0xE9C18B32,0xD82991AF,0x7E5E9A1B,
+                0xEFC8763C,0x49BF7D88,0x78576715,0xDE206CA1,0x1B87522F,0xBDF0599B,0x8C184306,0x2A6F48B2,
+                0xDC27385B,0x7A5033EF,0x4BB82972,0xEDCF22C6,0x28681C48,0x8E1F17FC,0xBFF70D61,0x198006D5,
+                0x47ABD36E,0xE1DCD8DA,0xD034C247,0x7643C9F3,0xB3E4F77D,0x1593FCC9,0x247BE654,0x820CEDE0,
+                0x74449D09,0xD23396BD,0xE3DB8C20,0x45AC8794,0x800BB91A,0x267CB2AE,0x1794A833,0xB1E3A387,
+                0x20754FA0,0x86024414,0xB7EA5E89,0x119D553D,0xD43A6BB3,0x724D6007,0x43A57A9A,0xE5D2712E,
+                0x139A01C7,0xB5ED0A73,0x840510EE,0x22721B5A,0xE7D525D4,0x41A22E60,0x704A34FD,0xD63D3F49,
+                0xCC1D9F8B,0x6A6A943F,0x5B828EA2,0xFDF58516,0x3852BB98,0x9E25B02C,0xAFCDAAB1,0x09BAA105,
+                0xFFF2D1EC,0x5985DA58,0x686DC0C5,0xCE1ACB71,0x0BBDF5FF,0xADCAFE4B,0x9C22E4D6,0x3A55EF62,
+                0xABC30345,0x0DB408F1,0x3C5C126C,0x9A2B19D8,0x5F8C2756,0xF9FB2CE2,0xC813367F,0x6E643DCB,
+                0x982C4D22,0x3E5B4696,0x0FB35C0B,0xA9C457BF,0x6C636931,0xCA146285,0xFBFC7818,0x5D8B73AC,
+                0x03A0A617,0xA5D7ADA3,0x943FB73E,0x3248BC8A,0xF7EF8204,0x519889B0,0x6070932D,0xC6079899,
+                0x304FE870,0x9638E3C4,0xA7D0F959,0x01A7F2ED,0xC400CC63,0x6277C7D7,0x539FDD4A,0xF5E8D6FE,
+                0x647E3AD9,0xC209316D,0xF3E12BF0,0x55962044,0x90311ECA,0x3646157E,0x07AE0FE3,0xA1D90457,
+                0x579174BE,0xF1E67F0A,0xC00E6597,0x66796E23,0xA3DE50AD,0x05A95B19,0x34414184,0x92364A30,
+            },
+            {
+                0x00000000,0xCCAA009E,0x4225077D,0x8E8F07E3,0x844A0EFA,0x48E00E64,0xC66F0987,0x0AC50919,
+                0xD3E51BB5,0x1F4F1B2B,0x91C01CC8,0x5D6A1C56,0x57AF154F,0x9B0515D1,0x158A1232,0xD92012AC,
+                0x7CBB312B,0xB01131B5,0x3E9E3656,0xF23436C8,0xF8F13FD1,0x345B3F4F,0xBAD438AC,0x767E3832,
+                0xAF5E2A9E,0x63F42A00,0xED7B2DE3,0x21D12D7D,0x2B142464,0xE7BE24FA,0x69312319,0xA59B2387,
+                0xF9766256,0x35DC62C8,0xBB53652B,0x77F965B5,0x7D3C6CAC,0xB1966C32,0x3F196BD1,0xF3B36B4F,
+                0x2A9379E3,0xE639797D,0x68B67E9E,0xA41C7E00,0xAED97719,0x62737787,0xECFC7064,0x205670FA,
+                0x85CD537D,0x496753E3,0xC7E85400,0x0B42549E,0x01875D87,0xCD2D5D19,0x43A25AFA,0x8F085A64,
+                0x562848C8,0x9A824856,0x140D4FB5,0xD8A74F2B,0xD2624632,0x1EC846AC,0x9047414F,0x5CED41D1,
+                0x299DC2ED,0xE537C273,0x6BB8C590,0xA712C50E,0xADD7CC17,0x617DCC89,0xEFF2CB6A,0x2358CBF4,
+                0xFA78D958,0x36D2D9C6,0xB85DDE25,0x74F7DEBB,0x7E32D7A2,0xB298D73C,0x3C17D0DF,0xF0BDD041,
+                0x5526F3C6,0x998CF358,0x1703F4BB,0xDBA9F425,0xD16CFD3C,0x1DC6FDA2,0x9349FA41,0x5FE3FADF,
+                0x86C3E873,0x4A69E8ED,0xC4E6EF0E,0x084CEF90,0x0289E689,0xCE23E617,0x40ACE1F4,0x8C06E16A,
+                0xD0EBA0BB,0x1C41A025,0x92CEA7C6,0x5E64A758,0x54A1AE41,0x980BAEDF,0x1684A93C,0xDA2EA9A2,
+                0x030EBB0E,0xCFA4BB90,0x412BBC73,0x8D81BCED,0x8744B5F4,0x4BEEB56A,0xC561B289,0x09CBB217,
+                0xAC509190,0x60FA910E,0xEE7596ED,0x22DF9673,0x281A9F6A,0xE4B09FF4,0x6A3F9817,0xA6959889,
+                0x7FB58A25,0xB31F8ABB,0x3D908D58,0xF13A8DC6,0xFBFF84DF,0x37558441,0xB9DA83A2,0x7570833C,
+                0x533B85DA,0x9F918544,0x111E82A7,0xDDB48239,0xD7718B20,0x1BDB8BBE,0x95548C5D,0x59FE8CC3,
+                0x80DE9E6F,0x4C749EF1,0xC2FB9912,0x0E51998C,0x04949095,0xC83E900B,0x46B197E8,0x8A1B9776,
+                0x2F80B4F1,0xE32AB46F,0x6DA5B38C,0xA10FB312,0xABCABA0B,0x6760BA95,0xE9EFBD76,0x2545BDE8,
+                0xFC65AF44,0x30CFAFDA,0xBE40A839,0x72EAA8A7,0x782FA1BE,0xB485A120,0x3A0AA6C3,0xF6A0A65D,
+                0xAA4DE78C,0x66E7E712,0xE868E0F1,0x24C2E06F,0x2E07E976,0xE2ADE9E8,0x6C22EE0B,0xA088EE95,
+                0x79A8FC39,0xB502FCA7,0x3B8DFB44,0xF727FBDA,0xFDE2F2C3,0x3148F25D,0xBFC7F5BE,0x736DF520,
+                0xD6F6D6A7,0x1A5CD639,0x94D3D1DA,0x5879D144,0x52BCD85D,0x9E16D8C3,0x1099DF20,0xDC33DFBE,
+                0x0513CD12,0xC9B9CD8C,0x4736CA6F,0x8B9CCAF1,0x8159C3E8,0x4DF3C376,0xC37CC495,0x0FD6C40B,
+                0x7AA64737,0xB60C47A9,0x3883404A,0xF42940D4,0xFEEC49CD,0x32464953,0xBCC94EB0,0x70634E2E,
+                0xA9435C82,0x65E95C1C,0xEB665BFF,0x27CC5B61,0x2D095278,0xE1A352E6,0x6F2C5505,0xA386559B,
+                0x061D761C,0xCAB77682,0x44387161,0x889271FF,0x825778E6,0x4EFD7878,0xC0727F9B,0x0CD87F05,
+                0xD5F86DA9,0x19526D37,0x97DD6AD4,0x5B776A4A,0x51B26353,0x9D1863CD,0x1397642E,0xDF3D64B0,
+                0x83D02561,0x4F7A25FF,0xC1F5221C,0x0D5F2282,0x079A2B9B,0xCB302B05,0x45BF2CE6,0x89152C78,
+                0x50353ED4,0x9C9F3E4A,0x121039A9,0xDEBA3937,0xD47F302E,0x18D530B0,0x965A3753,0x5AF037CD,
+                0xFF6B144A,0x33C114D4,0xBD4E1337,0x71E413A9,0x7B211AB0,0xB78B1A2E,0x39041DCD,0xF5AE1D53,
+                0x2C8E0FFF,0xE0240F61,0x6EAB0882,0xA201081C,0xA8C40105,0x646E019B,0xEAE10678,0x264B06E6,
+            },
+            {
+                0x00000000,0x177B1443,0x2EF62886,0x398D3CC5,0x5DEC510C,0x4A97454F,0x731A798A,0x64616DC9,
+                0xBBD8A218,0xACA3B65B,0x952E8A9E,0x82559EDD,0xE634F314,0xF14FE757,0xC8C2DB92,0xDFB9CFD1,
+                0xACC04271,0xBBBB5632,0x82366AF7,0x954D7EB4,0xF12C137D,0xE657073E,0xDFDA3BFB,0xC8A12FB8,
+                0x1718E069,0x0063F42A,0x39EEC8EF,0x2E95DCAC,0x4AF4B165,0x5D8FA526,0x640299E3,0x73798DA0,
+                0x82F182A3,0x958A96E0,0xAC07AA25,0xBB7CBE66,0xDF1DD3AF,0xC866C7EC,0xF1EBFB29,0xE690EF6A,
+                0x392920BB,0x2E5234F8,0x17DF083D,0x00A41C7E,0x64C571B7,0x73BE65F4,0x4A335931,0x5D484D72,
+                0x2E31C0D2,0x394AD491,0x00C7E854,0x17BCFC17,0x73DD91DE,0x64A6859D,0x5D2BB958,0x4A50AD1B,
+                0x95E962CA,0x82927689,0xBB1F4A4C,0xAC645E0F,0xC80533C6,0xDF7E2785,0xE6F31B40,0xF1880F03,
+                0xDE920307,0xC9E91744,0xF0642B81,0xE71F3FC2,0x837E520B,0x94054648,0xAD887A8D,0xBAF36ECE,
+                0x654AA11F,0x7231B55C,0x4BBC8999,0x5CC79DDA,0x38A6F013,0x2FDDE450,0x1650D895,0x012BCCD6,
+                0x72524176,0x65295535,0x5CA469F0,0x4BDF7DB3,0x2FBE107A,0x38C50439,0x014838FC,0x16332CBF,
+                0xC98AE36E,0xDEF1F72D,0xE77CCBE8,0xF007DFAB,0x9466B262,0x831DA621,0xBA909AE4,0xADEB8EA7,
+                0x5C6381A4,0x4B1895E7,0x7295A922,0x65EEBD61,0x018FD0A8,0x16F4C4EB,0x2F79F82E,0x3802EC6D,
+                0xE7BB23BC,0xF0C037FF,0xC94D0B3A,0xDE361F79,0xBA5772B0,0xAD2C66F3,0x94A15A36,0x83DA4E75,
+                0xF0A3C3D5,0xE7D8D796,0xDE55EB53,0xC92EFF10,0xAD4F92D9,0xBA34869A,0x83B9BA5F,0x94C2AE1C,
+                0x4B7B61CD,0x5C00758E,0x658D494B,0x72F65D08,0x169730C1,0x01EC2482,0x38611847,0x2F1A0C04,
+                0x6655004F,0x712E140C,0x48A328C9,0x5FD83C8A,0x3BB95143,0x2CC24500,0x154F79C5,0x02346D86,
+                0xDD8DA257,0xCAF6B614,0xF37B8AD1,0xE4009E92,0x8061F35B,0x971AE718,0xAE97DBDD,0xB9ECCF9E,
+                0xCA95423E,0xDDEE567D,0xE4636AB8,0xF3187EFB,0x97791332,0x80020771,0xB98F3BB4,0xAEF42FF7,
+                0x714DE026,0x6636F465,0x5FBBC8A0,0x48C0DCE3,0x2CA1B12A,0x3BDAA569,0x025799AC,0x152C8DEF,
+                0xE4A482EC,0xF3DF96AF,0xCA52AA6A,0xDD29BE29,0xB948D3E0,0xAE33C7A3,0x97BEFB66,0x80C5EF25,
+                0x5F7C20F4,0x480734B7,0x718A0872,0x66F11C31,0x029071F8,0x15EB65BB,0x2C66597E,0x3B1D4D3D,
+                0x4864C09D,0x5F1FD4DE,0x6692E81B,0x71E9FC58,0x15889191,0x02F385D2,0x3B7EB917,0x2C05AD54,
+                0xF3BC6285,0xE4C776C6,0xDD4A4A03,0xCA315E40,0xAE503389,0xB92B27CA,0x80A61B0F,0x97DD0F4C,
+                0xB8C70348,0xAFBC170B,0x96312BCE,0x814A3F8D,0xE52B5244,0xF2504607,0xCBDD7AC2,0xDCA66E81,
+                0x031FA150,0x1464B513,0x2DE989D6,0x3A929D95,0x5EF3F05C,0x4988E41F,0x7005D8DA,0x677ECC99,
+                0x14074139,0x037C557A,0x3AF169BF,0x2D8A7DFC,0x49EB1035,0x5E900476,0x671D38B3,0x70662CF0,
+                0xAFDFE321,0xB8A4F762,0x8129CBA7,0x9652DFE4,0xF233B22D,0xE548A66E,0xDCC59AAB,0xCBBE8EE8,
+                0x3A3681EB,0x2D4D95A8,0x14C0A96D,0x03BBBD2E,0x67DAD0E7,0x70A1C4A4,0x492CF861,0x5E57EC22,
+                0x81EE23F3,0x969537B0,0xAF180B75,0xB8631F36,0xDC0272FF,0xCB7966BC,0xF2F45A79,0xE58F4E3A,
+                0x96F6C39A,0x818DD7D9,0xB800EB1C,0xAF7BFF5F,0xCB1A9296,0xDC6186D5,0xE5ECBA10,0xF297AE53,
+                0x2D2E6182,0x3A5575C1,0x03D84904,0x14A35D47,0x70C2308E,0x67B924CD,0x5E341808,0x494F0C4B,
+            },
+            {
+                0x00000000,0xEFC26B3E,0x04F5D03D,0xEB37BB03,0x09EBA07A,0xE629CB44,0x0D1E7047,0xE2DC1B79,
+                0x13D740F4,0xFC152BCA,0x172290C9,0xF8E0FBF7,0x1A3CE08E,0xF5FE8BB0,0x1EC930B3,0xF10B5B8D,
+                0x27AE81E8,0xC86CEAD6,0x235B51D5,0xCC993AEB,0x2E452192,0xC1874AAC,0x2AB0F1AF,0xC5729A91,
+                0x3479C11C,0xDBBBAA22,0x308C1121,0xDF4E7A1F,0x3D926166,0xD2500A58,0x3967B15B,0xD6A5DA65,
+                0x4F5D03D0,0xA09F68EE,0x4BA8D3ED,0xA46AB8D3,0x46B6A3AA,0xA974C894,0x42437397,0xAD8118A9,
+                0x5C8A4324,0xB348281A,0x587F9319,0xB7BDF827,0x5561E35E,0xBAA38860,0x51943363,0xBE56585D,
+                0x68F38238,0x8731E906,0x6C065205,0x83C4393B,0x61182242,0x8EDA497C,0x65EDF27F,0x8A2F9941,
+                0x7B24C2CC,0x94E6A9F2,0x7FD112F1,0x901379CF,0x72CF62B6,0x9D0D0988,0x763AB28B,0x99F8D9B5,
+                0x9EBA07A0,0x71786C9E,0x9A4FD79D,0x758DBCA3,0x9751A7DA,0x7893CCE4,0x93A477E7,0x7C661CD9,
+                0x8D6D4754,0x62AF2C6A,0x89989769,0x665AFC57,0x8486E72E,0x6B448C10,0x80733713,0x6FB15C2D,
+                0xB9148648,0x56D6ED76,0xBDE15675,0x52233D4B,0xB0FF2632,0x5F3D4D0C,0xB40AF60F,0x5BC89D31,
+                0xAAC3C6BC,0x4501AD82,0xAE361681,0x41F47DBF,0xA32866C6,0x4CEA0DF8,0xA7DDB6FB,0x481FDDC5,
+                0xD1E70470,0x3E256F4E,0xD512D44D,0x3AD0BF73,0xD80CA40A,0x37CECF34,0xDCF97437,0x333B1F09,
+                0xC2304484,0x2DF22FBA,0xC6C594B9,0x2907FF87,0xCBDBE4FE,0x24198FC0,0xCF2E34C3,0x20EC5FFD,
+                0xF6498598,0x198BEEA6,0xF2BC55A5,0x1D7E3E9B,0xFFA225E2,0x10604EDC,0xFB57F5DF,0x14959EE1,
+                0xE59EC56C,0x0A5CAE52,0xE16B1551,0x0EA97E6F,0xEC756516,0x03B70E28,0xE880B52B,0x0742DE15,
+                0xE6050901,0x09C7623F,0xE2F0D93C,0x0D32B202,0xEFEEA97B,0x002CC245,0xEB1B7946,0x04D91278,
+                0xF5D249F5,0x1A1022CB,0xF12799C8,0x1EE5F2F6,0xFC39E98F,0x13FB82B1,0xF8CC39B2,0x170E528C,
+                0xC1AB88E9,0x2E69E3D7,0xC55E58D4,0x2A9C33EA,0xC8402893,0x278243AD,0xCCB5F8AE,0x23779390,
+                0xD27CC81D,0x3DBEA323,0xD6891820,0x394B731E,0xDB976867,0x34550359,0xDF62B85A,0x30A0D364,
+                0xA9580AD1,0x469A61EF,0xADADDAEC,0x426FB1D2,0xA0B3AAAB,0x4F71C195,0xA4467A96,0x4B8411A8,
+                0xBA8F4A25,0x554D211B,0xBE7A9A18,0x51B8F126,0xB364EA5F,0x5CA68161,0xB7913A62,0x5853515C,
+                0x8EF68B39,0x6134E007,0x8A035B04,0x65C1303A,0x871D2B43,0x68DF407D,0x83E8FB7E,0x6C2A9040,
+                0x9D21CBCD,0x72E3A0F3,0x99D41BF0,0x761670CE,0x94CA6BB7,0x7B080089,0x903FBB8A,0x7FFDD0B4,
+                0x78BF0EA1,0x977D659F,0x7C4ADE9C,0x9388B5A2,0x7154AEDB,0x9E96C5E5,0x75A17EE6,0x9A6315D8,
+                0x6B684E55,0x84AA256B,0x6F9D9E68,0x805FF556,0x6283EE2F,0x8D418511,0x66763E12,0x89B4552C,
+                0x5F118F49,0xB0D3E477,0x5BE45F74,0xB426344A,0x56FA2F33,0xB938440D,0x520FFF0E,0xBDCD9430,
+                0x4CC6CFBD,0xA304A483,0x48331F80,0xA7F174BE,0x452D6FC7,0xAAEF04F9,0x41D8BFFA,0xAE1AD4C4,
+                0x37E20D71,0xD820664F,0x3317DD4C,0xDCD5B672,0x3E09AD0B,0xD1CBC635,0x3AFC7D36,0xD53E1608,
+                0x24354D85,0xCBF726BB,0x20C09DB8,0xCF02F686,0x2DDEEDFF,0xC21C86C1,0x292B3DC2,0xC6E956FC,
+                0x104C8C99,0xFF8EE7A7,0x14B95CA4,0xFB7B379A,0x19A72CE3,0xF66547DD,0x1D52FCDE,0xF29097E0,
+                0x039BCC6D,0xEC59A753,0x076E1C50,0xE8AC776E,0x0A706C17,0xE5B20729,0x0E85BC2A,0xE147D714,
+            },
+            {
+                0x00000000,0xC18EDFC0,0x586CB9C1,0x99E26601,0xB0D97382,0x7157AC42,0xE8B5CA43,0x293B1583,
+                0xBAC3E145,0x7B4D3E85,0xE2AF5884,0x23218744,0x0A1A92C7,0xCB944D07,0x52762B06,0x93F8F4C6,
+                0xAEF6C4CB,0x6F781B0B,0xF69A7D0A,0x3714A2CA,0x1E2FB749,0xDFA16889,0x46430E88,0x87CDD148,
+                0x1435258E,0xD5BBFA4E,0x4C599C4F,0x8DD7438F,0xA4EC560C,0x656289CC,0xFC80EFCD,0x3D0E300D,
+                0x869C8FD7,0x47125017,0xDEF03616,0x1F7EE9D6,0x3645FC55,0xF7CB2395,0x6E294594,0xAFA79A54,
+                0x3C5F6E92,0xFDD1B152,0x6433D753,0xA5BD0893,0x8C861D10,0x4D08C2D0,0xD4EAA4D1,0x15647B11,
+                0x286A4B1C,0xE9E494DC,0x7006F2DD,0xB1882D1D,0x98B3389E,0x593DE75E,0xC0DF815F,0x01515E9F,
+                0x92A9AA59,0x53277599,0xCAC51398,0x0B4BCC58,0x2270D9DB,0xE3FE061B,0x7A1C601A,0xBB92BFDA,
+                0xD64819EF,0x17C6C62F,0x8E24A02E,0x4FAA7FEE,0x66916A6D,0xA71FB5AD,0x3EFDD3AC,0xFF730C6C,
+                0x6C8BF8AA,0xAD05276A,0x34E7416B,0xF5699EAB,0xDC528B28,0x1DDC54E8,0x843E32E9,0x45B0ED29,
+                0x78BEDD24,0xB93002E4,0x20D264E5,0xE15CBB25,0xC867AEA6,0x09E97166,0x900B1767,0x5185C8A7,
+                0xC27D3C61,0x03F3E3A1,0x9A1185A0,0x5B9F5A60,0x72A44FE3,0xB32A9023,0x2AC8F622,0xEB4629E2,
+                0x50D49638,0x915A49F8,0x08B82FF9,0xC936F039,0xE00DE5BA,0x21833A7A,0xB8615C7B,0x79EF83BB,
+                0xEA17777D,0x2B99A8BD,0xB27BCEBC,0x73F5117C,0x5ACE04FF,0x9B40DB3F,0x02A2BD3E,0xC32C62FE,
+                0xFE2252F3,0x3FAC8D33,0xA64EEB32,0x67C034F2,0x4EFB2171,0x8F75FEB1,0x169798B0,0xD7194770,
+                0x44E1B3B6,0x856F6C76,0x1C8D0A77,0xDD03D5B7,0xF438C034,0x35B61FF4,0xAC5479F5,0x6DDAA635,
+                0x77E1359F,0xB66FEA5F,0x2F8D8C5E,0xEE03539E,0xC738461D,0x06B699DD,0x9F54FFDC,0x5EDA201C,
+                0xCD22D4DA,0x0CAC0B1A,0x954E6D1B,0x54C0B2DB,0x7DFBA758,0xBC757898,0x25971E99,0xE419C159,
+                0xD917F154,0x18992E94,0x817B4895,0x40F59755,0x69CE82D6,0xA8405D16,0x31A23B17,0xF02CE4D7,
+                0x63D41011,0xA25ACFD1,0x3BB8A9D0,0xFA367610,0xD30D6393,0x1283BC53,0x8B61DA52,0x4AEF0592,
+                0xF17DBA48,0x30F36588,0xA9110389,0x689FDC49,0x41A4C9CA,0x802A160A,0x19C8700B,0xD846AFCB,
+                0x4BBE5B0D,0x8A3084CD,0x13D2E2CC,0xD25C3D0C,0xFB67288F,0x3AE9F74F,0xA30B914E,0x62854E8E,
+                0x5F8B7E83,0x9E05A143,0x07E7C742,0xC6691882,0xEF520D01,0x2EDCD2C1,0xB73EB4C0,0x76B06B00,
+                0xE5489FC6,0x24C64006,0xBD242607,0x7CAAF9C7,0x5591EC44,0x941F3384,0x0DFD5585,0xCC738A45,
+                0xA1A92C70,0x6027F3B0,0xF9C595B1,0x384B4A71,0x11705FF2,0xD0FE8032,0x491CE633,0x889239F3,
+                0x1B6ACD35,0xDAE412F5,0x430674F4,0x8288AB34,0xABB3BEB7,0x6A3D6177,0xF3DF0776,0x3251D8B6,
+                0x0F5FE8BB,0xCED1377B,0x5733517A,0x96BD8EBA,0xBF869B39,0x7E0844F9,0xE7EA22F8,0x2664FD38,
+                0xB59C09FE,0x7412D63E,0xEDF0B03F,0x2C7E6FFF,0x05457A7C,0xC4CBA5BC,0x5D29C3BD,0x9CA71C7D,
+                0x2735A3A7,0xE6BB7C67,0x7F591A66,0xBED7C5A6,0x97ECD025,0x56620FE5,0xCF8069E4,0x0E0EB624,
+                0x9DF642E2,0x5C789D22,0xC59AFB23,0x041424E3,0x2D2F3160,0xECA1EEA0,0x754388A1,0xB4CD5761,
+                0x89C3676C,0x484DB8AC,0xD1AFDEAD,0x1021016D,0x391A14EE,0xF894CB2E,0x6176AD2F,0xA0F872EF,
+                0x33008629,0xF28E59E9,0x6B6C3FE8,0xAAE2E028,0x83D9F5AB,0x42572A6B,0xDBB54C6A,0x1A3B93AA,
+            },
+            {
+                0x00000000,0x9BA54C6F,0xEC3B9E9F,0x779ED2F0,0x03063B7F,0x98A37710,0xEF3DA5E0,0x7498E98F,
+                0x060C76FE,0x9DA93A91,0xEA37E861,0x7192A40E,0x050A4D81,0x9EAF01EE,0xE931D31E,0x72949F71,
+                0x0C18EDFC,0x97BDA193,0xE0237363,0x7B863F0C,0x0F1ED683,0x94BB9AEC,0xE325481C,0x78800473,
+                0x0A149B02,0x91B1D76D,0xE62F059D,0x7D8A49F2,0x0912A07D,0x92B7EC12,0xE5293EE2,0x7E8C728D,
+                0x1831DBF8,0x83949797,0xF40A4567,0x6FAF0908,0x1B37E087,0x8092ACE8,0xF70C7E18,0x6CA93277,
+                0x1E3DAD06,0x8598E169,0xF2063399,0x69A37FF6,0x1D3B9679,0x869EDA16,0xF10008E6,0x6AA54489,
+                0x14293604,0x8F8C7A6B,0xF812A89B,0x63B7E4F4,0x172F0D7B,0x8C8A4114,0xFB1493E4,0x60B1DF8B,
+                0x122540FA,0x89800C95,0xFE1EDE65,0x65BB920A,0x11237B85,0x8A8637EA,0xFD18E51A,0x66BDA975,
+                0x3063B7F0,0xABC6FB9F,0xDC58296F,0x47FD6500,0x33658C8F,0xA8C0C0E0,0xDF5E1210,0x44FB5E7F,
+                0x366FC10E,0xADCA8D61,0xDA545F91,0x41F113FE,0x3569FA71,0xAECCB61E,0xD95264EE,0x42F72881,
+                0x3C7B5A0C,0xA7DE1663,0xD040C493,0x4BE588FC,0x3F7D6173,0xA4D82D1C,0xD346FFEC,0x48E3B383,
+                0x3A772CF2,0xA1D2609D,0xD64CB26D,0x4DE9FE02,0x3971178D,0xA2D45BE2,0xD54A8912,0x4EEFC57D,
+                0x28526C08,0xB3F72067,0xC469F297,0x5FCCBEF8,0x2B545777,0xB0F11B18,0xC76FC9E8,0x5CCA8587,
+                0x2E5E1AF6,0xB5FB5699,0xC2658469,0x59C0C806,0x2D582189,0xB6FD6DE6,0xC163BF16,0x5AC6F379,
+                0x244A81F4,0xBFEFCD9B,0xC8711F6B,0x53D45304,0x274CBA8B,0xBCE9F6E4,0xCB772414,0x50D2687B,
+                0x2246F70A,0xB9E3BB65,0xCE7D6995,0x55D825FA,0x2140CC75,0xBAE5801A,0xCD7B52EA,0x56DE1E85,
+                0x60C76FE0,0xFB62238F,0x8CFCF17F,0x1759BD10,0x63C1549F,0xF86418F0,0x8FFACA00,0x145F866F,
+                0x66CB191E,0xFD6E5571,0x8AF08781,0x1155CBEE,0x65CD2261,0xFE686E0E,0x89F6BCFE,0x1253F091,
+                0x6CDF821C,0xF77ACE73,0x80E41C83,0x1B4150EC,0x6FD9B963,0xF47CF50C,0x83E227FC,0x18476B93,
+                0x6AD3F4E2,0xF176B88D,0x86E86A7D,0x1D4D2612,0x69D5CF9D,0xF27083F2,0x85EE5102,0x1E4B1D6D,
+                0x78F6B418,0xE353F877,0x94CD2A87,0x0F6866E8,0x7BF08F67,0xE055C308,0x97CB11F8,0x0C6E5D97,
+                0x7EFAC2E6,0xE55F8E89,0x92C15C79,0x09641016,0x7DFCF999,0xE659B5F6,0x91C76706,0x0A622B69,
+                0x74EE59E4,0xEF4B158B,0x98D5C77B,0x03708B14,0x77E8629B,0xEC4D2EF4,0x9BD3FC04,0x0076B06B,
+                0x72E22F1A,0xE9476375,0x9ED9B185,0x057CFDEA,0x71E41465,0xEA41580A,0x9DDF8AFA,0x067AC695,
+                0x50A4D810,0xCB01947F,0xBC9F468F,0x273A0AE0,0x53A2E36F,0xC807AF00,0xBF997DF0,0x243C319F,
+                0x56A8AEEE,0xCD0DE281,0xBA933071,0x21367C1E,0x55AE9591,0xCE0BD9FE,0xB9950B0E,0x22304761,
+                0x5CBC35EC,0xC7197983,0xB087AB73,0x2B22E71C,0x5FBA0E93,0xC41F42FC,0xB381900C,0x2824DC63,
+                0x5AB04312,0xC1150F7D,0xB68BDD8D,0x2D2E91E2,0x59B6786D,0xC2133402,0xB58DE6F2,0x2E28AA9D,
+                0x489503E8,0xD3304F87,0xA4AE9D77,0x3F0BD118,0x4B933897,0xD03674F8,0xA7A8A608,0x3C0DEA67,
+                0x4E997516,0xD53C3979,0xA2A2EB89,0x3907A7E6,0x4D9F4E69,0xD63A0206,0xA1A4D0F6,0x3A019C99,
+                0x448DEE14,0xDF28A27B,0xA8B6708B,0x33133CE4,0x478BD56B,0xDC2E9904,0xABB04BF4,0x3015079B,
+                0x428198EA,0xD924D485,0xAEBA0675,0x351F4A1A,0x4187A395,0xDA22EFFA,0xADBC3D0A,0x36197165,
+            },
+            {
+                0x00000000,0xDD96D985,0x605CB54B,0xBDCA6CCE,0xC0B96A96,0x1D2FB313,0xA0E5DFDD,0x7D730658,
+                0x5A03D36D,0x87950AE8,0x3A5F6626,0xE7C9BFA3,0x9ABAB9FB,0x472C607E,0xFAE60CB0,0x2770D535,
+                0xB407A6DA,0x69917F5F,0xD45B1391,0x09CDCA14,0x74BECC4C,0xA92815C9,0x14E27907,0xC974A082,
+                0xEE0475B7,0x3392AC32,0x8E58C0FC,0x53CE1979,0x2EBD1F21,0xF32BC6A4,0x4EE1AA6A,0x937773EF,
+                0xB37E4BF5,0x6EE89270,0xD322FEBE,0x0EB4273B,0x73C72163,0xAE51F8E6,0x139B9428,0xCE0D4DAD,
+                0xE97D9898,0x34EB411D,0x89212DD3,0x54B7F456,0x29C4F20E,0xF4522B8B,0x49984745,0x940E9EC0,
+                0x0779ED2F,0xDAEF34AA,0x67255864,0xBAB381E1,0xC7C087B9,0x1A565E3C,0xA79C32F2,0x7A0AEB77,
+                0x5D7A3E42,0x80ECE7C7,0x3D268B09,0xE0B0528C,0x9DC354D4,0x40558D51,0xFD9FE19F,0x2009381A,
+                0xBD8D91AB,0x601B482E,0xDDD124E0,0x0047FD65,0x7D34FB3D,0xA0A222B8,0x1D684E76,0xC0FE97F3,
+                0xE78E42C6,0x3A189B43,0x87D2F78D,0x5A442E08,0x27372850,0xFAA1F1D5,0x476B9D1B,0x9AFD449E,
+                0x098A3771,0xD41CEEF4,0x69D6823A,0xB4405BBF,0xC9335DE7,0x14A58462,0xA96FE8AC,0x74F93129,
+                0x5389E41C,0x8E1F3D99,0x33D55157,0xEE4388D2,0x93308E8A,0x4EA6570F,0xF36C3BC1,0x2EFAE244,
+                0x0EF3DA5E,0xD36503DB,0x6EAF6F15,0xB339B690,0xCE4AB0C8,0x13DC694D,0xAE160583,0x7380DC06,
+                0x54F00933,0x8966D0B6,0x34ACBC78,0xE93A65FD,0x944963A5,0x49DFBA20,0xF415D6EE,0x29830F6B,
+                0xBAF47C84,0x6762A501,0xDAA8C9CF,0x073E104A,0x7A4D1612,0xA7DBCF97,0x1A11A359,0xC7877ADC,
+                0xE0F7AFE9,0x3D61766C,0x80AB1AA2,0x5D3DC327,0x204EC57F,0xFDD81CFA,0x40127034,0x9D84A9B1,
+                0xA06A2517,0x7DFCFC92,0xC036905C,0x1DA049D9,0x60D34F81,0xBD459604,0x008FFACA,0xDD19234F,
+                0xFA69F67A,0x27FF2FFF,0x9A354331,0x47A39AB4,0x3AD09CEC,0xE7464569,0x5A8C29A7,0x871AF022,
+                0x146D83CD,0xC9FB5A48,0x74313686,0xA9A7EF03,0xD4D4E95B,0x094230DE,0xB4885C10,0x691E8595,
+                0x4E6E50A0,0x93F88925,0x2E32E5EB,0xF3A43C6E,0x8ED73A36,0x5341E3B3,0xEE8B8F7D,0x331D56F8,
+                0x13146EE2,0xCE82B767,0x7348DBA9,0xAEDE022C,0xD3AD0474,0x0E3BDDF1,0xB3F1B13F,0x6E6768BA,
+                0x4917BD8F,0x9481640A,0x294B08C4,0xF4DDD141,0x89AED719,0x54380E9C,0xE9F26252,0x3464BBD7,
+                0xA713C838,0x7A8511BD,0xC74F7D73,0x1AD9A4F6,0x67AAA2AE,0xBA3C7B2B,0x07F617E5,0xDA60CE60,
+                0xFD101B55,0x2086C2D0,0x9D4CAE1E,0x40DA779B,0x3DA971C3,0xE03FA846,0x5DF5C488,0x80631D0D,
+                0x1DE7B4BC,0xC0716D39,0x7DBB01F7,0xA02DD872,0xDD5EDE2A,0x00C807AF,0xBD026B61,0x6094B2E4,
+                0x47E467D1,0x9A72BE54,0x27B8D29A,0xFA2E0B1F,0x875D0D47,0x5ACBD4C2,0xE701B80C,0x3A976189,
+                0xA9E01266,0x7476CBE3,0xC9BCA72D,0x142A7EA8,0x695978F0,0xB4CFA175,0x0905CDBB,0xD493143E,
+                0xF3E3C10B,0x2E75188E,0x93BF7440,0x4E29ADC5,0x335AAB9D,0xEECC7218,0x53061ED6,0x8E90C753,
+                0xAE99FF49,0x730F26CC,0xCEC54A02,0x13539387,0x6E2095DF,0xB3B64C5A,0x0E7C2094,0xD3EAF911,
+                0xF49A2C24,0x290CF5A1,0x94C6996F,0x495040EA,0x342346B2,0xE9B59F37,0x547FF3F9,0x89E92A7C,
+                0x1A9E5993,0xC7088016,0x7AC2ECD8,0xA754355D,0xDA273305,0x07B1EA80,0xBA7B864E,0x67ED5FCB,
+                0x409D8AFE,0x9D0B537B,0x20C13FB5,0xFD57E630,0x8024E068,0x5DB239ED,0xE0785523,0x3DEE8CA6,
+            },
+            {
+                0x00000000,0x9D0FE176,0xE16EC4AD,0x7C6125DB,0x19AC8F1B,0x84A36E6D,0xF8C24BB6,0x65CDAAC0,
+                0x33591E36,0xAE56FF40,0xD237DA9B,0x4F383BED,0x2AF5912D,0xB7FA705B,0xCB9B5580,0x5694B4F6,
+                0x66B23C6C,0xFBBDDD1A,0x87DCF8C1,0x1AD319B7,0x7F1EB377,0xE2115201,0x9E7077DA,0x037F96AC,
+                0x55EB225A,0xC8E4C32C,0xB485E6F7,0x298A0781,0x4C47AD41,0xD1484C37,0xAD2969EC,0x3026889A,
+                0xCD6478D8,0x506B99AE,0x2C0ABC75,0xB1055D03,0xD4C8F7C3,0x49C716B5,0x35A6336E,0xA8A9D218,
+                0xFE3D66EE,0x63328798,0x1F53A243,0x825C4335,0xE791E9F5,0x7A9E0883,0x06FF2D58,0x9BF0CC2E,
+                0xABD644B4,0x36D9A5C2,0x4AB88019,0xD7B7616F,0xB27ACBAF,0x2F752AD9,0x53140F02,0xCE1BEE74,
+                0x988F5A82,0x0580BBF4,0x79E19E2F,0xE4EE7F59,0x8123D599,0x1C2C34EF,0x604D1134,0xFD42F042,
+                0x41B9F7F1,0xDCB61687,0xA0D7335C,0x3DD8D22A,0x581578EA,0xC51A999C,0xB97BBC47,0x24745D31,
+                0x72E0E9C7,0xEFEF08B1,0x938E2D6A,0x0E81CC1C,0x6B4C66DC,0xF64387AA,0x8A22A271,0x172D4307,
+                0x270BCB9D,0xBA042AEB,0xC6650F30,0x5B6AEE46,0x3EA74486,0xA3A8A5F0,0xDFC9802B,0x42C6615D,
+                0x1452D5AB,0x895D34DD,0xF53C1106,0x6833F070,0x0DFE5AB0,0x90F1BBC6,0xEC909E1D,0x719F7F6B,
+                0x8CDD8F29,0x11D26E5F,0x6DB34B84,0xF0BCAAF2,0x95710032,0x087EE144,0x741FC49F,0xE91025E9,
+                0xBF84911F,0x228B7069,0x5EEA55B2,0xC3E5B4C4,0xA6281E04,0x3B27FF72,0x4746DAA9,0xDA493BDF,
+                0xEA6FB345,0x77605233,0x0B0177E8,0x960E969E,0xF3C33C5E,0x6ECCDD28,0x12ADF8F3,0x8FA21985,
+                0xD936AD73,0x44394C05,0x385869DE,0xA55788A8,0xC09A2268,0x5D95C31E,0x21F4E6C5,0xBCFB07B3,
+                0x8373EFE2,0x1E7C0E94,0x621D2B4F,0xFF12CA39,0x9ADF60F9,0x07D0818F,0x7BB1A454,0xE6BE4522,
+                0xB02AF1D4,0x2D2510A2,0x51443579,0xCC4BD40F,0xA9867ECF,0x34899FB9,0x48E8BA62,0xD5E75B14,
+                0xE5C1D38E,0x78CE32F8,0x04AF1723,0x99A0F655,0xFC6D5C95,0x6162BDE3,0x1D039838,0x800C794E,
+                0xD698CDB8,0x4B972CCE,0x37F60915,0xAAF9E863,0xCF3442A3,0x523BA3D5,0x2E5A860E,0xB3556778,
+                0x4E17973A,0xD318764C,0xAF795397,0x3276B2E1,0x57BB1821,0xCAB4F957,0xB6D5DC8C,0x2BDA3DFA,
+                0x7D4E890C,0xE041687A,0x9C204DA1,0x012FACD7,0x64E20617,0xF9EDE761,0x858CC2BA,0x188323CC,
+                0x28A5AB56,0xB5AA4A20,0xC9CB6FFB,0x54C48E8D,0x3109244D,0xAC06C53B,0xD067E0E0,0x4D680196,
+                0x1BFCB560,0x86F35416,0xFA9271CD,0x679D90BB,0x02503A7B,0x9F5FDB0D,0xE33EFED6,0x7E311FA0,
+                0xC2CA1813,0x5FC5F965,0x23A4DCBE,0xBEAB3DC8,0xDB669708,0x4669767E,0x3A0853A5,0xA707B2D3,
+                0xF1930625,0x6C9CE753,0x10FDC288,0x8DF223FE,0xE83F893E,0x75306848,0x09514D93,0x945EACE5,
+                0xA478247F,0x3977C509,0x4516E0D2,0xD81901A4,0xBDD4AB64,0x20DB4A12,0x5CBA6FC9,0xC1B58EBF,
+                0x97213A49,0x0A2EDB3F,0x764FFEE4,0xEB401F92,0x8E8DB552,0x13825424,0x6FE371FF,0xF2EC9089,
+                0x0FAE60CB,0x92A181BD,0xEEC0A466,0x73CF4510,0x1602EFD0,0x8B0D0EA6,0xF76C2B7D,0x6A63CA0B,
+                0x3CF77EFD,0xA1F89F8B,0xDD99BA50,0x40965B26,0x255BF1E6,0xB8541090,0xC435354B,0x593AD43D,
+                0x691C5CA7,0xF413BDD1,0x8872980A,0x157D797C,0x70B0D3BC,0xEDBF32CA,0x91DE1711,0x0CD1F667,
+                0x5A454291,0xC74AA3E7,0xBB2B863C,0x2624674A,0x43E9CD8A,0xDEE62CFC,0xA2870927,0x3F88E851,
+            },
+            {
+                0x00000000,0xB9FBDBE8,0xA886B191,0x117D6A79,0x8A7C6563,0x3387BE8B,0x22FAD4F2,0x9B010F1A,
+                0xCF89CC87,0x7672176F,0x670F7D16,0xDEF4A6FE,0x45F5A9E4,0xFC0E720C,0xED731875,0x5488C39D,
+                0x44629F4F,0xFD9944A7,0xECE42EDE,0x551FF536,0xCE1EFA2C,0x77E521C4,0x66984BBD,0xDF639055,
+                0x8BEB53C8,0x32108820,0x236DE259,0x9A9639B1,0x019736AB,0xB86CED43,0xA911873A,0x10EA5CD2,
+                0x88C53E9E,0x313EE576,0x20438F0F,0x99B854E7,0x02B95BFD,0xBB428015,0xAA3FEA6C,0x13C43184,
+                0x474CF219,0xFEB729F1,0xEFCA4388,0x56319860,0xCD30977A,0x74CB4C92,0x65B626EB,0xDC4DFD03,
+                0xCCA7A1D1,0x755C7A39,0x64211040,0xDDDACBA8,0x46DBC4B2,0xFF201F5A,0xEE5D7523,0x57A6AECB,
+                0x032E6D56,0xBAD5B6BE,0xABA8DCC7,0x1253072F,0x89520835,0x30A9D3DD,0x21D4B9A4,0x982F624C,
+                0xCAFB7B7D,0x7300A095,0x627DCAEC,0xDB861104,0x40871E1E,0xF97CC5F6,0xE801AF8F,0x51FA7467,
+                0x0572B7FA,0xBC896C12,0xADF4066B,0x140FDD83,0x8F0ED299,0x36F50971,0x27886308,0x9E73B8E0,
+                0x8E99E432,0x37623FDA,0x261F55A3,0x9FE48E4B,0x04E58151,0xBD1E5AB9,0xAC6330C0,0x1598EB28,
+                0x411028B5,0xF8EBF35D,0xE9969924,0x506D42CC,0xCB6C4DD6,0x7297963E,0x63EAFC47,0xDA1127AF,
+                0x423E45E3,0xFBC59E0B,0xEAB8F472,0x53432F9A,0xC8422080,0x71B9FB68,0x60C49111,0xD93F4AF9,
+                0x8DB78964,0x344C528C,0x253138F5,0x9CCAE31D,0x07CBEC07,0xBE3037EF,0xAF4D5D96,0x16B6867E,
+                0x065CDAAC,0xBFA70144,0xAEDA6B3D,0x1721B0D5,0x8C20BFCF,0x35DB6427,0x24A60E5E,0x9D5DD5B6,
+                0xC9D5162B,0x702ECDC3,0x6153A7BA,0xD8A87C52,0x43A97348,0xFA52A8A0,0xEB2FC2D9,0x52D41931,
+                0x4E87F0BB,0xF77C2B53,0xE601412A,0x5FFA9AC2,0xC4FB95D8,0x7D004E30,0x6C7D2449,0xD586FFA1,
+                0x810E3C3C,0x38F5E7D4,0x29888DAD,0x90735645,0x0B72595F,0xB28982B7,0xA3F4E8CE,0x1A0F3326,
+                0x0AE56FF4,0xB31EB41C,0xA263DE65,0x1B98058D,0x80990A97,0x3962D17F,0x281FBB06,0x91E460EE,
+                0xC56CA373,0x7C97789B,0x6DEA12E2,0xD411C90A,0x4F10C610,0xF6EB1DF8,0xE7967781,0x5E6DAC69,
+                0xC642CE25,0x7FB915CD,0x6EC47FB4,0xD73FA45C,0x4C3EAB46,0xF5C570AE,0xE4B81AD7,0x5D43C13F,
+                0x09CB02A2,0xB030D94A,0xA14DB333,0x18B668DB,0x83B767C1,0x3A4CBC29,0x2B31D650,0x92CA0DB8,
+                0x8220516A,0x3BDB8A82,0x2AA6E0FB,0x935D3B13,0x085C3409,0xB1A7EFE1,0xA0DA8598,0x19215E70,
+                0x4DA99DED,0xF4524605,0xE52F2C7C,0x5CD4F794,0xC7D5F88E,0x7E2E2366,0x6F53491F,0xD6A892F7,
+                0x847C8BC6,0x3D87502E,0x2CFA3A57,0x9501E1BF,0x0E00EEA5,0xB7FB354D,0xA6865F34,0x1F7D84DC,
+                0x4BF54741,0xF20E9CA9,0xE373F6D0,0x5A882D38,0xC1892222,0x7872F9CA,0x690F93B3,0xD0F4485B,
+                0xC01E1489,0x79E5CF61,0x6898A518,0xD1637EF0,0x4A6271EA,0xF399AA02,0xE2E4C07B,0x5B1F1B93,
+                0x0F97D80E,0xB66C03E6,0xA711699F,0x1EEAB277,0x85EBBD6D,0x3C106685,0x2D6D0CFC,0x9496D714,
+                0x0CB9B558,0xB5426EB0,0xA43F04C9,0x1DC4DF21,0x86C5D03B,0x3F3E0BD3,0x2E4361AA,0x97B8BA42,
+                0xC33079DF,0x7ACBA237,0x6BB6C84E,0xD24D13A6,0x494C1CBC,0xF0B7C754,0xE1CAAD2D,0x583176C5,
+                0x48DB2A17,0xF120F1FF,0xE05D9B86,0x59A6406E,0xC2A74F74,0x7B5C949C,0x6A21FEE5,0xD3DA250D,
+                0x8752E690,0x3EA93D78,0x2FD45701,0x962F8CE9,0x0D2E83F3,0xB4D5581B,0xA5A83262,0x1C53E98A,
+            },
+            {
+                0x00000000,0xAE689191,0x87A02563,0x29C8B4F2,0xD4314C87,0x7A59DD16,0x539169E4,0xFDF9F875,
+                0x73139F4F,0xDD7B0EDE,0xF4B3BA2C,0x5ADB2BBD,0xA722D3C8,0x094A4259,0x2082F6AB,0x8EEA673A,
+                0xE6273E9E,0x484FAF0F,0x61871BFD,0xCFEF8A6C,0x32167219,0x9C7EE388,0xB5B6577A,0x1BDEC6EB,
+                0x9534A1D1,0x3B5C3040,0x129484B2,0xBCFC1523,0x4105ED56,0xEF6D7CC7,0xC6A5C835,0x68CD59A4,
+                0x173F7B7D,0xB957EAEC,0x909F5E1E,0x3EF7CF8F,0xC30E37FA,0x6D66A66B,0x44AE1299,0xEAC68308,
+                0x642CE432,0xCA4475A3,0xE38CC151,0x4DE450C0,0xB01DA8B5,0x1E753924,0x37BD8DD6,0x99D51C47,
+                0xF11845E3,0x5F70D472,0x76B86080,0xD8D0F111,0x25290964,0x8B4198F5,0xA2892C07,0x0CE1BD96,
+                0x820BDAAC,0x2C634B3D,0x05ABFFCF,0xABC36E5E,0x563A962B,0xF85207BA,0xD19AB348,0x7FF222D9,
+                0x2E7EF6FA,0x8016676B,0xA9DED399,0x07B64208,0xFA4FBA7D,0x54272BEC,0x7DEF9F1E,0xD3870E8F,
+                0x5D6D69B5,0xF305F824,0xDACD4CD6,0x74A5DD47,0x895C2532,0x2734B4A3,0x0EFC0051,0xA09491C0,
+                0xC859C864,0x663159F5,0x4FF9ED07,0xE1917C96,0x1C6884E3,0xB2001572,0x9BC8A180,0x35A03011,
+                0xBB4A572B,0x1522C6BA,0x3CEA7248,0x9282E3D9,0x6F7B1BAC,0xC1138A3D,0xE8DB3ECF,0x46B3AF5E,
+                0x39418D87,0x97291C16,0xBEE1A8E4,0x10893975,0xED70C100,0x43185091,0x6AD0E463,0xC4B875F2,
+                0x4A5212C8,0xE43A8359,0xCDF237AB,0x639AA63A,0x9E635E4F,0x300BCFDE,0x19C37B2C,0xB7ABEABD,
+                0xDF66B319,0x710E2288,0x58C6967A,0xF6AE07EB,0x0B57FF9E,0xA53F6E0F,0x8CF7DAFD,0x229F4B6C,
+                0xAC752C56,0x021DBDC7,0x2BD50935,0x85BD98A4,0x784460D1,0xD62CF140,0xFFE445B2,0x518CD423,
+                0x5CFDEDF4,0xF2957C65,0xDB5DC897,0x75355906,0x88CCA173,0x26A430E2,0x0F6C8410,0xA1041581,
+                0x2FEE72BB,0x8186E32A,0xA84E57D8,0x0626C649,0xFBDF3E3C,0x55B7AFAD,0x7C7F1B5F,0xD2178ACE,
+                0xBADAD36A,0x14B242FB,0x3D7AF609,0x93126798,0x6EEB9FED,0xC0830E7C,0xE94BBA8E,0x47232B1F,
+                0xC9C94C25,0x67A1DDB4,0x4E696946,0xE001F8D7,0x1DF800A2,0xB3909133,0x9A5825C1,0x3430B450,
+                0x4BC29689,0xE5AA0718,0xCC62B3EA,0x620A227B,0x9FF3DA0E,0x319B4B9F,0x1853FF6D,0xB63B6EFC,
+                0x38D109C6,0x96B99857,0xBF712CA5,0x1119BD34,0xECE04541,0x4288D4D0,0x6B406022,0xC528F1B3,
+                0xADE5A817,0x038D3986,0x2A458D74,0x842D1CE5,0x79D4E490,0xD7BC7501,0xFE74C1F3,0x501C5062,
+                0xDEF63758,0x709EA6C9,0x5956123B,0xF73E83AA,0x0AC77BDF,0xA4AFEA4E,0x8D675EBC,0x230FCF2D,
+                0x72831B0E,0xDCEB8A9F,0xF5233E6D,0x5B4BAFFC,0xA6B25789,0x08DAC618,0x211272EA,0x8F7AE37B,
+                0x01908441,0xAFF815D0,0x8630A122,0x285830B3,0xD5A1C8C6,0x7BC95957,0x5201EDA5,0xFC697C34,
+                0x94A42590,0x3ACCB401,0x130400F3,0xBD6C9162,0x40956917,0xEEFDF886,0xC7354C74,0x695DDDE5,
+                0xE7B7BADF,0x49DF2B4E,0x60179FBC,0xCE7F0E2D,0x3386F658,0x9DEE67C9,0xB426D33B,0x1A4E42AA,
+                0x65BC6073,0xCBD4F1E2,0xE21C4510,0x4C74D481,0xB18D2CF4,0x1FE5BD65,0x362D0997,0x98459806,
+                0x16AFFF3C,0xB8C76EAD,0x910FDA5F,0x3F674BCE,0xC29EB3BB,0x6CF6222A,0x453E96D8,0xEB560749,
+                0x839B5EED,0x2DF3CF7C,0x043B7B8E,0xAA53EA1F,0x57AA126A,0xF9C283FB,0xD00A3709,0x7E62A698,
+                0xF088C1A2,0x5EE05033,0x7728E4C1,0xD9407550,0x24B98D25,0x8AD11CB4,0xA319A846,0x0D7139D7,
+            }
+        };
+
+        uint32_t Crc32(const void* src, size_t size)
+        {
+            const uint8_t* p8 = (const uint8_t*)src;
+            uint32_t crc = 0xFFFFFFFF;
+
+            for (; ((uintptr_t)p8 & (sizeof(uint32_t) - 1)) != 0 && size > 0; ++p8, --size)
+                crc = Crc32Table[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8);
+
+            const uint32_t* p32 = (const uint32_t*)p8;
+            for (; size >= 16; size -= 16)
+            {
+#ifdef SIMD_BIG_ENDIAN
+                uint32_t v0 = *p32++ ^ Reorder32(crc);
+                uint32_t v1 = *p32++;
+                uint32_t v2 = *p32++;
+                uint32_t v3 = *p32++;
+                crc = 
+                    Crc32Table[0x0][v3 & 0xFF] ^
+                    Crc32Table[0x1][(v3 >> 8) & 0xFF] ^
+                    Crc32Table[0x2][(v3 >> 16) & 0xFF] ^
+                    Crc32Table[0x3][(v3 >> 24) & 0xFF] ^
+                    Crc32Table[0x4][v2 & 0xFF] ^
+                    Crc32Table[0x5][(v2 >> 8) & 0xFF] ^
+                    Crc32Table[0x6][(v2 >> 16) & 0xFF] ^
+                    Crc32Table[0x7][(v2 >> 24) & 0xFF] ^
+                    Crc32Table[0x8][v1 & 0xFF] ^
+                    Crc32Table[0x9][(v1 >> 8) & 0xFF] ^
+                    Crc32Table[0xA][(v1 >> 16) & 0xFF] ^
+                    Crc32Table[0xB][(v1 >> 24) & 0xFF] ^
+                    Crc32Table[0xC][v0 & 0xFF] ^
+                    Crc32Table[0xD][(v0 >> 8) & 0xFF] ^
+                    Crc32Table[0xE][(v0 >> 16) & 0xFF] ^
+                    Crc32Table[0xF][(v0 >> 24) & 0xFF];
+#else
+                uint32_t v0 = *p32++ ^ crc;
+                uint32_t v1 = *p32++;
+                uint32_t v2 = *p32++;
+                uint32_t v3 = *p32++;
+                crc = 
+                    Crc32Table[0x0][(v3 >> 24) & 0xFF] ^
+                    Crc32Table[0x1][(v3 >> 16) & 0xFF] ^
+                    Crc32Table[0x2][(v3 >> 8) & 0xFF] ^
+                    Crc32Table[0x3][v3 & 0xFF] ^
+                    Crc32Table[0x4][(v2 >> 24) & 0xFF] ^
+                    Crc32Table[0x5][(v2 >> 16) & 0xFF] ^
+                    Crc32Table[0x6][(v2 >> 8) & 0xFF] ^
+                    Crc32Table[0x7][v2 & 0xFF] ^
+                    Crc32Table[0x8][(v1 >> 24) & 0xFF] ^
+                    Crc32Table[0x9][(v1 >> 16) & 0xFF] ^
+                    Crc32Table[0xA][(v1 >> 8) & 0xFF] ^
+                    Crc32Table[0xB][v1 & 0xFF] ^
+                    Crc32Table[0xC][(v0 >> 24) & 0xFF] ^
+                    Crc32Table[0xD][(v0 >> 16) & 0xFF] ^
+                    Crc32Table[0xE][(v0 >> 8) & 0xFF] ^
+                    Crc32Table[0xF][v0 & 0xFF];
+#endif
+            }
+
+            for (p8 = (const uint8_t*)p32; size > 0; ++p8, size--)
+                crc = Crc32Table[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8);
+
+            return (~crc);
+        }
+
+        //---------------------------------------------------------------------
+
+        // Precalculated CRC32c lookup table for polynomial 0x1EDC6F41 (castagnoli-crc).
+        static const uint32_t Crc32cTable[8][256] =
+        {
+            {
+                0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
+                0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
+                0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+                0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
+                0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35,
+                0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+                0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a,
+                0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
+                0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+                0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
+                0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
+                0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+                0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
+                0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
+                0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+                0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
+                0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
+                0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+                0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
+                0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
+                0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+                0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982,
+                0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
+                0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+                0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
+                0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
+                0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+                0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
+                0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
+                0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+                0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
+                0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351
+            },
+            {
+                0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945,
+                0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd,
+                0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4,
+                0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c,
+                0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47,
+                0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff,
+                0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6,
+                0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e,
+                0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41,
+                0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9,
+                0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0,
+                0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78,
+                0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43,
+                0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb,
+                0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2,
+                0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a,
+                0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc,
+                0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004,
+                0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d,
+                0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185,
+                0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be,
+                0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306,
+                0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f,
+                0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287,
+                0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8,
+                0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600,
+                0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439,
+                0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781,
+                0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba,
+                0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502,
+                0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b,
+                0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483
+            },
+            {
+                0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469,
+                0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac,
+                0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3,
+                0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726,
+                0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d,
+                0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8,
+                0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7,
+                0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32,
+                0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0,
+                0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75,
+                0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a,
+                0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff,
+                0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4,
+                0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161,
+                0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e,
+                0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb,
+                0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a,
+                0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def,
+                0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0,
+                0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065,
+                0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e,
+                0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb,
+                0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4,
+                0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71,
+                0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3,
+                0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36,
+                0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79,
+                0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc,
+                0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7,
+                0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622,
+                0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d,
+                0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8
+            },
+            {
+                0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca,
+                0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c,
+                0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7,
+                0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11,
+                0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41,
+                0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7,
+                0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c,
+                0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a,
+                0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d,
+                0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb,
+                0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610,
+                0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6,
+                0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6,
+                0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040,
+                0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b,
+                0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d,
+                0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5,
+                0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213,
+                0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8,
+                0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e,
+                0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e,
+                0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698,
+                0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443,
+                0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5,
+                0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12,
+                0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4,
+                0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f,
+                0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9,
+                0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99,
+                0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f,
+                0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4,
+                0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842
+            },
+            {
+                0x00000000, 0x38116fac, 0x7022df58, 0x4833b0f4, 0xe045beb0, 0xd854d11c, 0x906761e8, 0xa8760e44,
+                0xc5670b91, 0xfd76643d, 0xb545d4c9, 0x8d54bb65, 0x2522b521, 0x1d33da8d, 0x55006a79, 0x6d1105d5,
+                0x8f2261d3, 0xb7330e7f, 0xff00be8b, 0xc711d127, 0x6f67df63, 0x5776b0cf, 0x1f45003b, 0x27546f97,
+                0x4a456a42, 0x725405ee, 0x3a67b51a, 0x0276dab6, 0xaa00d4f2, 0x9211bb5e, 0xda220baa, 0xe2336406,
+                0x1ba8b557, 0x23b9dafb, 0x6b8a6a0f, 0x539b05a3, 0xfbed0be7, 0xc3fc644b, 0x8bcfd4bf, 0xb3debb13,
+                0xdecfbec6, 0xe6ded16a, 0xaeed619e, 0x96fc0e32, 0x3e8a0076, 0x069b6fda, 0x4ea8df2e, 0x76b9b082,
+                0x948ad484, 0xac9bbb28, 0xe4a80bdc, 0xdcb96470, 0x74cf6a34, 0x4cde0598, 0x04edb56c, 0x3cfcdac0,
+                0x51eddf15, 0x69fcb0b9, 0x21cf004d, 0x19de6fe1, 0xb1a861a5, 0x89b90e09, 0xc18abefd, 0xf99bd151,
+                0x37516aae, 0x0f400502, 0x4773b5f6, 0x7f62da5a, 0xd714d41e, 0xef05bbb2, 0xa7360b46, 0x9f2764ea,
+                0xf236613f, 0xca270e93, 0x8214be67, 0xba05d1cb, 0x1273df8f, 0x2a62b023, 0x625100d7, 0x5a406f7b,
+                0xb8730b7d, 0x806264d1, 0xc851d425, 0xf040bb89, 0x5836b5cd, 0x6027da61, 0x28146a95, 0x10050539,
+                0x7d1400ec, 0x45056f40, 0x0d36dfb4, 0x3527b018, 0x9d51be5c, 0xa540d1f0, 0xed736104, 0xd5620ea8,
+                0x2cf9dff9, 0x14e8b055, 0x5cdb00a1, 0x64ca6f0d, 0xccbc6149, 0xf4ad0ee5, 0xbc9ebe11, 0x848fd1bd,
+                0xe99ed468, 0xd18fbbc4, 0x99bc0b30, 0xa1ad649c, 0x09db6ad8, 0x31ca0574, 0x79f9b580, 0x41e8da2c,
+                0xa3dbbe2a, 0x9bcad186, 0xd3f96172, 0xebe80ede, 0x439e009a, 0x7b8f6f36, 0x33bcdfc2, 0x0badb06e,
+                0x66bcb5bb, 0x5eadda17, 0x169e6ae3, 0x2e8f054f, 0x86f90b0b, 0xbee864a7, 0xf6dbd453, 0xcecabbff,
+                0x6ea2d55c, 0x56b3baf0, 0x1e800a04, 0x269165a8, 0x8ee76bec, 0xb6f60440, 0xfec5b4b4, 0xc6d4db18,
+                0xabc5decd, 0x93d4b161, 0xdbe70195, 0xe3f66e39, 0x4b80607d, 0x73910fd1, 0x3ba2bf25, 0x03b3d089,
+                0xe180b48f, 0xd991db23, 0x91a26bd7, 0xa9b3047b, 0x01c50a3f, 0x39d46593, 0x71e7d567, 0x49f6bacb,
+                0x24e7bf1e, 0x1cf6d0b2, 0x54c56046, 0x6cd40fea, 0xc4a201ae, 0xfcb36e02, 0xb480def6, 0x8c91b15a,
+                0x750a600b, 0x4d1b0fa7, 0x0528bf53, 0x3d39d0ff, 0x954fdebb, 0xad5eb117, 0xe56d01e3, 0xdd7c6e4f,
+                0xb06d6b9a, 0x887c0436, 0xc04fb4c2, 0xf85edb6e, 0x5028d52a, 0x6839ba86, 0x200a0a72, 0x181b65de,
+                0xfa2801d8, 0xc2396e74, 0x8a0ade80, 0xb21bb12c, 0x1a6dbf68, 0x227cd0c4, 0x6a4f6030, 0x525e0f9c,
+                0x3f4f0a49, 0x075e65e5, 0x4f6dd511, 0x777cbabd, 0xdf0ab4f9, 0xe71bdb55, 0xaf286ba1, 0x9739040d,
+                0x59f3bff2, 0x61e2d05e, 0x29d160aa, 0x11c00f06, 0xb9b60142, 0x81a76eee, 0xc994de1a, 0xf185b1b6,
+                0x9c94b463, 0xa485dbcf, 0xecb66b3b, 0xd4a70497, 0x7cd10ad3, 0x44c0657f, 0x0cf3d58b, 0x34e2ba27,
+                0xd6d1de21, 0xeec0b18d, 0xa6f30179, 0x9ee26ed5, 0x36946091, 0x0e850f3d, 0x46b6bfc9, 0x7ea7d065,
+                0x13b6d5b0, 0x2ba7ba1c, 0x63940ae8, 0x5b856544, 0xf3f36b00, 0xcbe204ac, 0x83d1b458, 0xbbc0dbf4,
+                0x425b0aa5, 0x7a4a6509, 0x3279d5fd, 0x0a68ba51, 0xa21eb415, 0x9a0fdbb9, 0xd23c6b4d, 0xea2d04e1,
+                0x873c0134, 0xbf2d6e98, 0xf71ede6c, 0xcf0fb1c0, 0x6779bf84, 0x5f68d028, 0x175b60dc, 0x2f4a0f70,
+                0xcd796b76, 0xf56804da, 0xbd5bb42e, 0x854adb82, 0x2d3cd5c6, 0x152dba6a, 0x5d1e0a9e, 0x650f6532,
+                0x081e60e7, 0x300f0f4b, 0x783cbfbf, 0x402dd013, 0xe85bde57, 0xd04ab1fb, 0x9879010f, 0xa0686ea3
+            },
+            {
+                0x00000000, 0xef306b19, 0xdb8ca0c3, 0x34bccbda, 0xb2f53777, 0x5dc55c6e, 0x697997b4, 0x8649fcad,
+                0x6006181f, 0x8f367306, 0xbb8ab8dc, 0x54bad3c5, 0xd2f32f68, 0x3dc34471, 0x097f8fab, 0xe64fe4b2,
+                0xc00c303e, 0x2f3c5b27, 0x1b8090fd, 0xf4b0fbe4, 0x72f90749, 0x9dc96c50, 0xa975a78a, 0x4645cc93,
+                0xa00a2821, 0x4f3a4338, 0x7b8688e2, 0x94b6e3fb, 0x12ff1f56, 0xfdcf744f, 0xc973bf95, 0x2643d48c,
+                0x85f4168d, 0x6ac47d94, 0x5e78b64e, 0xb148dd57, 0x370121fa, 0xd8314ae3, 0xec8d8139, 0x03bdea20,
+                0xe5f20e92, 0x0ac2658b, 0x3e7eae51, 0xd14ec548, 0x570739e5, 0xb83752fc, 0x8c8b9926, 0x63bbf23f,
+                0x45f826b3, 0xaac84daa, 0x9e748670, 0x7144ed69, 0xf70d11c4, 0x183d7add, 0x2c81b107, 0xc3b1da1e,
+                0x25fe3eac, 0xcace55b5, 0xfe729e6f, 0x1142f576, 0x970b09db, 0x783b62c2, 0x4c87a918, 0xa3b7c201,
+                0x0e045beb, 0xe13430f2, 0xd588fb28, 0x3ab89031, 0xbcf16c9c, 0x53c10785, 0x677dcc5f, 0x884da746,
+                0x6e0243f4, 0x813228ed, 0xb58ee337, 0x5abe882e, 0xdcf77483, 0x33c71f9a, 0x077bd440, 0xe84bbf59,
+                0xce086bd5, 0x213800cc, 0x1584cb16, 0xfab4a00f, 0x7cfd5ca2, 0x93cd37bb, 0xa771fc61, 0x48419778,
+                0xae0e73ca, 0x413e18d3, 0x7582d309, 0x9ab2b810, 0x1cfb44bd, 0xf3cb2fa4, 0xc777e47e, 0x28478f67,
+                0x8bf04d66, 0x64c0267f, 0x507ceda5, 0xbf4c86bc, 0x39057a11, 0xd6351108, 0xe289dad2, 0x0db9b1cb,
+                0xebf65579, 0x04c63e60, 0x307af5ba, 0xdf4a9ea3, 0x5903620e, 0xb6330917, 0x828fc2cd, 0x6dbfa9d4,
+                0x4bfc7d58, 0xa4cc1641, 0x9070dd9b, 0x7f40b682, 0xf9094a2f, 0x16392136, 0x2285eaec, 0xcdb581f5,
+                0x2bfa6547, 0xc4ca0e5e, 0xf076c584, 0x1f46ae9d, 0x990f5230, 0x763f3929, 0x4283f2f3, 0xadb399ea,
+                0x1c08b7d6, 0xf338dccf, 0xc7841715, 0x28b47c0c, 0xaefd80a1, 0x41cdebb8, 0x75712062, 0x9a414b7b,
+                0x7c0eafc9, 0x933ec4d0, 0xa7820f0a, 0x48b26413, 0xcefb98be, 0x21cbf3a7, 0x1577387d, 0xfa475364,
+                0xdc0487e8, 0x3334ecf1, 0x0788272b, 0xe8b84c32, 0x6ef1b09f, 0x81c1db86, 0xb57d105c, 0x5a4d7b45,
+                0xbc029ff7, 0x5332f4ee, 0x678e3f34, 0x88be542d, 0x0ef7a880, 0xe1c7c399, 0xd57b0843, 0x3a4b635a,
+                0x99fca15b, 0x76ccca42, 0x42700198, 0xad406a81, 0x2b09962c, 0xc439fd35, 0xf08536ef, 0x1fb55df6,
+                0xf9fab944, 0x16cad25d, 0x22761987, 0xcd46729e, 0x4b0f8e33, 0xa43fe52a, 0x90832ef0, 0x7fb345e9,
+                0x59f09165, 0xb6c0fa7c, 0x827c31a6, 0x6d4c5abf, 0xeb05a612, 0x0435cd0b, 0x308906d1, 0xdfb96dc8,
+                0x39f6897a, 0xd6c6e263, 0xe27a29b9, 0x0d4a42a0, 0x8b03be0d, 0x6433d514, 0x508f1ece, 0xbfbf75d7,
+                0x120cec3d, 0xfd3c8724, 0xc9804cfe, 0x26b027e7, 0xa0f9db4a, 0x4fc9b053, 0x7b757b89, 0x94451090,
+                0x720af422, 0x9d3a9f3b, 0xa98654e1, 0x46b63ff8, 0xc0ffc355, 0x2fcfa84c, 0x1b736396, 0xf443088f,
+                0xd200dc03, 0x3d30b71a, 0x098c7cc0, 0xe6bc17d9, 0x60f5eb74, 0x8fc5806d, 0xbb794bb7, 0x544920ae,
+                0xb206c41c, 0x5d36af05, 0x698a64df, 0x86ba0fc6, 0x00f3f36b, 0xefc39872, 0xdb7f53a8, 0x344f38b1,
+                0x97f8fab0, 0x78c891a9, 0x4c745a73, 0xa344316a, 0x250dcdc7, 0xca3da6de, 0xfe816d04, 0x11b1061d,
+                0xf7fee2af, 0x18ce89b6, 0x2c72426c, 0xc3422975, 0x450bd5d8, 0xaa3bbec1, 0x9e87751b, 0x71b71e02,
+                0x57f4ca8e, 0xb8c4a197, 0x8c786a4d, 0x63480154, 0xe501fdf9, 0x0a3196e0, 0x3e8d5d3a, 0xd1bd3623,
+                0x37f2d291, 0xd8c2b988, 0xec7e7252, 0x034e194b, 0x8507e5e6, 0x6a378eff, 0x5e8b4525, 0xb1bb2e3c
+            },
+            {
+                0x00000000, 0x68032cc8, 0xd0065990, 0xb8057558, 0xa5e0c5d1, 0xcde3e919, 0x75e69c41, 0x1de5b089,
+                0x4e2dfd53, 0x262ed19b, 0x9e2ba4c3, 0xf628880b, 0xebcd3882, 0x83ce144a, 0x3bcb6112, 0x53c84dda,
+                0x9c5bfaa6, 0xf458d66e, 0x4c5da336, 0x245e8ffe, 0x39bb3f77, 0x51b813bf, 0xe9bd66e7, 0x81be4a2f,
+                0xd27607f5, 0xba752b3d, 0x02705e65, 0x6a7372ad, 0x7796c224, 0x1f95eeec, 0xa7909bb4, 0xcf93b77c,
+                0x3d5b83bd, 0x5558af75, 0xed5dda2d, 0x855ef6e5, 0x98bb466c, 0xf0b86aa4, 0x48bd1ffc, 0x20be3334,
+                0x73767eee, 0x1b755226, 0xa370277e, 0xcb730bb6, 0xd696bb3f, 0xbe9597f7, 0x0690e2af, 0x6e93ce67,
+                0xa100791b, 0xc90355d3, 0x7106208b, 0x19050c43, 0x04e0bcca, 0x6ce39002, 0xd4e6e55a, 0xbce5c992,
+                0xef2d8448, 0x872ea880, 0x3f2bddd8, 0x5728f110, 0x4acd4199, 0x22ce6d51, 0x9acb1809, 0xf2c834c1,
+                0x7ab7077a, 0x12b42bb2, 0xaab15eea, 0xc2b27222, 0xdf57c2ab, 0xb754ee63, 0x0f519b3b, 0x6752b7f3,
+                0x349afa29, 0x5c99d6e1, 0xe49ca3b9, 0x8c9f8f71, 0x917a3ff8, 0xf9791330, 0x417c6668, 0x297f4aa0,
+                0xe6ecfddc, 0x8eefd114, 0x36eaa44c, 0x5ee98884, 0x430c380d, 0x2b0f14c5, 0x930a619d, 0xfb094d55,
+                0xa8c1008f, 0xc0c22c47, 0x78c7591f, 0x10c475d7, 0x0d21c55e, 0x6522e996, 0xdd279cce, 0xb524b006,
+                0x47ec84c7, 0x2fefa80f, 0x97eadd57, 0xffe9f19f, 0xe20c4116, 0x8a0f6dde, 0x320a1886, 0x5a09344e,
+                0x09c17994, 0x61c2555c, 0xd9c72004, 0xb1c40ccc, 0xac21bc45, 0xc422908d, 0x7c27e5d5, 0x1424c91d,
+                0xdbb77e61, 0xb3b452a9, 0x0bb127f1, 0x63b20b39, 0x7e57bbb0, 0x16549778, 0xae51e220, 0xc652cee8,
+                0x959a8332, 0xfd99affa, 0x459cdaa2, 0x2d9ff66a, 0x307a46e3, 0x58796a2b, 0xe07c1f73, 0x887f33bb,
+                0xf56e0ef4, 0x9d6d223c, 0x25685764, 0x4d6b7bac, 0x508ecb25, 0x388de7ed, 0x808892b5, 0xe88bbe7d,
+                0xbb43f3a7, 0xd340df6f, 0x6b45aa37, 0x034686ff, 0x1ea33676, 0x76a01abe, 0xcea56fe6, 0xa6a6432e,
+                0x6935f452, 0x0136d89a, 0xb933adc2, 0xd130810a, 0xccd53183, 0xa4d61d4b, 0x1cd36813, 0x74d044db,
+                0x27180901, 0x4f1b25c9, 0xf71e5091, 0x9f1d7c59, 0x82f8ccd0, 0xeafbe018, 0x52fe9540, 0x3afdb988,
+                0xc8358d49, 0xa036a181, 0x1833d4d9, 0x7030f811, 0x6dd54898, 0x05d66450, 0xbdd31108, 0xd5d03dc0,
+                0x8618701a, 0xee1b5cd2, 0x561e298a, 0x3e1d0542, 0x23f8b5cb, 0x4bfb9903, 0xf3feec5b, 0x9bfdc093,
+                0x546e77ef, 0x3c6d5b27, 0x84682e7f, 0xec6b02b7, 0xf18eb23e, 0x998d9ef6, 0x2188ebae, 0x498bc766,
+                0x1a438abc, 0x7240a674, 0xca45d32c, 0xa246ffe4, 0xbfa34f6d, 0xd7a063a5, 0x6fa516fd, 0x07a63a35,
+                0x8fd9098e, 0xe7da2546, 0x5fdf501e, 0x37dc7cd6, 0x2a39cc5f, 0x423ae097, 0xfa3f95cf, 0x923cb907,
+                0xc1f4f4dd, 0xa9f7d815, 0x11f2ad4d, 0x79f18185, 0x6414310c, 0x0c171dc4, 0xb412689c, 0xdc114454,
+                0x1382f328, 0x7b81dfe0, 0xc384aab8, 0xab878670, 0xb66236f9, 0xde611a31, 0x66646f69, 0x0e6743a1,
+                0x5daf0e7b, 0x35ac22b3, 0x8da957eb, 0xe5aa7b23, 0xf84fcbaa, 0x904ce762, 0x2849923a, 0x404abef2,
+                0xb2828a33, 0xda81a6fb, 0x6284d3a3, 0x0a87ff6b, 0x17624fe2, 0x7f61632a, 0xc7641672, 0xaf673aba,
+                0xfcaf7760, 0x94ac5ba8, 0x2ca92ef0, 0x44aa0238, 0x594fb2b1, 0x314c9e79, 0x8949eb21, 0xe14ac7e9,
+                0x2ed97095, 0x46da5c5d, 0xfedf2905, 0x96dc05cd, 0x8b39b544, 0xe33a998c, 0x5b3fecd4, 0x333cc01c,
+                0x60f48dc6, 0x08f7a10e, 0xb0f2d456, 0xd8f1f89e, 0xc5144817, 0xad1764df, 0x15121187, 0x7d113d4f
+            },
+            {
+                0x00000000, 0x493c7d27, 0x9278fa4e, 0xdb448769, 0x211d826d, 0x6821ff4a, 0xb3657823, 0xfa590504,
+                0x423b04da, 0x0b0779fd, 0xd043fe94, 0x997f83b3, 0x632686b7, 0x2a1afb90, 0xf15e7cf9, 0xb86201de,
+                0x847609b4, 0xcd4a7493, 0x160ef3fa, 0x5f328edd, 0xa56b8bd9, 0xec57f6fe, 0x37137197, 0x7e2f0cb0,
+                0xc64d0d6e, 0x8f717049, 0x5435f720, 0x1d098a07, 0xe7508f03, 0xae6cf224, 0x7528754d, 0x3c14086a,
+                0x0d006599, 0x443c18be, 0x9f789fd7, 0xd644e2f0, 0x2c1de7f4, 0x65219ad3, 0xbe651dba, 0xf759609d,
+                0x4f3b6143, 0x06071c64, 0xdd439b0d, 0x947fe62a, 0x6e26e32e, 0x271a9e09, 0xfc5e1960, 0xb5626447,
+                0x89766c2d, 0xc04a110a, 0x1b0e9663, 0x5232eb44, 0xa86bee40, 0xe1579367, 0x3a13140e, 0x732f6929,
+                0xcb4d68f7, 0x827115d0, 0x593592b9, 0x1009ef9e, 0xea50ea9a, 0xa36c97bd, 0x782810d4, 0x31146df3,
+                0x1a00cb32, 0x533cb615, 0x8878317c, 0xc1444c5b, 0x3b1d495f, 0x72213478, 0xa965b311, 0xe059ce36,
+                0x583bcfe8, 0x1107b2cf, 0xca4335a6, 0x837f4881, 0x79264d85, 0x301a30a2, 0xeb5eb7cb, 0xa262caec,
+                0x9e76c286, 0xd74abfa1, 0x0c0e38c8, 0x453245ef, 0xbf6b40eb, 0xf6573dcc, 0x2d13baa5, 0x642fc782,
+                0xdc4dc65c, 0x9571bb7b, 0x4e353c12, 0x07094135, 0xfd504431, 0xb46c3916, 0x6f28be7f, 0x2614c358,
+                0x1700aeab, 0x5e3cd38c, 0x857854e5, 0xcc4429c2, 0x361d2cc6, 0x7f2151e1, 0xa465d688, 0xed59abaf,
+                0x553baa71, 0x1c07d756, 0xc743503f, 0x8e7f2d18, 0x7426281c, 0x3d1a553b, 0xe65ed252, 0xaf62af75,
+                0x9376a71f, 0xda4ada38, 0x010e5d51, 0x48322076, 0xb26b2572, 0xfb575855, 0x2013df3c, 0x692fa21b,
+                0xd14da3c5, 0x9871dee2, 0x4335598b, 0x0a0924ac, 0xf05021a8, 0xb96c5c8f, 0x6228dbe6, 0x2b14a6c1,
+                0x34019664, 0x7d3deb43, 0xa6796c2a, 0xef45110d, 0x151c1409, 0x5c20692e, 0x8764ee47, 0xce589360,
+                0x763a92be, 0x3f06ef99, 0xe44268f0, 0xad7e15d7, 0x572710d3, 0x1e1b6df4, 0xc55fea9d, 0x8c6397ba,
+                0xb0779fd0, 0xf94be2f7, 0x220f659e, 0x6b3318b9, 0x916a1dbd, 0xd856609a, 0x0312e7f3, 0x4a2e9ad4,
+                0xf24c9b0a, 0xbb70e62d, 0x60346144, 0x29081c63, 0xd3511967, 0x9a6d6440, 0x4129e329, 0x08159e0e,
+                0x3901f3fd, 0x703d8eda, 0xab7909b3, 0xe2457494, 0x181c7190, 0x51200cb7, 0x8a648bde, 0xc358f6f9,
+                0x7b3af727, 0x32068a00, 0xe9420d69, 0xa07e704e, 0x5a27754a, 0x131b086d, 0xc85f8f04, 0x8163f223,
+                0xbd77fa49, 0xf44b876e, 0x2f0f0007, 0x66337d20, 0x9c6a7824, 0xd5560503, 0x0e12826a, 0x472eff4d,
+                0xff4cfe93, 0xb67083b4, 0x6d3404dd, 0x240879fa, 0xde517cfe, 0x976d01d9, 0x4c2986b0, 0x0515fb97,
+                0x2e015d56, 0x673d2071, 0xbc79a718, 0xf545da3f, 0x0f1cdf3b, 0x4620a21c, 0x9d642575, 0xd4585852,
+                0x6c3a598c, 0x250624ab, 0xfe42a3c2, 0xb77edee5, 0x4d27dbe1, 0x041ba6c6, 0xdf5f21af, 0x96635c88,
+                0xaa7754e2, 0xe34b29c5, 0x380faeac, 0x7133d38b, 0x8b6ad68f, 0xc256aba8, 0x19122cc1, 0x502e51e6,
+                0xe84c5038, 0xa1702d1f, 0x7a34aa76, 0x3308d751, 0xc951d255, 0x806daf72, 0x5b29281b, 0x1215553c,
+                0x230138cf, 0x6a3d45e8, 0xb179c281, 0xf845bfa6, 0x021cbaa2, 0x4b20c785, 0x906440ec, 0xd9583dcb,
+                0x613a3c15, 0x28064132, 0xf342c65b, 0xba7ebb7c, 0x4027be78, 0x091bc35f, 0xd25f4436, 0x9b633911,
+                0xa777317b, 0xee4b4c5c, 0x350fcb35, 0x7c33b612, 0x866ab316, 0xcf56ce31, 0x14124958, 0x5d2e347f,
+                0xe54c35a1, 0xac704886, 0x7734cfef, 0x3e08b2c8, 0xc451b7cc, 0x8d6dcaeb, 0x56294d82, 0x1f1530a5
+            }
+        };
+
+        uint32_t Crc32c(const void* src, size_t size)
+        {
+            const uint8_t* p8 = (const uint8_t*)src;
+            uint32_t crc = 0xFFFFFFFF;
+
+            for (; ((uintptr_t)p8 & (sizeof(uint32_t) - 1)) != 0 && size > 0; ++p8, --size)
+                crc = Crc32cTable[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8);
+
+            const uint32_t* p32 = (const uint32_t*)p8;
+            for (; size >= 8; size -= 8)
+            {
+#ifdef SIMD_BIG_ENDIAN
+                uint32_t v0 = *p32++ ^ Reorder32(crc);
+                uint32_t v1 = *p32++;
+                crc =
+                    Crc32cTable[0x0][v1 & 0xFF] ^
+                    Crc32cTable[0x1][(v1 >> 8) & 0xFF] ^
+                    Crc32cTable[0x2][(v1 >> 16) & 0xFF] ^
+                    Crc32cTable[0x3][(v1 >> 24) & 0xFF] ^
+                    Crc32cTable[0x4][v0 & 0xFF] ^
+                    Crc32cTable[0x5][(v0 >> 8) & 0xFF] ^
+                    Crc32cTable[0x6][(v0 >> 16) & 0xFF] ^
+                    Crc32cTable[0x7][(v0 >> 24) & 0xFF];
+#else
+                uint32_t v0 = *p32++ ^ crc;
+                uint32_t v1 = *p32++;
+                crc =
+                    Crc32cTable[0x0][(v1 >> 24) & 0xFF] ^
+                    Crc32cTable[0x1][(v1 >> 16) & 0xFF] ^
+                    Crc32cTable[0x2][(v1 >> 8) & 0xFF] ^
+                    Crc32cTable[0x3][v1 & 0xFF] ^
+                    Crc32cTable[0x4][(v0 >> 24) & 0xFF] ^
+                    Crc32cTable[0x5][(v0 >> 16) & 0xFF] ^
+                    Crc32cTable[0x6][(v0 >> 8) & 0xFF] ^
+                    Crc32cTable[0x7][v0 & 0xFF];
+#endif
+            }
+
+            for (p8 = (const uint8_t*)p32; size > 0; ++p8, size--)
+                crc = Crc32cTable[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8);
+
+            return (~crc);
+        }
+    }
+}
diff --git a/3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp b/3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp
new file mode 100644
index 0000000000..b064ca50a2
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp
@@ -0,0 +1,371 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdArray.h"
+#include "Simd/SimdCpu.h"
+#include "Simd/SimdBase.h"
+
+#include <stdio.h>
+
+#if defined(_MSC_VER)
+#pragma warning (push)
+#pragma warning (disable: 4996)
+#endif
+
+namespace Simd
+{
+    uint8_t* ImageLoadFromFile(const ImageLoadFromMemoryPtr loader, const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+    {
+        uint8_t* data = NULL;
+        ::FILE* file = ::fopen(path, "rb");
+        if (file)
+        {
+            ::fseek(file, 0, SEEK_END);
+            Array8u buffer(::ftell(file));
+            ::fseek(file, 0, SEEK_SET);
+            if (::fread(buffer.data, 1, buffer.size, file) == buffer.size)
+                data = loader(buffer.data, buffer.size, stride, width, height, format);
+            ::fclose(file);
+        }
+        return data;
+    }
+
+    //-------------------------------------------------------------------------
+
+    ImageLoaderParam::ImageLoaderParam(const uint8_t* d, size_t s, SimdPixelFormatType f)
+        : data(d)
+        , size(s)
+        , format(f)
+        , file(SimdImageFileUndefined)
+    {
+    }
+
+    bool ImageLoaderParam::Validate()
+    {
+        if (size >= 3)
+        {
+            if (data[0] == 'P' && data[2] == '\n')
+            {
+                if (data[1] == '2')
+                    file = SimdImageFilePgmTxt;
+                if (data[1] == '3')
+                    file = SimdImageFilePpmTxt;
+                if (data[1] == '5')
+                    file = SimdImageFilePgmBin;
+                if (data[1] == '6')
+                    file = SimdImageFilePpmBin;
+            }
+        }
+        if (size >= 8)
+        {
+            const uint8_t SIGNATURE[8] = { 137, 80, 78, 71, 13, 10, 26, 10 };
+            if(memcmp(data, SIGNATURE, 8) == 0)
+                file = SimdImageFilePng;
+        }
+        if (size >= 2)
+        {
+            if (data[0] == 0xFF && data[1] == 0xD8)
+                file = SimdImageFileJpeg;
+        }
+        return
+            file != SimdImageFileUndefined && 
+                (format == SimdPixelFormatNone || format == SimdPixelFormatGray8 || 
+                format == SimdPixelFormatBgr24 || format == SimdPixelFormatBgra32 || 
+                format == SimdPixelFormatRgb24 || format == SimdPixelFormatRgba32);
+    }
+        
+    namespace Base
+    {
+        ImagePxmLoader::ImagePxmLoader(const ImageLoaderParam& param)
+            : ImageLoader(param)
+            , _toAny(NULL)
+            , _toBgra(NULL)
+        {
+        }
+
+        bool ImagePxmLoader::ReadHeader(size_t version)
+        {
+            if (_stream.Size() < 3 ||
+                _stream.Data()[0] != 'P' ||
+                _stream.Data()[1] != '0' + version ||
+                _stream.Data()[2] != '\n')
+                return false;
+            _stream.Seek(3);
+            uint32_t width, height, max;
+            if (!(_stream.ReadUnsigned(width) && _stream.ReadUnsigned(height) && _stream.ReadUnsigned(max)))
+                return false;
+            if (!(width > 0 && height > 0 && max == 255))
+                return false;
+            uint8_t byte;
+            if (!(_stream.Read(byte) && byte == '\n'))
+                return false;
+            _image.Recreate(width, height, (Image::Format)_param.format);
+            _block = height;
+            if (_param.file == SimdImageFilePgmTxt || _param.file == SimdImageFilePgmBin)
+            {
+                _size = width * 1;
+                if (_param.format != SimdPixelFormatGray8)
+                {
+                    _block = Simd::RestrictRange<size_t>(Base::AlgCacheL1() / _size, 1, height);
+                    _buffer.Resize(_block * _size);
+                }
+            }
+            else if (_param.file == SimdImageFilePpmTxt || _param.file == SimdImageFilePpmBin)
+            {
+                _size = width * 3;
+                if (_param.format != SimdPixelFormatRgb24)
+                {
+                    _block = Simd::RestrictRange<size_t>(Base::AlgCacheL1() / _size, 1, height);
+                    _buffer.Resize(_block * _size);
+                }
+            }
+            else
+                return false;
+            SetConverters();
+            return true;
+        }
+
+        //-------------------------------------------------------------------------
+
+        ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param)
+            : ImagePxmLoader(param)
+        {
+            if (_param.format == SimdPixelFormatNone)
+                _param.format = SimdPixelFormatGray8;
+        }
+
+        bool ImagePgmTxtLoader::FromStream()
+        {
+            if (!ReadHeader(2))
+                return false;
+            size_t grayStride = _param.format == SimdPixelFormatGray8 ? _image.stride : _size;
+            for (size_t row = 0; row < _image.height;)
+            {
+                size_t block = Simd::Min(row + _block, _image.height) - row;
+                uint8_t * gray = _param.format == SimdPixelFormatGray8 ? _image.Row<uint8_t>(row) : _buffer.data;
+                for (size_t b = 0; b < block; ++b)
+                {
+                    for (size_t i = 0; i < _size; ++i)
+                    {
+                        if (!_stream.ReadUnsigned(gray[i]))
+                            return false;
+                    }
+                    gray += grayStride;
+                }
+                if(_param.format == SimdPixelFormatBgr24 || _param.format == SimdPixelFormatRgb24)
+                    _toAny(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride);
+                if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32)
+                    _toBgra(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride, 0xFF);
+                row += block;
+            }
+            return true;
+        }
+
+        void ImagePgmTxtLoader::SetConverters()
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatBgr24: _toAny = Base::GrayToBgr; break;
+            case SimdPixelFormatBgra32: _toBgra = Base::GrayToBgra; break;
+            case SimdPixelFormatRgb24: _toAny = Base::GrayToBgr; break;
+            case SimdPixelFormatRgba32: _toBgra = Base::GrayToBgra; break;
+            default: break;
+            }
+        }
+
+        //-------------------------------------------------------------------------
+
+        ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param)
+            : ImagePxmLoader(param)
+        {
+            if (_param.format == SimdPixelFormatNone)
+                _param.format = SimdPixelFormatGray8;
+        }
+
+        bool ImagePgmBinLoader::FromStream()
+        {
+            if (!ReadHeader(5))
+                return false;
+            size_t grayStride = _param.format == SimdPixelFormatGray8 ? _image.stride : _size;
+            for (size_t row = 0; row < _image.height;)
+            {
+                size_t block = Simd::Min(row + _block, _image.height) - row;
+                uint8_t* gray = _param.format == SimdPixelFormatGray8 ? _image.Row<uint8_t>(row) : _buffer.data;
+                for (size_t b = 0; b < block; ++b)
+                {
+                    if (_stream.Read(_size, gray) != _size)
+                        return false;
+                    gray += grayStride;
+                }
+                if (_param.format == SimdPixelFormatBgr24 || _param.format == SimdPixelFormatRgb24)
+                    _toAny(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride);
+                if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32)
+                    _toBgra(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride, 0xFF);
+                row += block;
+            }
+            return true;
+        }
+
+        void ImagePgmBinLoader::SetConverters()
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatBgr24: _toAny = Base::GrayToBgr; break;
+            case SimdPixelFormatBgra32: _toBgra = Base::GrayToBgra; break;
+            case SimdPixelFormatRgb24: _toAny = Base::GrayToBgr; break;
+            case SimdPixelFormatRgba32: _toBgra = Base::GrayToBgra; break;
+            default: break;
+            }
+        }
+
+        //-------------------------------------------------------------------------
+
+        ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param)
+            : ImagePxmLoader(param)
+        {
+            if (_param.format == SimdPixelFormatNone)
+                _param.format = SimdPixelFormatRgb24;
+        }
+
+        bool ImagePpmTxtLoader::FromStream()
+        {
+            if (!ReadHeader(3))
+                return false;
+            size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? _image.stride : _size;
+            for (size_t row = 0; row < _image.height;)
+            {
+                size_t block = Simd::Min(row + _block, _image.height) - row;
+                uint8_t* rgb = _param.format == SimdPixelFormatRgb24 ? _image.Row<uint8_t>(row) : _buffer.data;
+                for (size_t b = 0; b < block; ++b)
+                {
+                    for (size_t i = 0; i < _size; ++i)
+                    {
+                        if (!_stream.ReadUnsigned(rgb[i]))
+                            return false;
+                    }
+                    rgb += rgbStride;
+                }
+                if (_param.format == SimdPixelFormatGray8 || _param.format == SimdPixelFormatBgr24)
+                    _toAny(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride);
+                if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32)
+                    _toBgra(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride, 0xFF);
+                row += block;
+            }
+            return true;
+        }
+
+        void ImagePpmTxtLoader::SetConverters()
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatGray8: _toAny = Base::RgbToGray; break;
+            case SimdPixelFormatBgr24: _toAny = Base::BgrToRgb; break;
+            case SimdPixelFormatBgra32: _toBgra = Base::RgbToBgra; break;
+            case SimdPixelFormatRgba32: _toBgra = Base::BgrToBgra; break;
+            default: break;
+            }
+        }
+
+        //-------------------------------------------------------------------------
+
+        ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param)
+            : ImagePxmLoader(param)
+        {
+            if (_param.format == SimdPixelFormatNone)
+                _param.format = SimdPixelFormatRgb24;
+        }
+
+        bool ImagePpmBinLoader::FromStream()
+        {
+            if (!ReadHeader(6))
+                return false;
+            size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? _image.stride : _size;
+            for (size_t row = 0; row < _image.height;)
+            {
+                size_t block = Simd::Min(row + _block, _image.height) - row;
+                uint8_t* rgb = _param.format == SimdPixelFormatRgb24 ? _image.Row<uint8_t>(row) : _buffer.data;
+                for (size_t b = 0; b < block; ++b)
+                {
+                    if (_stream.Read(_size, rgb) != _size)
+                        return false;
+                    rgb += rgbStride;
+                }
+                if (_param.format == SimdPixelFormatGray8 || _param.format == SimdPixelFormatBgr24)
+                    _toAny(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride);
+                if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32)
+                    _toBgra(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride, 0xFF);
+                row += block;
+            }
+            return true;
+        }
+
+        void ImagePpmBinLoader::SetConverters()
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatGray8: _toAny = Base::RgbToGray; break;
+            case SimdPixelFormatBgr24: _toAny = Base::BgrToRgb; break;
+            case SimdPixelFormatBgra32: _toBgra = Base::RgbToBgra; break;
+            case SimdPixelFormatRgba32: _toBgra = Base::BgrToBgra; break;
+            default: break;
+            }
+        }
+
+        //-------------------------------------------------------------------------
+
+        ImageLoader* CreateImageLoader(const ImageLoaderParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinLoader(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinLoader(param);
+            case SimdImageFilePng: return new ImagePngLoader(param);
+            case SimdImageFileJpeg: return new ImageJpegLoader(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+        {
+            ImageLoaderParam param(data, size, *format);
+            if (param.Validate())
+            {
+                Holder<ImageLoader> loader(CreateImageLoader(param));
+                if (loader)
+                {
+                    if (loader->FromStream())
+                        return loader->Release(stride, width, height, format);
+                }
+            }
+            return NULL;
+        }
+    }
+}
+
+#if defined(_MSC_VER)
+#pragma warning (pop)
+#endif
diff --git a/3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp b/3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp
new file mode 100644
index 0000000000..88c5da73d0
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp
@@ -0,0 +1,2456 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdArray.h"
+#include "Simd/SimdCpu.h"
+#include "Simd/SimdBase.h"
+
+namespace Simd
+{
+    namespace Base
+    {
+#if defined(SIMD_X64_ENABLE) && !defined(SIMD_SSE2_DISABLE)
+#define JPEG_SSE2
+        static int jpeg__sse2_available(void)
+        {
+            return 1;
+        }
+#endif
+
+#if defined(SIMD_ARM64_ENABLE) && !defined(SIMD_NEON_DISABLE)
+#define JPEG_NEON
+#endif
+
+        typedef unsigned char jpeg_uc;
+        typedef unsigned short jpeg_us;
+        typedef unsigned short jpeg__uint16;
+        typedef   signed short jpeg__int16;
+        typedef unsigned int   jpeg__uint32;
+        typedef   signed int   jpeg__int32;
+
+        typedef struct
+        {
+            int      (*read)  (void* user, char* data, int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+            void     (*skip)  (void* user, int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+            int      (*eof)   (void* user);                       // returns nonzero if we are at end of file/data
+        } jpeg_io_callbacks;
+
+#define jpeg_inline SIMD_INLINE
+#define JPEG_ASSERT assert
+
+#ifdef _MSC_VER
+#define JPEG_NOTUSED(v)  (void)(v)
+#else
+#define JPEG_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+        typedef struct
+        {
+            jpeg__uint32 img_x, img_y;
+            int img_n, img_out_n;
+
+            jpeg_io_callbacks io;
+            void* io_user_data;
+
+            int read_from_callbacks;
+            int buflen;
+            jpeg_uc buffer_start[128];
+            int callback_already_read;
+
+            jpeg_uc* img_buffer, * img_buffer_end;
+            jpeg_uc* img_buffer_original, * img_buffer_original_end;
+        } jpeg__context;
+
+        static int jpeg__err(const char* str)
+        {
+            //jpeg__g_failure_reason = str;
+            return 0;
+        }
+
+        static int jpeg__err(const char* str1, const char* str2)
+        {
+            //jpeg__g_failure_reason = str;
+            return 0;
+        }
+
+#define jpeg__errpuc(x,y)  ((unsigned char *)(size_t) (jpeg__err(x,y)?NULL:NULL))
+
+        static void jpeg__refill_buffer(jpeg__context* s)
+        {
+            int n = (s->io.read)(s->io_user_data, (char*)s->buffer_start, s->buflen);
+            s->callback_already_read += (int)(s->img_buffer - s->img_buffer_original);
+            if (n == 0) {
+                // at end of file, treat same as if from memory, but need to handle case
+                // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+                s->read_from_callbacks = 0;
+                s->img_buffer = s->buffer_start;
+                s->img_buffer_end = s->buffer_start + 1;
+                *s->img_buffer = 0;
+            }
+            else {
+                s->img_buffer = s->buffer_start;
+                s->img_buffer_end = s->buffer_start + n;
+            }
+        }
+
+        jpeg_inline static jpeg_uc jpeg__get8(jpeg__context* s)
+        {
+            if (s->img_buffer < s->img_buffer_end)
+                return *s->img_buffer++;
+            if (s->read_from_callbacks) {
+                jpeg__refill_buffer(s);
+                return *s->img_buffer++;
+            }
+            return 0;
+        }
+
+#define jpeg_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
+
+#define JPEG_SIMD_ALIGN(type, name) SIMD_ALIGNED(16) type name
+
+        static int jpeg__get16be(jpeg__context* s)
+        {
+            int z = jpeg__get8(s);
+            return (z << 8) + jpeg__get8(s);
+        }
+
+        static void jpeg__skip(jpeg__context* s, int n)
+        {
+            if (n == 0) return;  // already there!
+            if (n < 0) {
+                s->img_buffer = s->img_buffer_end;
+                return;
+            }
+            if (s->io.read) {
+                int blen = (int)(s->img_buffer_end - s->img_buffer);
+                if (blen < n) {
+                    s->img_buffer = s->img_buffer_end;
+                    (s->io.skip)(s->io_user_data, n - blen);
+                    return;
+                }
+            }
+            s->img_buffer += n;
+        }
+
+        jpeg_inline static int jpeg__at_eof(jpeg__context* s)
+        {
+            if (s->io.read) {
+                if (!(s->io.eof)(s->io_user_data)) return 0;
+                // if feof() is true, check if buffer = end
+                // special case: we've only got the special 0 character at the end
+                if (s->read_from_callbacks == 0) return 1;
+            }
+
+            return s->img_buffer >= s->img_buffer_end;
+        }
+
+#define JPEG_MALLOC(sz)           malloc(sz)
+#define JPEG_REALLOC(p,newsz)     realloc(p,newsz)
+#define JPEG_FREE(p)              free(p)
+
+#define JPEG_MAX_DIMENSIONS (1 << 24)
+
+        enum
+        {
+            JPEG__SCAN_load = 0,
+            JPEG__SCAN_type,
+            JPEG__SCAN_header
+        };
+
+        static void* jpeg__malloc(size_t size)
+        {
+            return JPEG_MALLOC(size);
+        }
+
+        static int jpeg__addsizes_valid(int a, int b)
+        {
+            if (b < 0) return 0;
+            // now 0 <= b <= INT_MAX, hence also
+            // 0 <= INT_MAX - b <= INTMAX.
+            // And "a + b <= INT_MAX" (which might overflow) is the
+            // same as a <= INT_MAX - b (no overflow)
+            return a <= INT_MAX - b;
+        }
+
+        static int jpeg__mul2sizes_valid(int a, int b)
+        {
+            if (a < 0 || b < 0) return 0;
+            if (b == 0) return 1; // mul-by-0 is always safe
+            // portable way to check for no overflows in a*b
+            return a <= INT_MAX / b;
+        }
+
+        static int jpeg__mad2sizes_valid(int a, int b, int add)
+        {
+            return jpeg__mul2sizes_valid(a, b) && jpeg__addsizes_valid(a * b, add);
+        }
+
+        // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+        static int jpeg__mad3sizes_valid(int a, int b, int c, int add)
+        {
+            return jpeg__mul2sizes_valid(a, b) && jpeg__mul2sizes_valid(a * b, c) &&
+                jpeg__addsizes_valid(a * b * c, add);
+        }
+
+        static int jpeg__mad4sizes_valid(int a, int b, int c, int d, int add)
+        {
+            return jpeg__mul2sizes_valid(a, b) && jpeg__mul2sizes_valid(a * b, c) &&
+                jpeg__mul2sizes_valid(a * b * c, d) && jpeg__addsizes_valid(a * b * c * d, add);
+        }
+
+        static void* jpeg__malloc_mad2(int a, int b, int add)
+        {
+            if (!jpeg__mad2sizes_valid(a, b, add)) return NULL;
+            return jpeg__malloc(a * b + add);
+        }
+
+        static void* jpeg__malloc_mad3(int a, int b, int c, int add)
+        {
+            if (!jpeg__mad3sizes_valid(a, b, c, add)) return NULL;
+            return jpeg__malloc(a * b * c + add);
+        }
+
+        static jpeg_uc jpeg__compute_y(int r, int g, int b)
+        {
+            return (jpeg_uc)(((r * 77) + (g * 150) + (29 * b)) >> 8);
+        }
+
+        typedef struct
+        {
+            int bits_per_channel;
+            int num_channels;
+            int channel_order;
+        } jpeg__result_info;
+
+        static void jpeg__rewind(jpeg__context* s)
+        {
+            // conceptually rewind SHOULD rewind to the beginning of the stream,
+            // but we just rewind to the beginning of the initial buffer, because
+            // we only use it after doing 'test', which only ever looks at at most 92 bytes
+            s->img_buffer = s->img_buffer_original;
+            s->img_buffer_end = s->img_buffer_original_end;
+        }
+
+        //------------------------------------------------------------------------------
+
+        // huffman decoding acceleration
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
+
+        typedef struct
+        {
+            jpeg_uc  fast[1 << FAST_BITS];
+            // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+            jpeg__uint16 code[256];
+            jpeg_uc  values[256];
+            jpeg_uc  size[257];
+            unsigned int maxcode[18];
+            int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+        } jpeg__huffman;
+
+        typedef struct
+        {
+            jpeg__context* s;
+            jpeg__huffman huff_dc[4];
+            jpeg__huffman huff_ac[4];
+            jpeg__uint16 dequant[4][64];
+            jpeg__int16 fast_ac[4][1 << FAST_BITS];
+
+            // sizes for components, interleaved MCUs
+            int img_h_max, img_v_max;
+            int img_mcu_x, img_mcu_y;
+            int img_mcu_w, img_mcu_h;
+
+            // definition of jpeg image component
+            struct
+            {
+                int id;
+                int h, v;
+                int tq;
+                int hd, ha;
+                int dc_pred;
+
+                int x, y, w2, h2;
+                jpeg_uc* data;
+                void* raw_data, * raw_coeff;
+                jpeg_uc* linebuf;
+                short* coeff;   // progressive only
+                int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
+            } img_comp[4];
+
+            jpeg__uint32   code_buffer; // jpeg entropy-coded buffer
+            int            code_bits;   // number of valid bits
+            unsigned char  marker;      // marker seen while filling entropy buffer
+            int            nomore;      // flag if we saw a marker so must stop
+
+            int            progressive;
+            int            spec_start;
+            int            spec_end;
+            int            succ_high;
+            int            succ_low;
+            int            eob_run;
+            int            jfif;
+            int            app14_color_transform; // Adobe APP14 tag
+            int            rgb;
+
+            int scan_n, order[4];
+            int restart_interval, todo;
+
+            // kernels
+            void (*idct_block_kernel)(jpeg_uc* out, int out_stride, short data[64]);
+            void (*YCbCr_to_RGB_kernel)(jpeg_uc* out, const jpeg_uc* y, const jpeg_uc* pcb, const jpeg_uc* pcr, int count, int step);
+            jpeg_uc* (*resample_row_hv_2_kernel)(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs);
+        } jpeg__jpeg;
+
+        static int jpeg__build_huffman(jpeg__huffman* h, int* count)
+        {
+            int i, j, k = 0;
+            unsigned int code;
+            // build size list for each symbol (from JPEG spec)
+            for (i = 0; i < 16; ++i)
+                for (j = 0; j < count[i]; ++j)
+                    h->size[k++] = (jpeg_uc)(i + 1);
+            h->size[k] = 0;
+
+            // compute actual symbols (from jpeg spec)
+            code = 0;
+            k = 0;
+            for (j = 1; j <= 16; ++j) {
+                // compute delta to add to code to compute symbol id
+                h->delta[j] = k - code;
+                if (h->size[k] == j) {
+                    while (h->size[k] == j)
+                        h->code[k++] = (jpeg__uint16)(code++);
+                    if (code - 1 >= (1u << j)) return jpeg__err("bad code lengths", "Corrupt JPEG");
+                }
+                // compute largest code + 1 for this size, preshifted as needed later
+                h->maxcode[j] = code << (16 - j);
+                code <<= 1;
+            }
+            h->maxcode[j] = 0xffffffff;
+
+            // build non-spec acceleration table; 255 is flag for not-accelerated
+            memset(h->fast, 255, 1 << FAST_BITS);
+            for (i = 0; i < k; ++i) {
+                int s = h->size[i];
+                if (s <= FAST_BITS) {
+                    int c = h->code[i] << (FAST_BITS - s);
+                    int m = 1 << (FAST_BITS - s);
+                    for (j = 0; j < m; ++j) {
+                        h->fast[c + j] = (jpeg_uc)i;
+                    }
+                }
+            }
+            return 1;
+        }
+
+        // build a table that decodes both magnitude and value of small ACs in
+        // one go.
+        static void jpeg__build_fast_ac(jpeg__int16* fast_ac, jpeg__huffman* h)
+        {
+            int i;
+            for (i = 0; i < (1 << FAST_BITS); ++i) {
+                jpeg_uc fast = h->fast[i];
+                fast_ac[i] = 0;
+                if (fast < 255) {
+                    int rs = h->values[fast];
+                    int run = (rs >> 4) & 15;
+                    int magbits = rs & 15;
+                    int len = h->size[fast];
+
+                    if (magbits && len + magbits <= FAST_BITS) {
+                        // magnitude code followed by receive_extend code
+                        int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+                        int m = 1 << (magbits - 1);
+                        if (k < m) k += (~0U << magbits) + 1;
+                        // if the result is small enough, we can fit it in fast_ac table
+                        if (k >= -128 && k <= 127)
+                            fast_ac[i] = (jpeg__int16)((k * 256) + (run * 16) + (len + magbits));
+                    }
+                }
+            }
+        }
+
+        static void jpeg__grow_buffer_unsafe(jpeg__jpeg* j)
+        {
+            do {
+                unsigned int b = j->nomore ? 0 : jpeg__get8(j->s);
+                if (b == 0xff) {
+                    int c = jpeg__get8(j->s);
+                    while (c == 0xff) c = jpeg__get8(j->s); // consume fill bytes
+                    if (c != 0) {
+                        j->marker = (unsigned char)c;
+                        j->nomore = 1;
+                        return;
+                    }
+                }
+                j->code_buffer |= b << (24 - j->code_bits);
+                j->code_bits += 8;
+            } while (j->code_bits <= 24);
+        }
+
+        // (1 << n) - 1
+        static const jpeg__uint32 jpeg__bmask[17] = { 0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535 };
+
+        // decode a jpeg huffman value from the bitstream
+        jpeg_inline static int jpeg__jpeg_huff_decode(jpeg__jpeg* j, jpeg__huffman* h)
+        {
+            unsigned int temp;
+            int c, k;
+
+            if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j);
+
+            // look at the top FAST_BITS and determine what symbol ID it is,
+            // if the code is <= FAST_BITS
+            c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
+            k = h->fast[c];
+            if (k < 255) {
+                int s = h->size[k];
+                if (s > j->code_bits)
+                    return -1;
+                j->code_buffer <<= s;
+                j->code_bits -= s;
+                return h->values[k];
+            }
+
+            // naive test is to shift the code_buffer down so k bits are
+            // valid, then test against maxcode. To speed this up, we've
+            // preshifted maxcode left so that it has (16-k) 0s at the
+            // end; in other words, regardless of the number of bits, it
+            // wants to be compared against something shifted to have 16;
+            // that way we don't need to shift inside the loop.
+            temp = j->code_buffer >> 16;
+            for (k = FAST_BITS + 1; ; ++k)
+                if (temp < h->maxcode[k])
+                    break;
+            if (k == 17) {
+                // error! code not found
+                j->code_bits -= 16;
+                return -1;
+            }
+
+            if (k > j->code_bits)
+                return -1;
+
+            // convert the huffman code to the symbol id
+            c = ((j->code_buffer >> (32 - k)) & jpeg__bmask[k]) + h->delta[k];
+            JPEG_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & jpeg__bmask[h->size[c]]) == h->code[c]);
+
+            // convert the id to a symbol
+            j->code_bits -= k;
+            j->code_buffer <<= k;
+            return h->values[c];
+        }
+
+        // bias[n] = (-1<<n) + 1
+        static const int jpeg__jbias[16] = { 0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767 };
+
+        // combined JPEG 'receive' and JPEG 'extend', since baseline
+        // always extends everything it receives.
+        jpeg_inline static int jpeg__extend_receive(jpeg__jpeg* j, int n)
+        {
+            unsigned int k;
+            int sgn;
+            if (j->code_bits < n) jpeg__grow_buffer_unsafe(j);
+
+            sgn = (jpeg__int32)j->code_buffer >> 31; // sign bit is always in MSB
+            k = jpeg_lrot(j->code_buffer, n);
+            if (n < 0 || n >= (int)(sizeof(jpeg__bmask) / sizeof(*jpeg__bmask))) return 0;
+            j->code_buffer = k & ~jpeg__bmask[n];
+            k &= jpeg__bmask[n];
+            j->code_bits -= n;
+            return k + (jpeg__jbias[n] & ~sgn);
+        }
+
+        // get some unsigned bits
+        jpeg_inline static int jpeg__jpeg_get_bits(jpeg__jpeg* j, int n)
+        {
+            unsigned int k;
+            if (j->code_bits < n) jpeg__grow_buffer_unsafe(j);
+            k = jpeg_lrot(j->code_buffer, n);
+            j->code_buffer = k & ~jpeg__bmask[n];
+            k &= jpeg__bmask[n];
+            j->code_bits -= n;
+            return k;
+        }
+
+        jpeg_inline static int jpeg__jpeg_get_bit(jpeg__jpeg* j)
+        {
+            unsigned int k;
+            if (j->code_bits < 1) jpeg__grow_buffer_unsafe(j);
+            k = j->code_buffer;
+            j->code_buffer <<= 1;
+            --j->code_bits;
+            return k & 0x80000000;
+        }
+
+        // given a value that's at position X in the zigzag stream,
+        // where does it appear in the 8x8 matrix coded as row-major?
+        static const jpeg_uc jpeg__jpeg_dezigzag[64 + 15] =
+        {
+            0,  1,  8, 16,  9,  2,  3, 10,
+           17, 24, 32, 25, 18, 11,  4,  5,
+           12, 19, 26, 33, 40, 48, 41, 34,
+           27, 20, 13,  6,  7, 14, 21, 28,
+           35, 42, 49, 56, 57, 50, 43, 36,
+           29, 22, 15, 23, 30, 37, 44, 51,
+           58, 59, 52, 45, 38, 31, 39, 46,
+           53, 60, 61, 54, 47, 55, 62, 63,
+           // let corrupt input sample past end
+           63, 63, 63, 63, 63, 63, 63, 63,
+           63, 63, 63, 63, 63, 63, 63
+        };
+
+        // decode one 64-entry block--
+        static int jpeg__jpeg_decode_block(jpeg__jpeg* j, short data[64], jpeg__huffman* hdc, jpeg__huffman* hac, jpeg__int16* fac, int b, jpeg__uint16* dequant)
+        {
+            int diff, dc, k;
+            int t;
+
+            if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j);
+            t = jpeg__jpeg_huff_decode(j, hdc);
+            if (t < 0) return jpeg__err("bad huffman code", "Corrupt JPEG");
+
+            // 0 all the ac values now so we can do it 32-bits at a time
+            memset(data, 0, 64 * sizeof(data[0]));
+
+            diff = t ? jpeg__extend_receive(j, t) : 0;
+            dc = j->img_comp[b].dc_pred + diff;
+            j->img_comp[b].dc_pred = dc;
+            data[0] = (short)(dc * dequant[0]);
+
+            // decode AC components, see JPEG spec
+            k = 1;
+            do {
+                unsigned int zig;
+                int c, r, s;
+                if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j);
+                c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
+                r = fac[c];
+                if (r) { // fast-AC path
+                    k += (r >> 4) & 15; // run
+                    s = r & 15; // combined length
+                    j->code_buffer <<= s;
+                    j->code_bits -= s;
+                    // decode into unzigzag'd location
+                    zig = jpeg__jpeg_dezigzag[k++];
+                    data[zig] = (short)((r >> 8) * dequant[zig]);
+                }
+                else {
+                    int rs = jpeg__jpeg_huff_decode(j, hac);
+                    if (rs < 0) return jpeg__err("bad huffman code", "Corrupt JPEG");
+                    s = rs & 15;
+                    r = rs >> 4;
+                    if (s == 0) {
+                        if (rs != 0xf0) break; // end block
+                        k += 16;
+                    }
+                    else {
+                        k += r;
+                        // decode into unzigzag'd location
+                        zig = jpeg__jpeg_dezigzag[k++];
+                        data[zig] = (short)(jpeg__extend_receive(j, s) * dequant[zig]);
+                    }
+                }
+            } while (k < 64);
+            return 1;
+        }
+
+        static int jpeg__jpeg_decode_block_prog_dc(jpeg__jpeg* j, short data[64], jpeg__huffman* hdc, int b)
+        {
+            int diff, dc;
+            int t;
+            if (j->spec_end != 0) return jpeg__err("can't merge dc and ac", "Corrupt JPEG");
+
+            if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j);
+
+            if (j->succ_high == 0) {
+                // first scan for DC coefficient, must be first
+                memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now
+                t = jpeg__jpeg_huff_decode(j, hdc);
+                if (t == -1) return jpeg__err("can't merge dc and ac", "Corrupt JPEG");
+                diff = t ? jpeg__extend_receive(j, t) : 0;
+
+                dc = j->img_comp[b].dc_pred + diff;
+                j->img_comp[b].dc_pred = dc;
+                data[0] = (short)(dc << j->succ_low);
+            }
+            else {
+                // refinement scan for DC coefficient
+                if (jpeg__jpeg_get_bit(j))
+                    data[0] += (short)(1 << j->succ_low);
+            }
+            return 1;
+        }
+
+        // @OPTIMIZE: store non-zigzagged during the decode passes,
+        // and only de-zigzag when dequantizing
+        static int jpeg__jpeg_decode_block_prog_ac(jpeg__jpeg* j, short data[64], jpeg__huffman* hac, jpeg__int16* fac)
+        {
+            int k;
+            if (j->spec_start == 0) return jpeg__err("can't merge dc and ac", "Corrupt JPEG");
+
+            if (j->succ_high == 0) {
+                int shift = j->succ_low;
+
+                if (j->eob_run) {
+                    --j->eob_run;
+                    return 1;
+                }
+
+                k = j->spec_start;
+                do {
+                    unsigned int zig;
+                    int c, r, s;
+                    if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j);
+                    c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
+                    r = fac[c];
+                    if (r) { // fast-AC path
+                        k += (r >> 4) & 15; // run
+                        s = r & 15; // combined length
+                        j->code_buffer <<= s;
+                        j->code_bits -= s;
+                        zig = jpeg__jpeg_dezigzag[k++];
+                        data[zig] = (short)((r >> 8) << shift);
+                    }
+                    else {
+                        int rs = jpeg__jpeg_huff_decode(j, hac);
+                        if (rs < 0) return jpeg__err("bad huffman code", "Corrupt JPEG");
+                        s = rs & 15;
+                        r = rs >> 4;
+                        if (s == 0) {
+                            if (r < 15) {
+                                j->eob_run = (1 << r);
+                                if (r)
+                                    j->eob_run += jpeg__jpeg_get_bits(j, r);
+                                --j->eob_run;
+                                break;
+                            }
+                            k += 16;
+                        }
+                        else {
+                            k += r;
+                            zig = jpeg__jpeg_dezigzag[k++];
+                            data[zig] = (short)(jpeg__extend_receive(j, s) << shift);
+                        }
+                    }
+                } while (k <= j->spec_end);
+            }
+            else {
+                // refinement scan for these AC coefficients
+
+                short bit = (short)(1 << j->succ_low);
+
+                if (j->eob_run) {
+                    --j->eob_run;
+                    for (k = j->spec_start; k <= j->spec_end; ++k) {
+                        short* p = &data[jpeg__jpeg_dezigzag[k]];
+                        if (*p != 0)
+                            if (jpeg__jpeg_get_bit(j))
+                                if ((*p & bit) == 0) {
+                                    if (*p > 0)
+                                        *p += bit;
+                                    else
+                                        *p -= bit;
+                                }
+                    }
+                }
+                else {
+                    k = j->spec_start;
+                    do {
+                        int r, s;
+                        int rs = jpeg__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+                        if (rs < 0) return jpeg__err("bad huffman code", "Corrupt JPEG");
+                        s = rs & 15;
+                        r = rs >> 4;
+                        if (s == 0) {
+                            if (r < 15) {
+                                j->eob_run = (1 << r) - 1;
+                                if (r)
+                                    j->eob_run += jpeg__jpeg_get_bits(j, r);
+                                r = 64; // force end of block
+                            }
+                            else {
+                                // r=15 s=0 should write 16 0s, so we just do
+                                // a run of 15 0s and then write s (which is 0),
+                                // so we don't have to do anything special here
+                            }
+                        }
+                        else {
+                            if (s != 1) return jpeg__err("bad huffman code", "Corrupt JPEG");
+                            // sign bit
+                            if (jpeg__jpeg_get_bit(j))
+                                s = bit;
+                            else
+                                s = -bit;
+                        }
+
+                        // advance by r
+                        while (k <= j->spec_end) {
+                            short* p = &data[jpeg__jpeg_dezigzag[k++]];
+                            if (*p != 0) {
+                                if (jpeg__jpeg_get_bit(j))
+                                    if ((*p & bit) == 0) {
+                                        if (*p > 0)
+                                            *p += bit;
+                                        else
+                                            *p -= bit;
+                                    }
+                            }
+                            else {
+                                if (r == 0) {
+                                    *p = (short)s;
+                                    break;
+                                }
+                                --r;
+                            }
+                        }
+                    } while (k <= j->spec_end);
+                }
+            }
+            return 1;
+        }
+
+        // take a -128..127 value and jpeg__clamp it and convert to 0..255
+        jpeg_inline static jpeg_uc jpeg__clamp(int x)
+        {
+            // trick to use a single test to catch both cases
+            if ((unsigned int)x > 255) {
+                if (x < 0) return 0;
+                if (x > 255) return 255;
+            }
+            return (jpeg_uc)x;
+        }
+
+#define jpeg__f2f(x)  ((int) (((x) * 4096 + 0.5)))
+#define jpeg__fsh(x)  ((x) * 4096)
+
+        // derived from jidctint -- DCT_ISLOW
+#define JPEG__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * jpeg__f2f(0.5411961f);       \
+   t2 = p1 + p3*jpeg__f2f(-1.847759065f);      \
+   t3 = p1 + p2*jpeg__f2f( 0.765366865f);      \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = jpeg__fsh(p2+p3);                      \
+   t1 = jpeg__fsh(p2-p3);                      \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*jpeg__f2f( 1.175875602f);      \
+   t0 = t0*jpeg__f2f( 0.298631336f);           \
+   t1 = t1*jpeg__f2f( 2.053119869f);           \
+   t2 = t2*jpeg__f2f( 3.072711026f);           \
+   t3 = t3*jpeg__f2f( 1.501321110f);           \
+   p1 = p5 + p1*jpeg__f2f(-0.899976223f);      \
+   p2 = p5 + p2*jpeg__f2f(-2.562915447f);      \
+   p3 = p3*jpeg__f2f(-1.961570560f);           \
+   p4 = p4*jpeg__f2f(-0.390180644f);           \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
+
+        static void jpeg__idct_block(jpeg_uc* out, int out_stride, short data[64])
+        {
+            int i, val[64], * v = val;
+            jpeg_uc* o;
+            short* d = data;
+
+            // columns
+            for (i = 0; i < 8; ++i, ++d, ++v) {
+                // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+                if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0
+                    && d[40] == 0 && d[48] == 0 && d[56] == 0) {
+                    //    no shortcut                 0     seconds
+                    //    (1|2|3|4|5|6|7)==0          0     seconds
+                    //    all separate               -0.047 seconds
+                    //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+                    int dcterm = d[0] * 4;
+                    v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+                }
+                else {
+                    JPEG__IDCT_1D(d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56])
+                        // constants scaled things up by 1<<12; let's bring them back
+                        // down, but keep 2 extra bits of precision
+                        x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+                    v[0] = (x0 + t3) >> 10;
+                    v[56] = (x0 - t3) >> 10;
+                    v[8] = (x1 + t2) >> 10;
+                    v[48] = (x1 - t2) >> 10;
+                    v[16] = (x2 + t1) >> 10;
+                    v[40] = (x2 - t1) >> 10;
+                    v[24] = (x3 + t0) >> 10;
+                    v[32] = (x3 - t0) >> 10;
+                }
+            }
+
+            for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) {
+                // no fast case since the first 1D IDCT spread components out
+                JPEG__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7])
+                    // constants scaled things up by 1<<12, plus we had 1<<2 from first
+                    // loop, plus horizontal and vertical each scale by sqrt(8) so together
+                    // we've got an extra 1<<3, so 1<<17 total we need to remove.
+                    // so we want to round that, which means adding 0.5 * 1<<17,
+                    // aka 65536. Also, we'll end up with -128 to 127 that we want
+                    // to encode as 0..255 by adding 128, so we'll add that before the shift
+                    x0 += 65536 + (128 << 17);
+                x1 += 65536 + (128 << 17);
+                x2 += 65536 + (128 << 17);
+                x3 += 65536 + (128 << 17);
+                // tried computing the shifts into temps, or'ing the temps to see
+                // if any were out of range, but that was slower
+                o[0] = jpeg__clamp((x0 + t3) >> 17);
+                o[7] = jpeg__clamp((x0 - t3) >> 17);
+                o[1] = jpeg__clamp((x1 + t2) >> 17);
+                o[6] = jpeg__clamp((x1 - t2) >> 17);
+                o[2] = jpeg__clamp((x2 + t1) >> 17);
+                o[5] = jpeg__clamp((x2 - t1) >> 17);
+                o[3] = jpeg__clamp((x3 + t0) >> 17);
+                o[4] = jpeg__clamp((x3 - t0) >> 17);
+            }
+        }
+
+#ifdef JPEG_SSE2
+        // sse2 integer IDCT. not the fastest possible implementation but it
+        // produces bit-identical results to the generic C version so it's
+        // fully "transparent".
+        static void jpeg__idct_simd(jpeg_uc* out, int out_stride, short data[64])
+        {
+            // This is constructed to match our regular (generic) integer IDCT exactly.
+            __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+            __m128i tmp;
+
+            // dot product constant: even elems=x, odd elems=y
+#define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
+
+// out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+// out(1) = c1[even]*x + c1[odd]*y
+#define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+   // out = in << 12  (in 16-bit, out 32-bit)
+#define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+   // wide add
+#define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+   // wide sub
+#define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+   // butterfly a/b, add bias, then shift by "s" and pack
+#define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
+
+   // 8-bit interleave step (for transposes)
+#define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
+
+   // 16-bit interleave step (for transposes)
+#define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
+
+#define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
+
+            __m128i rot0_0 = dct_const(jpeg__f2f(0.5411961f), jpeg__f2f(0.5411961f) + jpeg__f2f(-1.847759065f));
+            __m128i rot0_1 = dct_const(jpeg__f2f(0.5411961f) + jpeg__f2f(0.765366865f), jpeg__f2f(0.5411961f));
+            __m128i rot1_0 = dct_const(jpeg__f2f(1.175875602f) + jpeg__f2f(-0.899976223f), jpeg__f2f(1.175875602f));
+            __m128i rot1_1 = dct_const(jpeg__f2f(1.175875602f), jpeg__f2f(1.175875602f) + jpeg__f2f(-2.562915447f));
+            __m128i rot2_0 = dct_const(jpeg__f2f(-1.961570560f) + jpeg__f2f(0.298631336f), jpeg__f2f(-1.961570560f));
+            __m128i rot2_1 = dct_const(jpeg__f2f(-1.961570560f), jpeg__f2f(-1.961570560f) + jpeg__f2f(3.072711026f));
+            __m128i rot3_0 = dct_const(jpeg__f2f(-0.390180644f) + jpeg__f2f(2.053119869f), jpeg__f2f(-0.390180644f));
+            __m128i rot3_1 = dct_const(jpeg__f2f(-0.390180644f), jpeg__f2f(-0.390180644f) + jpeg__f2f(1.501321110f));
+
+            // rounding biases in column/row passes, see jpeg__idct_block for explanation.
+            __m128i bias_0 = _mm_set1_epi32(512);
+            __m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17));
+
+            // load
+            row0 = _mm_load_si128((const __m128i*) (data + 0 * 8));
+            row1 = _mm_load_si128((const __m128i*) (data + 1 * 8));
+            row2 = _mm_load_si128((const __m128i*) (data + 2 * 8));
+            row3 = _mm_load_si128((const __m128i*) (data + 3 * 8));
+            row4 = _mm_load_si128((const __m128i*) (data + 4 * 8));
+            row5 = _mm_load_si128((const __m128i*) (data + 5 * 8));
+            row6 = _mm_load_si128((const __m128i*) (data + 6 * 8));
+            row7 = _mm_load_si128((const __m128i*) (data + 7 * 8));
+
+            // column pass
+            dct_pass(bias_0, 10);
+
+            {
+                // 16bit 8x8 transpose pass 1
+                dct_interleave16(row0, row4);
+                dct_interleave16(row1, row5);
+                dct_interleave16(row2, row6);
+                dct_interleave16(row3, row7);
+
+                // transpose pass 2
+                dct_interleave16(row0, row2);
+                dct_interleave16(row1, row3);
+                dct_interleave16(row4, row6);
+                dct_interleave16(row5, row7);
+
+                // transpose pass 3
+                dct_interleave16(row0, row1);
+                dct_interleave16(row2, row3);
+                dct_interleave16(row4, row5);
+                dct_interleave16(row6, row7);
+            }
+
+            // row pass
+            dct_pass(bias_1, 17);
+
+            {
+                // pack
+                __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+                __m128i p1 = _mm_packus_epi16(row2, row3);
+                __m128i p2 = _mm_packus_epi16(row4, row5);
+                __m128i p3 = _mm_packus_epi16(row6, row7);
+
+                // 8bit 8x8 transpose pass 1
+                dct_interleave8(p0, p2); // a0e0a1e1...
+                dct_interleave8(p1, p3); // c0g0c1g1...
+
+                // transpose pass 2
+                dct_interleave8(p0, p1); // a0c0e0g0...
+                dct_interleave8(p2, p3); // b0d0f0h0...
+
+                // transpose pass 3
+                dct_interleave8(p0, p2); // a0b0c0d0...
+                dct_interleave8(p1, p3); // a4b4c4d4...
+
+                // store
+                _mm_storel_epi64((__m128i*) out, p0); out += out_stride;
+                _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+                _mm_storel_epi64((__m128i*) out, p2); out += out_stride;
+                _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+                _mm_storel_epi64((__m128i*) out, p1); out += out_stride;
+                _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+                _mm_storel_epi64((__m128i*) out, p3); out += out_stride;
+                _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p3, 0x4e));
+            }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+        }
+
+#endif // JPEG_SSE2
+
+#ifdef JPEG_NEON
+
+        // NEON integer IDCT. should produce bit-identical
+        // results to the generic C version.
+        static void jpeg__idct_simd(jpeg_uc* out, int out_stride, short data[64])
+        {
+            int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+            int16x4_t rot0_0 = vdup_n_s16(jpeg__f2f(0.5411961f));
+            int16x4_t rot0_1 = vdup_n_s16(jpeg__f2f(-1.847759065f));
+            int16x4_t rot0_2 = vdup_n_s16(jpeg__f2f(0.765366865f));
+            int16x4_t rot1_0 = vdup_n_s16(jpeg__f2f(1.175875602f));
+            int16x4_t rot1_1 = vdup_n_s16(jpeg__f2f(-0.899976223f));
+            int16x4_t rot1_2 = vdup_n_s16(jpeg__f2f(-2.562915447f));
+            int16x4_t rot2_0 = vdup_n_s16(jpeg__f2f(-1.961570560f));
+            int16x4_t rot2_1 = vdup_n_s16(jpeg__f2f(-0.390180644f));
+            int16x4_t rot3_0 = vdup_n_s16(jpeg__f2f(0.298631336f));
+            int16x4_t rot3_1 = vdup_n_s16(jpeg__f2f(2.053119869f));
+            int16x4_t rot3_2 = vdup_n_s16(jpeg__f2f(3.072711026f));
+            int16x4_t rot3_3 = vdup_n_s16(jpeg__f2f(1.501321110f));
+
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+            // wide add
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
+
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
+
+   // load
+            row0 = vld1q_s16(data + 0 * 8);
+            row1 = vld1q_s16(data + 1 * 8);
+            row2 = vld1q_s16(data + 2 * 8);
+            row3 = vld1q_s16(data + 3 * 8);
+            row4 = vld1q_s16(data + 4 * 8);
+            row5 = vld1q_s16(data + 5 * 8);
+            row6 = vld1q_s16(data + 6 * 8);
+            row7 = vld1q_s16(data + 7 * 8);
+
+            // add DC bias
+            row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+            // column pass
+            dct_pass(vrshrn_n_s32, 10);
+
+            // 16bit 8x8 transpose
+            {
+                // these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+                // whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
+
+      // pass 1
+                dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+                dct_trn16(row2, row3);
+                dct_trn16(row4, row5);
+                dct_trn16(row6, row7);
+
+                // pass 2
+                dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+                dct_trn32(row1, row3);
+                dct_trn32(row4, row6);
+                dct_trn32(row5, row7);
+
+                // pass 3
+                dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+                dct_trn64(row1, row5);
+                dct_trn64(row2, row6);
+                dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+            }
+
+            // row pass
+            // vrshrn_n_s32 only supports shifts up to 16, we need
+            // 17. so do a non-rounding shift of 16 first then follow
+            // up with a rounding shift by 1.
+            dct_pass(vshrn_n_s32, 16);
+
+            {
+                // pack and round
+                uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+                uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+                uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+                uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+                uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+                uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+                uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+                uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+                // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
+
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
+
+      // 8x8 8-bit transpose pass 1
+                dct_trn8_8(p0, p1);
+                dct_trn8_8(p2, p3);
+                dct_trn8_8(p4, p5);
+                dct_trn8_8(p6, p7);
+
+                // pass 2
+                dct_trn8_16(p0, p2);
+                dct_trn8_16(p1, p3);
+                dct_trn8_16(p4, p6);
+                dct_trn8_16(p5, p7);
+
+                // pass 3
+                dct_trn8_32(p0, p4);
+                dct_trn8_32(p1, p5);
+                dct_trn8_32(p2, p6);
+                dct_trn8_32(p3, p7);
+
+                // store
+                vst1_u8(out, p0); out += out_stride;
+                vst1_u8(out, p1); out += out_stride;
+                vst1_u8(out, p2); out += out_stride;
+                vst1_u8(out, p3); out += out_stride;
+                vst1_u8(out, p4); out += out_stride;
+                vst1_u8(out, p5); out += out_stride;
+                vst1_u8(out, p6); out += out_stride;
+                vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+            }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+        }
+
+#endif // JPEG_NEON
+
+#define JPEG__MARKER_none  0xff
+        // if there's a pending marker from the entropy stream, return that
+        // otherwise, fetch from the stream and get a marker. if there's no
+        // marker, return 0xff, which is never a valid marker value
+        static jpeg_uc jpeg__get_marker(jpeg__jpeg* j)
+        {
+            jpeg_uc x;
+            if (j->marker != JPEG__MARKER_none) { x = j->marker; j->marker = JPEG__MARKER_none; return x; }
+            x = jpeg__get8(j->s);
+            if (x != 0xff) return JPEG__MARKER_none;
+            while (x == 0xff)
+                x = jpeg__get8(j->s); // consume repeated 0xff fill bytes
+            return x;
+        }
+
+        // in each scan, we'll have scan_n components, and the order
+        // of the components is specified by order[]
+#define JPEG__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, jpeg__jpeg_reset the entropy decoder and
+// the dc prediction
+        static void jpeg__jpeg_reset(jpeg__jpeg* j)
+        {
+            j->code_bits = 0;
+            j->code_buffer = 0;
+            j->nomore = 0;
+            j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
+            j->marker = JPEG__MARKER_none;
+            j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+            j->eob_run = 0;
+            // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+            // since we don't even allow 1<<30 pixels
+        }
+
+        static int jpeg__parse_entropy_coded_data(jpeg__jpeg* z)
+        {
+            jpeg__jpeg_reset(z);
+            if (!z->progressive) {
+                if (z->scan_n == 1) {
+                    int i, j;
+                    JPEG_SIMD_ALIGN(short, data[64]);
+                    int n = z->order[0];
+                    // non-interleaved data, we just need to process one block at a time,
+                    // in trivial scanline order
+                    // number of blocks to do just depends on how many actual "pixels" this
+                    // component has, independent of interleaved MCU blocking and such
+                    int w = (z->img_comp[n].x + 7) >> 3;
+                    int h = (z->img_comp[n].y + 7) >> 3;
+                    for (j = 0; j < h; ++j) {
+                        for (i = 0; i < w; ++i) {
+                            int ha = z->img_comp[n].ha;
+                            if (!jpeg__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                            z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data);
+                            // every data block is an MCU, so countdown the restart interval
+                            if (--z->todo <= 0) {
+                                if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z);
+                                // if it's NOT a restart, then just bail, so we get corrupt data
+                                // rather than no data
+                                if (!JPEG__RESTART(z->marker)) return 1;
+                                jpeg__jpeg_reset(z);
+                            }
+                        }
+                    }
+                    return 1;
+                }
+                else { // interleaved
+                    int i, j, k, x, y;
+                    JPEG_SIMD_ALIGN(short, data[64]);
+                    for (j = 0; j < z->img_mcu_y; ++j) {
+                        for (i = 0; i < z->img_mcu_x; ++i) {
+                            // scan an interleaved mcu... process scan_n components in order
+                            for (k = 0; k < z->scan_n; ++k) {
+                                int n = z->order[k];
+                                // scan out an mcu's worth of this component; that's just determined
+                                // by the basic H and V specified for the component
+                                for (y = 0; y < z->img_comp[n].v; ++y) {
+                                    for (x = 0; x < z->img_comp[n].h; ++x) {
+                                        int x2 = (i * z->img_comp[n].h + x) * 8;
+                                        int y2 = (j * z->img_comp[n].v + y) * 8;
+                                        int ha = z->img_comp[n].ha;
+                                        if (!jpeg__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                                        z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2, z->img_comp[n].w2, data);
+                                    }
+                                }
+                            }
+                            // after all interleaved components, that's an interleaved MCU,
+                            // so now count down the restart interval
+                            if (--z->todo <= 0) {
+                                if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z);
+                                if (!JPEG__RESTART(z->marker)) return 1;
+                                jpeg__jpeg_reset(z);
+                            }
+                        }
+                    }
+                    return 1;
+                }
+            }
+            else {
+                if (z->scan_n == 1) {
+                    int i, j;
+                    int n = z->order[0];
+                    // non-interleaved data, we just need to process one block at a time,
+                    // in trivial scanline order
+                    // number of blocks to do just depends on how many actual "pixels" this
+                    // component has, independent of interleaved MCU blocking and such
+                    int w = (z->img_comp[n].x + 7) >> 3;
+                    int h = (z->img_comp[n].y + 7) >> 3;
+                    for (j = 0; j < h; ++j) {
+                        for (i = 0; i < w; ++i) {
+                            short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+                            if (z->spec_start == 0) {
+                                if (!jpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                                    return 0;
+                            }
+                            else {
+                                int ha = z->img_comp[n].ha;
+                                if (!jpeg__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                                    return 0;
+                            }
+                            // every data block is an MCU, so countdown the restart interval
+                            if (--z->todo <= 0) {
+                                if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z);
+                                if (!JPEG__RESTART(z->marker)) return 1;
+                                jpeg__jpeg_reset(z);
+                            }
+                        }
+                    }
+                    return 1;
+                }
+                else { // interleaved
+                    int i, j, k, x, y;
+                    for (j = 0; j < z->img_mcu_y; ++j) {
+                        for (i = 0; i < z->img_mcu_x; ++i) {
+                            // scan an interleaved mcu... process scan_n components in order
+                            for (k = 0; k < z->scan_n; ++k) {
+                                int n = z->order[k];
+                                // scan out an mcu's worth of this component; that's just determined
+                                // by the basic H and V specified for the component
+                                for (y = 0; y < z->img_comp[n].v; ++y) {
+                                    for (x = 0; x < z->img_comp[n].h; ++x) {
+                                        int x2 = (i * z->img_comp[n].h + x);
+                                        int y2 = (j * z->img_comp[n].v + y);
+                                        short* data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                                        if (!jpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                                            return 0;
+                                    }
+                                }
+                            }
+                            // after all interleaved components, that's an interleaved MCU,
+                            // so now count down the restart interval
+                            if (--z->todo <= 0) {
+                                if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z);
+                                if (!JPEG__RESTART(z->marker)) return 1;
+                                jpeg__jpeg_reset(z);
+                            }
+                        }
+                    }
+                    return 1;
+                }
+            }
+        }
+
+        static void jpeg__jpeg_dequantize(short* data, jpeg__uint16* dequant)
+        {
+            int i;
+            for (i = 0; i < 64; ++i)
+                data[i] *= dequant[i];
+        }
+
+        static void jpeg__jpeg_finish(jpeg__jpeg* z)
+        {
+            if (z->progressive) {
+                // dequantize and idct the data
+                int i, j, n;
+                for (n = 0; n < z->s->img_n; ++n) {
+                    int w = (z->img_comp[n].x + 7) >> 3;
+                    int h = (z->img_comp[n].y + 7) >> 3;
+                    for (j = 0; j < h; ++j) {
+                        for (i = 0; i < w; ++i) {
+                            short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+                            jpeg__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+                            z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data);
+                        }
+                    }
+                }
+            }
+        }
+
+        static int jpeg__process_marker(jpeg__jpeg* z, int m)
+        {
+            int L;
+            switch (m) {
+            case JPEG__MARKER_none: // no marker found
+                return jpeg__err("expected marker", "Corrupt JPEG");
+
+            case 0xDD: // DRI - specify restart interval
+                if (jpeg__get16be(z->s) != 4) return jpeg__err("bad DRI len", "Corrupt JPEG");
+                z->restart_interval = jpeg__get16be(z->s);
+                return 1;
+
+            case 0xDB: // DQT - define quantization table
+                L = jpeg__get16be(z->s) - 2;
+                while (L > 0) {
+                    int q = jpeg__get8(z->s);
+                    int p = q >> 4, sixteen = (p != 0);
+                    int t = q & 15, i;
+                    if (p != 0 && p != 1) return jpeg__err("bad DQT type", "Corrupt JPEG");
+                    if (t > 3) return jpeg__err("bad DQT table", "Corrupt JPEG");
+
+                    for (i = 0; i < 64; ++i)
+                        z->dequant[t][jpeg__jpeg_dezigzag[i]] = (jpeg__uint16)(sixteen ? jpeg__get16be(z->s) : jpeg__get8(z->s));
+                    L -= (sixteen ? 129 : 65);
+                }
+                return L == 0;
+
+            case 0xC4: // DHT - define huffman table
+                L = jpeg__get16be(z->s) - 2;
+                while (L > 0) {
+                    jpeg_uc* v;
+                    int sizes[16], i, n = 0;
+                    int q = jpeg__get8(z->s);
+                    int tc = q >> 4;
+                    int th = q & 15;
+                    if (tc > 1 || th > 3) return jpeg__err("bad DHT header", "Corrupt JPEG");
+                    for (i = 0; i < 16; ++i) {
+                        sizes[i] = jpeg__get8(z->s);
+                        n += sizes[i];
+                    }
+                    L -= 17;
+                    if (tc == 0) {
+                        if (!jpeg__build_huffman(z->huff_dc + th, sizes)) return 0;
+                        v = z->huff_dc[th].values;
+                    }
+                    else {
+                        if (!jpeg__build_huffman(z->huff_ac + th, sizes)) return 0;
+                        v = z->huff_ac[th].values;
+                    }
+                    for (i = 0; i < n; ++i)
+                        v[i] = jpeg__get8(z->s);
+                    if (tc != 0)
+                        jpeg__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+                    L -= n;
+                }
+                return L == 0;
+            }
+
+            // check for comment block or APP blocks
+            if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+                L = jpeg__get16be(z->s);
+                if (L < 2) {
+                    if (m == 0xFE)
+                        return jpeg__err("bad COM len", "Corrupt JPEG");
+                    else
+                        return jpeg__err("bad APP len", "Corrupt JPEG");
+                }
+                L -= 2;
+
+                if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+                    static const unsigned char tag[5] = { 'J','F','I','F','\0' };
+                    int ok = 1;
+                    int i;
+                    for (i = 0; i < 5; ++i)
+                        if (jpeg__get8(z->s) != tag[i])
+                            ok = 0;
+                    L -= 5;
+                    if (ok)
+                        z->jfif = 1;
+                }
+                else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+                    static const unsigned char tag[6] = { 'A','d','o','b','e','\0' };
+                    int ok = 1;
+                    int i;
+                    for (i = 0; i < 6; ++i)
+                        if (jpeg__get8(z->s) != tag[i])
+                            ok = 0;
+                    L -= 6;
+                    if (ok) {
+                        jpeg__get8(z->s); // version
+                        jpeg__get16be(z->s); // flags0
+                        jpeg__get16be(z->s); // flags1
+                        z->app14_color_transform = jpeg__get8(z->s); // color transform
+                        L -= 6;
+                    }
+                }
+
+                jpeg__skip(z->s, L);
+                return 1;
+            }
+
+            return jpeg__err("unknown marker", "Corrupt JPEG");
+        }
+
+        // after we see SOS
+        static int jpeg__process_scan_header(jpeg__jpeg* z)
+        {
+            int i;
+            int Ls = jpeg__get16be(z->s);
+            z->scan_n = jpeg__get8(z->s);
+            if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int)z->s->img_n) return jpeg__err("bad SOS component count", "Corrupt JPEG");
+            if (Ls != 6 + 2 * z->scan_n) return jpeg__err("bad SOS len", "Corrupt JPEG");
+            for (i = 0; i < z->scan_n; ++i) {
+                int id = jpeg__get8(z->s), which;
+                int q = jpeg__get8(z->s);
+                for (which = 0; which < z->s->img_n; ++which)
+                    if (z->img_comp[which].id == id)
+                        break;
+                if (which == z->s->img_n) return 0; // no match
+                z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return jpeg__err("bad DC huff", "Corrupt JPEG");
+                z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return jpeg__err("bad AC huff", "Corrupt JPEG");
+                z->order[i] = which;
+            }
+
+            {
+                int aa;
+                z->spec_start = jpeg__get8(z->s);
+                z->spec_end = jpeg__get8(z->s); // should be 63, but might be 0
+                aa = jpeg__get8(z->s);
+                z->succ_high = (aa >> 4);
+                z->succ_low = (aa & 15);
+                if (z->progressive) {
+                    if (z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+                        return jpeg__err("bad SOS", "Corrupt JPEG");
+                }
+                else {
+                    if (z->spec_start != 0) return jpeg__err("bad SOS", "Corrupt JPEG");
+                    if (z->succ_high != 0 || z->succ_low != 0) return jpeg__err("bad SOS", "Corrupt JPEG");
+                    z->spec_end = 63;
+                }
+            }
+
+            return 1;
+        }
+
+        static int jpeg__free_jpeg_components(jpeg__jpeg* z, int ncomp, int why)
+        {
+            int i;
+            for (i = 0; i < ncomp; ++i) {
+                if (z->img_comp[i].raw_data) {
+                    JPEG_FREE(z->img_comp[i].raw_data);
+                    z->img_comp[i].raw_data = NULL;
+                    z->img_comp[i].data = NULL;
+                }
+                if (z->img_comp[i].raw_coeff) {
+                    JPEG_FREE(z->img_comp[i].raw_coeff);
+                    z->img_comp[i].raw_coeff = 0;
+                    z->img_comp[i].coeff = 0;
+                }
+                if (z->img_comp[i].linebuf) {
+                    JPEG_FREE(z->img_comp[i].linebuf);
+                    z->img_comp[i].linebuf = NULL;
+                }
+            }
+            return why;
+        }
+
+        static int jpeg__process_frame_header(jpeg__jpeg* z, int scan)
+        {
+            jpeg__context* s = z->s;
+            int Lf, p, i, q, h_max = 1, v_max = 1, c;
+            Lf = jpeg__get16be(s);         if (Lf < 11) return jpeg__err("bad SOF len", "Corrupt JPEG"); // JPEG
+            p = jpeg__get8(s);            if (p != 8) return jpeg__err("only 8-bit", "JPEG format not supported: 8-bit only"); // JPEG baseline
+            s->img_y = jpeg__get16be(s);   if (s->img_y == 0) return jpeg__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+            s->img_x = jpeg__get16be(s);   if (s->img_x == 0) return jpeg__err("0 width", "Corrupt JPEG"); // JPEG requires
+            if (s->img_y > JPEG_MAX_DIMENSIONS) return jpeg__err("too large", "Very large image (corrupt?)");
+            if (s->img_x > JPEG_MAX_DIMENSIONS) return jpeg__err("too large", "Very large image (corrupt?)");
+            c = jpeg__get8(s);
+            if (c != 3 && c != 1 && c != 4) return jpeg__err("bad component count", "Corrupt JPEG");
+            s->img_n = c;
+            for (i = 0; i < c; ++i) {
+                z->img_comp[i].data = NULL;
+                z->img_comp[i].linebuf = NULL;
+            }
+
+            if (Lf != 8 + 3 * s->img_n) return jpeg__err("bad SOF len", "Corrupt JPEG");
+
+            z->rgb = 0;
+            for (i = 0; i < s->img_n; ++i) {
+                static const unsigned char rgb[3] = { 'R', 'G', 'B' };
+                z->img_comp[i].id = jpeg__get8(s);
+                if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+                    ++z->rgb;
+                q = jpeg__get8(s);
+                z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return jpeg__err("bad H", "Corrupt JPEG");
+                z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return jpeg__err("bad V", "Corrupt JPEG");
+                z->img_comp[i].tq = jpeg__get8(s);  if (z->img_comp[i].tq > 3) return jpeg__err("bad TQ", "Corrupt JPEG");
+            }
+
+            if (scan != JPEG__SCAN_load) return 1;
+
+            if (!jpeg__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return jpeg__err("too large", "Image too large to decode");
+
+            for (i = 0; i < s->img_n; ++i) {
+                if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+                if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+            }
+
+            // compute interleaved mcu info
+            z->img_h_max = h_max;
+            z->img_v_max = v_max;
+            z->img_mcu_w = h_max * 8;
+            z->img_mcu_h = v_max * 8;
+            // these sizes can't be more than 17 bits
+            z->img_mcu_x = (s->img_x + z->img_mcu_w - 1) / z->img_mcu_w;
+            z->img_mcu_y = (s->img_y + z->img_mcu_h - 1) / z->img_mcu_h;
+
+            for (i = 0; i < s->img_n; ++i) {
+                // number of effective pixels (e.g. for non-interleaved MCU)
+                z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max - 1) / h_max;
+                z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max - 1) / v_max;
+                // to simplify generation, we'll allocate enough memory to decode
+                // the bogus oversized data from using interleaved MCUs and their
+                // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+                // discard the extra data until colorspace conversion
+                //
+                // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+                // so these muls can't overflow with 32-bit ints (which we require)
+                z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+                z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+                z->img_comp[i].coeff = 0;
+                z->img_comp[i].raw_coeff = 0;
+                z->img_comp[i].linebuf = NULL;
+                z->img_comp[i].raw_data = jpeg__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+                if (z->img_comp[i].raw_data == NULL)
+                    return jpeg__free_jpeg_components(z, i + 1, jpeg__err("outofmem", "Out of memory"));
+                // align blocks for idct using mmx/sse
+                z->img_comp[i].data = (jpeg_uc*)(((size_t)z->img_comp[i].raw_data + 15) & ~15);
+                if (z->progressive) {
+                    // w2, h2 are multiples of 8 (see above)
+                    z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+                    z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+                    z->img_comp[i].raw_coeff = jpeg__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+                    if (z->img_comp[i].raw_coeff == NULL)
+                        return jpeg__free_jpeg_components(z, i + 1, jpeg__err("outofmem", "Out of memory"));
+                    z->img_comp[i].coeff = (short*)(((size_t)z->img_comp[i].raw_coeff + 15) & ~15);
+                }
+            }
+
+            return 1;
+        }
+
+        // use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define jpeg__DNL(x)         ((x) == 0xdc)
+#define jpeg__SOI(x)         ((x) == 0xd8)
+#define jpeg__EOI(x)         ((x) == 0xd9)
+#define jpeg__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define jpeg__SOS(x)         ((x) == 0xda)
+
+#define jpeg__SOF_progressive(x)   ((x) == 0xc2)
+
+        static int jpeg__decode_jpeg_header(jpeg__jpeg* z, int scan)
+        {
+            int m;
+            z->jfif = 0;
+            z->app14_color_transform = -1; // valid values are 0,1,2
+            z->marker = JPEG__MARKER_none; // initialize cached marker to empty
+            m = jpeg__get_marker(z);
+            if (!jpeg__SOI(m)) return jpeg__err("no SOI", "Corrupt JPEG");
+            if (scan == JPEG__SCAN_type) return 1;
+            m = jpeg__get_marker(z);
+            while (!jpeg__SOF(m)) {
+                if (!jpeg__process_marker(z, m)) return 0;
+                m = jpeg__get_marker(z);
+                while (m == JPEG__MARKER_none) {
+                    // some files have extra padding after their blocks, so ok, we'll scan
+                    if (jpeg__at_eof(z->s)) return jpeg__err("no SOF", "Corrupt JPEG");
+                    m = jpeg__get_marker(z);
+                }
+            }
+            z->progressive = jpeg__SOF_progressive(m);
+            if (!jpeg__process_frame_header(z, scan)) return 0;
+            return 1;
+        }
+
+        // decode image to YCbCr format
+        static int jpeg__decode_jpeg_image(jpeg__jpeg* j)
+        {
+            int m;
+            for (m = 0; m < 4; m++) {
+                j->img_comp[m].raw_data = NULL;
+                j->img_comp[m].raw_coeff = NULL;
+            }
+            j->restart_interval = 0;
+            if (!jpeg__decode_jpeg_header(j, JPEG__SCAN_load)) return 0;
+            m = jpeg__get_marker(j);
+            while (!jpeg__EOI(m)) {
+                if (jpeg__SOS(m)) {
+                    if (!jpeg__process_scan_header(j)) return 0;
+                    if (!jpeg__parse_entropy_coded_data(j)) return 0;
+                    if (j->marker == JPEG__MARKER_none) {
+                        // handle 0s at the end of image data from IP Kamera 9060
+                        while (!jpeg__at_eof(j->s)) {
+                            int x = jpeg__get8(j->s);
+                            if (x == 255) {
+                                j->marker = jpeg__get8(j->s);
+                                break;
+                            }
+                        }
+                        // if we reach eof without hitting a marker, jpeg__get_marker() below will fail and we'll eventually return 0
+                    }
+                }
+                else if (jpeg__DNL(m)) {
+                    int Ld = jpeg__get16be(j->s);
+                    jpeg__uint32 NL = jpeg__get16be(j->s);
+                    if (Ld != 4) return jpeg__err("bad DNL len", "Corrupt JPEG");
+                    if (NL != j->s->img_y) return jpeg__err("bad DNL height", "Corrupt JPEG");
+                }
+                else {
+                    if (!jpeg__process_marker(j, m)) return 0;
+                }
+                m = jpeg__get_marker(j);
+            }
+            if (j->progressive)
+                jpeg__jpeg_finish(j);
+            return 1;
+        }
+
+        // static jfif-centered resampling (across block boundaries)
+
+        typedef jpeg_uc* (*resample_row_func)(jpeg_uc* out, jpeg_uc* in0, jpeg_uc* in1,
+            int w, int hs);
+
+#define jpeg__div4(x) ((jpeg_uc) ((x) >> 2))
+
+        static jpeg_uc* resample_row_1(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs)
+        {
+            JPEG_NOTUSED(out);
+            JPEG_NOTUSED(in_far);
+            JPEG_NOTUSED(w);
+            JPEG_NOTUSED(hs);
+            return in_near;
+        }
+
+        static jpeg_uc* jpeg__resample_row_v_2(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs)
+        {
+            // need to generate two samples vertically for every one in input
+            int i;
+            JPEG_NOTUSED(hs);
+            for (i = 0; i < w; ++i)
+                out[i] = jpeg__div4(3 * in_near[i] + in_far[i] + 2);
+            return out;
+        }
+
+        static jpeg_uc* jpeg__resample_row_h_2(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs)
+        {
+            // need to generate two samples horizontally for every one in input
+            int i;
+            jpeg_uc* input = in_near;
+
+            if (w == 1) {
+                // if only one sample, can't do any interpolation
+                out[0] = out[1] = input[0];
+                return out;
+            }
+
+            out[0] = input[0];
+            out[1] = jpeg__div4(input[0] * 3 + input[1] + 2);
+            for (i = 1; i < w - 1; ++i) {
+                int n = 3 * input[i] + 2;
+                out[i * 2 + 0] = jpeg__div4(n + input[i - 1]);
+                out[i * 2 + 1] = jpeg__div4(n + input[i + 1]);
+            }
+            out[i * 2 + 0] = jpeg__div4(input[w - 2] * 3 + input[w - 1] + 2);
+            out[i * 2 + 1] = input[w - 1];
+
+            JPEG_NOTUSED(in_far);
+            JPEG_NOTUSED(hs);
+
+            return out;
+        }
+
+#define jpeg__div16(x) ((jpeg_uc) ((x) >> 4))
+
+        static jpeg_uc* jpeg__resample_row_hv_2(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs)
+        {
+            // need to generate 2x2 samples for every one in input
+            int i, t0, t1;
+            if (w == 1) {
+                out[0] = out[1] = jpeg__div4(3 * in_near[0] + in_far[0] + 2);
+                return out;
+            }
+
+            t1 = 3 * in_near[0] + in_far[0];
+            out[0] = jpeg__div4(t1 + 2);
+            for (i = 1; i < w; ++i) {
+                t0 = t1;
+                t1 = 3 * in_near[i] + in_far[i];
+                out[i * 2 - 1] = jpeg__div16(3 * t0 + t1 + 8);
+                out[i * 2] = jpeg__div16(3 * t1 + t0 + 8);
+            }
+            out[w * 2 - 1] = jpeg__div4(t1 + 2);
+
+            JPEG_NOTUSED(hs);
+
+            return out;
+        }
+
+#if defined(JPEG_SSE2) || defined(JPEG_NEON)
+        static jpeg_uc* jpeg__resample_row_hv_2_simd(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs)
+        {
+            // need to generate 2x2 samples for every one in input
+            int i = 0, t0, t1;
+
+            if (w == 1) {
+                out[0] = out[1] = jpeg__div4(3 * in_near[0] + in_far[0] + 2);
+                return out;
+            }
+
+            t1 = 3 * in_near[0] + in_far[0];
+            // process groups of 8 pixels for as long as we can.
+            // note we can't handle the last pixel in a row in this loop
+            // because we need to handle the filter boundary conditions.
+            for (; i < ((w - 1) & ~7); i += 8) {
+#if defined(JPEG_SSE2)
+                // load and perform the vertical filtering pass
+                // this uses 3*x + y = 4*x + (y - x)
+                __m128i zero = _mm_setzero_si128();
+                __m128i farb = _mm_loadl_epi64((__m128i*) (in_far + i));
+                __m128i nearb = _mm_loadl_epi64((__m128i*) (in_near + i));
+                __m128i farw = _mm_unpacklo_epi8(farb, zero);
+                __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+                __m128i diff = _mm_sub_epi16(farw, nearw);
+                __m128i nears = _mm_slli_epi16(nearw, 2);
+                __m128i curr = _mm_add_epi16(nears, diff); // current row
+
+                // horizontal filter works the same based on shifted vers of current
+                // row. "prev" is current row shifted right by 1 pixel; we need to
+                // insert the previous pixel value (from t1).
+                // "next" is current row shifted left by 1 pixel, with first pixel
+                // of next block of 8 pixels added in.
+                __m128i prv0 = _mm_slli_si128(curr, 2);
+                __m128i nxt0 = _mm_srli_si128(curr, 2);
+                __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+                __m128i next = _mm_insert_epi16(nxt0, 3 * in_near[i + 8] + in_far[i + 8], 7);
+
+                // horizontal filter, polyphase implementation since it's convenient:
+                // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+                // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+                // note the shared term.
+                __m128i bias = _mm_set1_epi16(8);
+                __m128i curs = _mm_slli_epi16(curr, 2);
+                __m128i prvd = _mm_sub_epi16(prev, curr);
+                __m128i nxtd = _mm_sub_epi16(next, curr);
+                __m128i curb = _mm_add_epi16(curs, bias);
+                __m128i even = _mm_add_epi16(prvd, curb);
+                __m128i odd = _mm_add_epi16(nxtd, curb);
+
+                // interleave even and odd pixels, then undo scaling.
+                __m128i int0 = _mm_unpacklo_epi16(even, odd);
+                __m128i int1 = _mm_unpackhi_epi16(even, odd);
+                __m128i de0 = _mm_srli_epi16(int0, 4);
+                __m128i de1 = _mm_srli_epi16(int1, 4);
+
+                // pack and write output
+                __m128i outv = _mm_packus_epi16(de0, de1);
+                _mm_storeu_si128((__m128i*) (out + i * 2), outv);
+#elif defined(JPEG_NEON)
+                // load and perform the vertical filtering pass
+                // this uses 3*x + y = 4*x + (y - x)
+                uint8x8_t farb = vld1_u8(in_far + i);
+                uint8x8_t nearb = vld1_u8(in_near + i);
+                int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+                int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+                int16x8_t curr = vaddq_s16(nears, diff); // current row
+
+                // horizontal filter works the same based on shifted vers of current
+                // row. "prev" is current row shifted right by 1 pixel; we need to
+                // insert the previous pixel value (from t1).
+                // "next" is current row shifted left by 1 pixel, with first pixel
+                // of next block of 8 pixels added in.
+                int16x8_t prv0 = vextq_s16(curr, curr, 7);
+                int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+                int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+                int16x8_t next = vsetq_lane_s16(3 * in_near[i + 8] + in_far[i + 8], nxt0, 7);
+
+                // horizontal filter, polyphase implementation since it's convenient:
+                // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+                // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+                // note the shared term.
+                int16x8_t curs = vshlq_n_s16(curr, 2);
+                int16x8_t prvd = vsubq_s16(prev, curr);
+                int16x8_t nxtd = vsubq_s16(next, curr);
+                int16x8_t even = vaddq_s16(curs, prvd);
+                int16x8_t odd = vaddq_s16(curs, nxtd);
+
+                // undo scaling and round, then store with even/odd phases interleaved
+                uint8x8x2_t o;
+                o.val[0] = vqrshrun_n_s16(even, 4);
+                o.val[1] = vqrshrun_n_s16(odd, 4);
+                vst2_u8(out + i * 2, o);
+#endif
+
+                // "previous" value for next iter
+                t1 = 3 * in_near[i + 7] + in_far[i + 7];
+            }
+
+            t0 = t1;
+            t1 = 3 * in_near[i] + in_far[i];
+            out[i * 2] = jpeg__div16(3 * t1 + t0 + 8);
+
+            for (++i; i < w; ++i) {
+                t0 = t1;
+                t1 = 3 * in_near[i] + in_far[i];
+                out[i * 2 - 1] = jpeg__div16(3 * t0 + t1 + 8);
+                out[i * 2] = jpeg__div16(3 * t1 + t0 + 8);
+            }
+            out[w * 2 - 1] = jpeg__div4(t1 + 2);
+
+            JPEG_NOTUSED(hs);
+
+            return out;
+        }
+#endif
+
+        static jpeg_uc* jpeg__resample_row_generic(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs)
+        {
+            // resample with nearest-neighbor
+            int i, j;
+            JPEG_NOTUSED(in_far);
+            for (i = 0; i < w; ++i)
+                for (j = 0; j < hs; ++j)
+                    out[i * hs + j] = in_near[i];
+            return out;
+        }
+
+        // this is a reduced-precision calculation of YCbCr-to-RGB introduced
+        // to make sure the code produces the same results in both SIMD and scalar
+#define jpeg__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+        static void jpeg__YCbCr_to_RGB_row(jpeg_uc* out, const jpeg_uc* y, const jpeg_uc* pcb, const jpeg_uc* pcr, int count, int step)
+        {
+            int i;
+            for (i = 0; i < count; ++i) {
+                int y_fixed = (y[i] << 20) + (1 << 19); // rounding
+                int r, g, b;
+                int cr = pcr[i] - 128;
+                int cb = pcb[i] - 128;
+                r = y_fixed + cr * jpeg__float2fixed(1.40200f);
+                g = y_fixed + (cr * -jpeg__float2fixed(0.71414f)) + ((cb * -jpeg__float2fixed(0.34414f)) & 0xffff0000);
+                b = y_fixed + cb * jpeg__float2fixed(1.77200f);
+                r >>= 20;
+                g >>= 20;
+                b >>= 20;
+                if ((unsigned)r > 255) { if (r < 0) r = 0; else r = 255; }
+                if ((unsigned)g > 255) { if (g < 0) g = 0; else g = 255; }
+                if ((unsigned)b > 255) { if (b < 0) b = 0; else b = 255; }
+                out[0] = (jpeg_uc)r;
+                out[1] = (jpeg_uc)g;
+                out[2] = (jpeg_uc)b;
+                out[3] = 255;
+                out += step;
+            }
+        }
+
+#if defined(JPEG_SSE2) || defined(JPEG_NEON)
+        static void jpeg__YCbCr_to_RGB_simd(jpeg_uc* out, jpeg_uc const* y, jpeg_uc const* pcb, jpeg_uc const* pcr, int count, int step)
+        {
+            int i = 0;
+
+#ifdef JPEG_SSE2
+            // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+            // it's useful in practice (you wouldn't use it for textures, for example).
+            // so just accelerate step == 4 case.
+            if (step == 4) {
+                // this is a fairly straightforward implementation and not super-optimized.
+                __m128i signflip = _mm_set1_epi8(-0x80);
+                __m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f));
+                __m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f));
+                __m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f));
+                __m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f));
+                __m128i y_bias = _mm_set1_epi8((char)(unsigned char)128);
+                __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+                for (; i + 7 < count; i += 8) {
+                    // load
+                    __m128i y_bytes = _mm_loadl_epi64((__m128i*) (y + i));
+                    __m128i cr_bytes = _mm_loadl_epi64((__m128i*) (pcr + i));
+                    __m128i cb_bytes = _mm_loadl_epi64((__m128i*) (pcb + i));
+                    __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+                    __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+                    // unpack to short (and left-shift cr, cb by 8)
+                    __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
+                    __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+                    __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+                    // color transform
+                    __m128i yws = _mm_srli_epi16(yw, 4);
+                    __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+                    __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+                    __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+                    __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+                    __m128i rws = _mm_add_epi16(cr0, yws);
+                    __m128i gwt = _mm_add_epi16(cb0, yws);
+                    __m128i bws = _mm_add_epi16(yws, cb1);
+                    __m128i gws = _mm_add_epi16(gwt, cr1);
+
+                    // descale
+                    __m128i rw = _mm_srai_epi16(rws, 4);
+                    __m128i bw = _mm_srai_epi16(bws, 4);
+                    __m128i gw = _mm_srai_epi16(gws, 4);
+
+                    // back to byte, set up for transpose
+                    __m128i brb = _mm_packus_epi16(rw, bw);
+                    __m128i gxb = _mm_packus_epi16(gw, xw);
+
+                    // transpose to interleave channels
+                    __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+                    __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+                    __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+                    __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+                    // store
+                    _mm_storeu_si128((__m128i*) (out + 0), o0);
+                    _mm_storeu_si128((__m128i*) (out + 16), o1);
+                    out += 32;
+                }
+            }
+#endif
+
+#ifdef JPEG_NEON
+            // in this version, step=3 support would be easy to add. but is there demand?
+            if (step == 4) {
+                // this is a fairly straightforward implementation and not super-optimized.
+                uint8x8_t signflip = vdup_n_u8(0x80);
+                int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f));
+                int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f));
+                int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f));
+                int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f));
+
+                for (; i + 7 < count; i += 8) {
+                    // load
+                    uint8x8_t y_bytes = vld1_u8(y + i);
+                    uint8x8_t cr_bytes = vld1_u8(pcr + i);
+                    uint8x8_t cb_bytes = vld1_u8(pcb + i);
+                    int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+                    int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+                    // expand to s16
+                    int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+                    int16x8_t crw = vshll_n_s8(cr_biased, 7);
+                    int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+                    // color transform
+                    int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+                    int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+                    int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+                    int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+                    int16x8_t rws = vaddq_s16(yws, cr0);
+                    int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+                    int16x8_t bws = vaddq_s16(yws, cb1);
+
+                    // undo scaling, round, convert to byte
+                    uint8x8x4_t o;
+                    o.val[0] = vqrshrun_n_s16(rws, 4);
+                    o.val[1] = vqrshrun_n_s16(gws, 4);
+                    o.val[2] = vqrshrun_n_s16(bws, 4);
+                    o.val[3] = vdup_n_u8(255);
+
+                    // store, interleaving r/g/b/a
+                    vst4_u8(out, o);
+                    out += 8 * 4;
+                }
+            }
+#endif
+
+            for (; i < count; ++i) {
+                int y_fixed = (y[i] << 20) + (1 << 19); // rounding
+                int r, g, b;
+                int cr = pcr[i] - 128;
+                int cb = pcb[i] - 128;
+                r = y_fixed + cr * jpeg__float2fixed(1.40200f);
+                g = y_fixed + cr * -jpeg__float2fixed(0.71414f) + ((cb * -jpeg__float2fixed(0.34414f)) & 0xffff0000);
+                b = y_fixed + cb * jpeg__float2fixed(1.77200f);
+                r >>= 20;
+                g >>= 20;
+                b >>= 20;
+                if ((unsigned)r > 255) { if (r < 0) r = 0; else r = 255; }
+                if ((unsigned)g > 255) { if (g < 0) g = 0; else g = 255; }
+                if ((unsigned)b > 255) { if (b < 0) b = 0; else b = 255; }
+                out[0] = (jpeg_uc)r;
+                out[1] = (jpeg_uc)g;
+                out[2] = (jpeg_uc)b;
+                out[3] = 255;
+                out += step;
+            }
+        }
+#endif
+
+        // set up the kernels
+        static void jpeg__setup_jpeg(jpeg__jpeg* j)
+        {
+            j->idct_block_kernel = jpeg__idct_block;
+            j->YCbCr_to_RGB_kernel = jpeg__YCbCr_to_RGB_row;
+            j->resample_row_hv_2_kernel = jpeg__resample_row_hv_2;
+
+#ifdef JPEG_SSE2
+            if (jpeg__sse2_available()) {
+                j->idct_block_kernel = jpeg__idct_simd;
+                j->YCbCr_to_RGB_kernel = jpeg__YCbCr_to_RGB_simd;
+                j->resample_row_hv_2_kernel = jpeg__resample_row_hv_2_simd;
+            }
+#endif
+
+#ifdef JPEG_NEON
+            j->idct_block_kernel = jpeg__idct_simd;
+            j->YCbCr_to_RGB_kernel = jpeg__YCbCr_to_RGB_simd;
+            j->resample_row_hv_2_kernel = jpeg__resample_row_hv_2_simd;
+#endif
+        }
+
+        // clean up the temporary component buffers
+        static void jpeg__cleanup_jpeg(jpeg__jpeg* j)
+        {
+            jpeg__free_jpeg_components(j, j->s->img_n, 0);
+        }
+
+        typedef struct
+        {
+            resample_row_func resample;
+            jpeg_uc* line0, * line1;
+            int hs, vs;   // expansion factor in each axis
+            int w_lores; // horizontal pixels pre-expansion
+            int ystep;   // how far through vertical expansion we are
+            int ypos;    // which pre-expansion row we're on
+        } jpeg__resample;
+
+        // fast 0..255 * 0..255 => 0..255 rounded multiplication
+        static jpeg_uc jpeg__blinn_8x8(jpeg_uc x, jpeg_uc y)
+        {
+            unsigned int t = x * y + 128;
+            return (jpeg_uc)((t + (t >> 8)) >> 8);
+        }
+
+        static jpeg_uc* load_jpeg_image(jpeg__jpeg* z, int* out_x, int* out_y, int* comp, int req_comp)
+        {
+            int n, decode_n, is_rgb;
+            z->s->img_n = 0; // make jpeg__cleanup_jpeg safe
+
+            // validate req_comp
+            if (req_comp < 0 || req_comp > 4) return jpeg__errpuc("bad req_comp", "Internal error");
+
+            // load a jpeg image from whichever source, but leave in YCbCr format
+            if (!jpeg__decode_jpeg_image(z)) { jpeg__cleanup_jpeg(z); return NULL; }
+
+            // determine actual number of components to generate
+            n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+            is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+
+            if (z->s->img_n == 3 && n < 3 && !is_rgb)
+                decode_n = 1;
+            else
+                decode_n = z->s->img_n;
+
+            // resample and color-convert
+            {
+                int k;
+                unsigned int i, j;
+                jpeg_uc* output;
+                jpeg_uc* coutput[4] = { NULL, NULL, NULL, NULL };
+
+                jpeg__resample res_comp[4];
+
+                for (k = 0; k < decode_n; ++k) {
+                    jpeg__resample* r = &res_comp[k];
+
+                    // allocate line buffer big enough for upsampling off the edges
+                    // with upsample factor of 4
+                    z->img_comp[k].linebuf = (jpeg_uc*)jpeg__malloc(z->s->img_x + 3);
+                    if (!z->img_comp[k].linebuf) { jpeg__cleanup_jpeg(z); return jpeg__errpuc("outofmem", "Out of memory"); }
+
+                    r->hs = z->img_h_max / z->img_comp[k].h;
+                    r->vs = z->img_v_max / z->img_comp[k].v;
+                    r->ystep = r->vs >> 1;
+                    r->w_lores = (z->s->img_x + r->hs - 1) / r->hs;
+                    r->ypos = 0;
+                    r->line0 = r->line1 = z->img_comp[k].data;
+
+                    if (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+                    else if (r->hs == 1 && r->vs == 2) r->resample = jpeg__resample_row_v_2;
+                    else if (r->hs == 2 && r->vs == 1) r->resample = jpeg__resample_row_h_2;
+                    else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
+                    else                               r->resample = jpeg__resample_row_generic;
+                }
+
+                // can't error after this so, this is safe
+                output = (jpeg_uc*)jpeg__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+                if (!output) { jpeg__cleanup_jpeg(z); return jpeg__errpuc("outofmem", "Out of memory"); }
+
+                // now go ahead and resample
+                for (j = 0; j < z->s->img_y; ++j) {
+                    jpeg_uc* out = output + n * z->s->img_x * j;
+                    for (k = 0; k < decode_n; ++k) {
+                        jpeg__resample* r = &res_comp[k];
+                        int y_bot = r->ystep >= (r->vs >> 1);
+                        coutput[k] = r->resample(z->img_comp[k].linebuf,
+                            y_bot ? r->line1 : r->line0,
+                            y_bot ? r->line0 : r->line1,
+                            r->w_lores, r->hs);
+                        if (++r->ystep >= r->vs) {
+                            r->ystep = 0;
+                            r->line0 = r->line1;
+                            if (++r->ypos < z->img_comp[k].y)
+                                r->line1 += z->img_comp[k].w2;
+                        }
+                    }
+                    if (n >= 3) {
+                        jpeg_uc* y = coutput[0];
+                        if (z->s->img_n == 3) {
+                            if (is_rgb) {
+                                for (i = 0; i < z->s->img_x; ++i) {
+                                    out[0] = y[i];
+                                    out[1] = coutput[1][i];
+                                    out[2] = coutput[2][i];
+                                    out[3] = 255;
+                                    out += n;
+                                }
+                            }
+                            else {
+                                z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                            }
+                        }
+                        else if (z->s->img_n == 4) {
+                            if (z->app14_color_transform == 0) { // CMYK
+                                for (i = 0; i < z->s->img_x; ++i) {
+                                    jpeg_uc m = coutput[3][i];
+                                    out[0] = jpeg__blinn_8x8(coutput[0][i], m);
+                                    out[1] = jpeg__blinn_8x8(coutput[1][i], m);
+                                    out[2] = jpeg__blinn_8x8(coutput[2][i], m);
+                                    out[3] = 255;
+                                    out += n;
+                                }
+                            }
+                            else if (z->app14_color_transform == 2) { // YCCK
+                                z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                                for (i = 0; i < z->s->img_x; ++i) {
+                                    jpeg_uc m = coutput[3][i];
+                                    out[0] = jpeg__blinn_8x8(255 - out[0], m);
+                                    out[1] = jpeg__blinn_8x8(255 - out[1], m);
+                                    out[2] = jpeg__blinn_8x8(255 - out[2], m);
+                                    out += n;
+                                }
+                            }
+                            else { // YCbCr + alpha?  Ignore the fourth channel for now
+                                z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                            }
+                        }
+                        else
+                            for (i = 0; i < z->s->img_x; ++i) {
+                                out[0] = out[1] = out[2] = y[i];
+                                out[3] = 255; // not used if n==3
+                                out += n;
+                            }
+                    }
+                    else {
+                        if (is_rgb) {
+                            if (n == 1)
+                                for (i = 0; i < z->s->img_x; ++i)
+                                    *out++ = jpeg__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                            else {
+                                for (i = 0; i < z->s->img_x; ++i, out += 2) {
+                                    out[0] = jpeg__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                                    out[1] = 255;
+                                }
+                            }
+                        }
+                        else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+                            for (i = 0; i < z->s->img_x; ++i) {
+                                jpeg_uc m = coutput[3][i];
+                                jpeg_uc r = jpeg__blinn_8x8(coutput[0][i], m);
+                                jpeg_uc g = jpeg__blinn_8x8(coutput[1][i], m);
+                                jpeg_uc b = jpeg__blinn_8x8(coutput[2][i], m);
+                                out[0] = jpeg__compute_y(r, g, b);
+                                out[1] = 255;
+                                out += n;
+                            }
+                        }
+                        else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+                            for (i = 0; i < z->s->img_x; ++i) {
+                                out[0] = jpeg__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                                out[1] = 255;
+                                out += n;
+                            }
+                        }
+                        else {
+                            jpeg_uc* y = coutput[0];
+                            if (n == 1)
+                                for (i = 0; i < z->s->img_x; ++i) out[i] = y[i];
+                            else
+                                for (i = 0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
+                        }
+                    }
+                }
+                jpeg__cleanup_jpeg(z);
+                *out_x = z->s->img_x;
+                *out_y = z->s->img_y;
+                if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
+                return output;
+            }
+        }
+
+        static void* jpeg__jpeg_load(jpeg__context* s, int* x, int* y, int* comp, int req_comp, jpeg__result_info* ri)
+        {
+            unsigned char* result;
+            jpeg__jpeg* j = (jpeg__jpeg*)jpeg__malloc(sizeof(jpeg__jpeg));
+            JPEG_NOTUSED(ri);
+            j->s = s;
+            jpeg__setup_jpeg(j);
+            result = load_jpeg_image(j, x, y, comp, req_comp);
+            JPEG_FREE(j);
+            return result;
+        }
+
+        static int jpeg__jpeg_test(jpeg__context* s)
+        {
+            int r;
+            jpeg__jpeg* j = (jpeg__jpeg*)jpeg__malloc(sizeof(jpeg__jpeg));
+            j->s = s;
+            jpeg__setup_jpeg(j);
+            r = jpeg__decode_jpeg_header(j, JPEG__SCAN_type);
+            jpeg__rewind(s);
+            JPEG_FREE(j);
+            return r;
+        }
+
+        static int jpeg__jpeg_info_raw(jpeg__jpeg* j, int* x, int* y, int* comp)
+        {
+            if (!jpeg__decode_jpeg_header(j, JPEG__SCAN_header)) {
+                jpeg__rewind(j->s);
+                return 0;
+            }
+            if (x) *x = j->s->img_x;
+            if (y) *y = j->s->img_y;
+            if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
+            return 1;
+        }
+
+        static int jpeg__jpeg_info(jpeg__context* s, int* x, int* y, int* comp)
+        {
+            int result;
+            jpeg__jpeg* j = (jpeg__jpeg*)(jpeg__malloc(sizeof(jpeg__jpeg)));
+            j->s = s;
+            result = jpeg__jpeg_info_raw(j, x, y, comp);
+            JPEG_FREE(j);
+            return result;
+        }
+
+        //------------------------------------------------------------------------
+
+        static int jpeg__stdio_read(void* user, char* data, int size)
+        {
+            InputMemoryStream* stream = (InputMemoryStream*)user;
+            return (int)stream->Read(size, data);
+        }
+
+        static void jpeg__stdio_skip(void* user, int n)
+        {
+            InputMemoryStream* stream = (InputMemoryStream*)user;
+            stream->Skip(n);
+        }
+
+        static int jpeg__stdio_eof(void* user)
+        {
+            InputMemoryStream* stream = (InputMemoryStream*)user;
+            return stream->Pos() == stream->Size() ? 1 : 0;
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageJpegLoader::ImageJpegLoader(const ImageLoaderParam& param)
+            : ImageLoader(param)
+        {
+            if (_param.format == SimdPixelFormatNone)
+                _param.format = SimdPixelFormatRgb24;
+        }
+
+        bool ImageJpegLoader::FromStream()
+        {
+            int x, y, comp;
+            jpeg__context s;
+            s.io.eof = jpeg__stdio_eof;
+            s.io.read = jpeg__stdio_read;
+            s.io.skip = jpeg__stdio_skip;
+            s.io_user_data = &_stream;
+            s.buflen = sizeof(s.buffer_start);
+            s.read_from_callbacks = 1;
+            s.callback_already_read = 0;
+            s.img_buffer = s.img_buffer_original = s.buffer_start;
+            jpeg__refill_buffer(&s);
+            s.img_buffer_original_end = s.img_buffer_end;
+            jpeg__result_info ri;
+            uint8_t * data = (uint8_t*)jpeg__jpeg_load(&s, &x, &y, &comp, 3, &ri);
+            if (data)
+            {
+                size_t stride = 3 * x;
+                _image.Recreate(x, y, (Image::Format)_param.format);
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8:
+                    Base::RgbToGray(data, x, y, stride, _image.data, _image.stride);
+                    break;
+                case SimdPixelFormatBgr24:
+                    Base::BgrToRgb(data, x, y, stride, _image.data, _image.stride);
+                    break;
+                case SimdPixelFormatBgra32:
+                    Base::RgbToBgra(data, x, y, stride, _image.data, _image.stride, 0xFF);
+                    break;
+                case SimdPixelFormatRgb24:
+                    Base::Copy(data, stride, x, y, 3, _image.data, _image.stride);
+                    break;
+                case SimdPixelFormatRgba32:
+                    Base::BgrToBgra(data, x, y, stride, _image.data, _image.stride, 0xFF);
+                    break;
+                default: 
+                    break;
+                }
+                JPEG_FREE(data);
+                return true;
+            }
+            return false;
+        }
+    }
+}
diff --git a/3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp b/3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp
new file mode 100644
index 0000000000..03ae0fab6f
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp
@@ -0,0 +1,1317 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdImageSavePng.h"
+#include "Simd/SimdArray.h"
+#include "Simd/SimdCpu.h"
+#include "Simd/SimdBase.h"
+
+namespace Simd
+{
+    namespace Base
+    {
+#define PNG_MALLOC(sz)           malloc(sz)
+#define PNG_REALLOC(p,newsz)     realloc(p,newsz)
+#define PNG_FREE(p)              free(p)
+
+#define PNG__BYTECAST(x)  ((uint8_t) ((x) & 255))  // truncate int to byte without warnings
+
+        SIMD_INLINE int PngError(const char* str, const char* stub)
+        {
+            std::cout << "PNG load error: " << str << ", " << stub << "!" << std::endl;
+            return 0;
+        }
+
+        SIMD_INLINE uint8_t * PngErrorPtr(const char* str, const char* stub)
+        {
+            return (uint8_t*)(size_t)(PngError(str, stub) ? NULL : NULL);
+        }
+
+        static void* png__malloc(size_t size)
+        {
+            return PNG_MALLOC(size);
+        }
+
+        struct PngContext
+        {
+            uint32_t img_x, img_y;
+            int img_n, img_out_n;
+        };
+
+        static int png__addsizes_valid(int a, int b)
+        {
+            if (b < 0) return 0;
+            // now 0 <= b <= INT_MAX, hence also
+            // 0 <= INT_MAX - b <= INTMAX.
+            // And "a + b <= INT_MAX" (which might overflow) is the
+            // same as a <= INT_MAX - b (no overflow)
+            return a <= INT_MAX - b;
+        }
+
+        // returns 1 if the product is valid, 0 on overflow.
+        // negative factors are considered invalid.
+        static int png__mul2sizes_valid(int a, int b)
+        {
+            if (a < 0 || b < 0) return 0;
+            if (b == 0) return 1; // mul-by-0 is always safe
+            // portable way to check for no overflows in a*b
+            return a <= INT_MAX / b;
+        }
+
+        // returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+        static int png__mad2sizes_valid(int a, int b, int add)
+        {
+            return png__mul2sizes_valid(a, b) && png__addsizes_valid(a * b, add);
+        }
+
+        // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+        static int png__mad3sizes_valid(int a, int b, int c, int add)
+        {
+            return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) &&
+                png__addsizes_valid(a * b * c, add);
+        }
+
+        // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+        static int png__mad4sizes_valid(int a, int b, int c, int d, int add)
+        {
+            return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) &&
+                png__mul2sizes_valid(a * b * c, d) && png__addsizes_valid(a * b * c * d, add);
+        }
+
+        // mallocs with size overflow checking
+        static void* png__malloc_mad2(int a, int b, int add)
+        {
+            if (!png__mad2sizes_valid(a, b, add)) return NULL;
+            return png__malloc(a * b + add);
+        }
+
+        static void* png__malloc_mad3(int a, int b, int c, int add)
+        {
+            if (!png__mad3sizes_valid(a, b, c, add)) return NULL;
+            return png__malloc(a * b * c + add);
+        }
+
+        static void* png__malloc_mad4(int a, int b, int c, int d, int add)
+        {
+            if (!png__mad4sizes_valid(a, b, c, d, add)) return NULL;
+            return png__malloc(a * b * c * d + add);
+        }
+
+        static uint8_t png__compute_y(int r, int g, int b)
+        {
+            return (uint8_t)(((r * 77) + (g * 150) + (29 * b)) >> 8);
+        }
+
+        static uint8_t* png__convert_format(uint8_t* data, int img_n, int req_comp, unsigned int x, unsigned int y)
+        {
+            int i, j;
+            uint8_t* good;
+
+            if (req_comp == img_n) 
+                return data;
+            assert(req_comp >= 1 && req_comp <= 4);
+
+            good = (uint8_t*)png__malloc_mad3(req_comp, x, y, 0);
+            if (good == NULL) 
+            {
+                PNG_FREE(data);
+                return PngErrorPtr("outofmem", "Out of memory");
+            }
+
+            for (j = 0; j < (int)y; ++j) 
+            {
+                uint8_t* src = data + j * x * img_n;
+                uint8_t* dest = good + j * x * req_comp;
+
+#define PNG__COMBO(a,b)  ((a)*8+(b))
+#define PNG__CASE(a,b)   case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+                // convert source image with img_n components to one with req_comp components;
+                // avoid switch per pixel, so use switch per scanline and massive macros
+                switch (PNG__COMBO(img_n, req_comp)) 
+                {
+                    PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 255; } break;
+                    PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 255; } break;
+                    PNG__CASE(2, 1) { dest[0] = src[0]; } break;
+                    PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break;
+                    PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 255; } break;
+                    PNG__CASE(3, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break;
+                    PNG__CASE(3, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = 255; } break;
+                    PNG__CASE(4, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break;
+                    PNG__CASE(4, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = src[3]; } break;
+                    PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break;
+                default: assert(0); PNG_FREE(data); PNG_FREE(good); return PngErrorPtr("unsupported", "Unsupported format conversion");
+                }
+#undef PNG__CASE
+            }
+
+            PNG_FREE(data);
+            return good;
+        }
+
+        static uint16_t png__compute_y_16(int r, int g, int b)
+        {
+            return (uint16_t)(((r * 77) + (g * 150) + (29 * b)) >> 8);
+        }
+
+        static uint16_t* png__convert_format16(uint16_t* data, int img_n, int req_comp, unsigned int x, unsigned int y)
+        {
+            int i, j;
+            uint16_t* good;
+
+            if (req_comp == img_n) 
+                return data;
+            assert(req_comp >= 1 && req_comp <= 4);
+
+            good = (uint16_t*)png__malloc(req_comp * x * y * 2);
+            if (good == NULL) 
+            {
+                PNG_FREE(data);
+                return (uint16_t*)PngErrorPtr("outofmem", "Out of memory");
+            }
+
+            for (j = 0; j < (int)y; ++j) 
+            {
+                uint16_t* src = data + j * x * img_n;
+                uint16_t* dest = good + j * x * req_comp;
+
+#define PNG__COMBO(a,b)  ((a)*8+(b))
+#define PNG__CASE(a,b)   case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+                // convert source image with img_n components to one with req_comp components;
+                // avoid switch per pixel, so use switch per scanline and massive macros
+                switch (PNG__COMBO(img_n, req_comp)) {
+                    PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 0xffff; } break;
+                    PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 0xffff; } break;
+                    PNG__CASE(2, 1) { dest[0] = src[0]; } break;
+                    PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break;
+                    PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 0xffff; } break;
+                    PNG__CASE(3, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break;
+                    PNG__CASE(3, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = 0xffff; } break;
+                    PNG__CASE(4, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break;
+                    PNG__CASE(4, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = src[3]; } break;
+                    PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break;
+                default: assert(0); PNG_FREE(data); PNG_FREE(good); return (uint16_t*)PngErrorPtr("unsupported", "Unsupported format conversion");
+                }
+#undef PNG__CASE
+            }
+
+            PNG_FREE(data);
+            return good;
+        }
+
+        namespace Zlib
+        {
+            const size_t ZFAST_BITS = 9;
+            const size_t ZFAST_SIZE = 1 << ZFAST_BITS;
+            const size_t ZFAST_MASK = ZFAST_SIZE - 1;
+
+            struct Zhuffman
+            {
+                uint16_t fast[ZFAST_SIZE];
+                uint16_t firstCode[16];
+                int maxCode[17];
+                uint16_t firstSymbol[16];
+                uint8_t  size[288];
+                uint16_t value[288];
+
+                bool Build(const uint8_t* sizelist, int num)
+                {
+                    int i, k = 0;
+                    int code, nextCode[16], sizes[17];
+
+                    memset(sizes, 0, sizeof(sizes));
+                    memset(fast, 0, sizeof(fast));
+                    for (i = 0; i < num; ++i)
+                        ++sizes[sizelist[i]];
+                    sizes[0] = 0;
+                    for (i = 1; i < 16; ++i)
+                        if (sizes[i] > (1 << i))
+                            return PngError("bad sizes", "Corrupt PNG");
+                    code = 0;
+                    for (i = 1; i < 16; ++i)
+                    {
+                        nextCode[i] = code;
+                        firstCode[i] = (uint16_t)code;
+                        firstSymbol[i] = (uint16_t)k;
+                        code = (code + sizes[i]);
+                        if (sizes[i] && code - 1 >= (1 << i))
+                            return PngError("bad codelengths", "Corrupt PNG");
+                        maxCode[i] = code << (16 - i); // preshift for inner loop
+                        code <<= 1;
+                        k += sizes[i];
+                    }
+                    maxCode[16] = 0x10000; // sentinel
+                    for (i = 0; i < num; ++i)
+                    {
+                        int s = sizelist[i];
+                        if (s)
+                        {
+                            int c = nextCode[s] - firstCode[s] + firstSymbol[s];
+                            uint16_t fastv = (uint16_t)((s << 9) | i);
+                            size[c] = (uint8_t)s;
+                            value[c] = (uint16_t)i;
+                            if (s <= (int)ZFAST_BITS)
+                            {
+                                int j = ZlibBitRev(nextCode[s], s);
+                                while (j < (1 << ZFAST_BITS))
+                                {
+                                    fast[j] = fastv;
+                                    j += (1 << s);
+                                }
+                            }
+                            ++nextCode[s];
+                        }
+                    }
+                    return 1;
+                }
+            };
+
+            SIMD_INLINE static int BitRev16(int n)
+            {
+                n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1);
+                n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2);
+                n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4);
+                n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8);
+                return n;
+            }
+
+            static int ZhuffmanDecode(InputMemoryStream& is, const Zhuffman& z)
+            {
+                int b, s;
+                if (is.BitCount() < 16)
+                {
+                    if (is.Eof())
+                        return -1;
+                    is.FillBits();
+                }
+                b = z.fast[is.BitBuffer() & ZFAST_MASK];
+                if (b)
+                {
+                    s = b >> 9;
+                    is.BitBuffer() >>= s;
+                    is.BitCount() -= s;
+                    return b & 511;
+                }
+                else
+                {
+                    int k;
+                    k = BitRev16(is.BitBuffer());
+                    for (s = ZFAST_BITS + 1; k >= z.maxCode[s]; ++s);
+                    if (s >= 16)
+                        return -1;
+                    b = (k >> (16 - s)) - z.firstCode[s] + z.firstSymbol[s];
+                    if (b >= sizeof(z.size) || z.size[b] != s)
+                        return -1;
+                    is.BitBuffer() >>= s;
+                    is.BitCount() -= s;
+                    return z.value[b];
+                }
+            }
+
+            static int ParseHuffmanBlock(InputMemoryStream& is, const Zhuffman& zLength, const Zhuffman& zDistance, OutputMemoryStream& os)
+            {
+                static const int zlengthBase[31] = { 3,4,5,6,7,8,9,10,11,13, 15,17,19,23,27,31,35,43,51,59, 67,83,99,115,131,163,195,227,258,0,0 };
+                static const int zlengthExtra[31] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+                static const int zdistBase[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193, 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0 };
+                static const int zdistExtra[32] = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
+
+                uint8_t* beg = os.Data(), * dst = os.Current(), * end = beg + os.Capacity();
+                for (;;)
+                {
+                    ptrdiff_t z = ZhuffmanDecode(is, zLength);
+                    if (z < 256)
+                    {
+                        if (z < 0)
+                            return PngError("bad huffman code", "Corrupt PNG");
+                        if (dst >= end)
+                        {
+                            os.Reserve(end - beg + 1);
+                            beg = os.Data();
+                            dst = os.Current();
+                            end = beg + os.Capacity();
+                        }
+                        *dst++ = (uint8_t)z;
+                    }
+                    else
+                    {
+                        uint8_t* p;
+                        ptrdiff_t len, dist;
+                        if (z == 256)
+                        {
+                            os.Seek(dst - beg);
+                            return 1;
+                        }
+                        z -= 257;
+                        len = zlengthBase[z];
+                        if (zlengthExtra[z])
+                            len += is.ReadBits(zlengthExtra[z]);
+                        z = ZhuffmanDecode(is, zDistance);
+                        if (z < 0)
+                            return PngError("bad huffman code", "Corrupt PNG");
+                        dist = zdistBase[z];
+                        if (zdistExtra[z])
+                            dist += is.ReadBits(zdistExtra[z]);
+                        if (dst - beg < dist)
+                            return PngError("bad dist", "Corrupt PNG");
+                        if (dst + len > end)
+                        {
+                            os.Reserve(end - beg + 1);
+                            beg = os.Data();
+                            dst = os.Current();
+                            end = beg + os.Capacity();
+                        }
+                        uint8_t* src = dst - dist;
+                        if (dist == 1)
+                        {
+                            memset(dst, *src, len);
+                            dst += len;
+                        }
+                        else if (dist < len || len < 16)
+                        {
+                            for (; len; len--)
+                                *dst++ = *src++;
+                        }
+                        else
+                        {
+                            memcpy(dst, src, len);
+                            dst += len;
+                        }
+                    }
+                }
+            }
+
+            static int ComputeHuffmanCodes(InputMemoryStream& is, Zhuffman& zLength, Zhuffman& zDistance)
+            {
+                static const uint8_t length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+                Zhuffman z_codelength;
+                uint8_t lencodes[286 + 32 + 137];
+                uint8_t codelength_sizes[19];
+                int i, n;
+
+                int hlit = is.ReadBits(5) + 257;
+                int hdist = is.ReadBits(5) + 1;
+                int hclen = is.ReadBits(4) + 4;
+                int ntot = hlit + hdist;
+
+                memset(codelength_sizes, 0, sizeof(codelength_sizes));
+                for (i = 0; i < hclen; ++i)
+                {
+                    int s = is.ReadBits(3);
+                    codelength_sizes[length_dezigzag[i]] = (uint8_t)s;
+                }
+                if (!z_codelength.Build(codelength_sizes, 19))
+                    return 0;
+                n = 0;
+                while (n < ntot)
+                {
+                    int c = ZhuffmanDecode(is, z_codelength);
+                    if (c < 0 || c >= 19)
+                        return PngError("bad codelengths", "Corrupt PNG");
+                    if (c < 16)
+                        lencodes[n++] = (uint8_t)c;
+                    else
+                    {
+                        uint8_t fill = 0;
+                        if (c == 16)
+                        {
+                            c = is.ReadBits(2) + 3;
+                            if (n == 0) return PngError("bad codelengths", "Corrupt PNG");
+                            fill = lencodes[n - 1];
+                        }
+                        else if (c == 17)
+                            c = is.ReadBits(3) + 3;
+                        else if (c == 18)
+                            c = is.ReadBits(7) + 11;
+                        else
+                            return PngError("bad codelengths", "Corrupt PNG");
+                        if (ntot - n < c)
+                            return PngError("bad codelengths", "Corrupt PNG");
+                        memset(lencodes + n, fill, c);
+                        n += c;
+                    }
+                }
+                if (n != ntot)
+                    return PngError("bad codelengths", "Corrupt PNG");
+                if (!zLength.Build(lencodes, hlit))
+                    return 0;
+                if (!zDistance.Build(lencodes + hlit, hdist))
+                    return 0;
+                return 1;
+            }
+
+            static int ParseUncompressedBlock(InputMemoryStream& is, OutputMemoryStream& os)
+            {
+                is.ClearBits();
+                uint16_t len, nlen;
+                if (!is.Read16u(len) || !is.Read16u(nlen) || nlen != (len ^ 0xffff))
+                    return PngError("zlib corrupt", "Corrupt PNG");
+                if (!os.Write(is, len))
+                    return PngError("read past buffer", "Corrupt PNG");
+                return 1;
+            }
+
+            static int ParseHeader(InputMemoryStream& is)
+            {
+                uint8_t cmf, flg;
+                if (!(is.Read8u(cmf) && is.Read8u(flg)))
+                    return PngError("bad zlib header", "Corrupt PNG");
+                if ((int(cmf) * 256 + flg) % 31 != 0)
+                    return PngError("bad zlib header", "Corrupt PNG");
+                if (flg & 32)
+                    return PngError("no preset dict", "Corrupt PNG");
+                if ((cmf & 15) != 8)
+                    return PngError("bad compression", "Corrupt PNG");
+                return 1;
+            }
+
+            bool Decode(InputMemoryStream& is, OutputMemoryStream& os, bool parseHeader)
+            {
+                static const uint8_t ZdefaultLength[288] = {
+                   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+                   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+                   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+                   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+                   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+                   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+                   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+                   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+                   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+                };
+                static const uint8_t ZdefaultDistance[32] = {
+                   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+                };
+
+                Zhuffman zLength, zDistance;
+                int final, type;
+                if (parseHeader)
+                {
+                    if (!ParseHeader(is))
+                        return false;
+                }
+                do
+                {
+                    final = is.ReadBits(1);
+                    type = is.ReadBits(2);
+                    if (type == 0)
+                    {
+                        if (!ParseUncompressedBlock(is, os))
+                            return false;
+                    }
+                    else if (type == 3)
+                        return false;
+                    else
+                    {
+                        if (type == 1)
+                        {
+                            if (!zLength.Build(ZdefaultLength, 288))
+                                return false;
+                            if (!zDistance.Build(ZdefaultDistance, 32))
+                                return false;
+                        }
+                        else
+                        {
+                            if (!ComputeHuffmanCodes(is, zLength, zDistance))
+                                return false;
+                        }
+                        if (!ParseHuffmanBlock(is, zLength, zDistance, os))
+                            return false;
+                    }
+                } while (!final);
+                return true;
+            }
+        }
+
+        typedef struct
+        {
+            PngContext* s;
+            uint8_t * out;
+            uint8_t depth;
+        } png__png;
+
+        enum 
+        {
+            PNG__F_none = 0,
+            PNG__F_sub = 1,
+            PNG__F_up = 2,
+            PNG__F_avg = 3,
+            PNG__F_paeth = 4,
+            // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+            PNG__F_avg_first,
+            PNG__F_paeth_first
+        };
+
+        static uint8_t first_row_filter[5] =
+        {
+           PNG__F_none,
+           PNG__F_sub,
+           PNG__F_none,
+           PNG__F_avg_first,
+           PNG__F_paeth_first
+        };
+
+        static int png__paeth(int a, int b, int c)
+        {
+            int p = a + b - c;
+            int pa = abs(p - a);
+            int pb = abs(p - b);
+            int pc = abs(p - c);
+            if (pa <= pb && pa <= pc) return a;
+            if (pb <= pc) return b;
+            return c;
+        }
+
+        static const uint8_t png__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+        // create the png data from post-deflated data
+        static int png__create_png_image_raw(png__png* a, uint8_t* raw, uint32_t raw_len, int out_n, uint32_t x, uint32_t y, int depth, int color)
+        {
+            int bytes = (depth == 16 ? 2 : 1);
+            PngContext* s = a->s;
+            uint32_t i, j, stride = x * out_n * bytes;
+            uint32_t img_len, img_width_bytes;
+            int k;
+            int img_n = s->img_n; // copy it into a local for later
+
+            int output_bytes = out_n * bytes;
+            int filter_bytes = img_n * bytes;
+            int width = x;
+
+            assert(out_n == s->img_n || out_n == s->img_n + 1);
+            a->out = (uint8_t*)png__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+            if (!a->out) return PngError("outofmem", "Out of memory");
+
+            if (!png__mad3sizes_valid(img_n, x, depth, 7)) return PngError("too large", "Corrupt PNG");
+            img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+            img_len = (img_width_bytes + 1) * y;
+
+            // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+            // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+            // so just check for raw_len < img_len always.
+            if (raw_len < img_len) 
+                return PngError("not enough pixels", "Corrupt PNG");
+
+            for (j = 0; j < y; ++j) 
+            {
+                uint8_t* cur = a->out + stride * j;
+                uint8_t* prior;
+                int filter = *raw++;
+
+                if (filter > 4)
+                    return PngError("invalid filter", "Corrupt PNG");
+
+                if (depth < 8) 
+                {
+                    if (img_width_bytes > x) 
+                        return PngError("invalid width", "Corrupt PNG");
+                    cur += x * out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+                    filter_bytes = 1;
+                    width = img_width_bytes;
+                }
+                prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
+
+                // if first row, use special filter that doesn't sample previous row
+                if (j == 0) filter = first_row_filter[filter];
+
+                // handle first byte explicitly
+                for (k = 0; k < filter_bytes; ++k) 
+                {
+                    switch (filter) {
+                    case PNG__F_none: cur[k] = raw[k]; break;
+                    case PNG__F_sub: cur[k] = raw[k]; break;
+                    case PNG__F_up: cur[k] = PNG__BYTECAST(raw[k] + prior[k]); break;
+                    case PNG__F_avg: cur[k] = PNG__BYTECAST(raw[k] + (prior[k] >> 1)); break;
+                    case PNG__F_paeth: cur[k] = PNG__BYTECAST(raw[k] + png__paeth(0, prior[k], 0)); break;
+                    case PNG__F_avg_first: cur[k] = raw[k]; break;
+                    case PNG__F_paeth_first: cur[k] = raw[k]; break;
+                    }
+                }
+
+                if (depth == 8) 
+                {
+                    if (img_n != out_n)
+                        cur[img_n] = 255; // first pixel
+                    raw += img_n;
+                    cur += out_n;
+                    prior += out_n;
+                }
+                else if (depth == 16) 
+                {
+                    if (img_n != out_n) 
+                    {
+                        cur[filter_bytes] = 255; // first pixel top byte
+                        cur[filter_bytes + 1] = 255; // first pixel bottom byte
+                    }
+                    raw += filter_bytes;
+                    cur += output_bytes;
+                    prior += output_bytes;
+                }
+                else 
+                {
+                    raw += 1;
+                    cur += 1;
+                    prior += 1;
+                }
+
+                // this is a little gross, so that we don't switch per-pixel or per-component
+                if (depth < 8 || img_n == out_n) 
+                {
+                    int nk = (width - 1) * filter_bytes;
+#define PNG__CASE(f) \
+             case f:     \
+                for (k=0; k < nk; ++k)
+                    switch (filter) {
+                        // "none" filter turns into a memcpy here; make that explicit.
+                    case PNG__F_none:         memcpy(cur, raw, nk); break;
+                        PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - filter_bytes]); } break;
+                        PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break;
+                        PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes])); } break;
+                        PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], 0, 0)); } break;
+                    }
+#undef PNG__CASE
+                    raw += nk;
+                }
+                else 
+                {
+                    assert(img_n + 1 == out_n);
+#define PNG__CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
+                    switch (filter) {
+                        PNG__CASE(PNG__F_none) { cur[k] = raw[k]; } break;
+                        PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - output_bytes]); } break;
+                        PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break;
+                        PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - output_bytes]) >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes])); } break;
+                        PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - output_bytes] >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], 0, 0)); } break;
+                    }
+#undef PNG__CASE
+
+                    // the loop above sets the high byte of the pixels' alpha, but for
+                    // 16 bit png files we also need the low byte set. we'll do that here.
+                    if (depth == 16) 
+                    {
+                        cur = a->out + stride * j; // start at the beginning of the row again
+                        for (i = 0; i < x; ++i, cur += output_bytes) 
+                            cur[filter_bytes + 1] = 255;
+                    }
+                }
+            }
+
+            // we make a separate pass to expand bits to pixels; for performance,
+            // this could run two scanlines behind the above code, so it won't
+            // intefere with filtering but will still be in the cache.
+            if (depth < 8)
+            {
+                for (j = 0; j < y; ++j)
+                {
+                    uint8_t* cur = a->out + stride * j;
+                    uint8_t* in = a->out + stride * j + x * out_n - img_width_bytes;
+                    // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
+                    // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+                    uint8_t scale = (color == 0) ? png__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+                    // note that the final byte might overshoot and write more data than desired.
+                    // we can allocate enough data that this never writes out of memory, but it
+                    // could also overwrite the next scanline. can it overwrite non-empty data
+                    // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+                    // so we need to explicitly clamp the final ones
+
+                    if (depth == 4) 
+                    {
+                        for (k = x * img_n; k >= 2; k -= 2, ++in) 
+                        {
+                            *cur++ = scale * ((*in >> 4));
+                            *cur++ = scale * ((*in) & 0x0f);
+                        }
+                        if (k > 0) 
+                            *cur++ = scale * ((*in >> 4));
+                    }
+                    else if (depth == 2) 
+                    {
+                        for (k = x * img_n; k >= 4; k -= 4, ++in) 
+                        {
+                            *cur++ = scale * ((*in >> 6));
+                            *cur++ = scale * ((*in >> 4) & 0x03);
+                            *cur++ = scale * ((*in >> 2) & 0x03);
+                            *cur++ = scale * ((*in) & 0x03);
+                        }
+                        if (k > 0) 
+                            *cur++ = scale * ((*in >> 6));
+                        if (k > 1) 
+                            *cur++ = scale * ((*in >> 4) & 0x03);
+                        if (k > 2) 
+                            *cur++ = scale * ((*in >> 2) & 0x03);
+                    }
+                    else if (depth == 1)
+                    {
+                        for (k = x * img_n; k >= 8; k -= 8, ++in) 
+                        {
+                            *cur++ = scale * ((*in >> 7));
+                            *cur++ = scale * ((*in >> 6) & 0x01);
+                            *cur++ = scale * ((*in >> 5) & 0x01);
+                            *cur++ = scale * ((*in >> 4) & 0x01);
+                            *cur++ = scale * ((*in >> 3) & 0x01);
+                            *cur++ = scale * ((*in >> 2) & 0x01);
+                            *cur++ = scale * ((*in >> 1) & 0x01);
+                            *cur++ = scale * ((*in) & 0x01);
+                        }
+                        if (k > 0) *cur++ = scale * ((*in >> 7));
+                        if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+                        if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+                        if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+                        if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+                        if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+                        if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+                    }
+                    if (img_n != out_n) 
+                    {
+                        int q;
+                        // insert alpha = 255
+                        cur = a->out + stride * j;
+                        if (img_n == 1) 
+                        {
+                            for (q = x - 1; q >= 0; --q)
+                            {
+                                cur[q * 2 + 1] = 255;
+                                cur[q * 2 + 0] = cur[q];
+                            }
+                        }
+                        else
+                        {
+                            assert(img_n == 3);
+                            for (q = x - 1; q >= 0; --q) 
+                            {
+                                cur[q * 4 + 3] = 255;
+                                cur[q * 4 + 2] = cur[q * 3 + 2];
+                                cur[q * 4 + 1] = cur[q * 3 + 1];
+                                cur[q * 4 + 0] = cur[q * 3 + 0];
+                            }
+                        }
+                    }
+                }
+            }
+            else if (depth == 16) 
+            {
+                // force the image data from big-endian to platform-native.
+                // this is done in a separate pass due to the decoding relying
+                // on the data being untouched, but could probably be done
+                // per-line during decode if care is taken.
+                uint8_t* cur = a->out;
+                uint16_t* cur16 = (uint16_t*)cur;
+
+                for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2)
+                    *cur16 = (cur[0] << 8) | cur[1];
+            }
+
+            return 1;
+        }
+
+        static int png__create_png_image(png__png* a, uint8_t* image_data, uint32_t image_data_len, int out_n, int depth, int color, int interlaced)
+        {
+            int bytes = (depth == 16 ? 2 : 1);
+            int out_bytes = out_n * bytes;
+            uint8_t* final;
+            int p;
+            if (!interlaced)
+                return png__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+            // de-interlacing
+            final = (uint8_t*)png__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+            for (p = 0; p < 7; ++p) 
+            {
+                int xorig[] = { 0,4,0,2,0,1,0 };
+                int yorig[] = { 0,0,4,0,2,0,1 };
+                int xspc[] = { 8,8,4,4,2,2,1 };
+                int yspc[] = { 8,8,8,4,4,2,2 };
+                int i, j, x, y;
+                // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+                x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p];
+                y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p];
+                if (x && y) 
+                {
+                    uint32_t img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+                    if (!png__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color))
+                    {
+                        PNG_FREE(final);
+                        return 0;
+                    }
+                    for (j = 0; j < y; ++j) 
+                    {
+                        for (i = 0; i < x; ++i) 
+                        {
+                            int out_y = j * yspc[p] + yorig[p];
+                            int out_x = i * xspc[p] + xorig[p];
+                            memcpy(final + out_y * a->s->img_x * out_bytes + out_x * out_bytes,
+                                a->out + (j * x + i) * out_bytes, out_bytes);
+                        }
+                    }
+                    PNG_FREE(a->out);
+                    image_data += img_len;
+                    image_data_len -= img_len;
+                }
+            }
+            a->out = final;
+
+            return 1;
+        }
+
+        static int png__compute_transparency(png__png* z, uint8_t tc[3], int out_n)
+        {
+            PngContext* s = z->s;
+            uint32_t i, pixel_count = s->img_x * s->img_y;
+            uint8_t* p = z->out;
+
+            // compute color-based transparency, assuming we've
+            // already got 255 as the alpha value in the output
+            assert(out_n == 2 || out_n == 4);
+
+            if (out_n == 2) 
+            {
+                for (i = 0; i < pixel_count; ++i) 
+                {
+                    p[1] = (p[0] == tc[0] ? 0 : 255);
+                    p += 2;
+                }
+            }
+            else 
+            {
+                for (i = 0; i < pixel_count; ++i) 
+                {
+                    if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+                        p[3] = 0;
+                    p += 4;
+                }
+            }
+            return 1;
+        }
+
+        static int png__compute_transparency16(png__png* z, uint16_t tc[3], int out_n)
+        {
+            PngContext* s = z->s;
+            uint32_t i, pixel_count = s->img_x * s->img_y;
+            uint16_t* p = (uint16_t*)z->out;
+
+            // compute color-based transparency, assuming we've
+            // already got 65535 as the alpha value in the output
+            assert(out_n == 2 || out_n == 4);
+
+            if (out_n == 2) 
+            {
+                for (i = 0; i < pixel_count; ++i)
+                {
+                    p[1] = (p[0] == tc[0] ? 0 : 65535);
+                    p += 2;
+                }
+            }
+            else 
+            {
+                for (i = 0; i < pixel_count; ++i)
+                {
+                    if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+                        p[3] = 0;
+                    p += 4;
+                }
+            }
+            return 1;
+        }
+
+        static int png__expand_png_palette(png__png* a, uint8_t* palette, int len, int pal_img_n)
+        {
+            uint32_t i, pixel_count = a->s->img_x * a->s->img_y;
+            uint8_t* p, * temp_out, * orig = a->out;
+
+            p = (uint8_t*)png__malloc_mad2(pixel_count, pal_img_n, 0);
+            if (p == NULL) 
+                return PngError("outofmem", "Out of memory");
+
+            // between here and free(out) below, exitting would leak
+            temp_out = p;
+
+            if (pal_img_n == 3) 
+            {
+                for (i = 0; i < pixel_count; ++i) 
+                {
+                    int n = orig[i] * 4;
+                    p[0] = palette[n];
+                    p[1] = palette[n + 1];
+                    p[2] = palette[n + 2];
+                    p += 3;
+                }
+            }
+            else 
+            {
+                for (i = 0; i < pixel_count; ++i) 
+                {
+                    int n = orig[i] * 4;
+                    p[0] = palette[n];
+                    p[1] = palette[n + 1];
+                    p[2] = palette[n + 2];
+                    p[3] = palette[n + 3];
+                    p += 4;
+                }
+            }
+            PNG_FREE(a->out);
+            a->out = temp_out;
+
+            return 1;
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePngLoader::ImagePngLoader(const ImageLoaderParam& param)
+            : ImageLoader(param)
+            , _toAny8(NULL)
+            , _toBgra8(NULL)
+            , _toAny16(NULL)
+            , _toBgra16(NULL)
+        {
+            if (_param.format == SimdPixelFormatNone)
+                _param.format = SimdPixelFormatRgba32;
+        }
+
+        void ImagePngLoader::SetConverters()
+        {
+            _bgrToBgra = Base::BgrToBgra;
+        }
+
+        SIMD_INLINE constexpr uint32_t ChunkType(char a, char b, char c, char d)
+        {
+            return ((uint32_t(a) << 24) + (uint32_t(b) << 16) + (uint32_t(c) << 8) + uint32_t(d));
+        }
+
+        bool ImagePngLoader::FromStream()
+        {
+            const int req_comp = 4;
+            PngContext context;
+            png__png p;
+            p.s = &context;
+            png__png* z = &p;
+
+            PngContext* s = z->s;
+
+            z->out = NULL;
+
+            if (!ParseFile())
+                return false;
+
+            s->img_x = _width;
+            s->img_y = _height;
+            z->depth = _depth;
+            s->img_n = _channels;
+
+            InputMemoryStream zSrc = MergedDataStream();
+            OutputMemoryStream zDst(AlignHi(size_t(_width) * _depth, 8) * _height * _channels + _height);
+            if(!Zlib::Decode(zSrc, zDst, !_iPhone))
+                return false;
+
+            if ((req_comp == s->img_n + 1 && req_comp != 3 && !_paletteChannels) || _hasTrans)
+                s->img_out_n = s->img_n + 1;
+            else
+                s->img_out_n = s->img_n;
+            if (!png__create_png_image(z, zDst.Data(), zDst.Size(), s->img_out_n, z->depth, _color, _interlace))
+                return 0;
+            if (_hasTrans) 
+            {
+                if (z->depth == 16)
+                {
+                    if (!png__compute_transparency16(z, _tc16, s->img_out_n))
+                        return false;
+                }
+                else
+                {
+                    if (!png__compute_transparency(z, _tc, s->img_out_n))
+                        return false;
+                }
+            }
+            if (_paletteChannels)
+            {
+                s->img_n = _paletteChannels; // record the actual colors we had
+                s->img_out_n = _paletteChannels;
+                if (req_comp >= 3) 
+                    s->img_out_n = req_comp;
+                if (!png__expand_png_palette(z, _palette.data, (int)_palette.size, s->img_out_n))
+                    return false;
+            }
+            else if (_hasTrans)
+                ++s->img_n;
+
+            if (!(p.depth <= 8 || p.depth == 16))
+                return false;
+            uint8_t* data = p.out;
+            p.out = NULL;
+            if (req_comp && req_comp != p.s->img_out_n)
+            {
+                if (p.depth <= 8)
+                    data = png__convert_format((uint8_t*)data, p.s->img_out_n, req_comp, _width, _height);
+                else
+                    data = (uint8_t*)png__convert_format16((uint16_t*)data, p.s->img_out_n, req_comp, _width, _height);
+                p.s->img_out_n = req_comp;
+                if (data == NULL)
+                    return false;
+            }
+            if (p.depth == 16)
+            {
+                size_t size = context.img_x * context.img_y * req_comp;
+                const uint16_t* src = (uint16_t*)data;
+                uint8_t* dst = (uint8_t*)PNG_MALLOC(size);
+                for (size_t i = 0; i < size; ++i)
+                    dst[i] = uint8_t(src[i] >> 8);
+                PNG_FREE(data);
+                data = dst;
+            }
+            PNG_FREE(p.out);
+            if (data)
+            {
+                size_t stride = 4 * context.img_x;
+                _image.Recreate(context.img_x, context.img_y, (Image::Format)_param.format);
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8:
+                    Base::RgbaToGray(data, context.img_x, context.img_y, stride, _image.data, _image.stride);
+                    break;
+                case SimdPixelFormatBgr24:
+                    Base::BgraToRgb(data, context.img_x, context.img_y, stride, _image.data, _image.stride);
+                    break;
+                case SimdPixelFormatBgra32:
+                    Base::BgraToRgba(data, context.img_x, context.img_y, stride, _image.data, _image.stride);
+                    break;
+                case SimdPixelFormatRgb24:
+                    Base::BgraToBgr(data, context.img_x, context.img_y, stride, _image.data, _image.stride);
+                    break;
+                case SimdPixelFormatRgba32:
+                    Base::Copy(data, stride, context.img_x, context.img_y, 4, _image.data, _image.stride);
+                    break;
+                default: 
+                    break;
+                }
+                PNG_FREE(data);
+                return true;
+            }
+            return false;
+        }
+
+        bool ImagePngLoader::ParseFile()
+        {
+            _first = true, _iPhone = false, _hasTrans = false;
+            if (!CheckHeader())
+                return false;
+            for (bool run = true; run;)
+            {
+                Chunk chunk;
+                if (!ReadChunk(chunk))
+                    return 0;
+                if (chunk.type == ChunkType('C', 'g', 'B', 'I'))
+                {
+                    _iPhone = true;
+                    _stream.Skip(chunk.size);
+                }
+                else if (chunk.type == ChunkType('I', 'H', 'D', 'R'))
+                {
+                    if (!ReadHeader(chunk))
+                        return false;
+                    SetConverters();
+                }
+                else if (chunk.type == ChunkType('P', 'L', 'T', 'E'))
+                {
+                    if (!ReadPalette(chunk))
+                        return false;
+                }
+                else if (chunk.type == ChunkType('t', 'R', 'N', 'S'))
+                {
+                    if (!ReadTransparency(chunk))
+                        return false;
+                }
+                else if (chunk.type == ChunkType('I', 'D', 'A', 'T'))
+                {
+                    if (!ReadData(chunk))
+                        return false;
+                }
+                else if (chunk.type == ChunkType('I', 'E', 'N', 'D'))
+                {
+                    if (_first)
+                        return false;
+                    run = false;
+                }
+                else
+                {
+                    if (_first || (chunk.type & (1 << 29)) == 0)
+                        return false;
+                    _stream.Skip(chunk.size);
+                }
+                uint32_t crc32;
+                if (!_stream.ReadBe32u(crc32))
+                    return false;
+            }
+            return _idats.size() != 0;
+        }
+
+        bool ImagePngLoader::CheckHeader()
+        {
+            const size_t size = 8;
+            const uint8_t control[size] = { 137, 80, 78, 71, 13, 10, 26, 10 };
+            uint8_t buffer[size];
+            return _stream.Read(size, buffer) == size && memcmp(buffer, control, size) == 0;
+        }
+
+        SIMD_INLINE bool ImagePngLoader::ReadChunk(Chunk& chunk)
+        {
+            if (_stream.ReadBe32u(chunk.size) && _stream.ReadBe32u(chunk.type))
+            {
+                chunk.offs = (uint32_t)_stream.Pos();
+                return true;
+            }
+            return false;
+        }
+
+        bool ImagePngLoader::ReadHeader(const Chunk& chunk)
+        {
+            const int MAX_SIZE = 1 << 24;
+            if (!_first)
+                return false;
+            _first = false;
+            if (!(chunk.size == 13 && _stream.CanRead(13)))
+                return false;
+            uint8_t comp, filter;
+            if (!(_stream.ReadBe32u(_width) && _stream.ReadBe32u(_height) &&
+                _stream.Read8u(_depth) && _stream.Read8u(_color) && _stream.Read8u(comp) &&
+                _stream.Read8u(filter) && _stream.Read8u(_interlace)))
+                return false;
+            if (_width == 0 || _width > MAX_SIZE || _height == 0 || _height > MAX_SIZE)
+                return false;
+            if (_depth != 1 && _depth != 2 && _depth != 4 && _depth != 8 && _depth != 16)
+                return false;
+            if (_color > 6 || (_color == 3 && _depth == 16))
+                return false;
+            _paletteChannels = 0;
+            if (_color == 3)
+                _paletteChannels = 3;
+            else if (_color & 1)
+                return false;
+            if (comp != 0 || filter != 0 || _interlace > 1)
+                return false;
+            if (!_paletteChannels)
+            {
+                _channels = (_color & 2 ? 3 : 1) + (_color & 4 ? 1 : 0);
+                if ((1 << 30) / _width / _channels < _height)
+                    return false;
+            }
+            else
+            {
+                _channels = 1;
+                if ((1 << 30) / _width / 4 < _height)
+                    return false;
+            }
+            return true;
+        }
+
+        bool ImagePngLoader::ReadPalette(const Chunk& chunk)
+        {
+            if (_first || chunk.size > 256 * 3)
+                return false;
+            size_t length = chunk.size / 3;
+            if (length * 3 != chunk.size)
+                return false;
+            if (_stream.CanRead(chunk.size))
+            {
+                _palette.Resize(length * 4);
+                _bgrToBgra(_stream.Current(), length, 1, length, _palette.data, _palette.size, 0xFF);
+                _stream.Skip(chunk.size);
+                return true;
+            }
+            else
+                return false;
+        }
+
+        bool ImagePngLoader::ReadTransparency(const Chunk& chunk)
+        {
+            if (_first)
+                return false;
+            if (_idats.size())
+                return false;
+            if (_paletteChannels)
+            {
+                if (_palette.size == 0 || chunk.size > _palette.size || !_stream.CanRead(chunk.size))
+                    return false;
+                _paletteChannels = 4;
+                for (size_t i = 0; i < chunk.size; ++i)
+                    _palette.data[i * 4 + 3] = _stream.Current()[i];
+                _stream.Skip(chunk.size);
+            }
+            else
+            {
+                if (!(_channels & 1) || chunk.size != _channels * 2)
+                    return false;
+                _hasTrans = true;
+                for (size_t k = 0; k < _channels; ++k)
+                    if (!_stream.ReadBe16u(_tc16[k]))
+                        return false;
+                if (_depth != 16)
+                {
+                    for (size_t k = 0; k < _channels; ++k)
+                        _tc[k] = uint8_t(_tc16[k]) * png__depth_scale_table[_depth];
+                }
+            }
+            return true;
+        }
+
+        bool ImagePngLoader::ReadData(const Chunk& chunk)
+        {
+            if (_first)
+                return false;
+            if (_paletteChannels && !_palette.size)
+                return false;
+            if (!_stream.CanRead(chunk.size))
+                return false;
+            _idats.push_back(chunk);
+            _stream.Skip(chunk.size);
+            return true;
+        }
+
+        InputMemoryStream ImagePngLoader::MergedDataStream()
+        {
+            if (_idats.size() == 1)
+                return InputMemoryStream((uint8_t*)_stream.Data() + _idats[0].offs, _idats[0].size);
+            else
+            {
+                size_t size = 0;
+                for (size_t i = 0; i < _idats.size(); ++i)
+                    size += _idats[i].size;
+                _idat.Resize(size);
+                for (size_t i = 0, offset = 0; i < _idats.size(); ++i)
+                {
+                    memcpy(_idat.data + offset, _stream.Data() + _idats[i].offs, _idats[i].size);
+                    offset += _idats[i].size;
+                }
+                return InputMemoryStream(_idat.data, _idat.size);
+            }
+        }
+    }
+}
diff --git a/3rdparty/simdlib/Simd/SimdBaseImageSave.cpp b/3rdparty/simdlib/Simd/SimdBaseImageSave.cpp
new file mode 100644
index 0000000000..fb5a8eacef
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseImageSave.cpp
@@ -0,0 +1,340 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdCpu.h"
+#include "Simd/SimdBase.h"
+
+#include <stdio.h>
+
+#include <memory>
+#include <sstream>
+
+#if defined(_MSC_VER)
+#pragma warning (push)
+#pragma warning (disable: 4996)
+#endif
+
+namespace Simd
+{        
+    SimdBool ImageSaveToFile(const ImageSaveToMemoryPtr saver, const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char* path)
+    {
+        SimdBool result = SimdFalse;
+        size_t size;
+        uint8_t * data = saver(src, stride, width, height, format, file, quality, &size);
+        if (data)
+        {
+            ::FILE* file = ::fopen(path, "wb");
+            if (file)
+            {
+                if (::fwrite(data, 1, size, file) == size)
+                    result = SimdTrue;
+                ::fclose(file);
+            }
+            Simd::Free(data);
+        }
+        return result;
+    }
+
+    //-------------------------------------------------------------------------
+
+    namespace Base
+    {
+        ImagePxmSaver::ImagePxmSaver(const ImageSaverParam& param)
+            : ImageSaver(param)
+            , _convert(NULL)
+        {
+            _block = _param.height;
+            if (_param.file == SimdImageFilePgmTxt || _param.file == SimdImageFilePgmBin)
+            {
+                _size = _param.width * 1;
+                if (_param.format != SimdPixelFormatGray8)
+                {
+                    _block = Simd::RestrictRange<size_t>(Base::AlgCacheL1() / _size, 1, _param.height);
+                    _buffer.Resize(_block * _size);
+                }
+            }
+            else if (_param.file == SimdImageFilePpmTxt || _param.file == SimdImageFilePpmBin)
+            {
+                _size = _param.width * 3;
+                if (_param.format != SimdPixelFormatRgb24)
+                {
+                    _block = Simd::RestrictRange<size_t>(Base::AlgCacheL1() / _size, 1, _param.height);
+                    _buffer.Resize(_block * _size);
+                }
+            }
+            else
+                assert(0);
+        }
+
+        void ImagePxmSaver::WriteHeader(size_t version)
+        {
+            std::stringstream header;
+            header << "P" << version << "\n" << _param.width << " " << _param.height << "\n255\n";
+            _stream.Write(header.str().c_str(), header.str().size());
+        }
+
+        uint8_t g_pxmPrint[256][4];
+        bool PxmPrintInit()
+        {
+            for (int i = 0; i < 256; ++i)
+            {
+                int d0 = i / 100;
+                int d1 = (i / 10) % 10;
+                int d2 = i % 10;
+                g_pxmPrint[i][0] = d0 ? '0' + d0 : ' ';
+                g_pxmPrint[i][1] = (d1 || d0) ? '0' + d1 : ' ';
+                g_pxmPrint[i][2] = '0' + d2;
+                g_pxmPrint[i][3] = ' ';
+            }
+            return true;
+        }
+        bool g_pxmPrintInited = PxmPrintInit();
+
+        //---------------------------------------------------------------------
+
+        ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param)
+            : ImagePxmSaver(param)
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatBgr24: _convert = Base::BgrToGray; break;
+            case SimdPixelFormatBgra32: _convert = Base::BgraToGray; break;
+            case SimdPixelFormatRgb24: _convert = Base::RgbToGray; break;
+            case SimdPixelFormatRgba32: _convert = Base::RgbaToGray; break;
+            default: break;
+            }
+        }
+
+        bool ImagePgmTxtSaver::ToStream(const uint8_t* src, size_t stride)
+        {
+            size_t grayStride = _param.format == SimdPixelFormatGray8 ? stride : _size;
+            _stream.Reserve(32 + _param.height * (_param.width * 4 + DivHi(_param.width, 17)));
+            WriteHeader(2);
+            for (size_t row = 0; row < _param.height;)
+            {
+                size_t block = Simd::Min(row + _block, _param.height) - row;
+                const uint8_t* gray = src;
+                if (_param.format != SimdPixelFormatGray8)
+                {
+                    _convert(src, _param.width, block, stride, _buffer.data, grayStride);
+                    gray = _buffer.data;
+                }                
+                for (size_t b = 0; b < block; ++b)
+                {
+                    uint8_t string[70];
+                    for (size_t col = 0, offset = 0; col < _param.width; ++col)
+                    {
+                        *(uint32_t*)(string + offset) = *(uint32_t*)g_pxmPrint[gray[col]];
+                        offset += 4;
+                        if (offset >= 68 || col == _param.width - 1)
+                        {
+                            string[offset++] = '\n';
+                            _stream.Write(string, offset);
+                            offset = 0;
+                        }
+                    }
+                    gray += grayStride;
+                }
+                src += stride * block;
+                row += block;
+            }
+            return true;
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param)
+            : ImagePxmSaver(param)
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatBgr24: _convert = Base::BgrToGray; break;
+            case SimdPixelFormatBgra32: _convert = Base::BgraToGray; break;
+            case SimdPixelFormatRgb24: _convert = Base::RgbToGray; break;
+            case SimdPixelFormatRgba32: _convert = Base::RgbaToGray; break;
+            default: break;
+            }
+        }
+
+        bool ImagePgmBinSaver::ToStream(const uint8_t* src, size_t stride)
+        {
+            size_t grayStride = _param.format == SimdPixelFormatGray8 ? stride : _size;
+            _stream.Reserve(32 + _param.height * _size);
+            WriteHeader(5);
+            for (size_t row = 0; row < _param.height;)
+            {
+                size_t block = Simd::Min(row + _block, _param.height) - row;
+                const uint8_t* gray = src;
+                if (_param.format != SimdPixelFormatGray8)
+                {
+                    _convert(src, _param.width, block, stride, _buffer.data, grayStride);
+                    gray = _buffer.data;
+                }
+                for (size_t b = 0; b < block; ++b)
+                {
+                    _stream.Write(gray, _size);
+                    gray += grayStride;
+                }
+                src += stride * block;
+                row += block;
+            }
+            return true;
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param)
+            : ImagePxmSaver(param)
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatGray8: _convert = Base::GrayToBgr; break;
+            case SimdPixelFormatBgr24: _convert = Base::BgrToRgb; break;
+            case SimdPixelFormatBgra32: _convert = Base::BgraToRgb; break;
+            case SimdPixelFormatRgba32: _convert = Base::BgraToBgr; break;
+            default: break;
+            }
+        }
+
+        bool ImagePpmTxtSaver::ToStream(const uint8_t* src, size_t stride)
+        {
+            size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? stride : _size;
+            _stream.Reserve(32 + _param.height * (_param.width * 13 + DivHi(_param.width, 5)));
+            WriteHeader(3);
+            for (size_t row = 0; row < _param.height;)
+            {
+                size_t block = Simd::Min(row + _block, _param.height) - row;
+                const uint8_t* rgb = src;
+                if (_param.format != SimdPixelFormatRgb24)
+                {
+                    _convert(src, _param.width, block, stride, _buffer.data, rgbStride);
+                    rgb = _buffer.data;
+                }
+                for (size_t b = 0; b < block; ++b)
+                {
+                    uint8_t string[70];
+                    for (size_t col = 0, offset = 0; col < _size; col += 3)
+                    {
+                        ((uint32_t*)(string + offset))[0] = *(uint32_t*)g_pxmPrint[rgb[col + 0]];
+                        ((uint32_t*)(string + offset))[1] = *(uint32_t*)g_pxmPrint[rgb[col + 1]];
+                        ((uint32_t*)(string + offset))[2] = *(uint32_t*)g_pxmPrint[rgb[col + 2]];
+                        offset += 12;
+                        if (offset >= 68 || col == _size - 3)
+                        {
+                            string[offset++] = '\n';
+                            _stream.Write(string, offset);
+                            offset = 0;
+                        }
+                        else
+                        {
+                            string[offset++] = ' ';
+                            string[offset++] = ' ';
+                        }
+                    }
+                    rgb += rgbStride;
+                }
+                src += stride * block;
+                row += block;
+            }
+            return true;
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param)
+            : ImagePxmSaver(param)
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatGray8: _convert = Base::GrayToBgr; break;
+            case SimdPixelFormatBgr24: _convert = Base::BgrToRgb; break;
+            case SimdPixelFormatBgra32: _convert = Base::BgraToRgb; break;
+            case SimdPixelFormatRgba32: _convert = Base::BgraToBgr; break;
+            default: break;
+            }
+        }
+
+        bool ImagePpmBinSaver::ToStream(const uint8_t* src, size_t stride)
+        {
+            size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? stride : _size;
+            _stream.Reserve(32 + _param.height * _size);
+            WriteHeader(6);
+            for (size_t row = 0; row < _param.height;)
+            {
+                size_t block = Simd::Min(row + _block, _param.height) - row;
+                const uint8_t* rgb = src;
+                if (_param.format != SimdPixelFormatRgb24)
+                {
+                    _convert(src, _param.width, block, stride, _buffer.data, rgbStride);
+                    rgb = _buffer.data;
+                }
+                for (size_t b = 0; b < block; ++b)
+                {
+                    _stream.Write(rgb, _size);
+                    rgb += rgbStride;
+                }
+                src += stride * block;
+                row += block;
+            }
+            return true;
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageSaver* CreateImageSaver(const ImageSaverParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinSaver(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinSaver(param);
+            case SimdImageFilePng:    return new ImagePngSaver(param);
+            case SimdImageFileJpeg:   return new ImageJpegSaver(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size)
+        {
+            ImageSaverParam param(width, height, format, file, quality);
+            if (param.Validate())
+            {
+                Holder<ImageSaver> saver(CreateImageSaver(param));
+                if (saver)
+                {
+                    if (saver->ToStream(src, stride))
+                        return saver->Release(size);
+                }
+            }
+            return NULL;
+        }
+    }
+}
+
+#if defined(_MSC_VER)
+#pragma warning (pop)
+#endif
diff --git a/3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp b/3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp
new file mode 100644
index 0000000000..f7ba583247
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp
@@ -0,0 +1,451 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdImageSaveJpeg.h"
+#include "Simd/SimdBase.h"
+
+namespace Simd
+{
+    namespace Base
+    {
+        const uint8_t JpegZigZagD[64] = { 
+            0, 1, 5, 6, 14, 15, 27, 28, 
+            2, 4, 7, 13, 16, 26, 29, 42, 
+            3, 8, 12, 17, 25, 30, 41, 43, 
+            9, 11, 18, 24, 31, 40, 44, 53, 
+            10, 19, 23, 32, 39, 45, 52, 54, 
+            20, 22, 33, 38, 46, 51, 55, 60, 
+            21, 34, 37, 47, 50, 56, 59, 61, 
+            35, 36, 48, 49, 57, 58, 62, 63 };
+
+        const uint8_t JpegZigZagT[64] = { 
+            0, 2, 3, 9, 10, 20, 21, 35,
+            1, 4, 8, 11, 19, 22, 34, 36,
+            5, 7, 12, 18, 23, 33, 37, 48,
+            6, 13, 17, 24, 32, 38, 47, 49,
+            14, 16, 25, 31, 39, 46, 50, 57,
+            15, 26, 30, 40, 45, 51, 56, 58,
+            27, 29, 41, 44, 52, 55, 59, 62,
+            28, 42, 43, 53, 54, 60, 61, 63 };        
+
+        const uint16_t HuffmanYdc[256][2] = { {0, 2}, {2, 3}, {3, 3}, {4, 3}, {5, 3}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9} };
+        const uint16_t HuffmanUVdc[256][2] = { {0, 2}, {1, 2}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}, {1022, 10}, {2046, 11} };
+        const uint16_t HuffmanYac[256][2] = {
+           {10, 4}, {0, 2}, {1, 2}, {4, 3}, {11, 4}, {26, 5}, {120, 7}, {248, 8}, {1014, 10}, {65410, 16}, {65411, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {12, 4}, {27, 5}, {121, 7}, {502, 9}, {2038, 11}, {65412, 16}, {65413, 16}, {65414, 16}, {65415, 16}, {65416, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {28, 5}, {249, 8}, {1015, 10}, {4084, 12}, {65417, 16}, {65418, 16}, {65419, 16}, {65420, 16}, {65421, 16}, {65422, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {58, 6}, {503, 9}, {4085, 12}, {65423, 16}, {65424, 16}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {59, 6}, {1016, 10}, {65430, 16}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {122, 7}, {2039, 11}, {65438, 16}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {123, 7}, {4086, 12}, {65446, 16}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {250, 8}, {4087, 12}, {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {504, 9}, {32704, 15}, {65462, 16}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {505, 9}, {65470, 16}, {65471, 16}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {506, 9}, {65479, 16}, {65480, 16}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {1017, 10}, {65488, 16}, {65489, 16}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {1018, 10}, {65497, 16}, {65498, 16}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {2040, 11}, {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {2041, 11}, {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+        };
+        const uint16_t HuffmanUVac[256][2] = {
+           {0, 2}, {1, 2}, {4, 3}, {10, 4}, {24, 5}, {25, 5}, {56, 6}, {120, 7}, {500, 9}, {1014, 10}, {4084, 12}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {11, 4}, {57, 6}, {246, 8}, {501, 9}, {2038, 11}, {4085, 12}, {65416, 16}, {65417, 16}, {65418, 16}, {65419, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {26, 5}, {247, 8}, {1015, 10}, {4086, 12}, {32706, 15}, {65420, 16}, {65421, 16}, {65422, 16}, {65423, 16}, {65424, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {27, 5}, {248, 8}, {1016, 10}, {4087, 12}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {65430, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {58, 6}, {502, 9}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {65438, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {59, 6}, {1017, 10}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {65446, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {121, 7}, {2039, 11}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {65454, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {122, 7}, {2040, 11}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {65462, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {249, 8}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {65470, 16}, {65471, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {503, 9}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {65479, 16}, {65480, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {504, 9}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {65488, 16}, {65489, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {505, 9}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {65497, 16}, {65498, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {506, 9}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {65506, 16}, {65507, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {2041, 11}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {65515, 16}, {65516, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {65525, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {1018, 10}, {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+        };
+
+#if defined(SIMD_JPEG_CALC_BITS_TABLE)
+        uint16_t JpegCalcBitsTable[JpegCalcBitsRange * 2][2];
+        bool JpegCalcBitsTableInit()
+        {
+            for (int i = 0, n = JpegCalcBitsRange * 2; i < n; ++i)
+            {
+                int val = i - JpegCalcBitsRange;
+                int tmp = val < 0 ? -val : val;
+                val = val < 0 ? val - 1 : val;
+                int cnt = 1;
+                while (tmp >>= 1)
+                    ++cnt;
+                JpegCalcBitsTable[i][0] = val & ((1 << cnt) - 1);
+                JpegCalcBitsTable[i][1] = cnt;
+            }
+            return true;
+        }
+        bool JpegCalcBitsTableInited = JpegCalcBitsTableInit();
+#endif
+
+        SIMD_INLINE void JpegDct(float* d0p, float* d1p, float* d2p, float* d3p, float* d4p, float* d5p, float* d6p, float* d7p)
+        {
+            float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
+            float z1, z2, z3, z4, z5, z11, z13;
+            float tmp0 = d0 + d7;
+            float tmp7 = d0 - d7;
+            float tmp1 = d1 + d6;
+            float tmp6 = d1 - d6;
+            float tmp2 = d2 + d5;
+            float tmp5 = d2 - d5;
+            float tmp3 = d3 + d4;
+            float tmp4 = d3 - d4;
+
+            float tmp10 = tmp0 + tmp3;
+            float tmp13 = tmp0 - tmp3;
+            float tmp11 = tmp1 + tmp2;
+            float tmp12 = tmp1 - tmp2;
+
+            d0 = tmp10 + tmp11;
+            d4 = tmp10 - tmp11;
+
+            z1 = (tmp12 + tmp13) * 0.707106781f;
+            d2 = tmp13 + z1;
+            d6 = tmp13 - z1;
+
+            tmp10 = tmp4 + tmp5;
+            tmp11 = tmp5 + tmp6;
+            tmp12 = tmp6 + tmp7;
+
+            z5 = (tmp10 - tmp12) * 0.382683433f;
+            z2 = tmp10 * 0.541196100f + z5;
+            z4 = tmp12 * 1.306562965f + z5;
+            z3 = tmp11 * 0.707106781f;
+
+            z11 = tmp7 + z3;
+            z13 = tmp7 - z3;
+
+            *d5p = z13 + z2;
+            *d3p = z13 - z2;
+            *d1p = z11 + z4;
+            *d7p = z11 - z4;
+
+            *d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
+        }
+
+        static int JpegProcessDu(Base::BitBuf& bitBuf, float* CDU, int stride, const float* fdtbl, int DC, const uint16_t HTDC[256][2], const uint16_t HTAC[256][2])
+        {
+            int offs, i, j, n, diff, end0pos, x, y;
+            for (offs = 0; offs < 8; ++offs) 
+                JpegDct(&CDU[offs], &CDU[offs + stride], &CDU[offs + stride * 2], &CDU[offs + stride * 3], &CDU[offs + stride * 4],
+                    &CDU[offs + stride * 5], &CDU[offs + stride * 6], &CDU[offs + stride * 7]);
+            for (offs = 0, n = stride * 8; offs < n; offs += stride)
+                JpegDct(&CDU[offs], &CDU[offs + 1], &CDU[offs + 2], &CDU[offs + 3], &CDU[offs + 4], &CDU[offs + 5], &CDU[offs + 6], &CDU[offs + 7]);
+            int DU[64];
+            for (y = 0, j = 0; y < 8; ++y) 
+            {
+                for (x = 0; x < 8; ++x, ++j) 
+                {
+                    i = y * stride + x;
+                    float v = CDU[i] * fdtbl[j];
+                    DU[JpegZigZagD[j]] = Round(v);
+                }
+            }
+            diff = DU[0] - DC;
+            if (diff == 0) 
+                bitBuf.Push(HTDC[0]);
+            else 
+            {
+                uint16_t bits[2];
+                JpegCalcBits(diff, bits);
+                bitBuf.Push(HTDC[bits[1]]);
+                bitBuf.Push(bits);
+            }
+            end0pos = 63;
+            for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos);
+            if (end0pos == 0) 
+            {
+                bitBuf.Push(HTAC[0x00]);
+                return DU[0];
+            }
+            for (i = 1; i <= end0pos; ++i)
+            {
+                int startpos = i;
+                int nrzeroes;
+                uint16_t bits[2];
+                for (; DU[i] == 0 && i <= end0pos; ++i);
+                nrzeroes = i - startpos;
+                if (nrzeroes >= 16) 
+                {
+                    int lng = nrzeroes >> 4;
+                    int nrmarker;
+                    for (nrmarker = 1; nrmarker <= lng; ++nrmarker)
+                        bitBuf.Push(HTAC[0xF0]);
+                    nrzeroes &= 15;
+                }
+                JpegCalcBits(DU[i], bits);
+                bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]);
+                bitBuf.Push(bits);
+            }
+            if (end0pos != 63) 
+                bitBuf.Push(HTAC[0x00]);
+            return DU[0];
+        }
+
+        void JpegWriteBlockSubs(OutputMemoryStream & stream, int width, int height, const uint8_t * red,
+            const uint8_t* green, const uint8_t* blue, int stride, const float * fY, const float* fUv, int dc[3])
+        {
+            int & DCY = dc[0], & DCU = dc[1], & DCV = dc[2];
+            float Y[256], U[256], V[256];
+            float subU[64], subV[64];
+            bool gray = red == green && red == blue;
+            Base::BitBuf bitBuf;
+            for (int y = 0; y < height; y += 16)
+            {
+                for (int x = 0; x < width; x += 16)
+                {
+                    if (gray)
+                        Base::GrayToY(red + x, stride, height - y, width - x, Y, 16);
+                    else
+                        Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 16);
+                    DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        for (int yy = 0, pos = 0; yy < 8; ++yy)
+                        {
+                            for (int xx = 0; xx < 8; ++xx, ++pos)
+                            {
+                                int j = yy * 32 + xx * 2;
+                                subU[pos] = (U[j + 0] + U[j + 1] + U[j + 16] + U[j + 17]) * 0.25f;
+                                subV[pos] = (V[j + 0] + V[j + 1] + V[j + 16] + V[j + 17]) * 0.25f;
+                            }
+                        }
+                        DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                    if (bitBuf.Full())
+                    {
+                        Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+                        bitBuf.Clear();
+                    }
+                }
+            }
+            Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+            bitBuf.Clear();
+        }
+
+        void JpegWriteBlockFull(OutputMemoryStream& stream, int width, int height, const uint8_t* red,
+            const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3])
+        {
+            int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2];
+            float Y[64], U[64], V[64];
+            bool gray = red == green && red == blue;
+            Base::BitBuf bitBuf;
+            for (int y = 0; y < height; y += 8)
+            {
+                for (int x = 0; x < width; x += 8)
+                {
+                    if (gray)
+                        Base::GrayToY(red + x, stride, height - y, width - x, Y, 8);
+                    else
+                        Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 8);
+                    DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                    if (bitBuf.Full())
+                    {
+                        Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+                        bitBuf.Clear();
+                    }
+                }
+            }
+            Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+            bitBuf.Clear();
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageJpegSaver::ImageJpegSaver(const ImageSaverParam& param)
+            : ImageSaver(param)
+            , _deintBgra(NULL)
+            , _deintBgr(NULL)
+        {
+        }
+
+        void ImageJpegSaver::Init()
+        {
+            InitParams(false);
+            switch (_param.format)
+            {
+            case SimdPixelFormatBgr24:
+            case SimdPixelFormatRgb24:
+                _deintBgr = Base::DeinterleaveBgr;
+                break;
+            case SimdPixelFormatBgra32:
+            case SimdPixelFormatRgba32:
+                _deintBgra = Base::DeinterleaveBgra;
+                break;
+            default: 
+                break;
+            }
+            _writeBlock = _subSample ? JpegWriteBlockSubs : JpegWriteBlockFull;
+        }
+
+        void ImageJpegSaver::InitParams(bool trans)
+        {
+            static const int YQT[] = { 16, 11, 10, 16, 24, 40, 51, 61, 12, 12, 14, 19, 26, 58, 60, 55, 14, 13, 
+                16, 24, 40, 57, 69, 56, 14, 17, 22, 29, 51, 87, 80, 62, 18, 22, 37, 56, 68, 109, 103, 77, 24, 
+                35, 55, 64, 81, 104, 113, 92, 49, 64, 78, 87, 103, 121, 120, 101, 72, 92, 95, 98, 112, 100, 103, 99 };
+            static const int UVQT[] = { 17, 18, 24, 47, 99, 99, 99, 99, 18, 21, 26, 66, 99, 99, 99, 99, 24, 
+                26, 56, 99, 99, 99, 99, 99, 47, 66, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 
+                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99 };
+            static const float AASF[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 
+                1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f, 1.0f * 2.828427125f, 
+                0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
+            _quality = _param.quality;
+            _quality = _quality ? _quality : 90;
+            _subSample = _quality <= 90 ? 1 : 0;
+            _quality = _quality < 1 ? 1 : _quality > 100 ? 100 : _quality;
+            _quality = _quality < 50 ? 5000 / _quality : 200 - _quality * 2;
+            for (size_t i = 0; i < 64; ++i)
+            {
+                int uvti, yti = (YQT[i] * _quality + 50) / 100;
+                _uY[Base::JpegZigZagD[i]] = uint8_t(yti < 1 ? 1 : yti > 255 ? 255 : yti);
+                uvti = (UVQT[i] * _quality + 50) / 100;
+                _uUv[Base::JpegZigZagD[i]] = uint8_t(uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
+            }
+            const uint8_t *ZigZag = trans ? Base::JpegZigZagT : Base::JpegZigZagD;
+            for (size_t y = 0, i = 0; y < 8; ++y)
+            {
+                for (size_t x = 0; x < 8; ++x, ++i)
+                {
+                    _fY[i] = 1.0f / (_uY[ZigZag[i]] * AASF[y] * AASF[x]);
+                    _fUv[i] = 1.0f / (_uUv[ZigZag[i]] * AASF[y] * AASF[x]);
+                }
+            }
+            _block = _subSample ? 16 : 8;
+            _width = (int)AlignHi(_param.width, _block);
+            if (_param.format != SimdPixelFormatGray8)
+                _buffer.Resize(_width * _block * 3);
+        }
+
+        void ImageJpegSaver::WriteHeader()
+        {
+            static const uint8_t DC_LUM_COD[] = { 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
+            static const uint8_t DC_LUM_VAL[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+            static const uint8_t AC_LUM_COD[] = { 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d };
+            static const uint8_t AC_LUM_VAL[] = {
+               0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08, 
+               0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0, 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28, 
+               0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 
+               0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 
+               0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 
+               0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2, 
+               0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa
+            };
+            static const uint8_t DC_CHR_COD[] = { 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
+            static const uint8_t DC_CHR_VAL[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+            static const uint8_t AC_CHR_COD[] = { 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 };
+            static const uint8_t AC_CHR_VAL[] = {
+               0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71, 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, 
+               0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0, 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34, 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26, 
+               0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 
+               0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 
+               0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 
+               0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 
+               0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa
+            };
+            static const uint8_t head0[] = { 0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F', 'I', 'F', 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0xFF, 0xDB, 0, 0x84, 0 };
+            static const uint8_t head2[] = { 0xFF, 0xDA, 0, 0xC, 3, 1, 0, 2, 0x11, 3, 0x11, 0, 0x3F, 0 };
+            const uint8_t head1[] = { 0xFF, 0xC0, 0, 0x11, 8,  uint8_t(_param.height >> 8),  uint8_t(_param.height),  uint8_t(_param.width >> 8),  
+                uint8_t(_param.width), 3, 1, uint8_t(_subSample ? 0x22 : 0x11), 0, 2, 0x11, 1, 3, 0x11, 1, 0xFF, 0xC4, 0x01, 0xA2, 0 };
+            _stream.Write(head0, sizeof(head0));
+            _stream.Write(_uY, 64);
+            _stream.Write8u(1);
+            _stream.Write(_uUv, 64);
+            _stream.Write(head1, sizeof(head1));
+            _stream.Write(DC_LUM_COD + 1, sizeof(DC_LUM_COD) - 1);
+            _stream.Write(DC_LUM_VAL, sizeof(DC_LUM_VAL));
+            _stream.Write8u(0x10); // HTYACinfo
+            _stream.Write(AC_LUM_COD + 1, sizeof(AC_LUM_COD) - 1);
+            _stream.Write(AC_LUM_VAL, sizeof(AC_LUM_VAL));
+            _stream.Write8u(1); // HTUDCinfo
+            _stream.Write(DC_CHR_COD + 1, sizeof(DC_CHR_COD) - 1);
+            _stream.Write(DC_CHR_VAL, sizeof(DC_CHR_VAL));
+            _stream.Write8u(0x11); // HTUACinfo
+            _stream.Write(AC_CHR_COD + 1, sizeof(AC_CHR_COD) - 1);
+            _stream.Write(AC_CHR_VAL, sizeof(AC_CHR_VAL));
+            _stream.Write(head2, sizeof(head2));
+        }
+
+        bool ImageJpegSaver::ToStream(const uint8_t* src, size_t stride)
+        {
+            Init();
+            WriteHeader();
+            uint8_t* r = _buffer.data, * g = r + _width * _block,* b = g + _width * _block;
+            int dc[3] = { 0, 0, 0 };
+            for (int row = 0; row < (int)_param.height; row += _block)
+            {
+                int block = Simd::Min(row + _block, (int)_param.height) - row;
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24:
+                    _deintBgr(src, stride, _param.width, block, b, _width, g, _width, r, _width);
+                    break;
+                case SimdPixelFormatBgra32:
+                    _deintBgra(src, stride, _param.width, block, b, _width, g, _width, r, _width, NULL, 0);
+                    break;
+                case SimdPixelFormatRgb24:
+                    _deintBgr(src, stride, _param.width, block, r, _width, g, _width, b, _width);
+                    break;
+                case SimdPixelFormatRgba32:
+                    _deintBgra(src, stride, _param.width, block, r, _width, g, _width, b, _width, NULL, 0);
+                    break;
+                default: 
+                    break;
+                }
+                if(_param.format == SimdPixelFormatGray8)
+                    _writeBlock(_stream, (int)_param.width, block, src, src, src, (int)stride, _fY, _fUv, dc);
+                else
+                    _writeBlock(_stream, (int)_param.width, block, r, g, b, _width, _fY, _fUv, dc);
+                src += block * stride;
+            }
+            static const uint16_t FILL_BITS[] = { 0x7F, 7 };
+            Base::WriteBits(_stream, FILL_BITS);
+            _stream.Write8u(0xFF);
+            _stream.Write8u(0xD9);
+            return true;
+        }
+    }
+}
diff --git a/3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp b/3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp
new file mode 100644
index 0000000000..dcb8f2efbb
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp
@@ -0,0 +1,379 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdImageSavePng.h"
+#include "Simd/SimdBase.h"
+#include "Simd/SimdCpu.h"
+
+namespace Simd
+{
+    namespace Base
+    {
+        const uint16_t ZlibLenC[30] = { 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 259 };
+        const uint8_t  ZlibLenEb[29] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
+        const uint16_t ZlibDistC[31] = { 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 32768 };
+        const uint8_t  ZlibDistEb[30] = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13 };
+
+#if defined(SIMD_PNG_ZLIB_BIT_REV_TABLE)
+        int ZlibBitRevTable[512];
+        static bool ZlibBitRevTableInit()
+        {
+            for (int i = 0; i < 512; i++)
+            {
+                int rev = 0, val = i;
+                for (size_t b = 0; b < 9; b++)
+                {
+                    rev = (rev << 1) | (val & 1);
+                    val >>= 1;
+                }
+                ZlibBitRevTable[i] = rev;
+            }
+            return true;
+        }
+        bool ZlibBitRevTableInited = ZlibBitRevTableInit();
+
+#endif
+
+        uint32_t ZlibAdler32(uint8_t* data, int size)
+        {
+            uint32_t lo = 1, hi = 0;
+            for (int b = 0, n = (int)(size % 5552); b < size;)
+            {
+                for (int i = 0; i < n; ++i)
+                {
+                    lo += data[b + i];
+                    hi += lo;
+                }
+                lo %= 65521;
+                hi %= 65521;
+                b += n;
+                n = 5552;
+            }
+            return (hi << 16) | lo;
+        }
+
+        void ZlibCompress(uint8_t* data, int size, int quality, OutputMemoryStream& stream)
+        {
+            const int ZHASH = 16384;
+            if (quality < 5)
+                quality = 5;
+            const int basket = quality * 2;
+            Array32i hashTable(ZHASH * basket);
+            memset(hashTable.data, -1, hashTable.RawSize());
+
+            stream.Write(uint8_t(0x78));
+            stream.Write(uint8_t(0x5e));
+            stream.WriteBits(1, 1);
+            stream.WriteBits(1, 2);
+
+            int i = 0, j;
+            while (i < size - 3)
+            {
+                int h = ZlibHash(data + i) & (ZHASH - 1), best = 3;
+                uint8_t* bestLoc = 0;
+                int* hList = hashTable.data + h * basket;
+                for (j = 0; hList[j] != -1 && j < basket; ++j)
+                {
+                    if (hList[j] > i - 32768)
+                    {
+                        int d = ZlibCount(data + hList[j], data + i, size - i);
+                        if (d >= best)
+                        {
+                            best = d;
+                            bestLoc = data + hList[j];
+                        }
+                    }
+                }
+                if (j == basket)
+                {
+                    memcpy(hList, hList + quality, quality * sizeof(int));
+                    memset(hList + quality, -1, quality * sizeof(int));
+                    j = quality;
+                }
+                hList[j] = i;
+
+                if (bestLoc)
+                {
+                    h = ZlibHash(data + i + 1) & (ZHASH - 1);
+                    int* hList = hashTable.data + h * basket;
+                    for (j = 0; hList[j] != -1 && j < basket; ++j)
+                    {
+                        if (hList[j] > i - 32767)
+                        {
+                            int e = ZlibCount(data + hList[j], data + i + 1, size - i - 1);
+                            if (e > best)
+                            {
+                                bestLoc = NULL;
+                                break;
+                            }
+                        }
+                    }
+                }
+
+                if (bestLoc)
+                {
+                    int d = (int)(data + i - bestLoc);
+                    assert(d <= 32767 && best <= 258);
+                    for (j = 0; best > Base::ZlibLenC[j + 1] - 1; ++j);
+                    Base::ZlibHuff(j + 257, stream);
+                    if (Base::ZlibLenEb[j])
+                        stream.WriteBits(best - Base::ZlibLenC[j], Base::ZlibLenEb[j]);
+                    for (j = 0; d > Base::ZlibDistC[j + 1] - 1; ++j);
+                    stream.WriteBits(Base::ZlibBitRev(j, 5), 5);
+                    if (Base::ZlibDistEb[j])
+                        stream.WriteBits(d - Base::ZlibDistC[j], Base::ZlibDistEb[j]);
+                    i += best;
+                }
+                else
+                {
+                    ZlibHuffB(data[i], stream);
+                    ++i;
+                }
+            }
+            for (; i < size; ++i)
+                ZlibHuffB(data[i], stream);
+            ZlibHuff(256, stream);
+            stream.FlushBits();
+            stream.WriteBe32u(ZlibAdler32(data, size));
+        }
+
+        uint32_t EncodeLine0(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            uint32_t sum = 0;
+            for (size_t i = 0; i < size; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine1(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            uint32_t sum = 0;
+            for (size_t i = 0; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            for (size_t i = n; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - n];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine2(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            uint32_t sum = 0;
+            for (size_t i = 0; i < n; ++i)
+            {
+                dst[i] = src[i] - src[i - stride];
+                sum += ::abs(dst[i]);
+            }
+            for (size_t i = n; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - stride];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine3(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            uint32_t sum = 0;
+            for (size_t i = 0; i < n; ++i)
+            {
+                dst[i] = src[i] - (src[i - stride] >> 1);
+                sum += ::abs(dst[i]);
+            }
+            for (size_t i = n; i < size; ++i)
+            {
+                dst[i] = src[i] - ((src[i - n] + src[i - stride]) >> 1);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine4(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            uint32_t sum = 0;
+            for (size_t i = 0; i < n; ++i)
+            {
+                dst[i] = (int8_t)(src[i] - src[i - stride]);
+                sum += ::abs(dst[i]);
+            }
+            for (size_t i = n; i < size; ++i)
+            {
+                dst[i] = src[i] - Paeth(src[i - n], src[i - stride], src[i - stride - n]);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine5(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            uint32_t sum = 0;
+            for (size_t i = 0; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            for (size_t i = n; i < size; ++i)
+            {
+                dst[i] = src[i] - (src[i - n] >> 1);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine6(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            uint32_t sum = 0;
+            for (size_t i = 0; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            for (size_t i = n; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - n];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        ImagePngSaver::ImagePngSaver(const ImageSaverParam& param)
+            : ImageSaver(param)
+            , _channels(0)
+            , _size(0)
+            , _convert(NULL)
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatGray8:
+                _channels = 1;
+                break;
+            case SimdPixelFormatBgr24:
+                _channels = 3;
+                break;
+            case SimdPixelFormatBgra32:
+                _channels = 4;
+                break;
+            case SimdPixelFormatRgb24:
+                _channels = 3;
+                break;
+            case SimdPixelFormatRgba32:
+                _channels = 4;
+                break;
+            default: 
+                break;
+            }
+            _size = _param.width * _channels;
+            if (_param.format == SimdPixelFormatBgr24)
+            {
+                _convert = Base::BgrToRgb;
+                _buff.Resize(_param.height * _size);
+            }
+            else if (_param.format == SimdPixelFormatBgra32)
+            {
+                _convert = Base::BgraToRgba;
+                _buff.Resize(_param.height * _size);
+            }
+            _filt.Resize((_size + 1) * _param.height);
+            _line.Resize(_size * FILTERS);
+            _encode[0] = Base::EncodeLine0;
+            _encode[1] = Base::EncodeLine1;
+            _encode[2] = Base::EncodeLine2;
+            _encode[3] = Base::EncodeLine3;
+            _encode[4] = Base::EncodeLine4;
+            _encode[5] = Base::EncodeLine5;
+            _encode[6] = Base::EncodeLine6;
+            _compress = Base::ZlibCompress;
+        }
+
+        bool ImagePngSaver::ToStream(const uint8_t* src, size_t stride)
+        {
+            if (_convert)
+            {
+                _convert(src, _param.width, _param.height, stride, _buff.data, _size);
+                src = _buff.data;
+                stride = _size;
+            }
+            for (size_t row = 0; row < _param.height; ++row)
+            {
+                int bestFilter = 0, bestSum = INT_MAX;
+                for (int filter = 0; filter < FILTERS; filter++)
+                {
+                    static const int TYPES[] = { 0, 1, 0, 5, 6, 0, 1, 2, 3, 4 };
+                    int type = TYPES[filter + (row ? 1 : 0) * FILTERS];
+                    int sum = _encode[type](src + stride * row, stride, _channels, _size, _line.data + _size * filter);
+                    if (sum < bestSum)
+                    {
+                        bestSum = sum;
+                        bestFilter = filter;
+                    }
+                }
+                _filt[row * (_size + 1)] = (uint8_t)bestFilter;
+                memcpy(_filt.data + row * (_size + 1) + 1, _line.data + _size * bestFilter, _size);
+            }
+            OutputMemoryStream zlib(Min(_param.width * _param.height, Base::AlgCacheL1()));
+            _compress(_filt.data, (int)_filt.size, COMPRESSION, zlib);
+            WriteToStream(zlib.Data(), zlib.Size());
+            return true;
+        }
+
+        SIMD_INLINE void WriteCrc32(OutputMemoryStream& stream, size_t size)
+        {
+            stream.WriteBe32u(Base::Crc32(stream.Current() - size - 4, size + 4));
+        }
+
+        void ImagePngSaver::WriteToStream(const uint8_t* zlib, size_t zlen)
+        {
+            const uint8_t SIGNATURE[8] = { 137, 80, 78, 71, 13, 10, 26, 10 };
+            const int8_t CTYPE[5] = { -1, 0, 4, 2, 6 };
+            _stream.Reserve(8 + 12 + 13 + 12 + zlen + 12);
+            _stream.Write(SIGNATURE, 8);
+            _stream.WriteBe32u(13);
+            _stream.Write("IHDR", 4);
+            _stream.WriteBe32u((uint32_t)_param.width);
+            _stream.WriteBe32u((uint32_t)_param.height);
+            _stream.Write8u(8);
+            _stream.Write8u(CTYPE[_channels]);
+            _stream.Write8u(0);
+            _stream.Write8u(0);
+            _stream.Write8u(0);
+            WriteCrc32(_stream, 13);
+            _stream.WriteBe32u((uint32_t)zlen);
+            _stream.Write("IDAT", 4);
+            _stream.Write(zlib, zlen);
+            WriteCrc32(_stream, zlen);
+            _stream.WriteBe32u(0);
+            _stream.Write("IEND", 4);
+            WriteCrc32(_stream, 0);
+        }
+    }
+}
diff --git a/3rdparty/simdlib/Simd/SimdImageLoad.h b/3rdparty/simdlib/Simd/SimdImageLoad.h
new file mode 100644
index 0000000000..43e44961e6
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdImageLoad.h
@@ -0,0 +1,396 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdImageLoad_h__
+#define __SimdImageLoad_h__
+
+#include "Simd/SimdMemoryStream.h"
+#include "Simd/SimdArray.h"
+#include "Simd/SimdAlignment.h"
+
+#include "Simd/SimdView.hpp"
+
+#include <vector>
+
+namespace Simd
+{
+    typedef uint8_t* (*ImageLoadFromMemoryPtr)(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format);
+
+    uint8_t* ImageLoadFromFile(const ImageLoadFromMemoryPtr loader, const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format);
+
+    //-------------------------------------------------------------------------
+
+    struct ImageLoaderParam
+    {
+        const uint8_t* data;
+        size_t size;
+        SimdImageFileType file;
+        SimdPixelFormatType format;
+
+        ImageLoaderParam(const uint8_t* d, size_t s, SimdPixelFormatType f);
+
+        bool Validate();
+    };
+
+    class ImageLoader
+    {
+    protected:
+        typedef Simd::View<Simd::Allocator> Image;
+
+        ImageLoaderParam _param;
+        InputMemoryStream _stream;
+        Image _image;
+        
+    public:
+        ImageLoader(const ImageLoaderParam& param)
+            : _param(param)
+            , _stream(_param.data, _param.size)
+        {
+        }
+
+        virtual ~ImageLoader()
+        {
+        }
+
+        virtual bool FromStream() = 0;
+
+        SIMD_INLINE uint8_t* Release(size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+        {
+            *stride = _image.stride;
+            *width = _image.width;
+            *height = _image.height;
+            *format = (SimdPixelFormatType)_image.format;
+            return _image.Release();
+        }
+    };
+
+    namespace Base
+    {
+        class ImagePxmLoader : public ImageLoader
+        {
+        public:
+            ImagePxmLoader(const ImageLoaderParam& param);
+
+        protected:
+            typedef void (*ToAnyPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride);
+            typedef void (*ToBgraPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+            ToAnyPtr _toAny;
+            ToBgraPtr _toBgra;
+            Array8u _buffer;
+            size_t _block, _size;
+
+            bool ReadHeader(size_t version);
+            virtual void SetConverters() = 0;
+        };
+
+        class ImagePgmTxtLoader : public ImagePxmLoader
+        {
+        public:
+            ImagePgmTxtLoader(const ImageLoaderParam& param);
+
+            virtual bool FromStream();
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePgmBinLoader : public ImagePxmLoader
+        {
+        public:
+            ImagePgmBinLoader(const ImageLoaderParam& param);
+
+            virtual bool FromStream();
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmTxtLoader : public ImagePxmLoader
+        {
+        public:
+            ImagePpmTxtLoader(const ImageLoaderParam& param);
+
+            virtual bool FromStream();
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmBinLoader : public ImagePxmLoader
+        {
+        public:
+            ImagePpmBinLoader(const ImageLoaderParam& param);
+
+            virtual bool FromStream();
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePngLoader : public ImageLoader
+        {
+        public:
+            ImagePngLoader(const ImageLoaderParam& param);
+
+            virtual bool FromStream();
+
+        protected:
+            typedef void (*ToAny8Ptr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride);
+            typedef void (*ToBgra8Ptr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+            typedef void (*ToAny16Ptr)(const uint16_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride);
+            typedef void (*ToBgra16Ptr)(const uint16_t* src, size_t width, size_t height, size_t srcStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+            ToAny8Ptr _toAny8;
+            ToBgra8Ptr _toBgra8, _bgrToBgra;
+            ToAny16Ptr _toAny16;
+            ToBgra16Ptr _toBgra16;
+
+            virtual void SetConverters();
+        private:
+            bool _first, _hasTrans, _iPhone;
+            uint32_t _width, _height, _channels;
+            uint16_t _tc16[3];
+            uint8_t _depth, _color, _interlace, _paletteChannels, _tc[3];
+            Array8u _palette, _idat;
+
+            struct Chunk
+            {
+                uint32_t size;
+                uint32_t type;
+                uint32_t offs;
+            };
+            typedef std::vector<Chunk> Chunks;
+            Chunks _idats;
+
+            bool ParseFile();
+            bool CheckHeader();
+            bool ReadChunk(Chunk& chunk);
+            bool ReadHeader(const Chunk & chunk);
+            bool ReadPalette(const Chunk& chunk);
+            bool ReadTransparency(const Chunk& chunk);
+            bool ReadData(const Chunk& chunk);
+            InputMemoryStream MergedDataStream();
+        };
+
+        class ImageJpegLoader : public ImageLoader
+        {
+        public:
+            ImageJpegLoader(const ImageLoaderParam& param);
+
+            virtual bool FromStream();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format);
+    }
+
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        class ImagePgmTxtLoader : public Base::ImagePgmTxtLoader
+        {
+        public:
+            ImagePgmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePgmBinLoader : public Base::ImagePgmBinLoader
+        {
+        public:
+            ImagePgmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmTxtLoader : public Base::ImagePpmTxtLoader
+        {
+        public:
+            ImagePpmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmBinLoader : public Base::ImagePpmBinLoader
+        {
+        public:
+            ImagePpmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePngLoader : public Base::ImagePngLoader
+        {
+        public:
+            ImagePngLoader(const ImageLoaderParam& param);
+
+            virtual bool FromStream();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format);
+    }
+#endif// SIMD_SSE41_ENABLE
+
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        class ImagePgmTxtLoader : public Sse41::ImagePgmTxtLoader
+        {
+        public:
+            ImagePgmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePgmBinLoader : public Sse41::ImagePgmBinLoader
+        {
+        public:
+            ImagePgmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmTxtLoader : public Sse41::ImagePpmTxtLoader
+        {
+        public:
+            ImagePpmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmBinLoader : public Sse41::ImagePpmBinLoader
+        {
+        public:
+            ImagePpmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format);
+    }
+#endif// SIMD_AVX2_ENABLE
+
+#ifdef SIMD_AVX512BW_ENABLE    
+    namespace Avx512bw
+    {
+        class ImagePgmTxtLoader : public Avx2::ImagePgmTxtLoader
+        {
+        public:
+            ImagePgmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePgmBinLoader : public Avx2::ImagePgmBinLoader
+        {
+        public:
+            ImagePgmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmTxtLoader : public Avx2::ImagePpmTxtLoader
+        {
+        public:
+            ImagePpmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmBinLoader : public Avx2::ImagePpmBinLoader
+        {
+        public:
+            ImagePpmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format);
+    }
+#endif// SIMD_AVX512BW_ENABLE
+
+#ifdef SIMD_NEON_ENABLE    
+    namespace Neon
+    {
+        class ImagePgmTxtLoader : public Base::ImagePgmTxtLoader
+        {
+        public:
+            ImagePgmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePgmBinLoader : public Base::ImagePgmBinLoader
+        {
+        public:
+            ImagePgmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmTxtLoader : public Base::ImagePpmTxtLoader
+        {
+        public:
+            ImagePpmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmBinLoader : public Base::ImagePpmBinLoader
+        {
+        public:
+            ImagePpmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format);
+    }
+#endif// SIMD_NEON_ENABLE
+}
+
+#endif//__SimdImageLoad_h__
diff --git a/3rdparty/simdlib/Simd/SimdImageSave.h b/3rdparty/simdlib/Simd/SimdImageSave.h
new file mode 100644
index 0000000000..4e1945c077
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdImageSave.h
@@ -0,0 +1,386 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdImageSave_h__
+#define __SimdImageSave_h__
+
+#include "Simd/SimdMemoryStream.h"
+#include "Simd/SimdArray.h"
+#include "Simd/SimdPerformance.h"
+
+namespace Simd
+{
+    typedef uint8_t* (*ImageSaveToMemoryPtr)(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size);
+
+    SimdBool ImageSaveToFile(const ImageSaveToMemoryPtr saver, const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char* path);
+
+    //---------------------------------------------------------------------
+
+    struct ImageSaverParam
+    {
+        size_t width, height;
+        SimdPixelFormatType format;
+        SimdImageFileType file;
+        int quality;
+
+        SIMD_INLINE ImageSaverParam(size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality)
+        {
+            this->width = width;
+            this->height = height;
+            this->format = format;
+            this->file = file;
+            this->quality = quality;
+        }
+
+        bool Validate()
+        {
+            if (file == SimdImageFileUndefined)
+            {
+                if (format == SimdPixelFormatGray8)
+                    file = SimdImageFilePgmBin;
+                else
+                    file = SimdImageFilePpmBin;
+            }            
+            if (format < SimdPixelFormatGray8 || format > SimdPixelFormatRgba32)
+                return false;
+            if (width == 0 || height == 0)
+                return false;
+            if (file <= SimdImageFileUndefined || file > SimdImageFileJpeg)
+                return false;
+            return true;
+        }
+    };
+
+    class ImageSaver
+    {
+    protected:
+        ImageSaverParam _param;
+        OutputMemoryStream _stream;
+    public:
+        ImageSaver(const ImageSaverParam& param)
+            : _param(param)
+        {
+        }
+
+        virtual ~ImageSaver()
+        {
+        }
+
+        virtual bool ToStream(const uint8_t* src, size_t stride) = 0;
+
+        SIMD_INLINE uint8_t* Release(size_t* size)
+        {
+            return _stream.Release(size);
+        }
+    };
+       
+    namespace Base
+    {
+        class ImagePxmSaver : public ImageSaver
+        {
+        public:
+            ImagePxmSaver(const ImageSaverParam& param);
+
+        protected:
+            typedef void (*ConvertPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride);
+            ConvertPtr _convert;
+            Array8u _buffer;
+            size_t _block, _size;
+
+            void WriteHeader(size_t version);
+        };
+
+        class ImagePgmTxtSaver : public ImagePxmSaver
+        {
+        public:
+            ImagePgmTxtSaver(const ImageSaverParam& param);
+
+            virtual bool ToStream(const uint8_t* src, size_t stride);
+        };
+
+        class ImagePgmBinSaver : public ImagePxmSaver
+        {
+        public:
+            ImagePgmBinSaver(const ImageSaverParam& param);
+
+            virtual bool ToStream(const uint8_t* src, size_t stride);
+        };
+
+        class ImagePpmTxtSaver : public ImagePxmSaver
+        {
+        public:
+            ImagePpmTxtSaver(const ImageSaverParam& param);
+
+            virtual bool ToStream(const uint8_t* src, size_t stride);
+        };
+
+        class ImagePpmBinSaver : public ImagePxmSaver
+        {
+        public:
+            ImagePpmBinSaver(const ImageSaverParam& param);
+
+            virtual bool ToStream(const uint8_t* src, size_t stride);
+        };
+
+        class ImagePngSaver : public ImageSaver
+        {
+        public:
+            ImagePngSaver(const ImageSaverParam& param);
+
+            virtual bool ToStream(const uint8_t* src, size_t stride);
+        protected:
+            static const int COMPRESSION = 8;
+            static const int FILTERS = 5;
+            static const int TYPES = 7;
+            typedef void (*ConvertPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride);
+            typedef uint32_t (*EncodePtr)(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst);
+            typedef void (*CompressPtr)(uint8_t* data, int size, int quality, OutputMemoryStream& stream);
+            ConvertPtr _convert;
+            EncodePtr _encode[TYPES];
+            CompressPtr _compress;
+            size_t _channels, _size;
+            Array8u _filt, _buff;
+            Array8i _line;
+
+            void WriteToStream(const uint8_t* zlib, size_t zlen);
+        };
+
+        class ImageJpegSaver : public ImageSaver
+        {
+        public:
+            ImageJpegSaver(const ImageSaverParam& param);
+
+            virtual bool ToStream(const uint8_t* src, size_t stride);
+        protected:
+            typedef void (*DeintBgrPtr)(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height,
+                uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride);
+            typedef void (*DeintBgraPtr)(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height,
+                uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride);
+            typedef void (*WriteBlockPtr)(OutputMemoryStream& stream, int width, int height, const uint8_t* red,
+                const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3]);
+
+            Array8u _buffer;
+            DeintBgrPtr _deintBgr;
+            DeintBgraPtr _deintBgra;
+            WriteBlockPtr _writeBlock;
+            bool _subSample;
+            int _quality, _block, _width;
+            float _fY[64], _fUv[64];
+            uint8_t _uY[64], _uUv[64];
+
+            virtual void Init();
+
+            void InitParams(bool trans);
+            void WriteHeader();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size);
+    }
+
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        class ImagePgmTxtSaver : public Base::ImagePgmTxtSaver
+        {
+        public:
+            ImagePgmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePgmBinSaver : public Base::ImagePgmBinSaver
+        {
+        public:
+            ImagePgmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmTxtSaver : public Base::ImagePpmTxtSaver
+        {
+        public:
+            ImagePpmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmBinSaver : public Base::ImagePpmBinSaver
+        {
+        public:
+            ImagePpmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePngSaver : public Base::ImagePngSaver
+        {
+        public:
+            ImagePngSaver(const ImageSaverParam& param);
+        };
+
+        class ImageJpegSaver : public Base::ImageJpegSaver
+        {
+        public:
+            ImageJpegSaver(const ImageSaverParam& param);
+
+        protected:
+            virtual void Init();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size);
+    }
+#endif// SIMD_SSE41_ENABLE
+
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        class ImagePgmTxtSaver : public Sse41::ImagePgmTxtSaver
+        {
+        public:
+            ImagePgmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePgmBinSaver : public Sse41::ImagePgmBinSaver
+        {
+        public:
+            ImagePgmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmTxtSaver : public Sse41::ImagePpmTxtSaver
+        {
+        public:
+            ImagePpmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmBinSaver : public Sse41::ImagePpmBinSaver
+        {
+        public:
+            ImagePpmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePngSaver : public Sse41::ImagePngSaver
+        {
+        public:
+            ImagePngSaver(const ImageSaverParam& param);
+        };
+
+        class ImageJpegSaver : public Sse41::ImageJpegSaver
+        {
+        public:
+            ImageJpegSaver(const ImageSaverParam& param);
+
+        protected:
+            virtual void Init();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size);
+    }
+#endif// SIMD_AVX2_ENABLE
+
+#ifdef SIMD_AVX512BW_ENABLE    
+    namespace Avx512bw
+    {
+        class ImagePgmTxtSaver : public Avx2::ImagePgmTxtSaver
+        {
+        public:
+            ImagePgmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePgmBinSaver : public Avx2::ImagePgmBinSaver
+        {
+        public:
+            ImagePgmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmTxtSaver : public Avx2::ImagePpmTxtSaver
+        {
+        public:
+            ImagePpmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmBinSaver : public Avx2::ImagePpmBinSaver
+        {
+        public:
+            ImagePpmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePngSaver : public Avx2::ImagePngSaver
+        {
+        public:
+            ImagePngSaver(const ImageSaverParam& param);
+        };
+
+        class ImageJpegSaver : public Avx2::ImageJpegSaver
+        {
+        public:
+            ImageJpegSaver(const ImageSaverParam& param);
+
+        protected:
+            virtual void Init();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size);
+    }
+#endif// SIMD_AVX512BW_ENABLE
+
+#ifdef SIMD_NEON_ENABLE    
+    namespace Neon
+    {
+        class ImagePgmTxtSaver : public Base::ImagePgmTxtSaver
+        {
+        public:
+            ImagePgmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePgmBinSaver : public Base::ImagePgmBinSaver
+        {
+        public:
+            ImagePgmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmTxtSaver : public Base::ImagePpmTxtSaver
+        {
+        public:
+            ImagePpmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmBinSaver : public Base::ImagePpmBinSaver
+        {
+        public:
+            ImagePpmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePngSaver : public Base::ImagePngSaver
+        {
+        public:
+            ImagePngSaver(const ImageSaverParam& param);
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size);
+    }
+#endif// SIMD_NEON_ENABLE
+}
+
+#endif//__SimdImageSave_h__
diff --git a/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h
new file mode 100644
index 0000000000..d54164f7d4
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h
@@ -0,0 +1,649 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdImageSaveJpeg_h__
+#define __SimdImageSaveJpeg_h__
+
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdMath.h"
+
+#define SIMD_JPEG_CALC_BITS_TABLE
+
+namespace Simd
+{
+    namespace Base
+    {
+        struct BitBuf
+        {
+            static const uint32_t capacity = 1024;
+            uint32_t size;
+            uint16_t data[1024][2];
+
+            SIMD_INLINE BitBuf()
+                : size(0) 
+            {
+            }
+
+            SIMD_INLINE void Push(const uint16_t* bits)
+            {
+                ((uint32_t*)data)[size++] = ((uint32_t*)bits)[0];
+            }
+
+            SIMD_INLINE bool Full(uint32_t tail = capacity / 2) const
+            {
+                return size + tail >= capacity;
+            }
+
+            SIMD_INLINE uint32_t Capacity() const 
+            {
+                return capacity;
+            }
+
+            SIMD_INLINE void Clear()
+            {
+                size = 0;
+            }
+        }; 
+
+        extern const uint8_t JpegZigZagD[64];
+        extern const uint8_t JpegZigZagT[64];
+
+        extern const uint16_t HuffmanYdc[256][2];
+        extern const uint16_t HuffmanUVdc[256][2];
+        extern const uint16_t HuffmanYac[256][2];
+        extern const uint16_t HuffmanUVac[256][2];
+
+#if defined(SIMD_JPEG_CALC_BITS_TABLE)
+        const int JpegCalcBitsRange = 2048;
+        extern uint16_t JpegCalcBitsTable[JpegCalcBitsRange * 2][2];
+        SIMD_INLINE void JpegCalcBits(int val, uint16_t bits[2])
+        {
+            assert(val >= -JpegCalcBitsRange && val < JpegCalcBitsRange);
+            ((uint32_t*)bits)[0] = ((uint32_t*)JpegCalcBitsTable)[val + JpegCalcBitsRange];
+        }
+#else
+        SIMD_INLINE void JpegCalcBits(int val, uint16_t bits[2])
+        {
+            int tmp = val < 0 ? -val : val;
+            val = val < 0 ? val - 1 : val;
+            bits[1] = 1;
+            while (tmp >>= 1)
+                ++bits[1];
+            bits[0] = val & ((1 << bits[1]) - 1);
+        }
+#endif
+
+        SIMD_INLINE void RgbToYuv(const uint8_t* r, const uint8_t* g, const uint8_t* b, int stride, int height, int width, float* y, float* u, float* v, int size)
+        {
+            for (int row = 0; row < size;)
+            {
+                for (int col = 0; col < size; col += 1)
+                {
+                    int offs = (col < width ? col : width - 1);
+                    float _r = r[offs], _g = g[offs], _b = b[offs];
+                    y[col] = +0.29900f * _r + 0.58700f * _g + 0.11400f * _b - 128.000f;
+                    u[col] = -0.16874f * _r - 0.33126f * _g + 0.50000f * _b;
+                    v[col] = +0.50000f * _r - 0.41869f * _g - 0.08131f * _b;
+                }
+                if (++row < height)
+                    r += stride, g += stride, b += stride;
+                y += size, u += size, v += size;
+            }
+        }
+
+        SIMD_INLINE void GrayToY(const uint8_t* g, int stride, int height, int width, float* y, int size)
+        {
+            for (int row = 0; row < size;)
+            {
+                for (int col = 0; col < size; col += 1)
+                {
+                    int offs = (col < width ? col : width - 1);
+                    y[col] = g[offs] - 128.000f;
+                }
+                if (++row < height)
+                    g += stride;
+                y += size;
+            }
+        }
+
+        SIMD_INLINE void JpegProcessDuGrayUv(BitBuf & bitBuf)
+        {
+            bitBuf.Push(Base::HuffmanUVdc[0]);
+            bitBuf.Push(Base::HuffmanUVac[0]);
+            bitBuf.Push(Base::HuffmanUVdc[0]);
+            bitBuf.Push(Base::HuffmanUVac[0]);
+        }
+
+        SIMD_INLINE void WriteBits(OutputMemoryStream & stream, const uint16_t bits[2])
+        {
+            stream.BitCount() += bits[1];
+#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+            stream.BitBuffer() |= uint64_t(bits[0]) << (64 - stream.BitCount());
+            while (stream.BitCount() >= 8)
+            {
+                uint8_t byte = stream.BitBuffer() >> 56;
+                stream.Write8u(byte);
+                if (byte == 255)
+                    stream.Write8u(0);
+                stream.BitBuffer() <<= 8;
+                stream.BitCount() -= 8;
+            }
+#else
+            stream.BitBuffer() |= uint32_t(bits[0]) << (32 - stream.BitCount());
+            while (stream.BitCount() >= 8)
+            {
+                uint8_t byte = stream.BitBuffer() >> 24;
+                stream.Write8u(byte);
+                if (byte == 255)
+                    stream.Write8u(0);
+                stream.BitBuffer() <<= 8;
+                stream.BitCount() -= 8;
+            }
+#endif
+        }
+
+        SIMD_INLINE void WriteBits(OutputMemoryStream& stream, const uint16_t bits[][2], size_t size)
+        {
+            size_t pos = stream.Pos();
+            stream.Reserve(pos + size * 2);
+            uint8_t* data = stream.Data();
+            size_t & bitCount = stream.BitCount();
+            size_t i = 0;
+#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+            uint64_t &bitBuffer = stream.BitBuffer();
+            for (size_t size3 = AlignLoAny(size, 3); i < size3; i += 3, bits += 3)
+            {
+                bitCount += bits[0][1];
+                bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount);
+                bitCount += bits[1][1];
+                bitBuffer |= uint64_t(bits[1][0]) << (64 - bitCount);
+                bitCount += bits[2][1];
+                bitBuffer |= uint64_t(bits[2][0]) << (64 - bitCount);
+                assert(bitCount <= 64);
+                while (bitCount >= 16)
+                {
+                    uint8_t byte = uint8_t(bitBuffer >> 56);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    byte = uint8_t(bitBuffer >> 48);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    bitBuffer <<= 16;
+                    bitCount -= 16;
+                }
+            }
+            if(bitCount >= 8)
+            {
+                assert(bitCount < 16);
+                uint8_t byte = uint8_t(bitBuffer >> 56);
+                data[pos++] = byte;
+                if (byte == 255)
+                    data[pos++] = 0;
+                bitBuffer <<= 8;
+                bitCount -= 8;
+            }
+            for (; i < size; ++i, ++bits)
+            {
+                bitCount += bits[0][1];
+                bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount);
+                while (bitCount >= 8)
+                {
+                    uint8_t byte = uint8_t(bitBuffer >> 56);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    bitBuffer <<= 8;
+                    bitCount -= 8;
+                }
+            }
+#else
+            uint32_t &bitBuffer = stream.BitBuffer();
+            for (; i < size; ++i, ++bits)
+            {
+                bitCount += bits[0][1];
+                bitBuffer |= uint32_t(bits[0][0]) << (32 - bitCount);
+                while (bitCount >= 8)
+                {
+                    uint8_t byte = uint8_t(bitBuffer >> 24);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    bitBuffer <<= 8;
+                    bitCount -= 8;
+                }
+            }
+#endif
+            stream.Seek(pos);
+        }
+    }
+
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+    }
+#endif// SIMD_SSE41_ENABLE
+
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        extern const uint32_t JpegZigZagTi32[64];
+
+        SIMD_INLINE void JpegDctV(const float* src, size_t srcStride, float* dst, size_t dstStride)
+        {
+            static const __m256 _0_707106781 = _mm256_set1_ps(0.707106781f);
+            static const __m256 _0_382683433 = _mm256_set1_ps(0.382683433f);
+            static const __m256 _0_541196100 = _mm256_set1_ps(0.541196100f);
+            static const __m256 _1_306562965 = _mm256_set1_ps(1.306562965f);
+
+            __m256 d0 = _mm256_loadu_ps(src + 0 * srcStride);
+            __m256 d1 = _mm256_loadu_ps(src + 1 * srcStride);
+            __m256 d2 = _mm256_loadu_ps(src + 2 * srcStride);
+            __m256 d3 = _mm256_loadu_ps(src + 3 * srcStride);
+            __m256 d4 = _mm256_loadu_ps(src + 4 * srcStride);
+            __m256 d5 = _mm256_loadu_ps(src + 5 * srcStride);
+            __m256 d6 = _mm256_loadu_ps(src + 6 * srcStride);
+            __m256 d7 = _mm256_loadu_ps(src + 7 * srcStride);
+
+            __m256 tmp0 = _mm256_add_ps(d0, d7);
+            __m256 tmp7 = _mm256_sub_ps(d0, d7);
+            __m256 tmp1 = _mm256_add_ps(d1, d6);
+            __m256 tmp6 = _mm256_sub_ps(d1, d6);
+            __m256 tmp2 = _mm256_add_ps(d2, d5);
+            __m256 tmp5 = _mm256_sub_ps(d2, d5);
+            __m256 tmp3 = _mm256_add_ps(d3, d4);
+            __m256 tmp4 = _mm256_sub_ps(d3, d4);
+
+            __m256 tmp10 = _mm256_add_ps(tmp0, tmp3);
+            __m256 tmp13 = _mm256_sub_ps(tmp0, tmp3);
+            __m256 tmp11 = _mm256_add_ps(tmp1, tmp2);
+            __m256 tmp12 = _mm256_sub_ps(tmp1, tmp2);
+
+            d0 = _mm256_add_ps(tmp10, tmp11);
+            d4 = _mm256_sub_ps(tmp10, tmp11);
+
+            __m256 z1 = _mm256_mul_ps(_mm256_add_ps(tmp12, tmp13), _0_707106781);
+            d2 = _mm256_add_ps(tmp13, z1);
+            d6 = _mm256_sub_ps(tmp13, z1);
+
+            tmp10 = _mm256_add_ps(tmp4, tmp5);
+            tmp11 = _mm256_add_ps(tmp5, tmp6);
+            tmp12 = _mm256_add_ps(tmp6, tmp7);
+
+            __m256 z5 = _mm256_mul_ps(_mm256_sub_ps(tmp10, tmp12), _0_382683433);
+            __m256 z2 = _mm256_add_ps(_mm256_mul_ps(tmp10, _0_541196100), z5);
+            __m256 z4 = _mm256_add_ps(_mm256_mul_ps(tmp12, _1_306562965), z5);
+            __m256 z3 = _mm256_mul_ps(tmp11, _0_707106781);
+
+            __m256 z11 = _mm256_add_ps(tmp7, z3);
+            __m256 z13 = _mm256_sub_ps(tmp7, z3);
+
+            _mm256_storeu_ps(dst + 0 * dstStride, d0);
+            _mm256_storeu_ps(dst + 1 * dstStride, _mm256_add_ps(z11, z4));
+            _mm256_storeu_ps(dst + 2 * dstStride, d2);
+            _mm256_storeu_ps(dst + 3 * dstStride, _mm256_sub_ps(z13, z2));
+            _mm256_storeu_ps(dst + 4 * dstStride, d4);
+            _mm256_storeu_ps(dst + 5 * dstStride, _mm256_add_ps(z13, z2));
+            _mm256_storeu_ps(dst + 6 * dstStride, d6);
+            _mm256_storeu_ps(dst + 7 * dstStride, _mm256_sub_ps(z11, z4));
+        }
+
+        SIMD_INLINE void JpegDct(const float* src, size_t stride, const float* fdt, int* dst)
+        {
+            static const __m256 _0_707106781 = _mm256_set1_ps(0.707106781f);
+            static const __m256 _0_382683433 = _mm256_set1_ps(0.382683433f);
+            static const __m256 _0_541196100 = _mm256_set1_ps(0.541196100f);
+            static const __m256 _1_306562965 = _mm256_set1_ps(1.306562965f);
+
+            __m256 d0 = _mm256_loadu_ps(src + 0 * stride);
+            __m256 d1 = _mm256_loadu_ps(src + 1 * stride);
+            __m256 d2 = _mm256_loadu_ps(src + 2 * stride);
+            __m256 d3 = _mm256_loadu_ps(src + 3 * stride);
+            __m256 d4 = _mm256_loadu_ps(src + 4 * stride);
+            __m256 d5 = _mm256_loadu_ps(src + 5 * stride);
+            __m256 d6 = _mm256_loadu_ps(src + 6 * stride);
+            __m256 d7 = _mm256_loadu_ps(src + 7 * stride);
+
+            __m256 tmp0 = _mm256_add_ps(d0, d7);
+            __m256 tmp7 = _mm256_sub_ps(d0, d7);
+            __m256 tmp1 = _mm256_add_ps(d1, d6);
+            __m256 tmp6 = _mm256_sub_ps(d1, d6);
+            __m256 tmp2 = _mm256_add_ps(d2, d5);
+            __m256 tmp5 = _mm256_sub_ps(d2, d5);
+            __m256 tmp3 = _mm256_add_ps(d3, d4);
+            __m256 tmp4 = _mm256_sub_ps(d3, d4);
+
+            __m256 tmp10 = _mm256_add_ps(tmp0, tmp3);
+            __m256 tmp13 = _mm256_sub_ps(tmp0, tmp3);
+            __m256 tmp11 = _mm256_add_ps(tmp1, tmp2);
+            __m256 tmp12 = _mm256_sub_ps(tmp1, tmp2);
+
+            d0 = _mm256_add_ps(tmp10, tmp11);
+            d4 = _mm256_sub_ps(tmp10, tmp11);
+
+            __m256 z1 = _mm256_mul_ps(_mm256_add_ps(tmp12, tmp13), _0_707106781);
+            d2 = _mm256_add_ps(tmp13, z1);
+            d6 = _mm256_sub_ps(tmp13, z1);
+
+            tmp10 = _mm256_add_ps(tmp4, tmp5);
+            tmp11 = _mm256_add_ps(tmp5, tmp6);
+            tmp12 = _mm256_add_ps(tmp6, tmp7);
+
+            __m256 z5 = _mm256_mul_ps(_mm256_sub_ps(tmp10, tmp12), _0_382683433);
+            __m256 z2 = _mm256_add_ps(_mm256_mul_ps(tmp10, _0_541196100), z5);
+            __m256 z4 = _mm256_add_ps(_mm256_mul_ps(tmp12, _1_306562965), z5);
+            __m256 z3 = _mm256_mul_ps(tmp11, _0_707106781);
+
+            __m256 z11 = _mm256_add_ps(tmp7, z3);
+            __m256 z13 = _mm256_sub_ps(tmp7, z3);
+
+            d1 = _mm256_add_ps(z11, z4);
+            d3 = _mm256_sub_ps(z13, z2);
+            d5 = _mm256_add_ps(z13, z2);
+            d7 = _mm256_sub_ps(z11, z4);
+
+            tmp10 = _mm256_permute2f128_ps(d0, d4, 0x20);
+            tmp11 = _mm256_permute2f128_ps(d1, d5, 0x20);
+            tmp12 = _mm256_permute2f128_ps(d2, d6, 0x20);
+            tmp13 = _mm256_permute2f128_ps(d3, d7, 0x20);
+            d4 = _mm256_permute2f128_ps(d0, d4, 0x31);
+            d5 = _mm256_permute2f128_ps(d1, d5, 0x31);
+            d6 = _mm256_permute2f128_ps(d2, d6, 0x31);
+            d7 = _mm256_permute2f128_ps(d3, d7, 0x31);
+
+            tmp0 = _mm256_unpacklo_ps(tmp10, tmp12);
+            tmp1 = _mm256_unpackhi_ps(tmp10, tmp12);
+            tmp2 = _mm256_unpacklo_ps(tmp11, tmp13);
+            tmp3 = _mm256_unpackhi_ps(tmp11, tmp13);
+            d0 = _mm256_unpacklo_ps(tmp0, tmp2);
+            d1 = _mm256_unpackhi_ps(tmp0, tmp2);
+            d2 = _mm256_unpacklo_ps(tmp1, tmp3);
+            d3 = _mm256_unpackhi_ps(tmp1, tmp3);
+
+            tmp0 = _mm256_unpacklo_ps(d4, d6);
+            tmp1 = _mm256_unpackhi_ps(d4, d6);
+            tmp2 = _mm256_unpacklo_ps(d5, d7);
+            tmp3 = _mm256_unpackhi_ps(d5, d7);
+            d4 = _mm256_unpacklo_ps(tmp0, tmp2);
+            d5 = _mm256_unpackhi_ps(tmp0, tmp2);
+            d6 = _mm256_unpacklo_ps(tmp1, tmp3);
+            d7 = _mm256_unpackhi_ps(tmp1, tmp3);
+
+            tmp0 = _mm256_add_ps(d0, d7);
+            tmp1 = _mm256_add_ps(d1, d6);
+            tmp2 = _mm256_add_ps(d2, d5);
+            tmp3 = _mm256_add_ps(d3, d4);
+            tmp7 = _mm256_sub_ps(d0, d7);
+            tmp6 = _mm256_sub_ps(d1, d6);
+            tmp5 = _mm256_sub_ps(d2, d5);
+            tmp4 = _mm256_sub_ps(d3, d4);
+
+            tmp10 = _mm256_add_ps(tmp0, tmp3);
+            tmp13 = _mm256_sub_ps(tmp0, tmp3);
+            tmp11 = _mm256_add_ps(tmp1, tmp2);
+            tmp12 = _mm256_sub_ps(tmp1, tmp2);
+
+            d0 = _mm256_add_ps(tmp10, tmp11);
+            d4 = _mm256_sub_ps(tmp10, tmp11);
+
+            z1 = _mm256_mul_ps(_mm256_add_ps(tmp12, tmp13), _0_707106781);
+            d2 = _mm256_add_ps(tmp13, z1);
+            d6 = _mm256_sub_ps(tmp13, z1);
+
+            tmp10 = _mm256_add_ps(tmp4, tmp5);
+            tmp11 = _mm256_add_ps(tmp5, tmp6);
+            tmp12 = _mm256_add_ps(tmp6, tmp7);
+
+            z5 = _mm256_mul_ps(_mm256_sub_ps(tmp10, tmp12), _0_382683433);
+            z2 = _mm256_add_ps(_mm256_mul_ps(tmp10, _0_541196100), z5);
+            z4 = _mm256_add_ps(_mm256_mul_ps(tmp12, _1_306562965), z5);
+            z3 = _mm256_mul_ps(tmp11, _0_707106781);
+
+            z11 = _mm256_add_ps(tmp7, z3);
+            z13 = _mm256_sub_ps(tmp7, z3);
+
+            d1 = _mm256_add_ps(z11, z4);
+            d3 = _mm256_sub_ps(z13, z2);
+            d5 = _mm256_add_ps(z13, z2);
+            d7 = _mm256_sub_ps(z11, z4);
+
+            _mm256_storeu_si256((__m256i*)dst + 0, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 0), d0)));
+            _mm256_storeu_si256((__m256i*)dst + 1, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 1), d1)));
+            _mm256_storeu_si256((__m256i*)dst + 2, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 2), d2)));
+            _mm256_storeu_si256((__m256i*)dst + 3, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 3), d3)));
+            _mm256_storeu_si256((__m256i*)dst + 4, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 4), d4)));
+            _mm256_storeu_si256((__m256i*)dst + 5, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 5), d5)));
+            _mm256_storeu_si256((__m256i*)dst + 6, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 6), d6)));
+            _mm256_storeu_si256((__m256i*)dst + 7, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 7), d7)));
+        }
+
+        const __m256i K32_PERM_LD = SIMD_MM256_SETR_EPI32(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1);
+
+        const __m256i K8_SHFL_VS = SIMD_MM256_SETR_EPI8(
+            0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1,
+            0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1);
+
+        const __m256i K8_SHFL_SH = SIMD_MM256_SETR_EPI8(
+            0x2, 0x3, -1, -1, 0x6, 0x7, -1, -1, 0xA, 0xB, -1, -1, -1, -1, -1, -1,
+            0x2, 0x3, -1, -1, 0x6, 0x7, -1, -1, 0xA, 0xB, -1, -1, -1, -1, -1, -1);
+
+        const __m256i K32_32 = SIMD_MM256_SET1_EPI32(32);
+
+#if defined(SIMD_X64_ENABLE)
+        SIMD_INLINE void WriteBits(uint8_t* data, size_t & pos, uint64_t & bitBuffer, size_t &bitCount, uint64_t shift, uint64_t value, uint64_t mask)
+        {
+            bitCount += shift;
+            assert(bitCount <= 64);
+            bitBuffer |= _pext_u64(value, mask) << (64 - bitCount);
+            while (bitCount >= 16)
+            {
+                uint8_t byte = uint8_t(bitBuffer >> 56);
+                data[pos++] = byte;
+                if (byte == 255)
+                    data[pos++] = 0;
+                byte = uint8_t(bitBuffer >> 48);
+                data[pos++] = byte;
+                if (byte == 255)
+                    data[pos++] = 0;
+                bitBuffer <<= 16;
+                bitCount -= 16;
+            }
+        }
+#endif
+
+        SIMD_INLINE void WriteBits(OutputMemoryStream& stream, const uint16_t bits[][2], size_t size)
+        {
+            size_t pos = stream.Pos();
+            stream.Reserve(pos + size * 2);
+            uint8_t* data = stream.Data();
+            size_t& bitCount = stream.BitCount();
+            size_t i = 0;
+#if defined(SIMD_X64_ENABLE)
+            uint64_t &bitBuffer = stream.BitBuffer();
+            size_t size12 = AlignLoAny(size, 12);
+            for (; i < size12; i += 12, bits += 12)
+            {
+                __m256i b0 = _mm256_permutevar8x32_epi32(_mm256_loadu_si256((__m256i*)(bits + 0)), K32_PERM_LD);
+                __m256i b1 = _mm256_permutevar8x32_epi32(_mm256_loadu_si256((__m256i*)(bits + 6)), K32_PERM_LD);
+                __m256i vs0 = _mm256_shuffle_epi8(b0, K8_SHFL_VS);
+                __m256i vs1 = _mm256_shuffle_epi8(b1, K8_SHFL_VS);
+                __m256i vv = Shuffle64i<0x0>(vs0, vs1);
+                __m256i ss = Shuffle64i<0xF>(vs0, vs1);
+                SIMD_ALIGNED(32) uint64_t value[4], mask[4], shift[4];
+                _mm256_storeu_si256((__m256i*)value, vv);
+                _mm256_storeu_si256((__m256i*)shift, _mm256_sad_epu8(ss, K_ZERO));
+                __m256i s0 = _mm256_sub_epi32(K32_32, _mm256_shuffle_epi8(b0, K8_SHFL_SH));
+                __m256i m0 = _mm256_srlv_epi32(K_INV_ZERO, s0);
+                __m256i s1 = _mm256_sub_epi32(K32_32, _mm256_shuffle_epi8(b1, K8_SHFL_SH));
+                __m256i m1 = _mm256_srlv_epi32(K_INV_ZERO, s1);
+                __m256i ms0 = _mm256_shuffle_epi8(m0, K8_SHFL_VS);
+                __m256i ms1 = _mm256_shuffle_epi8(m1, K8_SHFL_VS);
+                _mm256_storeu_si256((__m256i*)mask, Shuffle64i<0x0>(ms0, ms1));
+                WriteBits(data, pos, bitBuffer, bitCount, shift[0], value[0], mask[0]);
+                WriteBits(data, pos, bitBuffer, bitCount, shift[2], value[2], mask[2]);
+                WriteBits(data, pos, bitBuffer, bitCount, shift[1], value[1], mask[1]);
+                WriteBits(data, pos, bitBuffer, bitCount, shift[3], value[3], mask[3]);
+            }
+            if (bitCount >= 8)
+            {
+                assert(bitCount < 16);
+                uint8_t byte = uint8_t(bitBuffer >> 56);
+                data[pos++] = byte;
+                if (byte == 255)
+                    data[pos++] = 0;
+                bitBuffer <<= 8;
+                bitCount -= 8;
+            }
+            for (; i < size; ++i, ++bits)
+            {
+                bitCount += bits[0][1];
+                bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount);
+                while (bitCount >= 8)
+                {
+                    uint8_t byte = uint8_t(bitBuffer >> 56);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    bitBuffer <<= 8;
+                    bitCount -= 8;
+                }
+            }
+#else
+            uint32_t& bitBuffer = stream.BitBuffer();
+            for (; i < size; ++i, ++bits)
+            {
+                bitCount += bits[0][1];
+                bitBuffer |= uint32_t(bits[0][0]) << (32 - bitCount);
+                while (bitCount >= 8)
+                {
+                    uint8_t byte = uint8_t(bitBuffer >> 24);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    bitBuffer <<= 8;
+                    bitCount -= 8;
+                }
+            }
+#endif
+            stream.Seek(pos);
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+
+#ifdef SIMD_AVX512BW_ENABLE    
+    namespace Avx512bw
+    {
+        const __m512i K32_PERM_LD = SIMD_MM512_SETR_EPI32(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);
+
+        const __m512i K8_SHFL_VS = SIMD_MM512_SETR_EPI8(
+            0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1,
+            0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1,
+            0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1,
+            0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1);
+
+        SIMD_INLINE void WriteBits(OutputMemoryStream& stream, const uint16_t bits[][2], size_t size)
+        {
+            size_t pos = stream.Pos();
+            stream.Reserve(pos + size * 2);
+            uint8_t* data = stream.Data();
+            size_t& bitCount = stream.BitCount();
+            size_t i = 0;
+#if defined(SIMD_X64_ENABLE)
+            uint64_t &bitBuffer = stream.BitBuffer();
+            size_t size24 = AlignLoAny(size, 24);
+            for (; i < size24; i += 24, bits += 24)
+            {
+                __m512i b0 = _mm512_permutexvar_epi32(K32_PERM_LD, _mm512_loadu_si512((__m512i*)(bits + 00)));
+                __m512i b1 = _mm512_permutexvar_epi32(K32_PERM_LD, _mm512_loadu_si512((__m512i*)(bits + 12)));
+                __m512i vs0 = _mm512_shuffle_epi8(b0, K8_SHFL_VS);
+                __m512i vs1 = _mm512_shuffle_epi8(b1, K8_SHFL_VS);
+                __m512i vv = Shuffle64i<0x00>(vs0, vs1);
+                __m512i ss = Shuffle64i<0xFF>(vs0, vs1);
+                SIMD_ALIGNED(64) uint64_t value[8], mask[8], shift[8];
+                _mm512_storeu_si512((__m512i*)value, vv);
+                _mm512_storeu_si512((__m512i*)shift, _mm512_sad_epu8(ss, K_ZERO));
+                _mm512_storeu_si512((__m512i*)mask, _mm512_srlv_epi16(K_INV_ZERO, _mm512_sub_epi16(K16_0010, ss)));
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[0], value[0], mask[0]);
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[2], value[2], mask[2]);
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[4], value[4], mask[4]);
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[6], value[6], mask[6]);
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[1], value[1], mask[1]);
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[3], value[3], mask[3]);
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[5], value[5], mask[5]);
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[7], value[7], mask[7]);
+            }
+            if (bitCount >= 8)
+            {
+                assert(bitCount < 16);
+                uint8_t byte = uint8_t(bitBuffer >> 56);
+                data[pos++] = byte;
+                if (byte == 255)
+                    data[pos++] = 0;
+                bitBuffer <<= 8;
+                bitCount -= 8;
+            }
+            for (; i < size; ++i, ++bits)
+            {
+                bitCount += bits[0][1];
+                bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount);
+                while (bitCount >= 8)
+                {
+                    uint8_t byte = uint8_t(bitBuffer >> 56);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    bitBuffer <<= 8;
+                    bitCount -= 8;
+                }
+            }
+#else
+            uint32_t& bitBuffer = stream.BitBuffer();
+            for (; i < size; ++i, ++bits)
+            {
+                bitCount += bits[0][1];
+                bitBuffer |= uint32_t(bits[0][0]) << (32 - bitCount);
+                while (bitCount >= 8)
+                {
+                    uint8_t byte = uint8_t(bitBuffer >> 24);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    bitBuffer <<= 8;
+                    bitCount -= 8;
+                }
+            }
+#endif
+            stream.Seek(pos);
+        }
+    }
+#endif// SIMD_AVX512BW_ENABLE
+
+#ifdef SIMD_NEON_ENABLE    
+    namespace Neon
+    {
+    }
+#endif// SIMD_NEON_ENABLE
+}
+
+#endif//__SimdImageSaveJpeg_h__
diff --git a/3rdparty/simdlib/Simd/SimdImageSavePng.h b/3rdparty/simdlib/Simd/SimdImageSavePng.h
new file mode 100644
index 0000000000..71efd1ca60
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdImageSavePng.h
@@ -0,0 +1,235 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdImageSavePng_h__
+#define __SimdImageSavePng_h__
+
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdLoad.h"
+
+#define SIMD_PNG_ZLIB_BIT_REV_TABLE
+
+namespace Simd
+{
+    namespace Base
+    {
+        extern const uint16_t ZlibLenC[30];
+        extern const uint8_t  ZlibLenEb[29];
+        extern const uint16_t ZlibDistC[31];
+        extern const uint8_t  ZlibDistEb[30];
+
+#if defined(SIMD_PNG_ZLIB_BIT_REV_TABLE)
+        const int ZlibBitRevShift = 9;
+        const int ZlibBitRevSize = 1 << ZlibBitRevShift;
+        extern int ZlibBitRevTable[ZlibBitRevSize];
+        SIMD_INLINE int ZlibBitRev(int bits, int count)
+        {
+            assert(bits < ZlibBitRevSize&& count <= ZlibBitRevShift);
+            return ZlibBitRevTable[bits] >> (ZlibBitRevShift - count);
+        }
+#else
+        SIMD_INLINE int ZlibBitRev(int bits, int count)
+        {
+            int rev = 0;
+            for (size_t b = 0; b < count; b++)
+            {
+                rev = (rev << 1) | (bits & 1);
+                bits >>= 1;
+            }
+            return rev;
+        }
+#endif
+
+        SIMD_INLINE uint32_t ZlibHash(const uint8_t* data)
+        {
+            uint32_t hash = data[0] + (data[1] << 8) + (data[2] << 16);
+            hash ^= hash << 3;
+            hash += hash >> 5;
+            hash ^= hash << 4;
+            hash += hash >> 17;
+            hash ^= hash << 25;
+            hash += hash >> 6;
+            return hash;
+        }
+
+        SIMD_INLINE void ZlibHuffA(int bits, int count, OutputMemoryStream& stream)
+        {
+            stream.WriteBits(ZlibBitRev(bits, count), count);
+        }
+
+        SIMD_INLINE void ZlibHuff1(int bits, OutputMemoryStream& stream)
+        {
+            ZlibHuffA(0x30 + bits, 8, stream);
+        }
+
+        SIMD_INLINE void ZlibHuff2(int bits, OutputMemoryStream& stream)
+        {
+            ZlibHuffA(0x190 + bits - 144, 9, stream);
+        }
+
+        SIMD_INLINE void ZlibHuff3(int bits, OutputMemoryStream& stream)
+        {
+            ZlibHuffA(0 + bits - 256, 7, stream);
+        }
+
+        SIMD_INLINE void ZlibHuff4(int bits, OutputMemoryStream& stream)
+        {
+            ZlibHuffA(0xc0 + bits - 280, 8, stream);
+        }
+
+        SIMD_INLINE void ZlibHuff(int bits, OutputMemoryStream& stream)
+        {
+            if (bits <= 143)
+                ZlibHuff1(bits, stream);
+            else if (bits <= 255)
+                ZlibHuff2(bits, stream);
+            else if (bits <= 279)
+                ZlibHuff3(bits, stream);
+            else
+                ZlibHuff4(bits, stream);
+        }
+
+        SIMD_INLINE void ZlibHuffB(int bits, OutputMemoryStream& stream)
+        {
+            if (bits <= 143)
+                ZlibHuff1(bits, stream);
+            else
+                ZlibHuff2(bits, stream);
+        }
+
+        SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit)
+        {
+            limit = Min(limit, 258);
+            int i = 0;
+#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+            int limit8 = limit & (~7);
+            for (; i < limit8; i += 8)
+                if (*(uint64_t*)(a + i) != *(uint64_t*)(b + i))
+                    break;
+#else
+            int limit4 = limit & (~3);
+            for (; i < limit4; i += 4)
+                if (*(uint32_t*)(a + i) != *(uint32_t*)(b + i))
+                    break;
+#endif
+            for (; i < limit; i += 1)
+                if (a[i] != b[i])
+                    break;
+            return i;
+        }
+
+        SIMD_INLINE uint8_t Paeth(int a, int b, int c)
+        {
+            int p = a + b - c, pa = abs(p - a), pb = abs(p - b), pc = abs(p - c);
+            if (pa <= pb && pa <= pc)
+                return uint8_t(a);
+            if (pb <= pc)
+                return uint8_t(b);
+            return uint8_t(c);
+        }
+    }
+
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit)
+        {
+            limit = Min(limit, 258);
+            int i = 0;
+            int limit16 = limit & (~15);
+            for (; i < limit16; i += 16)
+                if (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)(a + i)), _mm_loadu_si128((__m128i*)(b + i)))) != 0xFFFF)
+                    break;
+#if defined(SIMD_X64_ENABLE)
+            int limit8 = limit & (~7);
+            for (; i < limit8; i += 8)
+                if (*(uint64_t*)(a + i) != *(uint64_t*)(b + i))
+                    break;
+#else
+            int limit4 = limit & (~3);
+            for (; i < limit4; i += 4)
+                if (*(uint32_t*)(a + i) != *(uint32_t*)(b + i))
+                    break;
+#endif
+            for (; i < limit; i += 1)
+                if (a[i] != b[i])
+                    break;
+            return i;
+        }
+    }
+#endif// SIMD_SSE41_ENABLE
+
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit)
+        {
+            limit = Min(limit, 258);
+            int i = 0;
+            for (; i < limit; i += 32)
+            {
+                __m256i _a = _mm256_loadu_si256((__m256i*)(a + i));
+                __m256i _b = _mm256_loadu_si256((__m256i*)(b + i));
+                uint32_t mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(_a, _b));
+                if (mask != 0xFFFFFFFF)
+                {
+                    i += _tzcnt_u32(~mask);
+                    break;
+                }
+            }
+            return Min(i, limit);
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+
+#ifdef SIMD_AVX512BW_ENABLE    
+    namespace Avx512bw
+    {
+        SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit)
+        {
+            limit = Min(limit, 258);
+            int i = 0;
+            for (; i < limit; i += 64)
+            {
+                __m512i _a = _mm512_loadu_si512(a + i);
+                __m512i _b = _mm512_loadu_si512(b + i);
+                uint64_t mask = _mm512_cmp_epi8_mask(_a, _b, _MM_CMPINT_NE);
+                if (mask != 0)
+                {
+                    i += (int)FirstNotZero64(mask);
+                    break;
+                }
+            }
+            return Min(i, limit);
+        }
+    }
+#endif// SIMD_AVX512BW_ENABLE
+
+#ifdef SIMD_NEON_ENABLE    
+    namespace Neon
+    {
+    }
+#endif// SIMD_NEON_ENABLE
+}
+
+#endif//__SimdImageSavePng_h__
diff --git a/3rdparty/simdlib/Simd/SimdLib.cpp b/3rdparty/simdlib/Simd/SimdLib.cpp
index 89718bb80e..c168701413 100755
--- a/3rdparty/simdlib/Simd/SimdLib.cpp
+++ b/3rdparty/simdlib/Simd/SimdLib.cpp
@@ -61,8 +61,10 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved)
 #include "Simd/SimdConst.h"
 #include "Simd/SimdLog.h"
 
-#include "Simd/SimdResizer.h"
 #include "Simd/SimdGaussianBlur.h"
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdResizer.h"
 
 #include "Simd/SimdBase.h"
 #include "Simd/SimdSse2.h"
@@ -451,6 +453,34 @@ SIMD_API void SimdGrayToBgra(const uint8_t * gray, size_t width, size_t height,
         Base::GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha);
 }
 
+SIMD_API uint8_t* SimdImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size)
+{
+    const static Simd::ImageSaveToMemoryPtr imageSaveToMemory = SIMD_FUNC3(ImageSaveToMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC);
+
+    return imageSaveToMemory(src, stride, width, height, format, file, quality, size);
+}
+
+SIMD_API SimdBool SimdImageSaveToFile(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char* path)
+{
+    const static Simd::ImageSaveToMemoryPtr imageSaveToMemory = SIMD_FUNC3(ImageSaveToMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC);
+
+    return ImageSaveToFile(imageSaveToMemory, src, stride, width, height, format, file, quality, path);
+}
+
+SIMD_API uint8_t* SimdImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+{
+    const static Simd::ImageLoadFromMemoryPtr imageLoadFromMemory = SIMD_FUNC3(ImageLoadFromMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC);
+
+    return imageLoadFromMemory(data, size, stride, width, height, format);
+}
+
+SIMD_API uint8_t* SimdImageLoadFromFile(const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+{
+    const static Simd::ImageLoadFromMemoryPtr imageLoadFromMemory = SIMD_FUNC3(ImageLoadFromMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC);
+
+    return ImageLoadFromFile(imageLoadFromMemory, path, stride, width, height, format);
+}
+
 SIMD_API void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride,
     size_t width, size_t height, uint8_t * bgr, size_t bgrStride)
 {
diff --git a/3rdparty/simdlib/Simd/SimdLib.h b/3rdparty/simdlib/Simd/SimdLib.h
index 4838b82261..5441805969 100755
--- a/3rdparty/simdlib/Simd/SimdLib.h
+++ b/3rdparty/simdlib/Simd/SimdLib.h
@@ -116,6 +116,27 @@ typedef enum
     SimdCpuInfoNeon, /*!< Availability of NEON (ARM). */
 } SimdCpuInfoType;
 
+/*! @ingroup c_types
+    Describes formats of image file. It is used in functions ::SimdImageSaveToMemory and ::SimdImageSaveToFile.
+*/
+typedef enum
+{
+    /*! An undefined image file format (format auto choice). */
+    SimdImageFileUndefined = 0,
+    /*! A PGM (Portable Gray Map) text (P2) image file format. */
+    SimdImageFilePgmTxt,
+    /*! A PGM (Portable Gray Map) binary (P5) image file format. */
+    SimdImageFilePgmBin,
+    /*! A PGM (Portable Pixel Map) text (P3) image file format. */
+    SimdImageFilePpmTxt,
+    /*! A PGM (Portable Pixel Map) binary (P6) image file format. */
+    SimdImageFilePpmBin,
+    /*! A PNG (Portable Network Graphics) image file format. */
+    SimdImageFilePng,
+    /*! A JPEG (Joint Photographic Experts Group) image file format. */
+    SimdImageFileJpeg,
+} SimdImageFileType;
+
 /*! @ingroup c_types
     Describes types of binary operation between two images performed by function ::SimdOperationBinary8u.
     Images must have the same format (unsigned 8-bit integer for every channel).
@@ -167,18 +188,6 @@ typedef enum
     SimdPixelFormatFloat,
     /*! A single channel 64-bit float point pixel format. */
     SimdPixelFormatDouble,
-    /*! A 8-bit Bayer pixel format (GRBG). */
-    SimdPixelFormatBayerGrbg,
-    /*! A 8-bit Bayer pixel format (GBRG). */
-    SimdPixelFormatBayerGbrg,
-    /*! A 8-bit Bayer pixel format (RGGB). */
-    SimdPixelFormatBayerRggb,
-    /*! A 8-bit Bayer pixel format (BGGR). */
-    SimdPixelFormatBayerBggr,
-    /*! A 24-bit (3 8-bit channels) HSV (Hue, Saturation, Value) pixel format. */
-    SimdPixelFormatHsv24,
-    /*! A 24-bit (3 8-bit channels) HSL (Hue, Saturation, Lightness) pixel format. */
-    SimdPixelFormatHsl24,
     /*! A 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */
     SimdPixelFormatRgb24,
     /*! A 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */
@@ -753,6 +762,82 @@ extern "C"
     SIMD_API void SimdGrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride,
         uint8_t *bgra, size_t bgraStride, uint8_t alpha);
 
+    /*! @ingroup image_io
+
+        \fn uint8_t* SimdImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t * size);
+
+        \short Saves an image to memory in given image file format.
+
+        \param [in] src - a pointer to pixels data of input image. 
+        \param [in] stride - a row size of input image in bytes.
+        \param [in] width - a width of input image.
+        \param [in] height - a height of input image.
+        \param [in] format - a pixel format of input image. 
+            Supported pixel formats: ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32.
+        \param [in] file - a format of output image file. To auto choise format of output file set this parameter to ::SimdImageFileUndefined.
+        \param [in] quality - a parameter of compression quality (if file format supports it).
+        \param [out] size - a pointer to the size of output image file in bytes.
+        \return a pointer to memory buffer with output image file. 
+            It has to be deleted after use by function ::SimdFree. On error it returns NULL.
+    */
+    SIMD_API uint8_t* SimdImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t * size);
+
+    /*! @ingroup image_io
+
+        \fn SimdBool SimdImageSaveToFile(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char * path);
+
+        \short Saves an image to memory in given image file format.
+
+        \param [in] src - a pointer to pixels data of input image.
+        \param [in] stride - a row size of input image in bytes.
+        \param [in] width - a width of input image.
+        \param [in] height - a height of input image.
+        \param [in] format - a pixel format of input image. 
+            Supported pixel formats: ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32.
+        \param [in] file - a format of output image file. To auto choise format of output file set this parameter to ::SimdImageFileUndefined.
+        \param [in] quality - a parameter of compression quality (if file format supports it).
+        \param [in] path - a path to output image file.
+        \return result of the operation.
+    */
+    SIMD_API SimdBool SimdImageSaveToFile(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char * path);
+
+    /*! @ingroup image_io
+
+        \fn uint8_t* SimdImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format);
+
+        \short Loads an image from memory buffer.
+
+        \param [in] data - a pointer to memory buffer with input image file.
+        \param [in] size - a size of input image file in bytes.
+        \param [out] stride - a pointer to row size of output image in bytes.
+        \param [out] width - a pointer to width of output image.
+        \param [out] height - a pointer to height of output image.
+        \param [in, out] format - a pointer to pixel format of output image. 
+            Here you can set desired pixel format (it can be ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32).
+            Or set ::SimdPixelFormatNone and use pixel format of input image file.
+        \return a pointer to pixels data of output image. 
+            It has to be deleted after use by function ::SimdFree. On error it returns NULL.
+    */
+    SIMD_API uint8_t* SimdImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format);
+
+    /*! @ingroup image_io
+
+        \fn uint8_t* SimdImageLoadFromFile(const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format);
+
+        \short Loads an image from file.
+
+        \param [in] path - a path to input image file.
+        \param [out] stride - a pointer to row size of output image in bytes.
+        \param [out] width - a pointer to width of output image.
+        \param [out] height - a pointer to height of output image.
+        \param [in, out] format - a pointer to pixel format of output image.
+            Here you can set desired pixel format (it can be ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32).
+            Or set ::SimdPixelFormatNone and use pixel format of input image file.
+        \return a pointer to pixels data of output image.
+            It has to be deleted after use by function ::SimdFree. On error it returns NULL.
+    */
+    SIMD_API uint8_t* SimdImageLoadFromFile(const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format);
+
     /*! @ingroup other_conversion
 
         \fn void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride);
diff --git a/3rdparty/simdlib/Simd/SimdMath.h b/3rdparty/simdlib/Simd/SimdMath.h
index 0f7425f76e..f8c192a189 100755
--- a/3rdparty/simdlib/Simd/SimdMath.h
+++ b/3rdparty/simdlib/Simd/SimdMath.h
@@ -750,6 +750,11 @@ namespace Simd
             return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(lo), _mm256_castsi256_ps(hi), imm));
         }
 
+        template<int imm> SIMD_INLINE __m256i Shuffle64i(__m256i lo, __m256i hi)
+        {
+            return _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(lo), _mm256_castsi256_pd(hi), imm));
+        }
+
         template<int imm> SIMD_INLINE __m256 Permute4x64(__m256 a)
         {
             return _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(a), imm));
diff --git a/3rdparty/simdlib/Simd/SimdMemory.h b/3rdparty/simdlib/Simd/SimdMemory.h
index d7772ffa3c..f0fca8840a 100755
--- a/3rdparty/simdlib/Simd/SimdMemory.h
+++ b/3rdparty/simdlib/Simd/SimdMemory.h
@@ -35,6 +35,18 @@
 
 namespace Simd
 {
+    SIMD_INLINE size_t DivHi(size_t value, size_t divider)
+    {
+        return (value + divider - 1) / divider;
+    }
+
+    SIMD_INLINE size_t Pow2Hi(size_t value)
+    {
+        size_t pow2 = 1;
+        for (; pow2 < value; pow2 *= 2);
+        return pow2;
+    }
+
     SIMD_INLINE size_t AlignHiAny(size_t size, size_t align)
     {
         return (size + align - 1) / align * align;
@@ -108,6 +120,13 @@ namespace Simd
         return ptr;
     }
 
+    template<class T> T* Allocate(uint8_t*& buffer, size_t size, size_t align = SIMD_ALIGN)
+    {
+        T* ptr = (T*)buffer;
+        buffer = buffer + AlignHi(size * sizeof(T), align);
+        return ptr;
+    }
+
     SIMD_INLINE void Free(void * ptr)
     {
 #ifdef SIMD_NO_MANS_LAND
diff --git a/3rdparty/simdlib/Simd/SimdMemoryStream.h b/3rdparty/simdlib/Simd/SimdMemoryStream.h
new file mode 100644
index 0000000000..9665f33d63
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdMemoryStream.h
@@ -0,0 +1,510 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdMemoryStream_h__
+#define __SimdMemoryStream_h__
+
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdPerformance.h"
+
+namespace Simd
+{
+    class InputMemoryStream
+    {
+        const uint8_t* _data;
+        size_t _pos, _size, _bitCount;
+#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+        uint64_t _bitBuffer;
+#else
+        uint32_t _bitBuffer;
+#endif
+
+    public:
+        SIMD_INLINE InputMemoryStream(const uint8_t* data = NULL, size_t size = 0)
+        {
+            Init(data, size);
+        }
+
+        SIMD_INLINE void Init(const uint8_t* data, size_t size)
+        {
+            _pos = 0;
+            _data = data;
+            _size = size;
+            _bitBuffer = 0;
+            _bitCount = 0;
+        }
+
+        SIMD_INLINE bool Seek(size_t pos)
+        {
+            if (pos <= _size)
+            {
+                _pos = pos;
+                return true;
+            }
+            return false;
+        }
+
+        SIMD_INLINE size_t Size() const
+        {
+            return _size;
+        }
+
+        SIMD_INLINE const uint8_t* Data() const
+        {
+            return _data;
+        }
+
+        SIMD_INLINE size_t Pos() const
+        {
+            return _pos;
+        }
+
+        SIMD_INLINE const uint8_t* Current() const
+        {
+            return _data + _pos;
+        }
+
+        SIMD_INLINE bool Eof() const
+        {
+            return _pos >= _size;
+        }
+
+        SIMD_INLINE bool CanRead(size_t size) const
+        {
+            return _pos + size <= _size;
+        }
+        
+        SIMD_INLINE size_t Read(size_t size, void* data)
+        {
+            size = Min(_size - _pos, size);
+            memcpy(data, _data + _pos, size);
+            _pos += size;
+            return size;
+        }
+
+        template <class Value> SIMD_INLINE bool Read(Value & value)
+        {
+            return Read(sizeof(Value), &value) == sizeof(Value);
+        }
+
+        SIMD_INLINE bool Read8u(uint8_t & value)
+        {
+            if (_pos < _size)
+            {
+                value = _data[_pos++];
+                return true;
+            }
+            else
+                return false;
+        }
+
+        SIMD_INLINE bool Read16u(uint16_t& value)
+        {
+            if (_pos + 2 <= _size)
+            {
+                value = *(uint16_t*)(_data + _pos);
+                _pos += 2;
+                return true;
+            }
+            else
+                return false;
+        }
+
+        SIMD_INLINE bool Read32u(uint32_t& value)
+        {
+            if (_pos + 4 <= _size)
+            {
+                value = *(uint32_t*)(_data + _pos);
+                _pos += 4;
+                return true;
+            }
+            else
+                return false;
+        }
+
+        SIMD_INLINE bool ReadBe16u(uint16_t& value)
+        {
+            if (Read16u(value))
+            {
+#if !defined(SIMD_BIG_ENDIAN)
+                value =
+                    (value & 0x00FF) << 8 |
+                    (value & 0xFF00) >> 8;
+#endif
+                return true;
+            }
+            else
+                return false;
+        }
+
+        SIMD_INLINE bool ReadBe32u(uint32_t& value)
+        {
+            if (Read32u(value))
+            {
+#if !defined(SIMD_BIG_ENDIAN)
+                value =
+                    (value & 0x000000FF) << 24 |
+                    (value & 0x0000FF00) << 8 |
+                    (value & 0x00FF0000) >> 8 |
+                    (value & 0xFF000000) >> 24;
+#endif
+                return true;
+            }
+            else
+                return false;
+        }
+
+        template<class Unsigned> SIMD_INLINE bool ReadUnsigned(Unsigned& value)
+        {
+            if (!SkipGap())
+                return false;
+            value = 0;
+            while (!IsGap(_data[_pos]) && _pos < _size)
+            {
+                if (_data[_pos] >= '0' && _data[_pos] <= '9')
+                    value = value * 10 + Unsigned(_data[_pos] - '0');
+                else
+                    return false;
+                _pos++;
+            }
+            return true;
+        }
+
+        SIMD_INLINE bool Skip(size_t size)
+        {
+            if (_pos + size < _size)
+            {
+                _pos += size;
+                return true;
+            }
+            return false;
+        }
+
+        SIMD_INLINE bool SkipValue(uint8_t value)
+        {
+            while (_data[_pos] == value && _pos < _size)
+                _pos++;
+            return _pos < _size;
+        }
+
+        SIMD_INLINE bool SkipNotGap()
+        {
+            while (!IsGap(_data[_pos]) && _pos < _size)
+                _pos++;
+            return _pos < _size;
+        }        
+        
+        SIMD_INLINE bool SkipGap()
+        {
+            while (IsGap(_data[_pos]) && _pos < _size)
+                _pos++;
+            return _pos < _size;
+        }
+
+        static SIMD_INLINE bool IsGap(uint8_t value)
+        {
+            return value == ' ' || value == '\t' || value == '\n' || value == '\r';
+        }
+
+#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+        SIMD_INLINE uint64_t& BitBuffer()
+        {
+            return _bitBuffer;
+        }
+#else
+        SIMD_INLINE uint32_t& BitBuffer()
+        {
+            return _bitBuffer;
+        }
+#endif
+
+        SIMD_INLINE size_t& BitCount()
+        {
+            return _bitCount;
+        }
+
+        SIMD_INLINE void FillBits()
+        {
+            static const size_t canReadByte = (sizeof(_bitBuffer) - 1) * 8;
+            while (_bitCount <= canReadByte && _pos < _size)
+            {
+                _bitBuffer |= (size_t)_data[_pos++] << _bitCount;
+                _bitCount += 8;
+            }
+        }
+
+        SIMD_INLINE void ClearBits()
+        {
+            _pos -= _bitCount / 8;
+            _bitBuffer = 0;
+            _bitCount = 0;
+        }
+
+        SIMD_INLINE bool ReadBits(size_t & bits, size_t count)
+        {
+            if (_bitCount < count)
+                FillBits();
+            if (_bitCount < count)
+                return false;
+            bits = _bitBuffer & ((size_t(1) << count) - 1);
+            _bitBuffer >>= count;
+            _bitCount -= count;
+            return true;
+        }
+
+        SIMD_INLINE size_t ReadBits(size_t count)
+        {
+            if (_bitCount < count)
+                FillBits();
+            size_t bits = _bitBuffer & ((size_t(1) << count) - 1);
+            _bitBuffer >>= count;
+            _bitCount -= count;
+            return bits;
+        }
+    };
+
+    //-------------------------------------------------------------------------
+
+    class OutputMemoryStream
+    {
+        const size_t CAPACITY_MIN = 64;
+
+        uint8_t * _data;
+        size_t _pos, _size, _capacity, _bitCount;
+#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+        uint64_t _bitBuffer;
+#else
+        uint32_t _bitBuffer;
+#endif
+
+        SIMD_INLINE void Reset(bool owner)
+        {
+            if (_data && owner)
+                Free(_data);
+            _data = NULL;
+            _pos = 0;
+            _size = 0;
+            _capacity = 0;
+            _bitBuffer = 0;
+            _bitCount = 0;
+        }
+
+    public:
+        SIMD_INLINE OutputMemoryStream(size_t capacity = 0)
+        {
+            Reset(false);
+            if (capacity)
+                Reserve(capacity);
+        }
+
+        SIMD_INLINE ~OutputMemoryStream()
+        {
+            Reset(true);
+        }
+
+        SIMD_INLINE void Seek(size_t pos)
+        {
+            _pos = pos;
+            _size = Max(_size, _pos);
+            Reserve(_pos);
+        }
+
+        SIMD_INLINE size_t Pos() const
+        {
+            return _pos;
+        }
+
+        SIMD_INLINE size_t Size() const
+        {
+            return _size;
+        }
+
+        SIMD_INLINE size_t Capacity() const
+        {
+            return _capacity;
+        }
+
+        SIMD_INLINE uint8_t* Data()
+        {
+            return _data;
+        }
+
+        SIMD_INLINE const uint8_t * Data() const
+        {
+            return _data;
+        }
+
+        SIMD_INLINE uint8_t* Current()
+        {
+            return _data + _pos;
+        }
+
+        SIMD_INLINE const uint8_t* Current() const
+        {
+            return _data + _pos;
+        }
+
+        SIMD_INLINE void Write(const void * data, size_t size)
+        {
+            Reserve(_pos + size);
+            memcpy(_data + _pos, data, size);
+            _pos += size;
+            _size = Max(_size, _pos);
+        }
+
+        SIMD_INLINE bool Write(InputMemoryStream & input, size_t size)
+        {
+            if (input.CanRead(size))
+            {
+                Write(input.Current(), size);
+                input.Skip(size);
+                return true;
+            }
+            return false;
+        }
+
+        SIMD_INLINE bool WriteSelf(ptrdiff_t offset, size_t size)
+        {
+            if (offset < 0)
+                return false;
+            Reserve(_pos + size);
+            if (offset + size > _pos)
+            {
+                for (size_t i = 0; i < size; ++i)
+                    _data[_pos++] = _data[offset++];
+            }
+            else
+            {
+                memcpy(_data + _pos, _data + offset, size);
+                _pos += size;
+            }
+            _size = Max(_size, _pos);
+            return true;
+        }
+
+        template <class Value> SIMD_INLINE void Write(const Value& value)
+        {
+            Write(&value, sizeof(Value));
+        }
+
+        SIMD_INLINE void Write8u(uint8_t value)
+        {
+            Reserve(_pos + 1);
+            _data[_pos++] = value;
+            _size = Max(_size, _pos);
+        }
+
+        SIMD_INLINE void Write8u(uint8_t value, size_t count)
+        {
+            Reserve(_pos + count);
+            memset(_data + _pos, value, count);
+            _pos += count;
+            _size = Max(_size, _pos);
+        }
+
+        SIMD_INLINE void WriteBe32u(const uint32_t & value)
+        {
+#if defined(SIMD_BIG_ENDIAN)
+            Write<uint32_t>(value);
+#else
+            Write<uint32_t>(
+                (value & 0x000000FF) << 24 | 
+                (value & 0x0000FF00) << 8 |
+                (value & 0x00FF0000) >> 8 | 
+                (value & 0xFF000000) >> 24);
+#endif
+        }
+
+        SIMD_INLINE uint8_t* Release(size_t* size = NULL)
+        {
+            uint8_t* data = _data;
+            if(size)
+                *size = _size;
+            Reset(false);
+            return data;
+        }
+
+        SIMD_INLINE void Reserve(size_t size)
+        {
+            if (size > _capacity)
+            {
+                size_t capacity = Max(CAPACITY_MIN, Max(_capacity * 2, size));
+                uint8_t* data = (uint8_t*)Allocate(capacity, SIMD_ALIGN);
+                if (_data)
+                {
+                    memcpy(data, _data, _size);
+                    Free(_data);
+                }
+                _data = data;
+                _capacity = capacity;
+            }
+        }
+
+        SIMD_INLINE void WriteBits(const size_t bits, size_t count)
+        {
+            _bitBuffer |= (bits) << _bitCount;
+            _bitCount += count;
+            while (_bitCount >= 8)
+            {
+                Write8u((uint8_t)_bitBuffer);
+                _bitBuffer >>= 8;
+                _bitCount -= 8;
+            }
+        }
+
+        SIMD_INLINE void FlushBits()
+        {
+            while (_bitCount >= 8)
+            {
+                Write8u((uint8_t)_bitBuffer);
+                _bitBuffer >>= 8;
+                _bitCount -= 8;
+            }
+            if (_bitCount)
+            {
+                Write8u((uint8_t)_bitBuffer);
+                _bitBuffer = 0;
+                _bitCount = 0;
+            }
+        }
+
+#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+        SIMD_INLINE uint64_t & BitBuffer()
+        {
+            return _bitBuffer;
+        }
+#else
+        SIMD_INLINE uint32_t& BitBuffer()
+        {
+            return _bitBuffer;
+        }
+#endif
+
+        SIMD_INLINE size_t& BitCount()
+        {
+            return _bitCount;
+        }
+    };
+}
+
+#endif//__SimdMemoryStream_h__
diff --git a/3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp b/3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp
new file mode 100644
index 0000000000..61c5d90359
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp
@@ -0,0 +1,154 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdNeon.h"
+
+#include <memory>
+
+namespace Simd
+{
+#ifdef SIMD_NEON_ENABLE    
+    namespace Neon
+    {
+        ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param)
+            : Base::ImagePgmTxtLoader(param)
+        {
+        }
+
+        void ImagePgmTxtLoader::SetConverters()
+        {
+            Base::ImagePgmTxtLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _toAny = Neon::GrayToBgr; break;
+                case SimdPixelFormatBgra32: _toBgra = Neon::GrayToBgra; break;
+                case SimdPixelFormatRgb24: _toAny = Neon::GrayToBgr; break;
+                case SimdPixelFormatRgba32: _toBgra = Neon::GrayToBgra; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param)
+            : Base::ImagePgmBinLoader(param)
+        {
+        }
+
+        void ImagePgmBinLoader::SetConverters()
+        {
+            Base::ImagePgmBinLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _toAny = Neon::GrayToBgr; break;
+                case SimdPixelFormatBgra32: _toBgra = Neon::GrayToBgra; break;
+                case SimdPixelFormatRgb24: _toAny = Neon::GrayToBgr; break;
+                case SimdPixelFormatRgba32: _toBgra = Neon::GrayToBgra; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param)
+            : Base::ImagePpmTxtLoader(param)
+        {
+        }
+
+        void ImagePpmTxtLoader::SetConverters()
+        {
+            Base::ImagePpmTxtLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _toAny = Neon::RgbToGray; break;
+                case SimdPixelFormatBgr24: _toAny = Neon::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _toBgra = Neon::RgbToBgra; break;
+                case SimdPixelFormatRgba32: _toBgra = Neon::BgrToBgra; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param)
+            : Base::ImagePpmBinLoader(param)
+        {
+        }
+
+        void ImagePpmBinLoader::SetConverters()
+        {
+            Base::ImagePpmBinLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _toAny = Neon::RgbToGray; break;
+                case SimdPixelFormatBgr24: _toAny = Neon::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _toBgra = Neon::RgbToBgra; break;
+                case SimdPixelFormatRgba32: _toBgra = Neon::BgrToBgra; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageLoader* CreateImageLoader(const ImageLoaderParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinLoader(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinLoader(param);
+            case SimdImageFilePng: return new Base::ImagePngLoader(param);
+            case SimdImageFileJpeg: return new Base::ImageJpegLoader(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+        {
+            ImageLoaderParam param(data, size, *format);
+            if (param.Validate())
+            {
+                Holder<ImageLoader> loader(CreateImageLoader(param));
+                if (loader)
+                {
+                    if (loader->FromStream())
+                        return loader->Release(stride, width, height, format);
+                }
+            }
+            return NULL;
+        }
+    }
+#endif// SIMD_NEON_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdNeonImageSave.cpp b/3rdparty/simdlib/Simd/SimdNeonImageSave.cpp
new file mode 100644
index 0000000000..a0fbbd071a
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdNeonImageSave.cpp
@@ -0,0 +1,134 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdNeon.h"
+
+#include <memory>
+
+namespace Simd
+{
+#ifdef SIMD_NEON_ENABLE    
+    namespace Neon
+    {
+        ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param)
+            : Base::ImagePgmTxtSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _convert = Neon::BgrToGray; break;
+                case SimdPixelFormatBgra32: _convert = Neon::BgraToGray; break;
+                case SimdPixelFormatRgb24: _convert = Neon::RgbToGray; break;
+                case SimdPixelFormatRgba32: _convert = Neon::RgbaToGray; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param)
+            : Base::ImagePgmBinSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _convert = Neon::BgrToGray; break;
+                case SimdPixelFormatBgra32: _convert = Neon::BgraToGray; break;
+                case SimdPixelFormatRgb24: _convert = Neon::RgbToGray; break;
+                case SimdPixelFormatRgba32: _convert = Neon::RgbaToGray; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param)
+            : Base::ImagePpmTxtSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _convert = Neon::GrayToBgr; break;
+                case SimdPixelFormatBgr24: _convert = Neon::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _convert = Neon::BgraToRgb; break;
+                case SimdPixelFormatRgba32: _convert = Neon::BgraToBgr; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param)
+            : Base::ImagePpmBinSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _convert = Neon::GrayToBgr; break;
+                case SimdPixelFormatBgr24: _convert = Neon::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _convert = Neon::BgraToRgb; break;
+                case SimdPixelFormatRgba32: _convert = Neon::BgraToBgr; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageSaver* CreateImageSaver(const ImageSaverParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinSaver(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinSaver(param);
+            case SimdImageFilePng: return new ImagePngSaver(param);
+            case SimdImageFileJpeg: return new Base::ImageJpegSaver(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size)
+        {
+            ImageSaverParam param(width, height, format, file, quality);
+            if (param.Validate())
+            {
+                Holder<ImageSaver> saver(CreateImageSaver(param));
+                if (saver)
+                {
+                    if (saver->ToStream(src, stride))
+                        return saver->Release(size);
+                }
+            }
+            return NULL;
+        }
+    }
+#endif// SIMD_NEON_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdPerformance.h b/3rdparty/simdlib/Simd/SimdPerformance.h
new file mode 100644
index 0000000000..e695326a69
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdPerformance.h
@@ -0,0 +1,197 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdPerformance_h__
+#define __SimdPerformance_h__
+
+#include "Simd/SimdDefs.h"
+
+#include <string>
+#include <sstream>
+
+namespace Simd
+{
+    typedef std::string String;
+
+    template <class T> SIMD_INLINE String ToStr(const T & value)
+    {
+        std::stringstream ss;
+        ss << value;
+        return ss.str();
+    }
+}
+
+#if defined(SIMD_PERFORMANCE_STATISTIC) && (defined(NDEBUG) || defined(SIMD_PERF_STAT_IN_DEBUG))
+
+#include "Simd/SimdTime.h"
+
+#include <limits>
+#include <iostream>
+#include <iomanip>
+#include <memory>
+#include <map>
+#include <thread>
+#include <mutex>
+#include <algorithm>
+
+namespace Simd
+{
+    namespace Base
+    {
+        class PerformanceMeasurer
+        {
+            String	_name;
+            int64_t _start, _current, _total, _min, _max;
+            int64_t _count, _flop;
+            bool _entered, _paused;
+
+        public:
+            PerformanceMeasurer(const String& name = "Unknown", int64_t flop = 0);
+
+            PerformanceMeasurer(const PerformanceMeasurer& pm);
+
+            void Enter();
+
+            void Leave(bool pause = false);
+
+            String Statistic() const;
+
+            void Combine(const PerformanceMeasurer& other);
+
+        private:
+            double Average() const;
+            double GFlops() const;
+        };
+
+        class PerformanceMeasurerHolder
+        {
+            PerformanceMeasurer * _pm;
+
+        public:
+            SIMD_INLINE PerformanceMeasurerHolder(PerformanceMeasurer * pm, bool enter = true)
+                : _pm(pm)
+            {
+                if (_pm && enter)
+                    _pm->Enter();
+            }
+
+            SIMD_INLINE void Enter()
+            {
+                if (_pm)
+                    _pm->Enter();
+            }
+
+            SIMD_INLINE void Leave(bool pause)
+            {
+                if (_pm)
+                    _pm->Leave(pause);
+            }
+
+            SIMD_INLINE ~PerformanceMeasurerHolder()
+            {
+                if (_pm)
+                    _pm->Leave();
+            }
+        };
+
+        class PerformanceMeasurerStorage
+        {
+            typedef PerformanceMeasurer Pm;
+            typedef std::shared_ptr<Pm> PmPtr;
+            typedef std::map<String, PmPtr> FunctionMap;
+            typedef std::map<std::thread::id, FunctionMap> ThreadMap;
+
+            ThreadMap _map;
+            mutable std::mutex _mutex;
+            String _report;
+
+            SIMD_INLINE FunctionMap & ThisThread()
+            {
+                static thread_local FunctionMap * thread = NULL;
+                if (thread == NULL)
+                {
+                    std::lock_guard<std::mutex> lock(_mutex);
+                    thread = &_map[std::this_thread::get_id()];
+                }
+                return *thread;
+            }
+
+        public:
+            static PerformanceMeasurerStorage s_storage;
+
+            PerformanceMeasurerStorage()
+            {
+            }
+
+            SIMD_INLINE PerformanceMeasurer * Get(const String & name, int64_t flop = 0)
+            {
+                FunctionMap & thread = ThisThread();
+                PerformanceMeasurer * pm = NULL;
+                FunctionMap::iterator it = thread.find(name);
+                if (it == thread.end())
+                {
+                    pm = new PerformanceMeasurer(name, flop);
+                    thread[name].reset(pm);
+                }
+                else
+                    pm = it->second.get();
+                return pm;
+            }
+
+            SIMD_INLINE PerformanceMeasurer * Get(const String func, const String & desc, int64_t flop = 0)
+            {
+                return Get(func + "{ " + desc + " }", flop);
+            }
+
+            const char* PerformanceStatistic();
+        };
+    }
+}
+#define SIMD_PERF_FUNCF(flop) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)(Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, (int64_t)(flop)))
+#define SIMD_PERF_FUNC() SIMD_PERF_FUNCF(0)
+#define SIMD_PERF_BEGF(desc, flop) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)(Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc, (int64_t)(flop)))
+#define SIMD_PERF_BEG(desc) SIMD_PERF_BEGF(desc, 0)
+#define SIMD_PERF_IFF(cond, desc, flop) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)((cond) ? Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc, (int64_t)(flop)) : NULL)
+#define SIMD_PERF_IF(cond, desc) SIMD_PERF_IFF(cond, desc, 0)
+#define SIMD_PERF_END(desc) Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc)->Leave();
+#define SIMD_PERF_INITF(name, desc, flop) Simd::Base::PerformanceMeasurerHolder name(Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc, (int64_t)(flop)), false);
+#define SIMD_PERF_INIT(name, desc)  SIMD_PERF_INITF(name, desc, 0);
+#define SIMD_PERF_START(name) name.Enter(); 
+#define SIMD_PERF_PAUSE(name) name.Leave(true);
+#define SIMD_PERF_EXT(ext) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)((ext)->Perf(SIMD_FUNCTION)) 
+#else//SIMD_PERFORMANCE_STATISTIC
+#define SIMD_PERF_FUNCF(flop)
+#define SIMD_PERF_FUNC()
+#define SIMD_PERF_BEGF(desc, flop)
+#define SIMD_PERF_BEG(desc)
+#define SIMD_PERF_IFF(cond, desc, flop)
+#define SIMD_PERF_IF(cond, desc)
+#define SIMD_PERF_END(desc)
+#define SIMD_PERF_INITF(name, desc, flop)
+#define SIMD_PERF_INIT(name, desc)
+#define SIMD_PERF_START(name)
+#define SIMD_PERF_PAUSE(name)
+#define SIMD_PERF_EXT(ext)
+#endif//SIMD_PERFORMANCE_STATISTIC 
+
+#endif//__SimdPerformance_h__
diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp
new file mode 100644
index 0000000000..eca83c63ed
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp
@@ -0,0 +1,159 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdSse2.h"
+#include "Simd/SimdSse41.h"
+
+#include <memory>
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param)
+            : Base::ImagePgmTxtLoader(param)
+        {
+        }
+
+        void ImagePgmTxtLoader::SetConverters()
+        {
+            Base::ImagePgmTxtLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _toAny = Sse41::GrayToBgr; break;
+                case SimdPixelFormatBgra32: _toBgra = Sse2::GrayToBgra; break;
+                case SimdPixelFormatRgb24: _toAny = Sse41::GrayToBgr; break;
+                case SimdPixelFormatRgba32: _toBgra = Sse41::GrayToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param)
+            : Base::ImagePgmBinLoader(param)
+        {
+        }
+
+        void ImagePgmBinLoader::SetConverters()
+        {
+            Base::ImagePgmBinLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _toAny = Sse41::GrayToBgr; break;
+                case SimdPixelFormatBgra32: _toBgra = Sse2::GrayToBgra; break;
+                case SimdPixelFormatRgb24: _toAny = Sse41::GrayToBgr; break;
+                case SimdPixelFormatRgba32: _toBgra = Sse41::GrayToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param)
+            : Base::ImagePpmTxtLoader(param)
+        {
+        }
+
+        void ImagePpmTxtLoader::SetConverters()
+        {
+            Base::ImagePpmTxtLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _toAny = Sse41::RgbToGray; break;
+                case SimdPixelFormatBgr24: _toAny = Sse41::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _toBgra = Sse41::RgbToBgra; break;
+                case SimdPixelFormatRgba32: _toBgra = Sse41::BgrToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param)
+            : Base::ImagePpmBinLoader(param)
+        {
+        }
+
+        void ImagePpmBinLoader::SetConverters()
+        {
+            Base::ImagePpmBinLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _toAny = Sse41::RgbToGray; break;
+                case SimdPixelFormatBgr24: _toAny = Sse41::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _toBgra = Sse41::RgbToBgra; break;
+                case SimdPixelFormatRgba32: _toBgra = Sse41::BgrToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageLoader* CreateImageLoader(const ImageLoaderParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinLoader(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinLoader(param);
+            case SimdImageFilePng: return new ImagePngLoader(param);
+            case SimdImageFileJpeg: return new Base::ImageJpegLoader(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+        {
+            ImageLoaderParam param(data, size, *format);
+            if (param.Validate())
+            {
+                std::unique_ptr<ImageLoader> loader(CreateImageLoader(param));
+                if (loader)
+                {
+                    if (loader->FromStream())
+                        return loader->Release(stride, width, height, format);
+                }
+            }
+            return NULL;
+        }
+    }
+#endif// SIMD_SSE41_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp
new file mode 100644
index 0000000000..1ec6ca0118
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp
@@ -0,0 +1,1805 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdArray.h"
+#include "Simd/SimdCpu.h"
+#include "Simd/SimdBase.h"
+#include "Simd/SimdSse2.h"
+#include "Simd/SimdSse41.h"
+
+namespace Simd
+{
+#if defined(SIMD_SSE41_ENABLE) 
+    namespace Sse41
+    {
+        typedef unsigned char png_uc;
+        typedef unsigned short png_us;
+
+        typedef uint16_t png__uint16;
+        typedef uint32_t png__uint32;
+
+#define png_inline SIMD_INLINE
+#define PNG_ASSERT assert
+#define PNG_MALLOC(sz)           malloc(sz)
+#define PNG_REALLOC(p,newsz)     realloc(p,newsz)
+#define PNG_FREE(p)              free(p)
+#define PNG_REALLOC_SIZED(p,oldsz,newsz) PNG_REALLOC(p,newsz)
+#define STBIDEF static
+
+#ifdef _MSC_VER
+#define PNG_NOTUSED(v)  (void)(v)
+#else
+#define PNG_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#define PNG__BYTECAST(x)  ((png_uc) ((x) & 255))  // truncate int to byte without warnings
+#define PNG_MAX_DIMENSIONS (1 << 24)
+
+        static int png__err(const char* str, const char* stub)
+        {
+            return 0;
+        }
+
+#define png__errpuc(x,y)  ((unsigned char *)(size_t) (png__err(x,y)?NULL:NULL))
+
+        static void* png__malloc(size_t size)
+        {
+            return PNG_MALLOC(size);
+        }
+
+        typedef struct
+        {
+            int      (*read)  (void* user, char* data, int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+            void     (*skip)  (void* user, int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+            int      (*eof)   (void* user);                       // returns nonzero if we are at end of file/data
+        } png_io_callbacks;
+
+        typedef struct
+        {
+            png__uint32 img_x, img_y;
+            int img_n, img_out_n;
+
+            png_io_callbacks io;
+            void* io_user_data;
+
+            int read_from_callbacks;
+            int buflen;
+            png_uc buffer_start[128];
+            int callback_already_read;
+
+            png_uc* img_buffer, * img_buffer_end;
+            png_uc* img_buffer_original, * img_buffer_original_end;
+        } png__context;
+
+        typedef struct
+        {
+            int bits_per_channel;
+            int num_channels;
+            int channel_order;
+        } png__result_info;
+
+        enum
+        {
+            PNG__SCAN_load = 0,
+            PNG__SCAN_type,
+            PNG__SCAN_header
+        };
+
+        enum
+        {
+            PNG_ORDER_RGB,
+            PNG_ORDER_BGR
+        };
+
+        static void png__rewind(png__context* s)
+        {
+            // conceptually rewind SHOULD rewind to the beginning of the stream,
+            // but we just rewind to the beginning of the initial buffer, because
+            // we only use it after doing 'test', which only ever looks at at most 92 bytes
+            s->img_buffer = s->img_buffer_original;
+            s->img_buffer_end = s->img_buffer_original_end;
+        }
+
+        static void png__refill_buffer(png__context* s)
+        {
+            int n = (s->io.read)(s->io_user_data, (char*)s->buffer_start, s->buflen);
+            s->callback_already_read += (int)(s->img_buffer - s->img_buffer_original);
+            if (n == 0) {
+                // at end of file, treat same as if from memory, but need to handle case
+                // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+                s->read_from_callbacks = 0;
+                s->img_buffer = s->buffer_start;
+                s->img_buffer_end = s->buffer_start + 1;
+                *s->img_buffer = 0;
+            }
+            else {
+                s->img_buffer = s->buffer_start;
+                s->img_buffer_end = s->buffer_start + n;
+            }
+        }
+
+        png_inline static png_uc png__get8(png__context* s)
+        {
+            if (s->img_buffer < s->img_buffer_end)
+                return *s->img_buffer++;
+            if (s->read_from_callbacks) {
+                png__refill_buffer(s);
+                return *s->img_buffer++;
+            }
+            return 0;
+        }
+
+        static int png__get16be(png__context* s)
+        {
+            int z = png__get8(s);
+            return (z << 8) + png__get8(s);
+        }
+
+        static png__uint32 png__get32be(png__context* s)
+        {
+            png__uint32 z = png__get16be(s);
+            return (z << 16) + png__get16be(s);
+        }
+
+        png_inline static int png__at_eof(png__context* s)
+        {
+            if (s->io.read) {
+                if (!(s->io.eof)(s->io_user_data)) return 0;
+                // if feof() is true, check if buffer = end
+                // special case: we've only got the special 0 character at the end
+                if (s->read_from_callbacks == 0) return 1;
+            }
+
+            return s->img_buffer >= s->img_buffer_end;
+        }
+
+        static void png__skip(png__context* s, int n)
+        {
+            if (n == 0) return;  // already there!
+            if (n < 0) {
+                s->img_buffer = s->img_buffer_end;
+                return;
+            }
+            if (s->io.read) {
+                int blen = (int)(s->img_buffer_end - s->img_buffer);
+                if (blen < n) {
+                    s->img_buffer = s->img_buffer_end;
+                    (s->io.skip)(s->io_user_data, n - blen);
+                    return;
+                }
+            }
+            s->img_buffer += n;
+        }
+
+        static int png__getn(png__context* s, png_uc* buffer, int n)
+        {
+            if (s->io.read) {
+                int blen = (int)(s->img_buffer_end - s->img_buffer);
+                if (blen < n) {
+                    int res, count;
+
+                    memcpy(buffer, s->img_buffer, blen);
+
+                    count = (s->io.read)(s->io_user_data, (char*)buffer + blen, n - blen);
+                    res = (count == (n - blen));
+                    s->img_buffer = s->img_buffer_end;
+                    return res;
+                }
+            }
+
+            if (s->img_buffer + n <= s->img_buffer_end) {
+                memcpy(buffer, s->img_buffer, n);
+                s->img_buffer += n;
+                return 1;
+            }
+            else
+                return 0;
+        }
+
+        static int png__addsizes_valid(int a, int b)
+        {
+            if (b < 0) return 0;
+            // now 0 <= b <= INT_MAX, hence also
+            // 0 <= INT_MAX - b <= INTMAX.
+            // And "a + b <= INT_MAX" (which might overflow) is the
+            // same as a <= INT_MAX - b (no overflow)
+            return a <= INT_MAX - b;
+        }
+
+        // returns 1 if the product is valid, 0 on overflow.
+        // negative factors are considered invalid.
+        static int png__mul2sizes_valid(int a, int b)
+        {
+            if (a < 0 || b < 0) return 0;
+            if (b == 0) return 1; // mul-by-0 is always safe
+            // portable way to check for no overflows in a*b
+            return a <= INT_MAX / b;
+        }
+
+        // returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+        static int png__mad2sizes_valid(int a, int b, int add)
+        {
+            return png__mul2sizes_valid(a, b) && png__addsizes_valid(a * b, add);
+        }
+
+        // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+        static int png__mad3sizes_valid(int a, int b, int c, int add)
+        {
+            return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) &&
+                png__addsizes_valid(a * b * c, add);
+        }
+
+        // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+        static int png__mad4sizes_valid(int a, int b, int c, int d, int add)
+        {
+            return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) &&
+                png__mul2sizes_valid(a * b * c, d) && png__addsizes_valid(a * b * c * d, add);
+        }
+
+        // mallocs with size overflow checking
+        static void* png__malloc_mad2(int a, int b, int add)
+        {
+            if (!png__mad2sizes_valid(a, b, add)) return NULL;
+            return png__malloc(a * b + add);
+        }
+
+        static void* png__malloc_mad3(int a, int b, int c, int add)
+        {
+            if (!png__mad3sizes_valid(a, b, c, add)) return NULL;
+            return png__malloc(a * b * c + add);
+        }
+
+        static void* png__malloc_mad4(int a, int b, int c, int d, int add)
+        {
+            if (!png__mad4sizes_valid(a, b, c, d, add)) return NULL;
+            return png__malloc(a * b * c * d + add);
+        }
+
+        static png_uc png__compute_y(int r, int g, int b)
+        {
+            return (png_uc)(((r * 77) + (g * 150) + (29 * b)) >> 8);
+        }
+
+        static unsigned char* png__convert_format(unsigned char* data, int img_n, int req_comp, unsigned int x, unsigned int y)
+        {
+            int i, j;
+            unsigned char* good;
+
+            if (req_comp == img_n) return data;
+            PNG_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+            good = (unsigned char*)png__malloc_mad3(req_comp, x, y, 0);
+            if (good == NULL) {
+                PNG_FREE(data);
+                return png__errpuc("outofmem", "Out of memory");
+            }
+
+            for (j = 0; j < (int)y; ++j) {
+                unsigned char* src = data + j * x * img_n;
+                unsigned char* dest = good + j * x * req_comp;
+
+#define PNG__COMBO(a,b)  ((a)*8+(b))
+#define PNG__CASE(a,b)   case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+                // convert source image with img_n components to one with req_comp components;
+                // avoid switch per pixel, so use switch per scanline and massive macros
+                switch (PNG__COMBO(img_n, req_comp)) {
+                    PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 255; } break;
+                    PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 255; } break;
+                    PNG__CASE(2, 1) { dest[0] = src[0]; } break;
+                    PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break;
+                    PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 255; } break;
+                    PNG__CASE(3, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break;
+                    PNG__CASE(3, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = 255; } break;
+                    PNG__CASE(4, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break;
+                    PNG__CASE(4, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = src[3]; } break;
+                    PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break;
+                default: PNG_ASSERT(0); PNG_FREE(data); PNG_FREE(good); return png__errpuc("unsupported", "Unsupported format conversion");
+                }
+#undef PNG__CASE
+            }
+
+            PNG_FREE(data);
+            return good;
+        }
+
+        static png__uint16 png__compute_y_16(int r, int g, int b)
+        {
+            return (png__uint16)(((r * 77) + (g * 150) + (29 * b)) >> 8);
+        }
+
+        static png__uint16* png__convert_format16(png__uint16* data, int img_n, int req_comp, unsigned int x, unsigned int y)
+        {
+            int i, j;
+            png__uint16* good;
+
+            if (req_comp == img_n) return data;
+            PNG_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+            good = (png__uint16*)png__malloc(req_comp * x * y * 2);
+            if (good == NULL) {
+                PNG_FREE(data);
+                return (png__uint16*)png__errpuc("outofmem", "Out of memory");
+            }
+
+            for (j = 0; j < (int)y; ++j) {
+                png__uint16* src = data + j * x * img_n;
+                png__uint16* dest = good + j * x * req_comp;
+
+#define PNG__COMBO(a,b)  ((a)*8+(b))
+#define PNG__CASE(a,b)   case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+                // convert source image with img_n components to one with req_comp components;
+                // avoid switch per pixel, so use switch per scanline and massive macros
+                switch (PNG__COMBO(img_n, req_comp)) {
+                    PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 0xffff; } break;
+                    PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 0xffff; } break;
+                    PNG__CASE(2, 1) { dest[0] = src[0]; } break;
+                    PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break;
+                    PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 0xffff; } break;
+                    PNG__CASE(3, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break;
+                    PNG__CASE(3, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = 0xffff; } break;
+                    PNG__CASE(4, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break;
+                    PNG__CASE(4, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = src[3]; } break;
+                    PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break;
+                default: PNG_ASSERT(0); PNG_FREE(data); PNG_FREE(good); return (png__uint16*)png__errpuc("unsupported", "Unsupported format conversion");
+                }
+#undef PNG__CASE
+            }
+
+            PNG_FREE(data);
+            return good;
+        }
+
+        // fast-way is faster to check than jpeg huffman, but slow way is slower
+#define PNG__ZFAST_BITS  9 // accelerate all cases in default tables
+#define PNG__ZFAST_MASK  ((1 << PNG__ZFAST_BITS) - 1)
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+        typedef struct
+        {
+            png__uint16 fast[1 << PNG__ZFAST_BITS];
+            png__uint16 firstcode[16];
+            int maxcode[17];
+            png__uint16 firstsymbol[16];
+            png_uc  size[288];
+            png__uint16 value[288];
+        } png__zhuffman;
+
+        png_inline static int png__bitreverse16(int n)
+        {
+            n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1);
+            n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2);
+            n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4);
+            n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8);
+            return n;
+        }
+
+        png_inline static int png__bit_reverse(int v, int bits)
+        {
+            PNG_ASSERT(bits <= 16);
+            // to bit reverse n bits, reverse 16 and shift
+            // e.g. 11 bits, bit reverse and shift away 5
+            return png__bitreverse16(v) >> (16 - bits);
+        }
+
+        static int png__zbuild_huffman(png__zhuffman* z, const png_uc* sizelist, int num)
+        {
+            int i, k = 0;
+            int code, next_code[16], sizes[17];
+
+            // DEFLATE spec for generating codes
+            memset(sizes, 0, sizeof(sizes));
+            memset(z->fast, 0, sizeof(z->fast));
+            for (i = 0; i < num; ++i)
+                ++sizes[sizelist[i]];
+            sizes[0] = 0;
+            for (i = 1; i < 16; ++i)
+                if (sizes[i] > (1 << i))
+                    return png__err("bad sizes", "Corrupt PNG");
+            code = 0;
+            for (i = 1; i < 16; ++i) {
+                next_code[i] = code;
+                z->firstcode[i] = (png__uint16)code;
+                z->firstsymbol[i] = (png__uint16)k;
+                code = (code + sizes[i]);
+                if (sizes[i])
+                    if (code - 1 >= (1 << i)) return png__err("bad codelengths", "Corrupt PNG");
+                z->maxcode[i] = code << (16 - i); // preshift for inner loop
+                code <<= 1;
+                k += sizes[i];
+            }
+            z->maxcode[16] = 0x10000; // sentinel
+            for (i = 0; i < num; ++i) {
+                int s = sizelist[i];
+                if (s) {
+                    int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+                    png__uint16 fastv = (png__uint16)((s << 9) | i);
+                    z->size[c] = (png_uc)s;
+                    z->value[c] = (png__uint16)i;
+                    if (s <= PNG__ZFAST_BITS) {
+                        int j = png__bit_reverse(next_code[s], s);
+                        while (j < (1 << PNG__ZFAST_BITS)) {
+                            z->fast[j] = fastv;
+                            j += (1 << s);
+                        }
+                    }
+                    ++next_code[s];
+                }
+            }
+            return 1;
+        }
+
+        // zlib-from-memory implementation for PNG reading
+        //    because PNG allows splitting the zlib stream arbitrarily,
+        //    and it's annoying structurally to have PNG call ZLIB call PNG,
+        //    we require PNG read all the IDATs and combine them into a single
+        //    memory buffer
+
+        typedef struct
+        {
+            png_uc* zbuffer, * zbuffer_end;
+            int num_bits;
+            png__uint32 code_buffer;
+
+            char* zout;
+            char* zout_start;
+            char* zout_end;
+            int   z_expandable;
+
+            png__zhuffman z_length, z_distance;
+        } png__zbuf;
+
+        png_inline static int png__zeof(png__zbuf* z)
+        {
+            return (z->zbuffer >= z->zbuffer_end);
+        }
+
+        png_inline static png_uc png__zget8(png__zbuf* z)
+        {
+            return png__zeof(z) ? 0 : *z->zbuffer++;
+        }
+
+        static void png__fill_bits(png__zbuf* z)
+        {
+            do {
+                if (z->code_buffer >= (1U << z->num_bits)) {
+                    z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
+                    return;
+                }
+                z->code_buffer |= (unsigned int)png__zget8(z) << z->num_bits;
+                z->num_bits += 8;
+            } while (z->num_bits <= 24);
+        }
+
+        png_inline static unsigned int png__zreceive(png__zbuf* z, int n)
+        {
+            unsigned int k;
+            if (z->num_bits < n) png__fill_bits(z);
+            k = z->code_buffer & ((1 << n) - 1);
+            z->code_buffer >>= n;
+            z->num_bits -= n;
+            return k;
+        }
+
+        static int png__zhuffman_decode_slowpath(png__zbuf* a, png__zhuffman* z)
+        {
+            int b, s, k;
+            // not resolved by fast table, so compute it the slow way
+            // use jpeg approach, which requires MSbits at top
+            k = png__bit_reverse(a->code_buffer, 16);
+            for (s = PNG__ZFAST_BITS + 1; ; ++s)
+                if (k < z->maxcode[s])
+                    break;
+            if (s >= 16) return -1; // invalid code!
+            // code size is s, so:
+            b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s];
+            if (b >= sizeof(z->size)) return -1; // some data was corrupt somewhere!
+            if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
+            a->code_buffer >>= s;
+            a->num_bits -= s;
+            return z->value[b];
+        }
+
+        png_inline static int png__zhuffman_decode(png__zbuf* a, png__zhuffman* z)
+        {
+            int b, s;
+            if (a->num_bits < 16) {
+                if (png__zeof(a)) {
+                    return -1;   /* report error for unexpected end of data. */
+                }
+                png__fill_bits(a);
+            }
+            b = z->fast[a->code_buffer & PNG__ZFAST_MASK];
+            if (b) {
+                s = b >> 9;
+                a->code_buffer >>= s;
+                a->num_bits -= s;
+                return b & 511;
+            }
+            return png__zhuffman_decode_slowpath(a, z);
+        }
+
+        static int png__zexpand(png__zbuf* z, char* zout, int n)  // need to make room for n bytes
+        {
+            char* q;
+            unsigned int cur, limit, old_limit;
+            z->zout = zout;
+            if (!z->z_expandable) return png__err("output buffer limit", "Corrupt PNG");
+            cur = (unsigned int)(z->zout - z->zout_start);
+            limit = old_limit = (unsigned)(z->zout_end - z->zout_start);
+            if (UINT_MAX - cur < (unsigned)n) return png__err("outofmem", "Out of memory");
+            while (cur + n > limit) {
+                if (limit > UINT_MAX / 2) return png__err("outofmem", "Out of memory");
+                limit *= 2;
+            }
+            q = (char*)PNG_REALLOC_SIZED(z->zout_start, old_limit, limit);
+            PNG_NOTUSED(old_limit);
+            if (q == NULL) return png__err("outofmem", "Out of memory");
+            z->zout_start = q;
+            z->zout = q + cur;
+            z->zout_end = q + limit;
+            return 1;
+        }
+
+        static const int png__zlength_base[31] = {
+           3,4,5,6,7,8,9,10,11,13,
+           15,17,19,23,27,31,35,43,51,59,
+           67,83,99,115,131,163,195,227,258,0,0 };
+
+        static const int png__zlength_extra[31] =
+        { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+        static const int png__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+        257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0 };
+
+        static const int png__zdist_extra[32] =
+        { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
+
+        static int png__parse_huffman_block(png__zbuf* a)
+        {
+            char* zout = a->zout;
+            for (;;) {
+                int z = png__zhuffman_decode(a, &a->z_length);
+                if (z < 256) {
+                    if (z < 0) return png__err("bad huffman code", "Corrupt PNG"); // error in huffman codes
+                    if (zout >= a->zout_end) {
+                        if (!png__zexpand(a, zout, 1)) return 0;
+                        zout = a->zout;
+                    }
+                    *zout++ = (char)z;
+                }
+                else {
+                    png_uc* p;
+                    int len, dist;
+                    if (z == 256) {
+                        a->zout = zout;
+                        return 1;
+                    }
+                    z -= 257;
+                    len = png__zlength_base[z];
+                    if (png__zlength_extra[z]) len += png__zreceive(a, png__zlength_extra[z]);
+                    z = png__zhuffman_decode(a, &a->z_distance);
+                    if (z < 0) return png__err("bad huffman code", "Corrupt PNG");
+                    dist = png__zdist_base[z];
+                    if (png__zdist_extra[z]) dist += png__zreceive(a, png__zdist_extra[z]);
+                    if (zout - a->zout_start < dist) return png__err("bad dist", "Corrupt PNG");
+                    if (zout + len > a->zout_end) {
+                        if (!png__zexpand(a, zout, len)) return 0;
+                        zout = a->zout;
+                    }
+                    p = (png_uc*)(zout - dist);
+                    if (dist == 1) { // run of one byte; common in images.
+                        png_uc v = *p;
+                        if (len) { do *zout++ = v; while (--len); }
+                    }
+                    else {
+                        if (len) { do *zout++ = *p++; while (--len); }
+                    }
+                }
+            }
+        }
+
+        static int png__compute_huffman_codes(png__zbuf* a)
+        {
+            static const png_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+            png__zhuffman z_codelength;
+            png_uc lencodes[286 + 32 + 137];//padding for maximum single op
+            png_uc codelength_sizes[19];
+            int i, n;
+
+            int hlit = png__zreceive(a, 5) + 257;
+            int hdist = png__zreceive(a, 5) + 1;
+            int hclen = png__zreceive(a, 4) + 4;
+            int ntot = hlit + hdist;
+
+            memset(codelength_sizes, 0, sizeof(codelength_sizes));
+            for (i = 0; i < hclen; ++i) {
+                int s = png__zreceive(a, 3);
+                codelength_sizes[length_dezigzag[i]] = (png_uc)s;
+            }
+            if (!png__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+            n = 0;
+            while (n < ntot) {
+                int c = png__zhuffman_decode(a, &z_codelength);
+                if (c < 0 || c >= 19) return png__err("bad codelengths", "Corrupt PNG");
+                if (c < 16)
+                    lencodes[n++] = (png_uc)c;
+                else {
+                    png_uc fill = 0;
+                    if (c == 16) {
+                        c = png__zreceive(a, 2) + 3;
+                        if (n == 0) return png__err("bad codelengths", "Corrupt PNG");
+                        fill = lencodes[n - 1];
+                    }
+                    else if (c == 17) {
+                        c = png__zreceive(a, 3) + 3;
+                    }
+                    else if (c == 18) {
+                        c = png__zreceive(a, 7) + 11;
+                    }
+                    else {
+                        return png__err("bad codelengths", "Corrupt PNG");
+                    }
+                    if (ntot - n < c) return png__err("bad codelengths", "Corrupt PNG");
+                    memset(lencodes + n, fill, c);
+                    n += c;
+                }
+            }
+            if (n != ntot) return png__err("bad codelengths", "Corrupt PNG");
+            if (!png__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+            if (!png__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist)) return 0;
+            return 1;
+        }
+
+        static int png__parse_uncompressed_block(png__zbuf* a)
+        {
+            png_uc header[4];
+            int len, nlen, k;
+            if (a->num_bits & 7)
+                png__zreceive(a, a->num_bits & 7); // discard
+             // drain the bit-packed data into header
+            k = 0;
+            while (a->num_bits > 0) {
+                header[k++] = (png_uc)(a->code_buffer & 255); // suppress MSVC run-time check
+                a->code_buffer >>= 8;
+                a->num_bits -= 8;
+            }
+            if (a->num_bits < 0) return png__err("zlib corrupt", "Corrupt PNG");
+            // now fill header the normal way
+            while (k < 4)
+                header[k++] = png__zget8(a);
+            len = header[1] * 256 + header[0];
+            nlen = header[3] * 256 + header[2];
+            if (nlen != (len ^ 0xffff)) return png__err("zlib corrupt", "Corrupt PNG");
+            if (a->zbuffer + len > a->zbuffer_end) return png__err("read past buffer", "Corrupt PNG");
+            if (a->zout + len > a->zout_end)
+                if (!png__zexpand(a, a->zout, len)) return 0;
+            memcpy(a->zout, a->zbuffer, len);
+            a->zbuffer += len;
+            a->zout += len;
+            return 1;
+        }
+
+        static int png__parse_zlib_header(png__zbuf* a)
+        {
+            int cmf = png__zget8(a);
+            int cm = cmf & 15;
+            /* int cinfo = cmf >> 4; */
+            int flg = png__zget8(a);
+            if (png__zeof(a)) return png__err("bad zlib header", "Corrupt PNG"); // zlib spec
+            if ((cmf * 256 + flg) % 31 != 0) return png__err("bad zlib header", "Corrupt PNG"); // zlib spec
+            if (flg & 32) return png__err("no preset dict", "Corrupt PNG"); // preset dictionary not allowed in png
+            if (cm != 8) return png__err("bad compression", "Corrupt PNG"); // DEFLATE required for png
+            // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+            return 1;
+        }
+
+        static const png_uc png__zdefault_length[288] =
+        {
+           8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+           8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+           8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+           8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+           8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+           9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+           9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+           9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+           7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+        };
+        static const png_uc png__zdefault_distance[32] =
+        {
+           5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+        };
+        /*
+        Init algorithm:
+        {
+           int i;   // use <= to match clearly with spec
+           for (i=0; i <= 143; ++i)     png__zdefault_length[i]   = 8;
+           for (   ; i <= 255; ++i)     png__zdefault_length[i]   = 9;
+           for (   ; i <= 279; ++i)     png__zdefault_length[i]   = 7;
+           for (   ; i <= 287; ++i)     png__zdefault_length[i]   = 8;
+
+           for (i=0; i <=  31; ++i)     png__zdefault_distance[i] = 5;
+        }
+        */
+
+        static int png__parse_zlib(png__zbuf* a, int parse_header)
+        {
+            int final, type;
+            if (parse_header)
+                if (!png__parse_zlib_header(a)) return 0;
+            a->num_bits = 0;
+            a->code_buffer = 0;
+            do {
+                final = png__zreceive(a, 1);
+                type = png__zreceive(a, 2);
+                if (type == 0) {
+                    if (!png__parse_uncompressed_block(a)) return 0;
+                }
+                else if (type == 3) {
+                    return 0;
+                }
+                else {
+                    if (type == 1) {
+                        // use fixed code lengths
+                        if (!png__zbuild_huffman(&a->z_length, png__zdefault_length, 288)) return 0;
+                        if (!png__zbuild_huffman(&a->z_distance, png__zdefault_distance, 32)) return 0;
+                    }
+                    else {
+                        if (!png__compute_huffman_codes(a)) return 0;
+                    }
+                    if (!png__parse_huffman_block(a)) return 0;
+                }
+            } while (!final);
+            return 1;
+        }
+
+        static int png__do_zlib(png__zbuf* a, char* obuf, int olen, int exp, int parse_header)
+        {
+            a->zout_start = obuf;
+            a->zout = obuf;
+            a->zout_end = obuf + olen;
+            a->z_expandable = exp;
+
+            return png__parse_zlib(a, parse_header);
+        }
+
+        STBIDEF char* png_zlib_decode_malloc_guesssize(const char* buffer, int len, int initial_size, int* outlen)
+        {
+            png__zbuf a;
+            char* p = (char*)png__malloc(initial_size);
+            if (p == NULL) return NULL;
+            a.zbuffer = (png_uc*)buffer;
+            a.zbuffer_end = (png_uc*)buffer + len;
+            if (png__do_zlib(&a, p, initial_size, 1, 1)) {
+                if (outlen) *outlen = (int)(a.zout - a.zout_start);
+                return a.zout_start;
+            }
+            else {
+                PNG_FREE(a.zout_start);
+                return NULL;
+            }
+        }
+
+        STBIDEF char* png_zlib_decode_malloc(char const* buffer, int len, int* outlen)
+        {
+            return png_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+        }
+
+        STBIDEF char* png_zlib_decode_malloc_guesssize_headerflag(const char* buffer, int len, int initial_size, int* outlen, int parse_header)
+        {
+            png__zbuf a;
+            char* p = (char*)png__malloc(initial_size);
+            if (p == NULL) return NULL;
+            a.zbuffer = (png_uc*)buffer;
+            a.zbuffer_end = (png_uc*)buffer + len;
+            if (png__do_zlib(&a, p, initial_size, 1, parse_header)) {
+                if (outlen) *outlen = (int)(a.zout - a.zout_start);
+                return a.zout_start;
+            }
+            else {
+                PNG_FREE(a.zout_start);
+                return NULL;
+            }
+        }
+
+        STBIDEF int png_zlib_decode_buffer(char* obuffer, int olen, char const* ibuffer, int ilen)
+        {
+            png__zbuf a;
+            a.zbuffer = (png_uc*)ibuffer;
+            a.zbuffer_end = (png_uc*)ibuffer + ilen;
+            if (png__do_zlib(&a, obuffer, olen, 0, 1))
+                return (int)(a.zout - a.zout_start);
+            else
+                return -1;
+        }
+
+        STBIDEF char* png_zlib_decode_noheader_malloc(char const* buffer, int len, int* outlen)
+        {
+            png__zbuf a;
+            char* p = (char*)png__malloc(16384);
+            if (p == NULL) return NULL;
+            a.zbuffer = (png_uc*)buffer;
+            a.zbuffer_end = (png_uc*)buffer + len;
+            if (png__do_zlib(&a, p, 16384, 1, 0)) {
+                if (outlen) *outlen = (int)(a.zout - a.zout_start);
+                return a.zout_start;
+            }
+            else {
+                PNG_FREE(a.zout_start);
+                return NULL;
+            }
+        }
+
+        STBIDEF int png_zlib_decode_noheader_buffer(char* obuffer, int olen, const char* ibuffer, int ilen)
+        {
+            png__zbuf a;
+            a.zbuffer = (png_uc*)ibuffer;
+            a.zbuffer_end = (png_uc*)ibuffer + ilen;
+            if (png__do_zlib(&a, obuffer, olen, 0, 0))
+                return (int)(a.zout - a.zout_start);
+            else
+                return -1;
+        }
+
+
+        // public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+        //    simple implementation
+        //      - only 8-bit samples
+        //      - no CRC checking
+        //      - allocates lots of intermediate memory
+        //        - avoids problem of streaming data between subsystems
+        //        - avoids explicit window management
+        //    performance
+        //      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+        typedef struct
+        {
+            png__uint32 length;
+            png__uint32 type;
+        } png__pngchunk;
+
+        static png__pngchunk png__get_chunk_header(png__context* s)
+        {
+            png__pngchunk c;
+            c.length = png__get32be(s);
+            c.type = png__get32be(s);
+            return c;
+        }
+
+        static int png__check_png_header(png__context* s)
+        {
+            static const png_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+            int i;
+            for (i = 0; i < 8; ++i)
+                if (png__get8(s) != png_sig[i]) return png__err("bad png sig", "Not a PNG");
+            return 1;
+        }
+
+        typedef struct
+        {
+            png__context* s;
+            png_uc* idata, * expanded, * out;
+            int depth;
+        } png__png;
+
+
+        enum {
+            PNG__F_none = 0,
+            PNG__F_sub = 1,
+            PNG__F_up = 2,
+            PNG__F_avg = 3,
+            PNG__F_paeth = 4,
+            // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+            PNG__F_avg_first,
+            PNG__F_paeth_first
+        };
+
+        static png_uc first_row_filter[5] =
+        {
+           PNG__F_none,
+           PNG__F_sub,
+           PNG__F_none,
+           PNG__F_avg_first,
+           PNG__F_paeth_first
+        };
+
+        static int png__paeth(int a, int b, int c)
+        {
+            int p = a + b - c;
+            int pa = abs(p - a);
+            int pb = abs(p - b);
+            int pc = abs(p - c);
+            if (pa <= pb && pa <= pc) return a;
+            if (pb <= pc) return b;
+            return c;
+        }
+
+        static const png_uc png__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+        // create the png data from post-deflated data
+        static int png__create_png_image_raw(png__png* a, png_uc* raw, png__uint32 raw_len, int out_n, png__uint32 x, png__uint32 y, int depth, int color)
+        {
+            int bytes = (depth == 16 ? 2 : 1);
+            png__context* s = a->s;
+            png__uint32 i, j, stride = x * out_n * bytes;
+            png__uint32 img_len, img_width_bytes;
+            int k;
+            int img_n = s->img_n; // copy it into a local for later
+
+            int output_bytes = out_n * bytes;
+            int filter_bytes = img_n * bytes;
+            int width = x;
+
+            PNG_ASSERT(out_n == s->img_n || out_n == s->img_n + 1);
+            a->out = (png_uc*)png__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+            if (!a->out) return png__err("outofmem", "Out of memory");
+
+            if (!png__mad3sizes_valid(img_n, x, depth, 7)) return png__err("too large", "Corrupt PNG");
+            img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+            img_len = (img_width_bytes + 1) * y;
+
+            // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+            // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+            // so just check for raw_len < img_len always.
+            if (raw_len < img_len) return png__err("not enough pixels", "Corrupt PNG");
+
+            for (j = 0; j < y; ++j) {
+                png_uc* cur = a->out + stride * j;
+                png_uc* prior;
+                int filter = *raw++;
+
+                if (filter > 4)
+                    return png__err("invalid filter", "Corrupt PNG");
+
+                if (depth < 8) {
+                    if (img_width_bytes > x) return png__err("invalid width", "Corrupt PNG");
+                    cur += x * out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+                    filter_bytes = 1;
+                    width = img_width_bytes;
+                }
+                prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
+
+                // if first row, use special filter that doesn't sample previous row
+                if (j == 0) filter = first_row_filter[filter];
+
+                // handle first byte explicitly
+                for (k = 0; k < filter_bytes; ++k) {
+                    switch (filter) {
+                    case PNG__F_none: cur[k] = raw[k]; break;
+                    case PNG__F_sub: cur[k] = raw[k]; break;
+                    case PNG__F_up: cur[k] = PNG__BYTECAST(raw[k] + prior[k]); break;
+                    case PNG__F_avg: cur[k] = PNG__BYTECAST(raw[k] + (prior[k] >> 1)); break;
+                    case PNG__F_paeth: cur[k] = PNG__BYTECAST(raw[k] + png__paeth(0, prior[k], 0)); break;
+                    case PNG__F_avg_first: cur[k] = raw[k]; break;
+                    case PNG__F_paeth_first: cur[k] = raw[k]; break;
+                    }
+                }
+
+                if (depth == 8) {
+                    if (img_n != out_n)
+                        cur[img_n] = 255; // first pixel
+                    raw += img_n;
+                    cur += out_n;
+                    prior += out_n;
+                }
+                else if (depth == 16) {
+                    if (img_n != out_n) {
+                        cur[filter_bytes] = 255; // first pixel top byte
+                        cur[filter_bytes + 1] = 255; // first pixel bottom byte
+                    }
+                    raw += filter_bytes;
+                    cur += output_bytes;
+                    prior += output_bytes;
+                }
+                else {
+                    raw += 1;
+                    cur += 1;
+                    prior += 1;
+                }
+
+                // this is a little gross, so that we don't switch per-pixel or per-component
+                if (depth < 8 || img_n == out_n) {
+                    int nk = (width - 1) * filter_bytes;
+#define PNG__CASE(f) \
+             case f:     \
+                for (k=0; k < nk; ++k)
+                    switch (filter) {
+                        // "none" filter turns into a memcpy here; make that explicit.
+                    case PNG__F_none:         memcpy(cur, raw, nk); break;
+                        PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - filter_bytes]); } break;
+                        PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break;
+                        PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes])); } break;
+                        PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], 0, 0)); } break;
+                    }
+#undef PNG__CASE
+                    raw += nk;
+                }
+                else {
+                    PNG_ASSERT(img_n + 1 == out_n);
+#define PNG__CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
+                    switch (filter) {
+                        PNG__CASE(PNG__F_none) { cur[k] = raw[k]; } break;
+                        PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - output_bytes]); } break;
+                        PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break;
+                        PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - output_bytes]) >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes])); } break;
+                        PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - output_bytes] >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], 0, 0)); } break;
+                    }
+#undef PNG__CASE
+
+                    // the loop above sets the high byte of the pixels' alpha, but for
+                    // 16 bit png files we also need the low byte set. we'll do that here.
+                    if (depth == 16) {
+                        cur = a->out + stride * j; // start at the beginning of the row again
+                        for (i = 0; i < x; ++i, cur += output_bytes) {
+                            cur[filter_bytes + 1] = 255;
+                        }
+                    }
+                }
+            }
+
+            // we make a separate pass to expand bits to pixels; for performance,
+            // this could run two scanlines behind the above code, so it won't
+            // intefere with filtering but will still be in the cache.
+            if (depth < 8) {
+                for (j = 0; j < y; ++j) {
+                    png_uc* cur = a->out + stride * j;
+                    png_uc* in = a->out + stride * j + x * out_n - img_width_bytes;
+                    // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
+                    // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+                    png_uc scale = (color == 0) ? png__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+                    // note that the final byte might overshoot and write more data than desired.
+                    // we can allocate enough data that this never writes out of memory, but it
+                    // could also overwrite the next scanline. can it overwrite non-empty data
+                    // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+                    // so we need to explicitly clamp the final ones
+
+                    if (depth == 4) {
+                        for (k = x * img_n; k >= 2; k -= 2, ++in) {
+                            *cur++ = scale * ((*in >> 4));
+                            *cur++ = scale * ((*in) & 0x0f);
+                        }
+                        if (k > 0) *cur++ = scale * ((*in >> 4));
+                    }
+                    else if (depth == 2) {
+                        for (k = x * img_n; k >= 4; k -= 4, ++in) {
+                            *cur++ = scale * ((*in >> 6));
+                            *cur++ = scale * ((*in >> 4) & 0x03);
+                            *cur++ = scale * ((*in >> 2) & 0x03);
+                            *cur++ = scale * ((*in) & 0x03);
+                        }
+                        if (k > 0) *cur++ = scale * ((*in >> 6));
+                        if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
+                        if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
+                    }
+                    else if (depth == 1) {
+                        for (k = x * img_n; k >= 8; k -= 8, ++in) {
+                            *cur++ = scale * ((*in >> 7));
+                            *cur++ = scale * ((*in >> 6) & 0x01);
+                            *cur++ = scale * ((*in >> 5) & 0x01);
+                            *cur++ = scale * ((*in >> 4) & 0x01);
+                            *cur++ = scale * ((*in >> 3) & 0x01);
+                            *cur++ = scale * ((*in >> 2) & 0x01);
+                            *cur++ = scale * ((*in >> 1) & 0x01);
+                            *cur++ = scale * ((*in) & 0x01);
+                        }
+                        if (k > 0) *cur++ = scale * ((*in >> 7));
+                        if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+                        if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+                        if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+                        if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+                        if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+                        if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+                    }
+                    if (img_n != out_n) {
+                        int q;
+                        // insert alpha = 255
+                        cur = a->out + stride * j;
+                        if (img_n == 1) {
+                            for (q = x - 1; q >= 0; --q) {
+                                cur[q * 2 + 1] = 255;
+                                cur[q * 2 + 0] = cur[q];
+                            }
+                        }
+                        else {
+                            PNG_ASSERT(img_n == 3);
+                            for (q = x - 1; q >= 0; --q) {
+                                cur[q * 4 + 3] = 255;
+                                cur[q * 4 + 2] = cur[q * 3 + 2];
+                                cur[q * 4 + 1] = cur[q * 3 + 1];
+                                cur[q * 4 + 0] = cur[q * 3 + 0];
+                            }
+                        }
+                    }
+                }
+            }
+            else if (depth == 16) {
+                // force the image data from big-endian to platform-native.
+                // this is done in a separate pass due to the decoding relying
+                // on the data being untouched, but could probably be done
+                // per-line during decode if care is taken.
+                png_uc* cur = a->out;
+                png__uint16* cur16 = (png__uint16*)cur;
+
+                for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) {
+                    *cur16 = (cur[0] << 8) | cur[1];
+                }
+            }
+
+            return 1;
+        }
+
+        static int png__create_png_image(png__png* a, png_uc* image_data, png__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+        {
+            int bytes = (depth == 16 ? 2 : 1);
+            int out_bytes = out_n * bytes;
+            png_uc* final;
+            int p;
+            if (!interlaced)
+                return png__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+            // de-interlacing
+            final = (png_uc*)png__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+            for (p = 0; p < 7; ++p) {
+                int xorig[] = { 0,4,0,2,0,1,0 };
+                int yorig[] = { 0,0,4,0,2,0,1 };
+                int xspc[] = { 8,8,4,4,2,2,1 };
+                int yspc[] = { 8,8,8,4,4,2,2 };
+                int i, j, x, y;
+                // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+                x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p];
+                y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p];
+                if (x && y) {
+                    png__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+                    if (!png__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+                        PNG_FREE(final);
+                        return 0;
+                    }
+                    for (j = 0; j < y; ++j) {
+                        for (i = 0; i < x; ++i) {
+                            int out_y = j * yspc[p] + yorig[p];
+                            int out_x = i * xspc[p] + xorig[p];
+                            memcpy(final + out_y * a->s->img_x * out_bytes + out_x * out_bytes,
+                                a->out + (j * x + i) * out_bytes, out_bytes);
+                        }
+                    }
+                    PNG_FREE(a->out);
+                    image_data += img_len;
+                    image_data_len -= img_len;
+                }
+            }
+            a->out = final;
+
+            return 1;
+        }
+
+        static int png__compute_transparency(png__png* z, png_uc tc[3], int out_n)
+        {
+            png__context* s = z->s;
+            png__uint32 i, pixel_count = s->img_x * s->img_y;
+            png_uc* p = z->out;
+
+            // compute color-based transparency, assuming we've
+            // already got 255 as the alpha value in the output
+            PNG_ASSERT(out_n == 2 || out_n == 4);
+
+            if (out_n == 2) {
+                for (i = 0; i < pixel_count; ++i) {
+                    p[1] = (p[0] == tc[0] ? 0 : 255);
+                    p += 2;
+                }
+            }
+            else {
+                for (i = 0; i < pixel_count; ++i) {
+                    if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+                        p[3] = 0;
+                    p += 4;
+                }
+            }
+            return 1;
+        }
+
+        static int png__compute_transparency16(png__png* z, png__uint16 tc[3], int out_n)
+        {
+            png__context* s = z->s;
+            png__uint32 i, pixel_count = s->img_x * s->img_y;
+            png__uint16* p = (png__uint16*)z->out;
+
+            // compute color-based transparency, assuming we've
+            // already got 65535 as the alpha value in the output
+            PNG_ASSERT(out_n == 2 || out_n == 4);
+
+            if (out_n == 2) {
+                for (i = 0; i < pixel_count; ++i) {
+                    p[1] = (p[0] == tc[0] ? 0 : 65535);
+                    p += 2;
+                }
+            }
+            else {
+                for (i = 0; i < pixel_count; ++i) {
+                    if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+                        p[3] = 0;
+                    p += 4;
+                }
+            }
+            return 1;
+        }
+
+        static int png__expand_png_palette(png__png* a, png_uc* palette, int len, int pal_img_n)
+        {
+            png__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+            png_uc* p, * temp_out, * orig = a->out;
+
+            p = (png_uc*)png__malloc_mad2(pixel_count, pal_img_n, 0);
+            if (p == NULL) return png__err("outofmem", "Out of memory");
+
+            // between here and free(out) below, exitting would leak
+            temp_out = p;
+
+            if (pal_img_n == 3) {
+                for (i = 0; i < pixel_count; ++i) {
+                    int n = orig[i] * 4;
+                    p[0] = palette[n];
+                    p[1] = palette[n + 1];
+                    p[2] = palette[n + 2];
+                    p += 3;
+                }
+            }
+            else {
+                for (i = 0; i < pixel_count; ++i) {
+                    int n = orig[i] * 4;
+                    p[0] = palette[n];
+                    p[1] = palette[n + 1];
+                    p[2] = palette[n + 2];
+                    p[3] = palette[n + 3];
+                    p += 4;
+                }
+            }
+            PNG_FREE(a->out);
+            a->out = temp_out;
+
+            PNG_NOTUSED(len);
+
+            return 1;
+        }
+
+        static int png__unpremultiply_on_load = 0;
+        static int png__de_iphone_flag = 0;
+
+        STBIDEF void png_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+        {
+            png__unpremultiply_on_load = flag_true_if_should_unpremultiply;
+        }
+
+        STBIDEF void png_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+        {
+            png__de_iphone_flag = flag_true_if_should_convert;
+        }
+
+        static void png__de_iphone(png__png* z)
+        {
+            png__context* s = z->s;
+            png__uint32 i, pixel_count = s->img_x * s->img_y;
+            png_uc* p = z->out;
+
+            if (s->img_out_n == 3) {  // convert bgr to rgb
+                for (i = 0; i < pixel_count; ++i) {
+                    png_uc t = p[0];
+                    p[0] = p[2];
+                    p[2] = t;
+                    p += 3;
+                }
+            }
+            else {
+                PNG_ASSERT(s->img_out_n == 4);
+                if (png__unpremultiply_on_load) {
+                    // convert bgr to rgb and unpremultiply
+                    for (i = 0; i < pixel_count; ++i) {
+                        png_uc a = p[3];
+                        png_uc t = p[0];
+                        if (a) {
+                            png_uc half = a / 2;
+                            p[0] = (p[2] * 255 + half) / a;
+                            p[1] = (p[1] * 255 + half) / a;
+                            p[2] = (t * 255 + half) / a;
+                        }
+                        else {
+                            p[0] = p[2];
+                            p[2] = t;
+                        }
+                        p += 4;
+                    }
+                }
+                else {
+                    // convert bgr to rgb
+                    for (i = 0; i < pixel_count; ++i) {
+                        png_uc t = p[0];
+                        p[0] = p[2];
+                        p[2] = t;
+                        p += 4;
+                    }
+                }
+            }
+        }
+
+#define PNG__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
+
+        static int png__parse_png_file(png__png* z, int scan, int req_comp)
+        {
+            png_uc palette[1024], pal_img_n = 0;
+            png_uc has_trans = 0, tc[3] = { 0 };
+            png__uint16 tc16[3];
+            png__uint32 ioff = 0, idata_limit = 0, i, pal_len = 0;
+            int first = 1, k, interlace = 0, color = 0, is_iphone = 0;
+            png__context* s = z->s;
+
+            z->expanded = NULL;
+            z->idata = NULL;
+            z->out = NULL;
+
+            if (!png__check_png_header(s)) return 0;
+
+            if (scan == PNG__SCAN_type) return 1;
+
+            for (;;) {
+                png__pngchunk c = png__get_chunk_header(s);
+                switch (c.type) {
+                case PNG__PNG_TYPE('C', 'g', 'B', 'I'):
+                    is_iphone = 1;
+                    png__skip(s, c.length);
+                    break;
+                case PNG__PNG_TYPE('I', 'H', 'D', 'R'): {
+                    int comp, filter;
+                    if (!first) return png__err("multiple IHDR", "Corrupt PNG");
+                    first = 0;
+                    if (c.length != 13) return png__err("bad IHDR len", "Corrupt PNG");
+                    s->img_x = png__get32be(s);
+                    s->img_y = png__get32be(s);
+                    if (s->img_y > PNG_MAX_DIMENSIONS) return png__err("too large", "Very large image (corrupt?)");
+                    if (s->img_x > PNG_MAX_DIMENSIONS) return png__err("too large", "Very large image (corrupt?)");
+                    z->depth = png__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return png__err("1/2/4/8/16-bit only", "PNG not supported: 1/2/4/8/16-bit only");
+                    color = png__get8(s);  if (color > 6)         return png__err("bad ctype", "Corrupt PNG");
+                    if (color == 3 && z->depth == 16)                  return png__err("bad ctype", "Corrupt PNG");
+                    if (color == 3) pal_img_n = 3; else if (color & 1) return png__err("bad ctype", "Corrupt PNG");
+                    comp = png__get8(s);  if (comp) return png__err("bad comp method", "Corrupt PNG");
+                    filter = png__get8(s);  if (filter) return png__err("bad filter method", "Corrupt PNG");
+                    interlace = png__get8(s); if (interlace > 1) return png__err("bad interlace method", "Corrupt PNG");
+                    if (!s->img_x || !s->img_y) return png__err("0-pixel image", "Corrupt PNG");
+                    if (!pal_img_n) {
+                        s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+                        if ((1 << 30) / s->img_x / s->img_n < s->img_y) return png__err("too large", "Image too large to decode");
+                        if (scan == PNG__SCAN_header) return 1;
+                    }
+                    else {
+                        // if paletted, then pal_n is our final components, and
+                        // img_n is # components to decompress/filter.
+                        s->img_n = 1;
+                        if ((1 << 30) / s->img_x / 4 < s->img_y) return png__err("too large", "Corrupt PNG");
+                        // if SCAN_header, have to scan to see if we have a tRNS
+                    }
+                    break;
+                }
+
+                case PNG__PNG_TYPE('P', 'L', 'T', 'E'): {
+                    if (first) return png__err("first not IHDR", "Corrupt PNG");
+                    if (c.length > 256 * 3) return png__err("invalid PLTE", "Corrupt PNG");
+                    pal_len = c.length / 3;
+                    if (pal_len * 3 != c.length) return png__err("invalid PLTE", "Corrupt PNG");
+                    for (i = 0; i < pal_len; ++i) {
+                        palette[i * 4 + 0] = png__get8(s);
+                        palette[i * 4 + 1] = png__get8(s);
+                        palette[i * 4 + 2] = png__get8(s);
+                        palette[i * 4 + 3] = 255;
+                    }
+                    break;
+                }
+
+                case PNG__PNG_TYPE('t', 'R', 'N', 'S'): {
+                    if (first) return png__err("first not IHDR", "Corrupt PNG");
+                    if (z->idata) return png__err("tRNS after IDAT", "Corrupt PNG");
+                    if (pal_img_n) {
+                        if (scan == PNG__SCAN_header) { s->img_n = 4; return 1; }
+                        if (pal_len == 0) return png__err("tRNS before PLTE", "Corrupt PNG");
+                        if (c.length > pal_len) return png__err("bad tRNS len", "Corrupt PNG");
+                        pal_img_n = 4;
+                        for (i = 0; i < c.length; ++i)
+                            palette[i * 4 + 3] = png__get8(s);
+                    }
+                    else {
+                        if (!(s->img_n & 1)) return png__err("tRNS with alpha", "Corrupt PNG");
+                        if (c.length != (png__uint32)s->img_n * 2) return png__err("bad tRNS len", "Corrupt PNG");
+                        has_trans = 1;
+                        if (z->depth == 16) {
+                            for (k = 0; k < s->img_n; ++k) tc16[k] = (png__uint16)png__get16be(s); // copy the values as-is
+                        }
+                        else {
+                            for (k = 0; k < s->img_n; ++k) tc[k] = (png_uc)(png__get16be(s) & 255) * png__depth_scale_table[z->depth]; // non 8-bit images will be larger
+                        }
+                    }
+                    break;
+                }
+
+                case PNG__PNG_TYPE('I', 'D', 'A', 'T'): {
+                    if (first) return png__err("first not IHDR", "Corrupt PNG");
+                    if (pal_img_n && !pal_len) return png__err("no PLTE", "Corrupt PNG");
+                    if (scan == PNG__SCAN_header) { s->img_n = pal_img_n; return 1; }
+                    if ((int)(ioff + c.length) < (int)ioff) return 0;
+                    if (ioff + c.length > idata_limit) {
+                        png__uint32 idata_limit_old = idata_limit;
+                        png_uc* p;
+                        if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+                        while (ioff + c.length > idata_limit)
+                            idata_limit *= 2;
+                        PNG_NOTUSED(idata_limit_old);
+                        p = (png_uc*)PNG_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return png__err("outofmem", "Out of memory");
+                        z->idata = p;
+                    }
+                    if (!png__getn(s, z->idata + ioff, c.length)) return png__err("outofdata", "Corrupt PNG");
+                    ioff += c.length;
+                    break;
+                }
+
+                case PNG__PNG_TYPE('I', 'E', 'N', 'D'): {
+                    png__uint32 raw_len, bpl;
+                    if (first) return png__err("first not IHDR", "Corrupt PNG");
+                    if (scan != PNG__SCAN_load) return 1;
+                    if (z->idata == NULL) return png__err("no IDAT", "Corrupt PNG");
+                    // initial guess for decoded data size to avoid unnecessary reallocs
+                    bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+                    raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+                    z->expanded = (png_uc*)png_zlib_decode_malloc_guesssize_headerflag((char*)z->idata, ioff, raw_len, (int*)&raw_len, !is_iphone);
+                    if (z->expanded == NULL) return 0; // zlib should set error
+                    PNG_FREE(z->idata); z->idata = NULL;
+                    if ((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || has_trans)
+                        s->img_out_n = s->img_n + 1;
+                    else
+                        s->img_out_n = s->img_n;
+                    if (!png__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+                    if (has_trans) {
+                        if (z->depth == 16) {
+                            if (!png__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+                        }
+                        else {
+                            if (!png__compute_transparency(z, tc, s->img_out_n)) return 0;
+                        }
+                    }
+                    if (is_iphone && png__de_iphone_flag && s->img_out_n > 2)
+                        png__de_iphone(z);
+                    if (pal_img_n) {
+                        // pal_img_n == 3 or 4
+                        s->img_n = pal_img_n; // record the actual colors we had
+                        s->img_out_n = pal_img_n;
+                        if (req_comp >= 3) s->img_out_n = req_comp;
+                        if (!png__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                            return 0;
+                    }
+                    else if (has_trans) {
+                        // non-paletted image with tRNS -> source image has (constant) alpha
+                        ++s->img_n;
+                    }
+                    PNG_FREE(z->expanded); z->expanded = NULL;
+                    // end of PNG chunk, read and skip CRC
+                    png__get32be(s);
+                    return 1;
+                }
+
+                default:
+                    // if critical, fail
+                    if (first) return png__err("first not IHDR", "Corrupt PNG");
+                    if ((c.type & (1 << 29)) == 0) {
+#ifndef PNG_NO_FAILURE_STRINGS
+                        // not threadsafe
+                        static char invalid_chunk[] = "XXXX PNG chunk not known";
+                        invalid_chunk[0] = PNG__BYTECAST(c.type >> 24);
+                        invalid_chunk[1] = PNG__BYTECAST(c.type >> 16);
+                        invalid_chunk[2] = PNG__BYTECAST(c.type >> 8);
+                        invalid_chunk[3] = PNG__BYTECAST(c.type >> 0);
+#endif
+                        return png__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+                    }
+                    png__skip(s, c.length);
+                    break;
+                }
+                // end of PNG chunk, read and skip CRC
+                png__get32be(s);
+            }
+        }
+
+        static void* png__do_png(png__png* p, int* x, int* y, int* n, int req_comp, png__result_info* ri)
+        {
+            void* result = NULL;
+            if (req_comp < 0 || req_comp > 4) return png__errpuc("bad req_comp", "Internal error");
+            if (png__parse_png_file(p, PNG__SCAN_load, req_comp)) {
+                if (p->depth <= 8)
+                    ri->bits_per_channel = 8;
+                else if (p->depth == 16)
+                    ri->bits_per_channel = 16;
+                else
+                    return png__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
+                result = p->out;
+                p->out = NULL;
+                if (req_comp && req_comp != p->s->img_out_n) {
+                    if (ri->bits_per_channel == 8)
+                        result = png__convert_format((unsigned char*)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+                    else
+                        result = png__convert_format16((png__uint16*)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+                    p->s->img_out_n = req_comp;
+                    if (result == NULL) return result;
+                }
+                *x = p->s->img_x;
+                *y = p->s->img_y;
+                if (n) *n = p->s->img_n;
+            }
+            PNG_FREE(p->out);      p->out = NULL;
+            PNG_FREE(p->expanded); p->expanded = NULL;
+            PNG_FREE(p->idata);    p->idata = NULL;
+
+            return result;
+        }
+
+        static void* png__png_load(png__context* s, int* x, int* y, int* comp, int req_comp, png__result_info* ri)
+        {
+            png__png p;
+            p.s = s;
+            return png__do_png(&p, x, y, comp, req_comp, ri);
+        }
+
+        static int png__png_test(png__context* s)
+        {
+            int r;
+            r = png__check_png_header(s);
+            png__rewind(s);
+            return r;
+        }
+
+        static int png__png_info_raw(png__png* p, int* x, int* y, int* comp)
+        {
+            if (!png__parse_png_file(p, PNG__SCAN_header, 0)) {
+                png__rewind(p->s);
+                return 0;
+            }
+            if (x) *x = p->s->img_x;
+            if (y) *y = p->s->img_y;
+            if (comp) *comp = p->s->img_n;
+            return 1;
+        }
+
+        static int png__png_info(png__context* s, int* x, int* y, int* comp)
+        {
+            png__png p;
+            p.s = s;
+            return png__png_info_raw(&p, x, y, comp);
+        }
+
+        static int png__png_is16(png__context* s)
+        {
+            png__png p;
+            p.s = s;
+            if (!png__png_info_raw(&p, NULL, NULL, NULL))
+                return 0;
+            if (p.depth != 16) {
+                png__rewind(p.s);
+                return 0;
+            }
+            return 1;
+        }
+
+        static void* png__load_main(png__context* s, int* x, int* y, int* comp, int req_comp, png__result_info* ri, int bpc)
+        {
+            memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+            ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+            ri->channel_order = PNG_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+            ri->num_channels = 0;
+
+            if (png__png_test(s))  return png__png_load(s, x, y, comp, req_comp, ri);
+
+            return png__errpuc("unknown image type", "Image not of any known type, or corrupt");
+        }
+
+        static png_uc* png__convert_16_to_8(png__uint16* orig, int w, int h, int channels)
+        {
+            int i;
+            int img_len = w * h * channels;
+            png_uc* reduced;
+
+            reduced = (png_uc*)png__malloc(img_len);
+            if (reduced == NULL) return png__errpuc("outofmem", "Out of memory");
+
+            for (i = 0; i < img_len; ++i)
+                reduced[i] = (png_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+            PNG_FREE(orig);
+            return reduced;
+        }
+
+        static unsigned char* png__load_and_postprocess_8bit(png__context* s, int* x, int* y, int* comp, int req_comp)
+        {
+            png__result_info ri;
+            void* result = png__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+            if (result == NULL)
+                return NULL;
+
+            // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+            PNG_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+            if (ri.bits_per_channel != 8) {
+                result = png__convert_16_to_8((png__uint16*)result, *x, *y, req_comp == 0 ? *comp : req_comp);
+                ri.bits_per_channel = 8;
+            }
+
+            // @TODO: move png__convert_format to here
+
+            //if (png__vertically_flip_on_load) {
+            //    int channels = req_comp ? req_comp : *comp;
+            //    png__vertical_flip(result, *x, *y, channels * sizeof(png_uc));
+            //}
+
+            return (unsigned char*)result;
+        }
+
+        static void png__start_mem(png__context* s, png_uc const* buffer, int len)
+        {
+            s->io.read = NULL;
+            s->read_from_callbacks = 0;
+            s->callback_already_read = 0;
+            s->img_buffer = s->img_buffer_original = (png_uc*)buffer;
+            s->img_buffer_end = s->img_buffer_original_end = (png_uc*)buffer + len;
+        }
+
+        STBIDEF png_uc* png_load_from_memory(png_uc const* buffer, int len, int* x, int* y, int* comp, int req_comp)
+        {
+            png__context s;
+            png__start_mem(&s, buffer, len);
+            return png__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
+        }
+
+        //------------------------------------------------------------------------
+
+        static int png__stdio_read(void* user, char* data, int size)
+        {
+            InputMemoryStream* stream = (InputMemoryStream*)user;
+            return (int)stream->Read(size, data);
+        }
+
+        static void png__stdio_skip(void* user, int n)
+        {
+            InputMemoryStream* stream = (InputMemoryStream*)user;
+            stream->Skip(n);
+        }
+
+        static int png__stdio_eof(void* user)
+        {
+            InputMemoryStream* stream = (InputMemoryStream*)user;
+            return stream->Pos() == stream->Size() ? 1 : 0;
+        }
+
+
+        //---------------------------------------------------------------------
+
+        ImagePngLoader::ImagePngLoader(const ImageLoaderParam& param)
+            : Base::ImagePngLoader(param)
+        {
+            if (_param.format == SimdPixelFormatNone)
+                _param.format = SimdPixelFormatRgb24;
+        }
+
+        bool ImagePngLoader::FromStream()
+        {
+            const int req_comp = 4;
+            int x, y, comp;
+            png__context s;
+            s.io.eof = png__stdio_eof;
+            s.io.read = png__stdio_read;
+            s.io.skip = png__stdio_skip;
+            s.io_user_data = &_stream;
+            s.buflen = sizeof(s.buffer_start);
+            s.read_from_callbacks = 1;
+            s.callback_already_read = 0;
+            s.img_buffer = s.img_buffer_original = s.buffer_start;
+            png__refill_buffer(&s);
+            s.img_buffer_original_end = s.img_buffer_end;
+            png__result_info ri;
+            uint8_t* data = (uint8_t*)png__png_load(&s, &x, &y, &comp, req_comp, &ri);
+            if (data)
+            {
+                if (ri.bits_per_channel == 16)
+                {
+                    const uint16_t* src = (uint16_t*)data;
+                    size_t size = x * y * req_comp;
+                    uint8_t* dst = (uint8_t*)PNG_MALLOC(size);
+                    for (size_t i = 0; i < size; ++i)
+                        dst[i] = uint8_t(src[i] >> 8);
+                    PNG_FREE(data);
+                    data = dst;
+                }
+                size_t stride = 4 * x;
+                _image.Recreate(x, y, (Image::Format)_param.format);
+                if (x < A)
+                {
+                    switch (_param.format)
+                    {
+                    case SimdPixelFormatGray8:
+                        Base::RgbaToGray(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatBgr24:
+                        Base::BgraToRgb(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatBgra32:
+                        Base::BgraToRgba(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatRgb24:
+                        Base::BgraToBgr(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatRgba32:
+                        Base::Copy(data, stride, x, y, 4, _image.data, _image.stride);
+                        break;
+                    default:
+                        break;
+                    }
+                }
+                else
+                {
+                    switch (_param.format)
+                    {
+                    case SimdPixelFormatGray8:
+                        Sse2::RgbaToGray(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatBgr24:
+                        Sse41::BgraToRgb(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatBgra32:
+                        Sse41::BgraToRgba(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatRgb24:
+                        Sse41::BgraToBgr(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatRgba32:
+                        Base::Copy(data, stride, x, y, 4, _image.data, _image.stride);
+                        break;
+                    default:
+                        break;
+                    }
+                }
+                PNG_FREE(data);
+                return true;
+            }
+            return false;
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp
new file mode 100644
index 0000000000..da20b395c0
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp
@@ -0,0 +1,139 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdSse2.h"
+#include "Simd/SimdSse41.h"
+
+#include <memory>
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param)
+            : Base::ImagePgmTxtSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _convert = Sse41::BgrToGray; break;
+                case SimdPixelFormatBgra32: _convert = Sse2::BgraToGray; break;
+                case SimdPixelFormatRgb24: _convert = Sse41::RgbToGray; break;
+                case SimdPixelFormatRgba32: _convert = Sse41::RgbaToGray; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param)
+            : Base::ImagePgmBinSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _convert = Sse41::BgrToGray; break;
+                case SimdPixelFormatBgra32: _convert = Sse2::BgraToGray; break;
+                case SimdPixelFormatRgb24: _convert = Sse41::RgbToGray; break;
+                case SimdPixelFormatRgba32: _convert = Sse41::RgbaToGray; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param)
+            : Base::ImagePpmTxtSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _convert = Sse41::GrayToBgr; break;
+                case SimdPixelFormatBgr24: _convert = Sse41::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _convert = Sse41::BgraToRgb; break;
+                case SimdPixelFormatRgba32: _convert = Sse41::BgraToBgr; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param)
+            : Base::ImagePpmBinSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _convert = Sse41::GrayToBgr; break;
+                case SimdPixelFormatBgr24: _convert = Sse41::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _convert = Sse41::BgraToRgb; break;
+                case SimdPixelFormatRgba32: _convert = Sse41::BgraToBgr; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageSaver* CreateImageSaver(const ImageSaverParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinSaver(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinSaver(param);
+            case SimdImageFilePng: return new ImagePngSaver(param);
+            case SimdImageFileJpeg: return new ImageJpegSaver(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size)
+        {
+            ImageSaverParam param(width, height, format, file, quality);
+            if (param.Validate())
+            {
+                std::unique_ptr<ImageSaver> saver(CreateImageSaver(param));
+                if (saver)
+                {
+                    if (saver->ToStream(src, stride))
+                        return saver->Release(size);
+                }
+            }
+            return NULL;
+        }
+    }
+#endif// SIMD_SSE41_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp
new file mode 100644
index 0000000000..3a0a2079c1
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp
@@ -0,0 +1,431 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdImageSaveJpeg.h"
+#include "Simd/SimdSse41.h"
+#include "Simd/SimdBase.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        SIMD_INLINE void JpegDctV(const float* src, size_t srcStride, float *dst, size_t dstStride)
+        {
+            for (int i = 0; i < 2; i++, src += 4, dst += 4)
+            {
+                __m128 d0 = _mm_loadu_ps(src + 0 * srcStride);
+                __m128 d1 = _mm_loadu_ps(src + 1 * srcStride);
+                __m128 d2 = _mm_loadu_ps(src + 2 * srcStride);
+                __m128 d3 = _mm_loadu_ps(src + 3 * srcStride);
+                __m128 d4 = _mm_loadu_ps(src + 4 * srcStride);
+                __m128 d5 = _mm_loadu_ps(src + 5 * srcStride);
+                __m128 d6 = _mm_loadu_ps(src + 6 * srcStride);
+                __m128 d7 = _mm_loadu_ps(src + 7 * srcStride);
+
+                __m128 tmp0 = _mm_add_ps(d0, d7);
+                __m128 tmp7 = _mm_sub_ps(d0, d7);
+                __m128 tmp1 = _mm_add_ps(d1, d6);
+                __m128 tmp6 = _mm_sub_ps(d1, d6);
+                __m128 tmp2 = _mm_add_ps(d2, d5);
+                __m128 tmp5 = _mm_sub_ps(d2, d5);
+                __m128 tmp3 = _mm_add_ps(d3, d4);
+                __m128 tmp4 = _mm_sub_ps(d3, d4);
+
+                __m128 tmp10 = _mm_add_ps(tmp0, tmp3);
+                __m128 tmp13 = _mm_sub_ps(tmp0, tmp3);
+                __m128 tmp11 = _mm_add_ps(tmp1, tmp2);
+                __m128 tmp12 = _mm_sub_ps(tmp1, tmp2);
+
+                d0 = _mm_add_ps(tmp10, tmp11);
+                d4 = _mm_sub_ps(tmp10, tmp11);
+
+                __m128 z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), _mm_set1_ps(0.707106781f));
+                d2 = _mm_add_ps(tmp13, z1);
+                d6 = _mm_sub_ps(tmp13, z1);
+
+                tmp10 = _mm_add_ps(tmp4, tmp5);
+                tmp11 = _mm_add_ps(tmp5, tmp6);
+                tmp12 = _mm_add_ps(tmp6, tmp7);
+
+                __m128 z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12),  _mm_set1_ps(0.382683433f));
+                __m128 z2 = _mm_add_ps(_mm_mul_ps(tmp10, _mm_set1_ps(0.541196100f)), z5);
+                __m128 z4 = _mm_add_ps(_mm_mul_ps(tmp12, _mm_set1_ps(1.306562965f)), z5);
+                __m128 z3 = _mm_mul_ps(tmp11, _mm_set1_ps(0.707106781f));
+
+                __m128 z11 = _mm_add_ps(tmp7, z3);
+                __m128 z13 = _mm_sub_ps(tmp7, z3);
+
+                _mm_storeu_ps(dst + 0 * dstStride, d0);
+                _mm_storeu_ps(dst + 1 * dstStride, _mm_add_ps(z11, z4));
+                _mm_storeu_ps(dst + 2 * dstStride, d2);
+                _mm_storeu_ps(dst + 3 * dstStride, _mm_sub_ps(z13, z2));
+                _mm_storeu_ps(dst + 4 * dstStride, d4);
+                _mm_storeu_ps(dst + 5 * dstStride, _mm_add_ps(z13, z2));
+                _mm_storeu_ps(dst + 6 * dstStride, d6);
+                _mm_storeu_ps(dst + 7 * dstStride, _mm_sub_ps(z11, z4));
+            }
+        }
+
+        SIMD_INLINE void JpegDctH(const float* src, size_t srcStride, const float * fdt, int* dst)
+        {
+            for (int i = 0; i < 2; i++, src += 4 * srcStride, fdt += 4, dst += 4)
+            {
+                __m128 tmp0, tmp1, tmp2, tmp3;
+                __m128 d0 = _mm_loadu_ps(src + 0 * srcStride);
+                __m128 d1 = _mm_loadu_ps(src + 1 * srcStride);
+                __m128 d2 = _mm_loadu_ps(src + 2 * srcStride);
+                __m128 d3 = _mm_loadu_ps(src + 3 * srcStride);
+                tmp0 = _mm_unpacklo_ps(d0, d2);
+                tmp1 = _mm_unpackhi_ps(d0, d2);
+                tmp2 = _mm_unpacklo_ps(d1, d3);
+                tmp3 = _mm_unpackhi_ps(d1, d3);
+                d0 = _mm_unpacklo_ps(tmp0, tmp2);
+                d1 = _mm_unpackhi_ps(tmp0, tmp2);
+                d2 = _mm_unpacklo_ps(tmp1, tmp3);
+                d3 = _mm_unpackhi_ps(tmp1, tmp3);
+
+                __m128 d4 = _mm_loadu_ps(src + 0 * srcStride + 4);
+                __m128 d5 = _mm_loadu_ps(src + 1 * srcStride + 4);
+                __m128 d6 = _mm_loadu_ps(src + 2 * srcStride + 4);
+                __m128 d7 = _mm_loadu_ps(src + 3 * srcStride + 4);
+                tmp0 = _mm_unpacklo_ps(d4, d6);
+                tmp1 = _mm_unpackhi_ps(d4, d6);
+                tmp2 = _mm_unpacklo_ps(d5, d7);
+                tmp3 = _mm_unpackhi_ps(d5, d7);
+                d4 = _mm_unpacklo_ps(tmp0, tmp2);
+                d5 = _mm_unpackhi_ps(tmp0, tmp2);
+                d6 = _mm_unpacklo_ps(tmp1, tmp3);
+                d7 = _mm_unpackhi_ps(tmp1, tmp3);
+
+                tmp0 = _mm_add_ps(d0, d7);
+                tmp1 = _mm_add_ps(d1, d6);
+                tmp2 = _mm_add_ps(d2, d5);
+                tmp3 = _mm_add_ps(d3, d4);
+                __m128 tmp7 = _mm_sub_ps(d0, d7);
+                __m128 tmp6 = _mm_sub_ps(d1, d6);
+                __m128 tmp5 = _mm_sub_ps(d2, d5);
+                __m128 tmp4 = _mm_sub_ps(d3, d4);
+
+                __m128 tmp10 = _mm_add_ps(tmp0, tmp3);
+                __m128 tmp13 = _mm_sub_ps(tmp0, tmp3);
+                __m128 tmp11 = _mm_add_ps(tmp1, tmp2);
+                __m128 tmp12 = _mm_sub_ps(tmp1, tmp2);
+
+                d0 = _mm_add_ps(tmp10, tmp11);
+                d4 = _mm_sub_ps(tmp10, tmp11);
+
+                __m128 z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), _mm_set1_ps(0.707106781f));
+                d2 = _mm_add_ps(tmp13, z1);
+                d6 = _mm_sub_ps(tmp13, z1);
+
+                tmp10 = _mm_add_ps(tmp4, tmp5);
+                tmp11 = _mm_add_ps(tmp5, tmp6);
+                tmp12 = _mm_add_ps(tmp6, tmp7);
+
+                __m128 z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12), _mm_set1_ps(0.382683433f));
+                __m128 z2 = _mm_add_ps(_mm_mul_ps(tmp10, _mm_set1_ps(0.541196100f)), z5);
+                __m128 z4 = _mm_add_ps(_mm_mul_ps(tmp12, _mm_set1_ps(1.306562965f)), z5);
+                __m128 z3 = _mm_mul_ps(tmp11, _mm_set1_ps(0.707106781f));
+
+                __m128 z11 = _mm_add_ps(tmp7, z3);
+                __m128 z13 = _mm_sub_ps(tmp7, z3);
+
+                d1 = _mm_add_ps(z11, z4);
+                d3 = _mm_sub_ps(z13, z2);
+                d5 = _mm_add_ps(z13, z2);
+                d7 = _mm_sub_ps(z11, z4);
+
+                _mm_storeu_si128((__m128i*)dst + 0x0, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 0), d0)));
+                _mm_storeu_si128((__m128i*)dst + 0x2, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 1), d1)));
+                _mm_storeu_si128((__m128i*)dst + 0x4, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 2), d2)));
+                _mm_storeu_si128((__m128i*)dst + 0x6, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 3), d3)));
+                _mm_storeu_si128((__m128i*)dst + 0x8, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 4), d4)));
+                _mm_storeu_si128((__m128i*)dst + 0xA, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 5), d5)));
+                _mm_storeu_si128((__m128i*)dst + 0xC, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 6), d6)));
+                _mm_storeu_si128((__m128i*)dst + 0xE, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 7), d7)));
+            }
+        }
+
+        static int JpegProcessDu(Base::BitBuf& bitBuf, float* CDU, int stride, const float* fdtbl, int DC, const uint16_t HTDC[256][2], const uint16_t HTAC[256][2])
+        {
+            JpegDctV(CDU, stride, CDU, stride);
+            SIMD_ALIGNED(16) int DUO[64], DU[64];
+            JpegDctH(CDU, stride, fdtbl, DUO);
+            for (int i = 0; i < 64; ++i)
+                DU[Base::JpegZigZagT[i]] = DUO[i];
+            int diff = DU[0] - DC;
+            if (diff == 0)
+                bitBuf.Push(HTDC[0]);
+            else
+            {
+                uint16_t bits[2];
+                Base::JpegCalcBits(diff, bits);
+                bitBuf.Push(HTDC[bits[1]]);
+                bitBuf.Push(bits);
+            }
+            int end0pos4 = 60;
+            for (; end0pos4 > 0 && _mm_testz_si128(_mm_loadu_si128((__m128i*)(DU + end0pos4)), Sse2::K_INV_ZERO); end0pos4 -= 4);
+            int end0pos = end0pos4 + 3;
+            for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos);
+            if (end0pos == 0)
+            {
+                bitBuf.Push(HTAC[0x00]);
+                return DU[0];
+            }
+            for (int i = 1; i <= end0pos; ++i)
+            {
+                int startpos = i;
+                for (; DU[i] == 0 && i <= end0pos; ++i);
+                int nrzeroes = i - startpos;
+                if (nrzeroes >= 16)
+                {
+                    int lng = nrzeroes >> 4;
+                    int nrmarker;
+                    for (nrmarker = 1; nrmarker <= lng; ++nrmarker)
+                        bitBuf.Push(HTAC[0xF0]);
+                    nrzeroes &= 15;
+                }
+                uint16_t bits[2];
+                Base::JpegCalcBits(DU[i], bits);
+                bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]);
+                bitBuf.Push(bits);
+            }
+            if (end0pos != 63)
+                bitBuf.Push(HTAC[0x00]);
+            return DU[0];
+        }
+
+        SIMD_INLINE void RgbToYuvInit(__m128 k[10])
+        {
+            k[0] = _mm_set1_ps(+0.29900f);
+            k[1] = _mm_set1_ps(+0.58700f);
+            k[2] = _mm_set1_ps(+0.11400f);
+            k[3] = _mm_set1_ps(-128.000f);
+            k[4] = _mm_set1_ps(-0.16874f);
+            k[5] = _mm_set1_ps(-0.33126f);
+            k[6] = _mm_set1_ps(+0.50000f);
+            k[7] = _mm_set1_ps(+0.50000f);
+            k[8] = _mm_set1_ps(-0.41869f);
+            k[9] = _mm_set1_ps(-0.08131f);
+        }
+
+        SIMD_INLINE void RgbToYuv(const uint8_t* r, const uint8_t* g, const uint8_t* b, int stride, int height, 
+            const __m128 k[10], float* y, float* u, float* v, int size)
+        {
+            for (int row = 0; row < size;)
+            {
+                for (int col = 0; col < size; col += 4)
+                {
+                    __m128 _r = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(r + col))));
+                    __m128 _g = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(g + col))));
+                    __m128 _b = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(b + col))));
+                    _mm_storeu_ps(y + col, _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, k[0]), _mm_mul_ps(_g, k[1])), _mm_mul_ps(_b, k[2])), k[3]));
+                    //_mm_storeu_ps(y + col, _mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, _yr), _mm_mul_ps(_g, _yg)), _mm_add_ps(_mm_mul_ps(_b, _yb), _yt)));
+                    _mm_storeu_ps(u + col, _mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, k[4]), _mm_mul_ps(_g, k[5])), _mm_mul_ps(_b, k[6])));
+                    _mm_storeu_ps(v + col, _mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, k[7]), _mm_mul_ps(_g, k[8])), _mm_mul_ps(_b, k[9])));
+                }
+                if(++row < height)
+                    r += stride, g += stride, b += stride;
+                y += size, u += size, v += size;
+            }
+        }
+
+        SIMD_INLINE void GrayToY(const uint8_t* g, int stride, int height, const __m128 k[10], float* y, int size)
+        {
+            for (int row = 0; row < size;)
+            {
+                for (int col = 0; col < size; col += 4)
+                {
+                    __m128 _g = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(g + col))));
+                    _mm_storeu_ps(y + col, _mm_add_ps(_g, k[3]));
+                }
+                if (++row < height)
+                    g += stride;
+                y += size;
+            }
+        }
+
+        SIMD_INLINE void SubUv(const float * src, float * dst)
+        {
+            __m128 _0_25 = _mm_set1_ps(0.25f), s0, s1;
+            for (int yy = 0; yy < 8; yy += 1)
+            {
+                s0 = _mm_add_ps(_mm_loadu_ps(src + 0), _mm_loadu_ps(src + 16));
+                s1 = _mm_add_ps(_mm_loadu_ps(src + 4), _mm_loadu_ps(src + 20));
+                _mm_storeu_ps(dst + 0, _mm_mul_ps(_mm_hadd_ps(s0, s1), _0_25));
+                s0 = _mm_add_ps(_mm_loadu_ps(src + 8), _mm_loadu_ps(src + 24));
+                s1 = _mm_add_ps(_mm_loadu_ps(src + 12), _mm_loadu_ps(src + 28));
+                _mm_storeu_ps(dst + 4, _mm_mul_ps(_mm_hadd_ps(s0, s1), _0_25));
+                src += 32;
+                dst += 8;
+            }
+        }
+
+        void JpegWriteBlockSubs(OutputMemoryStream& stream, int width, int height, const uint8_t* red,
+            const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3])
+        {
+            __m128 k[10];
+            RgbToYuvInit(k);
+            int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2];
+            int width16 = width& (~15);
+            bool gray = red == green && red == blue;
+            Base::BitBuf bitBuf;
+            for (int y = 0; y < height; y += 16)
+            {
+                int x = 0;
+                SIMD_ALIGNED(16) float Y[256], U[256], V[256];
+                SIMD_ALIGNED(16) float subU[64], subV[64];
+                for (; x < width16; x += 16)
+                {
+                    if (gray)
+                        GrayToY(red + x, stride, height - y, k, Y, 16);
+                    else
+                        RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 16);
+                    DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        SubUv(U, subU);
+                        SubUv(V, subV);
+                        DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                    if (bitBuf.Full())
+                    {
+                        Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+                        bitBuf.Clear();
+                    }
+                }
+                for (; x < width; x += 16)
+                {
+                    if (gray)
+                        Base::GrayToY(red + x, stride, height - y, width - x, Y, 16);
+                    else
+                        Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 16);
+                    DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        SubUv(U, subU);
+                        SubUv(V, subV);
+                        DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                }
+            }
+            Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+            bitBuf.Clear();
+        }
+
+        void JpegWriteBlockFull(OutputMemoryStream& stream, int width, int height, const uint8_t* red,
+            const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3])
+        {
+            __m128 k[10];
+            RgbToYuvInit(k);
+            int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2];
+            int width8 = width & (~7);
+            bool gray = red == green && red == blue;
+            Base::BitBuf bitBuf;
+            for (int y = 0; y < height; y += 8)
+            {
+                int x = 0;
+                SIMD_ALIGNED(16) float Y[64], U[64], V[64];
+                for (; x < width8; x += 8)
+                {
+                    if (gray)
+                        GrayToY(red + x, stride, height - y, k, Y, 8);
+                    else
+                        RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 8);
+                    DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                    if (bitBuf.Full())
+                    {
+                        Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+                        bitBuf.Clear();
+                    }
+                }
+                for (; x < width; x += 8)
+                {
+                    if (gray)
+                        Base::GrayToY(red + x, stride, height - y, width - x, Y, 8);
+                    else
+                        Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 8);
+                    DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                }
+            }
+            Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+            bitBuf.Clear();
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageJpegSaver::ImageJpegSaver(const ImageSaverParam& param)
+            : Base::ImageJpegSaver(param)
+        {
+        }
+
+        void ImageJpegSaver::Init()
+        {
+            InitParams(true);
+            switch (_param.format)
+            {
+            case SimdPixelFormatBgr24:
+            case SimdPixelFormatRgb24:
+                _deintBgr = _param.width < 16 ? Base::DeinterleaveBgr : Sse41::DeinterleaveBgr;
+                break;
+            case SimdPixelFormatBgra32:
+            case SimdPixelFormatRgba32:
+                _deintBgra = _param.width < 16 ? Base::DeinterleaveBgra : Sse41::DeinterleaveBgra;
+                break;
+            default: 
+                break;
+            }
+            _writeBlock = _subSample ? JpegWriteBlockSubs : JpegWriteBlockFull;
+        }
+    }
+#endif// SIMD_SSE41_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp
new file mode 100644
index 0000000000..0e1c76b710
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp
@@ -0,0 +1,370 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdImageSavePng.h"
+#include "Simd/SimdBase.h"
+#include "Simd/SimdSse41.h"
+#include "Simd/SimdExtract.h"
+
+namespace Simd
+{        
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        uint32_t ZlibAdler32(uint8_t* data, int size)
+        {
+            __m128i _i0 = _mm_setr_epi32(0, -1, -2, -3), _4 = _mm_set1_epi32(4);
+            uint32_t lo = 1, hi = 0;
+            for (int b = 0, n = (int)(size % 5552); b < size;)
+            {
+                int n4 = n & (~3), i = 0;
+                __m128i _i = _mm_add_epi32(_i0, _mm_set1_epi32(n));
+                __m128i _l = _mm_setzero_si128(), _h = _mm_setzero_si128();
+                for (; i < n4; i += 4)
+                {
+                    __m128i d = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(data + b + i)));
+                    _l = _mm_add_epi32(_l, d);
+                    _h = _mm_add_epi32(_h, _mm_mullo_epi32(d, _i));
+                    _i = _mm_sub_epi32(_i, _4);
+                }
+                int l = Sse2::ExtractInt32Sum(_l), h = Sse2::ExtractInt32Sum(_h);
+                for (; i < n; ++i)
+                {
+                    l += data[b + i];
+                    h += data[b + i]*(n - i);
+                }
+                hi = (hi + h + lo*n) % 65521;
+                lo = (lo + l) % 65521;
+                b += n;
+                n = 5552;
+            }
+            return (hi << 16) | lo;
+        }
+
+        void ZlibCompress(uint8_t* data, int size, int quality, OutputMemoryStream& stream)
+        {
+            const int ZHASH = 16384;
+            if (quality < 5)
+                quality = 5;
+            const int basket = quality * 2;
+            Array32i hashTable(ZHASH * basket);
+            memset(hashTable.data, -1, hashTable.RawSize());
+
+            stream.Write(uint8_t(0x78));
+            stream.Write(uint8_t(0x5e));
+            stream.WriteBits(1, 1);
+            stream.WriteBits(1, 2);
+
+            int i = 0, j;
+            while (i < size - 3)
+            {
+                int h = Base::ZlibHash(data + i) & (ZHASH - 1), best = 3;
+                uint8_t* bestLoc = 0;
+                int* hList = hashTable.data + h * basket;
+                for (j = 0; hList[j] != -1 && j < basket; ++j)
+                {
+                    if (hList[j] > i - 32768)
+                    {
+                        int d = ZlibCount(data + hList[j], data + i, size - i);
+                        if (d >= best)
+                        {
+                            best = d;
+                            bestLoc = data + hList[j];
+                        }
+                    }
+                }
+                if (j == basket)
+                {
+                    memcpy(hList, hList + quality, quality * sizeof(int));
+                    memset(hList + quality, -1, quality * sizeof(int));
+                    j = quality;
+                }
+                hList[j] = i;
+
+                if (bestLoc)
+                {
+                    h = Base::ZlibHash(data + i + 1) & (ZHASH - 1);
+                    int* hList = hashTable.data + h * basket;
+                    for (j = 0; hList[j] != -1 && j < basket; ++j)
+                    {
+                        if (hList[j] > i - 32767)
+                        {
+                            int e = ZlibCount(data + hList[j], data + i + 1, size - i - 1);
+                            if (e > best)
+                            {
+                                bestLoc = NULL;
+                                break;
+                            }
+                        }
+                    }
+                }
+
+                if (bestLoc)
+                {
+                    int d = (int)(data + i - bestLoc);
+                    assert(d <= 32767 && best <= 258);
+                    for (j = 0; best > Base::ZlibLenC[j + 1] - 1; ++j);
+                    Base::ZlibHuff(j + 257, stream);
+                    if (Base::ZlibLenEb[j])
+                        stream.WriteBits(best - Base::ZlibLenC[j], Base::ZlibLenEb[j]);
+                    for (j = 0; d > Base::ZlibDistC[j + 1] - 1; ++j);
+                    stream.WriteBits(Base::ZlibBitRev(j, 5), 5);
+                    if (Base::ZlibDistEb[j])
+                        stream.WriteBits(d - Base::ZlibDistC[j], Base::ZlibDistEb[j]);
+                    i += best;
+                }
+                else
+                {
+                    Base::ZlibHuffB(data[i], stream);
+                    ++i;
+                }
+            }
+            for (; i < size; ++i)
+                Base::ZlibHuffB(data[i], stream);
+            Base::ZlibHuff(256, stream);
+            stream.FlushBits();
+            stream.WriteBe32u(ZlibAdler32(data, size));
+        }
+
+        uint32_t EncodeLine0(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size, A);
+            __m128i _sum = _mm_setzero_si128();
+            for (; i < sizeA; i += A)
+            {
+                __m128i _src = _mm_loadu_si128((__m128i*)(src + i));
+                _mm_storeu_si128((__m128i*)(dst + i), _src);
+                _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_src)));
+            }
+            uint32_t sum = Sse2::ExtractInt32Sum(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine1(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            __m128i _sum = _mm_setzero_si128();
+            for (; i < sizeA; i += A)
+            {
+                __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i));
+                __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n));
+                __m128i _dst = _mm_sub_epi8(_src0, _src1);
+                _mm_storeu_si128((__m128i*)(dst + i), _dst);
+                _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst)));
+            }
+            sum += Sse2::ExtractInt32Sum(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - n];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine2(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i] - src[i - stride];
+                sum += ::abs(dst[i]);
+            }
+            __m128i _sum = _mm_setzero_si128();
+            for (; i < sizeA; i += A)
+            {
+                __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i));
+                __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - stride));
+                __m128i _dst = _mm_sub_epi8(_src0, _src1);
+                _mm_storeu_si128((__m128i*)(dst + i), _dst);
+                _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst)));
+            }
+            sum += Sse2::ExtractInt32Sum(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - stride];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine3(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i] - (src[i - stride] >> 1);
+                sum += ::abs(dst[i]);
+            }
+            __m128i _sum = _mm_setzero_si128();
+            for (; i < sizeA; i += A)
+            {
+                __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i));
+                __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n));
+                __m128i _src2 = _mm_loadu_si128((__m128i*)(src + i - stride));
+                __m128i lo = _mm_srli_epi16(_mm_add_epi16(UnpackU8<0>(_src1), UnpackU8<0>(_src2)), 1);
+                __m128i hi = _mm_srli_epi16(_mm_add_epi16(UnpackU8<1>(_src1), UnpackU8<1>(_src2)), 1);
+                __m128i _dst = _mm_sub_epi8(_src0, _mm_packus_epi16(lo, hi));
+                _mm_storeu_si128((__m128i*)(dst + i), _dst);
+                _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst)));
+            }
+            sum += Sse2::ExtractInt32Sum(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - ((src[i - n] + src[i - stride]) >> 1);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        SIMD_INLINE __m128i Paeth(__m128i a, __m128i b, __m128i c)
+        {
+            __m128i p = _mm_sub_epi16(_mm_add_epi16(a, b), c);
+            __m128i pa = _mm_abs_epi16(_mm_sub_epi16(p, a));
+            __m128i pb = _mm_abs_epi16(_mm_sub_epi16(p, b));
+            __m128i pc = _mm_abs_epi16(_mm_sub_epi16(p, c));
+            __m128i mbc = _mm_or_si128(_mm_cmpgt_epi16(pa, pb), _mm_cmpgt_epi16(pa, pc));
+            __m128i mc = _mm_cmpgt_epi16(pb, pc);
+            return _mm_blendv_epi8(a, _mm_blendv_epi8(b, c, mc), mbc);
+        }
+
+        uint32_t EncodeLine4(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = (int8_t)(src[i] - src[i - stride]);
+                sum += ::abs(dst[i]);
+            }
+            __m128i _sum = _mm_setzero_si128();
+            for (; i < sizeA; i += A)
+            {
+                __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i));
+                __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n));
+                __m128i _src2 = _mm_loadu_si128((__m128i*)(src + i - stride));
+                __m128i _src3 = _mm_loadu_si128((__m128i*)(src + i - stride - n));
+                __m128i lo = Paeth(UnpackU8<0>(_src1), UnpackU8<0>(_src2), UnpackU8<0>(_src3));
+                __m128i hi = Paeth(UnpackU8<1>(_src1), UnpackU8<1>(_src2), UnpackU8<1>(_src3));
+                __m128i _dst = _mm_sub_epi8(_src0, _mm_packus_epi16(lo, hi));
+                _mm_storeu_si128((__m128i*)(dst + i), _dst);
+                _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst)));
+            }
+            sum += Sse2::ExtractInt32Sum(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - Base::Paeth(src[i - n], src[i - stride], src[i - stride - n]);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine5(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            __m128i _sum = _mm_setzero_si128();
+            for (; i < sizeA; i += A)
+            {
+                __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i));
+                __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n));
+                __m128i lo = _mm_srli_epi16(UnpackU8<0>(_src1), 1);
+                __m128i hi = _mm_srli_epi16(UnpackU8<1>(_src1), 1);
+                __m128i _dst = _mm_sub_epi8(_src0, _mm_packus_epi16(lo, hi));
+                _mm_storeu_si128((__m128i*)(dst + i), _dst);
+                _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst)));
+            }
+            sum += Sse2::ExtractInt32Sum(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - (src[i - n] >> 1);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine6(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            __m128i _sum = _mm_setzero_si128();
+            for (; i < sizeA; i += A)
+            {
+                __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i));
+                __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n));
+                __m128i _dst = _mm_sub_epi8(_src0, _src1);
+                _mm_storeu_si128((__m128i*)(dst + i), _dst);
+                _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst)));
+            }
+            sum += Sse2::ExtractInt32Sum(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - n];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        ImagePngSaver::ImagePngSaver(const ImageSaverParam& param)
+            : Base::ImagePngSaver(param)
+        {
+            if (_param.format == SimdPixelFormatBgr24)
+                _convert = Sse41::BgrToRgb;
+            else if (_param.format == SimdPixelFormatBgra32)
+                _convert = Sse41::BgraToRgba;
+            _encode[0] = Sse41::EncodeLine0;
+            _encode[1] = Sse41::EncodeLine1;
+            _encode[2] = Sse41::EncodeLine2;
+            _encode[3] = Sse41::EncodeLine3;
+            _encode[4] = Sse41::EncodeLine4;
+            _encode[5] = Sse41::EncodeLine5;
+            _encode[6] = Sse41::EncodeLine6;
+            _compress = Sse41::ZlibCompress;
+        }
+    }
+#endif// SIMD_SSE41_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdView.hpp b/3rdparty/simdlib/Simd/SimdView.hpp
index 0c61a0e6e8..33629be94f 100755
--- a/3rdparty/simdlib/Simd/SimdView.hpp
+++ b/3rdparty/simdlib/Simd/SimdView.hpp
@@ -27,7 +27,6 @@
 #ifndef __SimdView_hpp__
 #define __SimdView_hpp__
 
-#include "Simd/SimdDefs.h"
 #include "Simd/SimdRectangle.hpp"
 #include "Simd/SimdAllocator.hpp"
 
@@ -493,34 +492,57 @@ namespace Simd
         /*!
             Loads image from file.
             
-            Supported formats:
-             - PGM(Portable Gray Map) text(P2) or binary(P5) (the file is loaded as 8-bit gray image).
-             - PPM(Portable Pixel Map) text(P3) or binary(P6) (the file is loaded as 32-bit BGRA image).
+            Supported formats are described by ::SimdImageFileType enumeration.
 
             \note PGM and PPM files with comments are not supported.
 
-            \param [in] path - a path to file with PGM or PPM image.
+            \param [in] path - a path to image file.
+            \param [in] format - a desired format of loaded image. 
+                Supported values are View::Gray8, View::Bgr24, View::Bgra32, View::Rgb24, View::Rgba32 and View::None.
+                Default value is View::None (loads image in native pixel format of image file).
             \return - a result of loading.
         */
-        bool Load(const std::string & path);
+        bool Load(const std::string & path, Format format = None);
+
+        /*!
+            Loads image from memory buffer.
+
+            Supported formats are described by ::SimdImageFileType enumeration.
+
+            \note PGM and PPM files with comments are not supported.
+
+            \param [in] src - a pointer to memory buffer.
+            \param [in] size - a buffer size.
+            \param [in] format - a desired format of loaded image.
+                Supported values are View::Gray8, View::Bgr24, View::Bgra32, View::Rgb24, View::Rgba32 and View::None.
+                Default value is View::None (loads image in native pixel format of image file).
+            \return - a result of loading.
+        */
+        bool Load(const uint8_t * src, size_t size, Format format = None);
 
         /*!
             Saves image to file.
  
-            Supported formats:
-             - PGM(Portable Gray Map) binary(P5) (this format is used in order to save 8-bit gray images).
-             - PPM(Portable Pixel Map) binary(P6) (this format is used in order to save 24-bit BGR and 32-bit BGRA images).
-
             \param [in] path - a path to file.
+            \param [in] type - a image file format. By default is equal to ::SimdImageFileUndefined (format auto choice).
+            \param [in] quality - a parameter of compression quality (if file format supports it).
             \return - a result of saving.
         */
-        bool Save(const std::string & path) const;
+        bool Save(const std::string & path, SimdImageFileType type = SimdImageFileUndefined, int quality = 100) const;
 
         /*!
-            Clear View structure (reset all fields) and free memory if it's owner
+            Clears View structure (reset all fields) and free memory if it's owner.
          */
         void Clear();
 
+        /*!
+            Releases pixel data and resets all fields.
+
+            \param [out] size - a pointer to the size of released pixel data. Can be NULL.
+            \return - a released pointer to pixel data. It must be deleted by function ::SimdFree.
+        */
+        uint8_t* Release(size_t* size = NULL);
+
     private:
         bool _owner;
     };
@@ -1027,6 +1049,7 @@ namespace Simd
         case Float:     return 4;
         case Double:    return 8;
         case Rgb24:     return 3;
+        case Rgba32:    return 4;
         default: assert(0); return 0;
         }
     }
@@ -1050,6 +1073,7 @@ namespace Simd
         case Float:     return 4;
         case Double:    return 8;
         case Rgb24:     return 1;
+        case Rgba32:    return 1;
         default: assert(0); return 0;
         }
     }
@@ -1073,6 +1097,7 @@ namespace Simd
         case Float:     return 1;
         case Double:    return 1;
         case Rgb24:     return 3;
+        case Rgba32:    return 4;
         default: assert(0); return 0;
         }
     }
@@ -1124,139 +1149,33 @@ namespace Simd
         std::swap((bool&)_owner, (bool&)other._owner);
     }
 
-    template <template<class> class A> SIMD_INLINE bool View<A>::Load(const std::string & path)
+    template <template<class> class A> SIMD_INLINE bool View<A>::Load(const std::string & path, Format format_)
     {
-        std::ifstream ifs(path.c_str(), std::ifstream::binary);
-        if (ifs.is_open())
-        {
-            std::string type;
-            ifs >> type;
-            if (type == "P2" || type == "P5")
-            {
-                size_t w, h, d;
-                ifs >> w >> h >> d;
-                if (d != 255)
-                    return false;
-                ifs.get();
-                Recreate(w, h, View<A>::Gray8);
-                if (type == "P2")
-                {
-                    for (size_t row = 0; row < height; ++row)
-                    {
-                        for (size_t col = 0; col < width; ++col)
-                        {
-                            int gray;
-                            ifs >> gray;
-                            data[row * stride + col] = (uint8_t)gray;
-                        }
-                    }
-                }
-                else
-                {
-                    for (size_t row = 0; row < height; ++row)
-                        ifs.read((char*)(data + row*stride), width);
-                }
-                return true;
-            }
-            if (type == "P3" || type == "P6")
-            {
-                size_t w, h, d;
-                ifs >> w >> h >> d;
-                if (d != 255)
-                    return false;
-                ifs.get();
-                Recreate(w, h, View<A>::Bgra32);
-                if (type == "P3")
-                {
-                    for (size_t row = 0; row < height; ++row)
-                    {
-                        uint8_t * bgra = data + row * stride;
-                        for (size_t col = 0; col < width; ++col, bgra += 4)
-                        {
-                            int blue, green, red;
-                            ifs >> red >> green >> blue;
-                            bgra[0] = (uint8_t)blue;
-                            bgra[1] = (uint8_t)green;
-                            bgra[2] = (uint8_t)red;
-                            bgra[3] = 0xFF;
-                        }
-                    }
-                }
-                else
-                {
-                    View buffer(width, 1, Bgr24);
-                    for (size_t row = 0; row < height; ++row)
-                    {
-                        ifs.read((char*)buffer.data, width*3);
-                        const uint8_t * rgb = buffer.data;
-                        uint8_t * bgra = data + row*stride;
-                        for (size_t col = 0; col < width; ++col, rgb += 3, bgra += 4)
-                        {
-                            bgra[0] = rgb[2];
-                            bgra[1] = rgb[1];
-                            bgra[2] = rgb[0];
-                            bgra[3] = 0xFF;
-                        }
-                    }
-                }
-                return true;
-            }
-        }
-        return false;
+        Clear();
+        (Format&)format = format_;
+        *(uint8_t**)&data = SimdImageLoadFromFile(path.c_str(), (size_t*)&stride, (size_t*)&width, (size_t*)&height, (SimdPixelFormatType*)&format);
+        if (data)
+            _owner = true;
+        else
+            (Format&)format = None;
+        return _owner;
     }
 
-    template <template<class> class A> SIMD_INLINE bool View<A>::Save(const std::string & path) const
+    template <template<class> class A> SIMD_INLINE bool View<A>::Load(const uint8_t * src, size_t size, Format format_)
     {
-        if (!(format == View<A>::Gray8 || format == View<A>::Bgr24 || format == View<A>::Bgra32))
-            return false;
-
-        std::ofstream ofs(path.c_str(), std::ofstream::binary);
-        if (ofs.is_open())
-        {
-            if (format == View<A>::Gray8)
-            {
-                ofs << "P5\n" << width << " " << height << "\n255\n";
-                for (size_t row = 0; row < height; ++row)
-                    ofs.write((const char*)(data + row*stride), width);
-            }
-            else if (format == View<A>::Bgr24)
-            {
-                ofs << "P6\n" << width << " " << height << "\n255\n";
-                View buffer(width, 1, Bgr24);
-                for (size_t row = 0; row < height; ++row)
-                {
-                    const uint8_t * bgr = data + row*stride;
-                    uint8_t * rgb = buffer.data;
-                    for (size_t col = 0; col < width; ++col, bgr += 3, rgb += 3)
-                    {
-                        rgb[0] = bgr[2];
-                        rgb[1] = bgr[1];
-                        rgb[2] = bgr[0];
-                    }
-                    ofs.write((const char*)(buffer.data), width*3);
-                }
-            }
-            else if (format == View<A>::Bgra32)
-            {
-                ofs << "P6\n" << width << " " << height << "\n255\n";
-                View buffer(width, 1, Bgr24);
-                for (size_t row = 0; row < height; ++row)
-                {
-                    const uint8_t * bgra = data + row*stride;
-                    uint8_t * rgb = buffer.data;
-                    for (size_t col = 0; col < width; ++col, bgra += 4, rgb += 3)
-                    {
-                        rgb[0] = bgra[2];
-                        rgb[1] = bgra[1];
-                        rgb[2] = bgra[0];
-                    }
-                    ofs.write((const char*)buffer.data, width * 3);
-                }
-            }
-            return true;
-        }
+        Clear();
+        (Format&)format = format_;
+        *(uint8_t**)&data = SimdImageLoadFromMemory(src, size, (size_t*)&stride, (size_t*)&width, (size_t*)&height, (SimdPixelFormatType*)&format);
+        if (data)
+            _owner = true;
         else
-            return false;
+            (Format&)format = None;
+        return _owner;
+    }
+
+    template <template<class> class A> SIMD_INLINE bool View<A>::Save(const std::string & path, SimdImageFileType type, int quality) const
+    {
+        return SimdImageSaveToFile(data, stride, width, height, (SimdPixelFormatType)format, type, quality, path.c_str()) == SimdTrue;
     }
 
     template <template<class> class A> SIMD_INLINE void View<A>::Clear()
@@ -1279,6 +1198,16 @@ namespace Simd
 #endif
     }
 
+    template <template<class> class A> SIMD_INLINE uint8_t* View<A>::Release(size_t* size)
+    {
+        uint8_t* released = data;
+        if (size)
+            *size = DataSize();
+        _owner = false;
+        Clear();
+        return released;
+    }
+
     // View utilities implementation:
 
     template <template<class> class A, class T> const T & At(const View<A> & view, size_t x, size_t y)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9adaac4edc..b2aa7a863e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -679,6 +679,8 @@ if(NOT USE_OPENCV AND (NOT USE_PNG OR NOT USE_JPEG))
 else()
   set(WITH_STBIMAGE OFF)
 endif()
+# TODO:
+set(WITH_STBIMAGE ON)
 
 VP_OPTION(WITH_CATCH2   ""           ""    "Use catch2" "" ON IF (VISP_CXX_STANDARD GREATER VISP_CXX_STANDARD_98))
 
diff --git a/modules/io/CMakeLists.txt b/modules/io/CMakeLists.txt
index 959ee1c9b6..949ec58aef 100644
--- a/modules/io/CMakeLists.txt
+++ b/modules/io/CMakeLists.txt
@@ -57,11 +57,21 @@ if(USE_PNG)
   add_definitions(${PNG_DEFINITIONS})
 endif()
 
-if(WITH_STBIMAGE)
+# TODO:
+#if(WITH_STBIMAGE)
   # stb_image is private
   include_directories(${STBIMAGE_INCLUDE_DIRS})
+#endif()
+
+if(WITH_CATCH2)
+  # catch2 is private
+  include_directories(${CATCH2_INCLUDE_DIRS})
 endif()
 
+# simdlib is always enabled since it contains fallback code to plain C++ code
+# Simd lib is private
+include_directories(${SIMDLIB_INCLUDE_DIRS})
+
 # OpenCV
 if(USE_OPENCV)
   # On win32 since OpenCV 2.4.7 and on OSX with OpenCV 2.4.10 we cannot use OpenCV_LIBS to set ViSP 3rd party libraries.
@@ -178,7 +188,7 @@ endif()
 vp_glob_module_sources()
 vp_module_include_directories(${opt_incs})
 vp_create_module(${opt_libs})
-vp_add_tests(DEPENDS_ON visp_features)
+vp_add_tests()
 
 vp_set_source_file_compile_flag(src/tools/vpParseArgv.cpp -Wno-strict-overflow)
 
diff --git a/modules/io/include/visp3/io/vpImageIo.h b/modules/io/include/visp3/io/vpImageIo.h
index d37cad48e3..11bd9aa766 100644
--- a/modules/io/include/visp3/io/vpImageIo.h
+++ b/modules/io/include/visp3/io/vpImageIo.h
@@ -144,6 +144,10 @@ class VISP_EXPORT vpImageIo
   static void readPNG(vpImage<unsigned char> &I, const std::string &filename);
   static void readPNG(vpImage<vpRGBa> &I, const std::string &filename);
 
+  //TODO:
+  static void readSimdlib(vpImage<vpRGBa> &I, const std::string &filename);
+  static void readStb(vpImage<vpRGBa> &I, const std::string &filename);
+
   static void writePFM(const vpImage<float> &I, const std::string &filename);
 
   static void writePGM(const vpImage<unsigned char> &I, const std::string &filename);
@@ -158,5 +162,9 @@ class VISP_EXPORT vpImageIo
 
   static void writePNG(const vpImage<unsigned char> &I, const std::string &filename);
   static void writePNG(const vpImage<vpRGBa> &I, const std::string &filename);
+
+  //TODO:
+  static void writeSimdlib(vpImage<vpRGBa> &I, const std::string &filename);
+  static void writeStb(vpImage<vpRGBa> &I, const std::string &filename);
 };
 #endif
diff --git a/modules/io/src/image/vpImageIo.cpp b/modules/io/src/image/vpImageIo.cpp
index ab290fa5f7..cc7799d158 100644
--- a/modules/io/src/image/vpImageIo.cpp
+++ b/modules/io/src/image/vpImageIo.cpp
@@ -62,6 +62,15 @@
 #include <png.h>
 #endif
 
+//TODO:
+#include <Simd/SimdLib.hpp>
+//TODO:
+#define STB_IMAGE_IMPLEMENTATION
+#include <stb_image.h>
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include <stb_image_write.h>
+
 #if !defined(VISP_HAVE_OPENCV)
 #if !defined(VISP_HAVE_JPEG) || !defined(VISP_HAVE_PNG)
 
@@ -2059,6 +2068,60 @@ void vpImageIo::readPNG(vpImage<vpRGBa> &I, const std::string &filename)
   fclose(file);
 }
 
+//TODO:
+void vpImageIo::readSimdlib(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  size_t stride = 0, width = 0, height = 0;
+  SimdPixelFormatType format = SimdPixelFormatRgba32;
+  uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format);
+  const bool copyData = false;
+  I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData);
+}
+
+void vpImageIo::readStb(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  int width = 0, height = 0, channels = 0;
+  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha);
+  if (image == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
+  }
+  I.init(reinterpret_cast<vpRGBa*>(image), static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
+  stbi_image_free(image);
+}
+
+inline bool ends_with(std::string const & value, std::string const & ending)
+{
+    if (ending.size() > value.size()) return false;
+    return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
+}
+
+void vpImageIo::writeSimdlib(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  if (ends_with(filename, ".png")) {
+    SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFilePng, 90, filename.c_str());
+  } else {
+    SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, 90, filename.c_str());
+  }
+}
+
+void vpImageIo::writeStb(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  if (ends_with(filename, ".png")) {
+    const int stride_in_bytes = static_cast<int>(4 * I.getWidth());
+    int res = stbi_write_png(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
+                             reinterpret_cast<void*>(I.bitmap), stride_in_bytes);
+    if (res == 0) {
+      throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str()));
+    }
+  } else {
+    int res = stbi_write_jpg(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
+                             reinterpret_cast<void*>(I.bitmap), 90);
+    if (res == 0) {
+      throw(vpImageException(vpImageException::ioError, "JEPG write error"));
+    }
+  }
+}
+
 #elif defined(VISP_HAVE_OPENCV)
 
 /*!
diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp
new file mode 100644
index 0000000000..ce0d416b70
--- /dev/null
+++ b/modules/io/test/perfImageLoadSave.cpp
@@ -0,0 +1,461 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Benchmark color image conversion.
+ *
+ *****************************************************************************/
+
+#include <visp3/core/vpConfig.h>
+
+#ifdef VISP_HAVE_CATCH2
+#define CATCH_CONFIG_ENABLE_BENCHMARKING
+#define CATCH_CONFIG_RUNNER
+#include <catch.hpp>
+
+#include <thread>
+#include <visp3/core/vpIoTools.h>
+#include <visp3/io/vpImageIo.h>
+
+static std::string ipath = vpIoTools::getViSPImagesDataPath();
+static std::string imagePathJpeg = vpIoTools::createFilePath(ipath, "Klimt/Klimt.jpeg");
+static std::string imagePathPng = vpIoTools::createFilePath(ipath, "Klimt/Klimt.png");
+static std::string imagePathPngBig = vpIoTools::createFilePath(ipath, "Klimt/test_image_resize.png");
+static int nThreads = 0;
+
+TEST_CASE("Benchmark Jpeg image loading", "[benchmark]") {
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::read()") {
+      vpImageIo::read(I, imagePathJpeg);
+      return I;
+    };
+  }
+
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::readSimdlib()") {
+      vpImageIo::readSimdlib(I, imagePathJpeg);
+      return I;
+    };
+  }
+
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::readStb()") {
+      vpImageIo::readStb(I, imagePathJpeg);
+      return I;
+    };
+  }
+}
+
+TEST_CASE("Benchmark Png image loading", "[benchmark]") {
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::read()") {
+      vpImageIo::read(I, imagePathPng);
+      return I;
+    };
+  }
+
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::readSimdlib()") {
+      vpImageIo::readSimdlib(I, imagePathPng);
+      return I;
+    };
+  }
+
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::readStb()") {
+      vpImageIo::readStb(I, imagePathPng);
+      return I;
+    };
+  }
+}
+
+TEST_CASE("Benchmark big Png image loading", "[benchmark]") {
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::read()") {
+      vpImageIo::read(I, imagePathPngBig);
+      return I;
+    };
+  }
+
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::readSimdlib()") {
+      vpImageIo::readSimdlib(I, imagePathPngBig);
+      return I;
+    };
+  }
+
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::readStb()") {
+      vpImageIo::readStb(I, imagePathPngBig);
+      return I;
+    };
+  }
+}
+
+TEST_CASE("Benchmark Jpeg image saving", "[benchmark]") {
+  vpImage<vpRGBa> I;
+  vpImageIo::read(I, imagePathJpeg);
+  {
+    const std::string filename = "/tmp/Klimt_ViSP.jpg";
+
+    BENCHMARK("vpImageIo::write()") {
+      vpImageIo::write(I, filename);
+      return I;
+    };
+  }
+
+  {
+    const std::string filename = "/tmp/Klimt_Simd.jpg";
+
+    BENCHMARK("vpImageIo::writeSimdlib()") {
+      vpImageIo::writeSimdlib(I, filename);
+      return I;
+    };
+  }
+
+  {
+    const std::string filename = "/tmp/Klimt_stb.jpg";
+
+    BENCHMARK("vpImageIo::writeStb()") {
+      vpImageIo::writeStb(I, filename);
+      return I;
+    };
+  }
+}
+
+TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") {
+  vpImage<vpRGBa> I;
+  vpImageIo::read(I, imagePathPngBig);
+  {
+    const std::string filename = "/tmp/Big_images_ViSP.jpg";
+
+    BENCHMARK("vpImageIo::write()") {
+      vpImageIo::write(I, filename);
+      return I;
+    };
+  }
+
+//  {
+//    const std::string filename = "/tmp/Big_images_Simd.jpg";
+
+//    BENCHMARK("vpImageIo::writeSimdlib()") {
+//      vpImageIo::writeSimdlib(I, filename);
+//      return I;
+//    };
+//  }
+
+  {
+    const std::string filename = "/tmp/Big_images_stb.jpg";
+
+    BENCHMARK("vpImageIo::writeStb()") {
+      vpImageIo::writeStb(I, filename);
+      return I;
+    };
+  }
+}
+
+TEST_CASE("Benchmark Png image saving", "[benchmark]") {
+  vpImage<vpRGBa> I;
+  vpImageIo::read(I, imagePathPng);
+  {
+    const std::string filename = "/tmp/Klimt_ViSP.png";
+
+    BENCHMARK("vpImageIo::write()") {
+      vpImageIo::write(I, filename);
+      return I;
+    };
+  }
+
+  {
+    const std::string filename = "/tmp/Klimt_Simd.png";
+
+    BENCHMARK("vpImageIo::writeSimdlib()") {
+      vpImageIo::writeSimdlib(I, filename);
+      return I;
+    };
+  }
+
+  {
+    const std::string filename = "/tmp/Klimt_stb.png";
+
+    BENCHMARK("vpImageIo::writeStb()") {
+      vpImageIo::writeStb(I, filename);
+      return I;
+    };
+  }
+}
+
+TEST_CASE("Benchmark big Png image saving", "[benchmark]") {
+  vpImage<vpRGBa> I;
+  vpImageIo::read(I, imagePathPngBig);
+  {
+    const std::string filename = "/tmp/Big_images_ViSP.png";
+
+    BENCHMARK("vpImageIo::write()") {
+      vpImageIo::write(I, filename);
+      return I;
+    };
+  }
+
+  {
+    const std::string filename = "/tmp/Big_images_Simd.png";
+
+    BENCHMARK("vpImageIo::writeSimdlib()") {
+      vpImageIo::writeSimdlib(I, filename);
+      return I;
+    };
+  }
+
+  {
+    const std::string filename = "/tmp/Big_images_stb.png";
+
+    BENCHMARK("vpImageIo::writeStb()") {
+      vpImageIo::writeStb(I, filename);
+      return I;
+    };
+  }
+}
+
+//TEST_CASE("Benchmark bgr to grayscale (ViSP)", "[benchmark]") {
+//  vpImage<vpRGBa> I;
+//  vpImageIo::read(I, imagePathColor);
+
+//  std::vector<unsigned char> bgr;
+//  common_tools::RGBaToBGR(I, bgr);
+
+//  vpImage<unsigned char> I_gray(I.getHeight(), I.getWidth());
+
+//  BENCHMARK("Benchmark bgr to grayscale (ViSP)") {
+//    vpImageConvert::BGRToGrey(bgr.data(),
+//                              I_gray.bitmap,
+//                              I.getWidth(), I.getHeight(),
+//                              false, nThreads);
+//    return I_gray;
+//  };
+
+//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101)
+//  SECTION("OpenCV Mat type")
+//  {
+//    cv::Mat img;
+//    vpImageConvert::convert(I, img);
+
+//    BENCHMARK("Benchmark bgr to grayscale (ViSP + OpenCV Mat type)") {
+//      vpImageConvert::convert(img, I_gray, false, nThreads);
+//      return I_gray;
+//    };
+//  }
+//#endif
+//}
+//#endif
+
+//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101)
+//TEST_CASE("Benchmark bgr to grayscale (OpenCV)", "[benchmark]") {
+//  cv::Mat img = cv::imread(imagePathColor);
+//  cv::Mat img_gray(img.size(), CV_8UC1);
+
+//  BENCHMARK("Benchmark bgr to grayscale (OpenCV)") {
+//    cv::cvtColor(img, img_gray, cv::COLOR_BGR2GRAY);
+//    return img_gray;
+//  };
+//}
+//#endif
+
+//// C++11 to be able to do bgr.data()
+//#if VISP_CXX_STANDARD >= VISP_CXX_STANDARD_11
+//TEST_CASE("Benchmark bgr to rgba (naive code)", "[benchmark]") {
+//  vpImage<vpRGBa> I;
+//  vpImageIo::read(I, imagePathColor);
+
+//  std::vector<unsigned char> bgr;
+//  common_tools::RGBaToBGR(I, bgr);
+
+//  vpImage<vpRGBa> I_bench(I.getHeight(), I.getWidth());
+//  BENCHMARK("Benchmark bgr to rgba (naive code)") {
+//    common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast<unsigned char*>(I_bench.bitmap),
+//                               I.getWidth(), I.getHeight(), false);
+//    return I_bench;
+//  };
+//}
+
+//TEST_CASE("Benchmark bgr to rgba (ViSP)", "[benchmark]") {
+//  vpImage<vpRGBa> I;
+//  vpImageIo::read(I, imagePathColor);
+
+//  std::vector<unsigned char> bgr;
+//  common_tools::RGBaToBGR(I, bgr);
+
+//  SECTION("Check BGR to RGBa conversion")
+//  {
+//    vpImage<vpRGBa> ref(I.getHeight(), I.getWidth());
+//    common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast<unsigned char*>(ref.bitmap),
+//                               I.getWidth(), I.getHeight(), false);
+//    vpImage<vpRGBa> rgba(I.getHeight(), I.getWidth());
+//    vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast<unsigned char *>(rgba.bitmap),
+//                              I.getWidth(), I.getHeight(), false);
+
+//    CHECK((rgba == ref));
+//  }
+
+//  vpImage<vpRGBa> I_rgba(I.getHeight(), I.getWidth());
+//  BENCHMARK("Benchmark bgr to rgba (ViSP)") {
+//    vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast<unsigned char *>(I_rgba.bitmap),
+//                              I.getWidth(), I.getHeight(), false);
+//    return I_rgba;
+//  };
+
+//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101)
+//  SECTION("OpenCV Mat type")
+//  {
+//    cv::Mat img;
+//    vpImageConvert::convert(I, img);
+
+//    BENCHMARK("Benchmark bgr to rgba (ViSP + OpenCV Mat type)") {
+//      vpImageConvert::convert(img, I_rgba);
+//      return I_rgba;
+//    };
+//  }
+//#endif
+//}
+
+//TEST_CASE("Benchmark bgra to rgba (naive code)", "[benchmark]") {
+//  vpImage<vpRGBa> I;
+//  vpImageIo::read(I, imagePathColor);
+
+//  std::vector<unsigned char> bgra;
+//  common_tools::RGBaToBGRa(I, bgra);
+
+//  vpImage<vpRGBa> I_bench(I.getHeight(), I.getWidth());
+//  BENCHMARK("Benchmark bgra to rgba (naive code)") {
+//    common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast<unsigned char*>(I_bench.bitmap),
+//                                I.getWidth(), I.getHeight(), false);
+//    return I_bench;
+//  };
+//}
+
+//TEST_CASE("Benchmark bgra to rgba (ViSP)", "[benchmark]") {
+//  vpImage<vpRGBa> I;
+//  vpImageIo::read(I, imagePathColor);
+
+//  std::vector<unsigned char> bgra;
+//  common_tools::RGBaToBGRa(I, bgra);
+
+//  SECTION("Check BGRa to RGBa conversion")
+//  {
+//    vpImage<vpRGBa> ref(I.getHeight(), I.getWidth());
+//    common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast<unsigned char*>(ref.bitmap),
+//                                I.getWidth(), I.getHeight(), false);
+//    vpImage<vpRGBa> rgba(I.getHeight(), I.getWidth());
+//    vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast<unsigned char *>(rgba.bitmap),
+//                               I.getWidth(), I.getHeight(), false);
+
+//    CHECK((rgba == ref));
+//  }
+//  vpImage<vpRGBa> I_rgba(I.getHeight(), I.getWidth());
+//  BENCHMARK("Benchmark bgra to rgba (ViSP)") {
+//    vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast<unsigned char *>(I_rgba.bitmap),
+//                               I.getWidth(), I.getHeight(), false);
+//    return I_rgba;
+//  };
+//}
+//#endif
+
+int main(int argc, char *argv[])
+{
+  Catch::Session session; // There must be exactly one instance
+
+  bool runBenchmark = false;
+  // Build a new parser on top of Catch's
+  using namespace Catch::clara;
+  auto cli = session.cli() // Get Catch's composite command line parser
+    | Opt(runBenchmark)    // bind variable to a new option, with a hint string
+    ["--benchmark"]        // the option names it will respond to
+    ("run benchmark?")     // description string for the help output
+    | Opt(imagePathJpeg, "imagePathColor")
+    ["--imagePathColor"]
+    ("Path to color image")
+    | Opt(imagePathPng, "imagePathColor")
+    ["--imagePathGray"]
+    ("Path to gray image")
+    | Opt(nThreads, "nThreads")
+    ["--nThreads"]
+    ("Number of threads");
+
+  // Now pass the new composite back to Catch so it uses that
+  session.cli(cli);
+
+  // Let Catch (using Clara) parse the command line
+  session.applyCommandLine(argc, argv);
+
+  if (runBenchmark) {
+//    vpImage<vpRGBa> I_color;
+//    vpImageIo::read(I_color, imagePathColor);
+//    std::cout << "imagePathColor:\n\t" << imagePathColor << "\n\t" << I_color.getWidth() << "x" << I_color.getHeight() << std::endl;
+
+//    vpImage<unsigned char> I_gray;
+//    vpImageIo::read(I_gray, imagePathGray);
+//    std::cout << "imagePathGray:\n\t" << imagePathGray << "\n\t" << I_gray.getWidth() << "x" << I_gray.getHeight() << std::endl;
+    std::cout << "nThreads: " << nThreads << " / available threads: " << std::thread::hardware_concurrency() << std::endl;
+
+    int numFailed = session.run();
+
+    // numFailed is clamped to 255 as some unices only use the lower 8 bits.
+    // This clamping has already been applied, so just return it here
+    // You can also do any post run clean-up here
+    return numFailed;
+  }
+
+  return EXIT_SUCCESS;
+}
+#else
+#include <iostream>
+
+int main()
+{
+  return 0;
+}
+#endif

From aad93cd76dbcf3b08d6ffc1a981499758a0235bf Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Thu, 4 Nov 2021 14:06:19 +0100
Subject: [PATCH 05/18] Fix issue when writing big Jpeg images.

---
 3rdparty/simdlib/Simd/SimdImageSaveJpeg.h |  5 +++--
 modules/io/test/perfImageLoadSave.cpp     | 14 +++++++-------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h
index d54164f7d4..f3d5f4a96c 100644
--- a/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h
+++ b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h
@@ -35,9 +35,9 @@ namespace Simd
     {
         struct BitBuf
         {
-            static const uint32_t capacity = 1024;
+            static const uint32_t capacity = 2048;
             uint32_t size;
-            uint16_t data[1024][2];
+            uint16_t data[capacity][2];
 
             SIMD_INLINE BitBuf()
                 : size(0) 
@@ -51,6 +51,7 @@ namespace Simd
 
             SIMD_INLINE bool Full(uint32_t tail = capacity / 2) const
             {
+                assert(size <= capacity);
                 return size + tail >= capacity;
             }
 
diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp
index ce0d416b70..8efe2c759e 100644
--- a/modules/io/test/perfImageLoadSave.cpp
+++ b/modules/io/test/perfImageLoadSave.cpp
@@ -180,14 +180,14 @@ TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") {
     };
   }
 
-//  {
-//    const std::string filename = "/tmp/Big_images_Simd.jpg";
+  {
+    const std::string filename = "/tmp/Big_images_Simd.jpg";
 
-//    BENCHMARK("vpImageIo::writeSimdlib()") {
-//      vpImageIo::writeSimdlib(I, filename);
-//      return I;
-//    };
-//  }
+    BENCHMARK("vpImageIo::writeSimdlib()") {
+      vpImageIo::writeSimdlib(I, filename);
+      return I;
+    };
+  }
 
   {
     const std::string filename = "/tmp/Big_images_stb.jpg";

From a70090ceda0d28077dde0217e6515a833c6e6b8a Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Wed, 17 Nov 2021 00:51:26 +0100
Subject: [PATCH 06/18] Experimental: wip code to try adding a backend system
 for image I/O.

---
 .../core/include/visp3/core/vpImageTools.h    |    8 +-
 modules/io/include/visp3/io/vpImageIo.h       |   34 +-
 .../io/src/image/private/vpImageIoBackend.h   |  104 +
 .../io/src/image/private/vpImageIoLibjpeg.cpp |  345 +++
 .../io/src/image/private/vpImageIoLibpng.cpp  |  615 +++++
 .../io/src/image/private/vpImageIoOpenCV.cpp  |  205 ++
 .../src/image/private/vpImageIoPortable.cpp   |  569 +++++
 .../io/src/image/private/vpImageIoSimd.cpp    |   87 +
 modules/io/src/image/private/vpImageIoStb.cpp |  121 +
 modules/io/src/image/vpImageIo.cpp            | 2112 ++---------------
 modules/io/test/perfImageLoadSave.cpp         |  171 +-
 11 files changed, 2286 insertions(+), 2085 deletions(-)
 create mode 100644 modules/io/src/image/private/vpImageIoBackend.h
 create mode 100644 modules/io/src/image/private/vpImageIoLibjpeg.cpp
 create mode 100644 modules/io/src/image/private/vpImageIoLibpng.cpp
 create mode 100644 modules/io/src/image/private/vpImageIoOpenCV.cpp
 create mode 100644 modules/io/src/image/private/vpImageIoPortable.cpp
 create mode 100644 modules/io/src/image/private/vpImageIoSimd.cpp
 create mode 100644 modules/io/src/image/private/vpImageIoStb.cpp

diff --git a/modules/core/include/visp3/core/vpImageTools.h b/modules/core/include/visp3/core/vpImageTools.h
index bf6e4a77f8..f12246e61d 100644
--- a/modules/core/include/visp3/core/vpImageTools.h
+++ b/modules/core/include/visp3/core/vpImageTools.h
@@ -1496,19 +1496,19 @@ void vpImageTools::warpLinear(const vpImage<Type> &src, const vpMatrix &T, vpIma
               const Type val01 = src[y_][x_ + 1];
               const Type val10 = src[y_ + 1][x_];
               const Type val11 = src[y_ + 1][x_ + 1];
-              const float col0 = lerp(val00, val01, s);
-              const float col1 = lerp(val10, val11, s);
+              const float col0 = lerp(static_cast<float>(val00), static_cast<float>(val01), s);
+              const float col1 = lerp(static_cast<float>(val10), static_cast<float>(val11), s);
               const float interp = lerp(col0, col1, t);
               dst[i][j] = vpMath::saturate<Type>(interp);
             } else if (y_ < static_cast<int>(src.getHeight()) - 1) {
               const Type val00 = src[y_][x_];
               const Type val10 = src[y_ + 1][x_];
-              const float interp = lerp(val00, val10, t);
+              const float interp = lerp(static_cast<float>(val00), static_cast<float>(val10), t);
               dst[i][j] = vpMath::saturate<Type>(interp);
             } else if (x_ < static_cast<int>(src.getWidth()) - 1) {
               const Type val00 = src[y_][x_];
               const Type val01 = src[y_][x_ + 1];
-              const float interp = lerp(val00, val01, s);
+              const float interp = lerp(static_cast<float>(val00), static_cast<float>(val01), s);
               dst[i][j] = vpMath::saturate<Type>(interp);
             } else {
               dst[i][j] = src[y_][x_];
diff --git a/modules/io/include/visp3/io/vpImageIo.h b/modules/io/include/visp3/io/vpImageIo.h
index 11bd9aa766..fa395e3882 100644
--- a/modules/io/include/visp3/io/vpImageIo.h
+++ b/modules/io/include/visp3/io/vpImageIo.h
@@ -124,6 +124,16 @@ class VISP_EXPORT vpImageIo
   static std::string getExtension(const std::string &filename);
 
 public:
+  //TODO:
+  // Image IO backend for only jpeg and png formats
+  enum vpImageIoBackendType {
+    IO_DEFAULT_BACKEND,
+    IO_LIB_BACKEND,
+    IO_OPENCV_BACKEND,
+    IO_SIMDLIB_BACKEND,
+    IO_STB_IMAGE_BACKEND
+  };
+
   static void read(vpImage<unsigned char> &I, const std::string &filename);
   static void read(vpImage<vpRGBa> &I, const std::string &filename);
 
@@ -138,15 +148,11 @@ class VISP_EXPORT vpImageIo
   static void readPPM(vpImage<unsigned char> &I, const std::string &filename);
   static void readPPM(vpImage<vpRGBa> &I, const std::string &filename);
 
-  static void readJPEG(vpImage<unsigned char> &I, const std::string &filename);
-  static void readJPEG(vpImage<vpRGBa> &I, const std::string &filename);
-
-  static void readPNG(vpImage<unsigned char> &I, const std::string &filename);
-  static void readPNG(vpImage<vpRGBa> &I, const std::string &filename);
+  static void readJPEG(vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
+  static void readJPEG(vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
 
-  //TODO:
-  static void readSimdlib(vpImage<vpRGBa> &I, const std::string &filename);
-  static void readStb(vpImage<vpRGBa> &I, const std::string &filename);
+  static void readPNG(vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
+  static void readPNG(vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
 
   static void writePFM(const vpImage<float> &I, const std::string &filename);
 
@@ -157,14 +163,10 @@ class VISP_EXPORT vpImageIo
   static void writePPM(const vpImage<unsigned char> &I, const std::string &filename);
   static void writePPM(const vpImage<vpRGBa> &I, const std::string &filename);
 
-  static void writeJPEG(const vpImage<unsigned char> &I, const std::string &filename);
-  static void writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename);
-
-  static void writePNG(const vpImage<unsigned char> &I, const std::string &filename);
-  static void writePNG(const vpImage<vpRGBa> &I, const std::string &filename);
+  static void writeJPEG(const vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
+  static void writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
 
-  //TODO:
-  static void writeSimdlib(vpImage<vpRGBa> &I, const std::string &filename);
-  static void writeStb(vpImage<vpRGBa> &I, const std::string &filename);
+  static void writePNG(const vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
+  static void writePNG(const vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
 };
 #endif
diff --git a/modules/io/src/image/private/vpImageIoBackend.h b/modules/io/src/image/private/vpImageIoBackend.h
new file mode 100644
index 0000000000..e1b434c030
--- /dev/null
+++ b/modules/io/src/image/private/vpImageIoBackend.h
@@ -0,0 +1,104 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Read/write images.
+ *
+ * Authors:
+ * Eric Marchand
+ *
+ *****************************************************************************/
+
+/*!
+  \file vpImageIo.h
+  \brief Read/write images
+*/
+
+#ifndef vpIMAGEIOBACKEND_H
+#define vpIMAGEIOBACKEND_H
+
+#include <visp3/core/vpImage.h>
+
+
+//
+void vp_writePFM(const vpImage<float> &I, const std::string &filename);
+void vp_writePGM(const vpImage<unsigned char> &I, const std::string &filename);
+void vp_writePGM(const vpImage<short> &I, const std::string &filename);
+void vp_writePGM(const vpImage<vpRGBa> &I, const std::string &filename);
+void vp_readPFM(vpImage<float> &I, const std::string &filename);
+void vp_readPGM(vpImage<unsigned char> &I, const std::string &filename);
+void vp_readPGM(vpImage<vpRGBa> &I, const std::string &filename);
+void vp_readPPM(vpImage<unsigned char> &I, const std::string &filename);
+void vp_readPPM(vpImage<vpRGBa> &I, const std::string &filename);
+void vp_writePPM(const vpImage<unsigned char> &I, const std::string &filename);
+void vp_writePPM(const vpImage<vpRGBa> &I, const std::string &filename);
+
+//
+void readJPEGLibjpeg(vpImage<unsigned char> &I, const std::string &filename);
+void readJPEGLibjpeg(vpImage<vpRGBa> &I, const std::string &filename);
+
+void writeJPEGLibjpeg(const vpImage<unsigned char> &I, const std::string &filename);
+void writeJPEGLibjpeg(const vpImage<vpRGBa> &I, const std::string &filename);
+
+//
+void readPNGLibpng(vpImage<unsigned char> &I, const std::string &filename);
+void readPNGLibpng(vpImage<vpRGBa> &I, const std::string &filename);
+
+void writePNGLibpng(const vpImage<unsigned char> &I, const std::string &filename);
+void writePNGLibpng(const vpImage<vpRGBa> &I, const std::string &filename);
+
+//
+void readOpenCV(vpImage<unsigned char> &I, const std::string &filename);
+void readOpenCV(vpImage<vpRGBa> &I, const std::string &filename);
+
+void writeOpenCV(const vpImage<unsigned char> &I, const std::string &filename);
+void writeOpenCV(const vpImage<vpRGBa> &I, const std::string &filename);
+
+//
+void readSimdlib(vpImage<unsigned char> &I, const std::string &filename);
+void readSimdlib(vpImage<vpRGBa> &I, const std::string &filename);
+
+void writeJPEGSimdlib(const vpImage<unsigned char> &I, const std::string &filename);
+void writeJPEGSimdlib(const vpImage<vpRGBa> &I, const std::string &filename);
+
+void writePNGSimdlib(const vpImage<unsigned char> &I, const std::string &filename);
+void writePNGSimdlib(const vpImage<vpRGBa> &I, const std::string &filename);
+
+//
+void readStb(vpImage<unsigned char> &I, const std::string &filename);
+void readStb(vpImage<vpRGBa> &I, const std::string &filename);
+
+void writeJPEGStb(const vpImage<unsigned char> &I, const std::string &filename);
+void writeJPEGStb(const vpImage<vpRGBa> &I, const std::string &filename);
+
+void writePNGStb(const vpImage<unsigned char> &I, const std::string &filename);
+void writePNGStb(const vpImage<vpRGBa> &I, const std::string &filename);
+
+#endif
diff --git a/modules/io/src/image/private/vpImageIoLibjpeg.cpp b/modules/io/src/image/private/vpImageIoLibjpeg.cpp
new file mode 100644
index 0000000000..99debb3021
--- /dev/null
+++ b/modules/io/src/image/private/vpImageIoLibjpeg.cpp
@@ -0,0 +1,345 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Read/write images.
+ *
+ * Authors:
+ * Eric Marchand
+ *
+ *****************************************************************************/
+
+/*!
+  \file vpImageIo.cpp
+  \brief Read/write images
+*/
+
+#include "vpImageIoBackend.h"
+#include <visp3/core/vpImageConvert.h>
+
+//TODO:
+#if defined(_WIN32)
+// Include WinSock2.h before windows.h to ensure that winsock.h is not
+// included by windows.h since winsock.h and winsock2.h are incompatible
+#include <WinSock2.h>
+#include <windows.h>
+#endif
+
+#if defined(VISP_HAVE_JPEG)
+#include <jerror.h>
+#include <jpeglib.h>
+#endif
+
+
+//--------------------------------------------------------------------------
+// JPEG
+//--------------------------------------------------------------------------
+
+#if defined(VISP_HAVE_JPEG)
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a JPEG file.
+
+  \param I : Image to save as a JPEG file.
+  \param filename : Name of the file containing the image.
+*/
+void writeJPEGLibjpeg(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  struct jpeg_compress_struct cinfo;
+  struct jpeg_error_mgr jerr;
+  FILE *file;
+
+  cinfo.err = jpeg_std_error(&jerr);
+  jpeg_create_compress(&cinfo);
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "wb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str()));
+  }
+
+  unsigned int width = I.getWidth();
+  unsigned int height = I.getHeight();
+
+  jpeg_stdio_dest(&cinfo, file);
+
+  cinfo.image_width = width;
+  cinfo.image_height = height;
+  cinfo.input_components = 1;
+  cinfo.in_color_space = JCS_GRAYSCALE;
+  jpeg_set_defaults(&cinfo);
+
+  jpeg_start_compress(&cinfo, TRUE);
+
+  unsigned char *line;
+  line = new unsigned char[width];
+  unsigned char *input = (unsigned char *)I.bitmap;
+  while (cinfo.next_scanline < cinfo.image_height) {
+    for (unsigned int i = 0; i < width; i++) {
+      line[i] = *(input);
+      input++;
+    }
+    jpeg_write_scanlines(&cinfo, &line, 1);
+  }
+
+  jpeg_finish_compress(&cinfo);
+  jpeg_destroy_compress(&cinfo);
+  delete[] line;
+  fclose(file);
+}
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a JPEG file.
+
+  \param I : Image to save as a JPEG file.
+  \param filename : Name of the file containing the image.
+*/
+void writeJPEGLibjpeg(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+  struct jpeg_compress_struct cinfo;
+  struct jpeg_error_mgr jerr;
+  FILE *file;
+
+  cinfo.err = jpeg_std_error(&jerr);
+  jpeg_create_compress(&cinfo);
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "wb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str()));
+  }
+
+  unsigned int width = I.getWidth();
+  unsigned int height = I.getHeight();
+
+  jpeg_stdio_dest(&cinfo, file);
+
+  cinfo.image_width = width;
+  cinfo.image_height = height;
+  cinfo.input_components = 3;
+  cinfo.in_color_space = JCS_RGB;
+  jpeg_set_defaults(&cinfo);
+
+  jpeg_start_compress(&cinfo, TRUE);
+
+  unsigned char *line;
+  line = new unsigned char[3 * width];
+  unsigned char *input = (unsigned char *)I.bitmap;
+  while (cinfo.next_scanline < cinfo.image_height) {
+    for (unsigned int i = 0; i < width; i++) {
+      line[i * 3] = *(input);
+      input++;
+      line[i * 3 + 1] = *(input);
+      input++;
+      line[i * 3 + 2] = *(input);
+      input++;
+      input++;
+    }
+    jpeg_write_scanlines(&cinfo, &line, 1);
+  }
+
+  jpeg_finish_compress(&cinfo);
+  jpeg_destroy_compress(&cinfo);
+  delete[] line;
+  fclose(file);
+}
+
+/*!
+  Read the contents of the JPEG file, allocate memory
+  for the corresponding gray level image, if necessary convert the data in
+  gray level, and set the bitmap whith the gray level data. That means that
+  the image \e I is a "black and white" rendering of the original image in \e
+  filename, as in a black and white photograph. If necessary, the quantization
+  formula used is \f$0,299 r + 0,587 g + 0,114 b\f$.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  \param I : Image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void readJPEGLibjpeg(vpImage<unsigned char> &I, const std::string &filename)
+{
+  struct jpeg_decompress_struct cinfo;
+  struct jpeg_error_mgr jerr;
+  FILE *file;
+
+  cinfo.err = jpeg_std_error(&jerr);
+  jpeg_create_decompress(&cinfo);
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "rb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str()));
+  }
+
+  jpeg_stdio_src(&cinfo, file);
+  jpeg_read_header(&cinfo, TRUE);
+
+  unsigned int width = cinfo.image_width;
+  unsigned int height = cinfo.image_height;
+
+  if ((width != I.getWidth()) || (height != I.getHeight()))
+    I.resize(height, width);
+
+  jpeg_start_decompress(&cinfo);
+
+  unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components);
+  JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1);
+
+  if (cinfo.out_color_space == JCS_RGB) {
+    vpImage<vpRGBa> Ic(height, width);
+    unsigned char *output = (unsigned char *)Ic.bitmap;
+    while (cinfo.output_scanline < cinfo.output_height) {
+      jpeg_read_scanlines(&cinfo, buffer, 1);
+      for (unsigned int i = 0; i < width; i++) {
+        *(output++) = buffer[0][i * 3];
+        *(output++) = buffer[0][i * 3 + 1];
+        *(output++) = buffer[0][i * 3 + 2];
+        *(output++) = vpRGBa::alpha_default;
+      }
+    }
+    vpImageConvert::convert(Ic, I);
+  }
+
+  else if (cinfo.out_color_space == JCS_GRAYSCALE) {
+    while (cinfo.output_scanline < cinfo.output_height) {
+      unsigned int row = cinfo.output_scanline;
+      jpeg_read_scanlines(&cinfo, buffer, 1);
+      memcpy(I[row], buffer[0], rowbytes);
+    }
+  }
+
+  jpeg_finish_decompress(&cinfo);
+  jpeg_destroy_decompress(&cinfo);
+  fclose(file);
+}
+
+/*!
+  Read a JPEG file and initialize a scalar image.
+
+  Read the contents of the JPEG file, allocate
+  memory for the corresponding image, and set
+  the bitmap whith the content of
+  the file.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  If the file corresponds to a grayscaled image, a conversion is done to deal
+  with \e I which is a color image.
+
+  \param I : Color image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void readJPEGLibjpeg(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  struct jpeg_decompress_struct cinfo;
+  struct jpeg_error_mgr jerr;
+  FILE *file;
+
+  cinfo.err = jpeg_std_error(&jerr);
+  jpeg_create_decompress(&cinfo);
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "rb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str()));
+  }
+
+  jpeg_stdio_src(&cinfo, file);
+
+  jpeg_read_header(&cinfo, TRUE);
+
+  unsigned int width = cinfo.image_width;
+  unsigned int height = cinfo.image_height;
+
+  if ((width != I.getWidth()) || (height != I.getHeight()))
+    I.resize(height, width);
+
+  jpeg_start_decompress(&cinfo);
+
+  unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components);
+  JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1);
+
+  if (cinfo.out_color_space == JCS_RGB) {
+    unsigned char *output = (unsigned char *)I.bitmap;
+    while (cinfo.output_scanline < cinfo.output_height) {
+      jpeg_read_scanlines(&cinfo, buffer, 1);
+      for (unsigned int i = 0; i < width; i++) {
+        *(output++) = buffer[0][i * 3];
+        *(output++) = buffer[0][i * 3 + 1];
+        *(output++) = buffer[0][i * 3 + 2];
+        *(output++) = vpRGBa::alpha_default;
+      }
+    }
+  }
+
+  else if (cinfo.out_color_space == JCS_GRAYSCALE) {
+    vpImage<unsigned char> Ig(height, width);
+
+    while (cinfo.output_scanline < cinfo.output_height) {
+      unsigned int row = cinfo.output_scanline;
+      jpeg_read_scanlines(&cinfo, buffer, 1);
+      memcpy(Ig[row], buffer[0], rowbytes);
+    }
+
+    vpImageConvert::convert(Ig, I);
+  }
+
+  jpeg_finish_decompress(&cinfo);
+  jpeg_destroy_decompress(&cinfo);
+  fclose(file);
+}
+#endif
diff --git a/modules/io/src/image/private/vpImageIoLibpng.cpp b/modules/io/src/image/private/vpImageIoLibpng.cpp
new file mode 100644
index 0000000000..e350e4260b
--- /dev/null
+++ b/modules/io/src/image/private/vpImageIoLibpng.cpp
@@ -0,0 +1,615 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Read/write images.
+ *
+ * Authors:
+ * Eric Marchand
+ *
+ *****************************************************************************/
+
+/*!
+  \file vpImageIo.cpp
+  \brief Read/write images
+*/
+
+#include "vpImageIoBackend.h"
+#include <visp3/core/vpImageConvert.h>
+
+//TODO:
+#if defined(_WIN32)
+// Include WinSock2.h before windows.h to ensure that winsock.h is not
+// included by windows.h since winsock.h and winsock2.h are incompatible
+#include <WinSock2.h>
+#include <windows.h>
+#endif
+
+#if defined(VISP_HAVE_PNG)
+#include <png.h>
+#endif
+
+
+//--------------------------------------------------------------------------
+// PNG
+//--------------------------------------------------------------------------
+
+#if defined(VISP_HAVE_PNG)
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a PNG file.
+
+  \param I : Image to save as a PNG file.
+  \param filename : Name of the file containing the image.
+*/
+void writePNGLibpng(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  FILE *file;
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "wb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str()));
+  }
+
+  /* create a png info struct */
+  png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+  if (!png_ptr) {
+    fclose(file);
+    vpERROR_TRACE("Error during png_create_write_struct()\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  png_infop info_ptr = png_create_info_struct(png_ptr);
+  if (!info_ptr) {
+    fclose(file);
+    png_destroy_write_struct(&png_ptr, NULL);
+    vpERROR_TRACE("Error during png_create_info_struct()\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  /* initialize the setjmp for returning properly after a libpng error occured
+   */
+  if (setjmp(png_jmpbuf(png_ptr))) {
+    fclose(file);
+    png_destroy_write_struct(&png_ptr, &info_ptr);
+    vpERROR_TRACE("Error during init_io\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  /* setup libpng for using standard C fwrite() function with our FILE pointer
+   */
+  png_init_io(png_ptr, file);
+
+  unsigned int width = I.getWidth();
+  unsigned int height = I.getHeight();
+  int bit_depth = 8;
+  int color_type = PNG_COLOR_TYPE_GRAY;
+  /* set some useful information from header */
+
+  if (setjmp(png_jmpbuf(png_ptr))) {
+    fclose(file);
+    png_destroy_write_struct(&png_ptr, &info_ptr);
+    vpERROR_TRACE("Error during write header\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
+               PNG_FILTER_TYPE_BASE);
+
+  png_write_info(png_ptr, info_ptr);
+
+  png_bytep *row_ptrs = new png_bytep[height];
+  for (unsigned int i = 0; i < height; i++)
+    row_ptrs[i] = new png_byte[width];
+
+  unsigned char *input = (unsigned char *)I.bitmap;
+
+  for (unsigned int i = 0; i < height; i++) {
+    png_byte *row = row_ptrs[i];
+    for (unsigned int j = 0; j < width; j++) {
+      row[j] = *(input);
+      input++;
+    }
+  }
+
+  png_write_image(png_ptr, row_ptrs);
+
+  png_write_end(png_ptr, NULL);
+
+  for (unsigned int j = 0; j < height; j++)
+    delete[] row_ptrs[j];
+
+  delete[] row_ptrs;
+
+  png_destroy_write_struct(&png_ptr, &info_ptr);
+
+  fclose(file);
+}
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a PNG file.
+
+  \param I : Image to save as a PNG file.
+  \param filename : Name of the file containing the image.
+*/
+void writePNGLibpng(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+  FILE *file;
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "wb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str()));
+  }
+
+  /* create a png info struct */
+  png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+  if (!png_ptr) {
+    fclose(file);
+    vpERROR_TRACE("Error during png_create_write_struct()\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  png_infop info_ptr = png_create_info_struct(png_ptr);
+  if (!info_ptr) {
+    fclose(file);
+    png_destroy_write_struct(&png_ptr, NULL);
+    vpERROR_TRACE("Error during png_create_info_struct()\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  /* initialize the setjmp for returning properly after a libpng error occured
+   */
+  if (setjmp(png_jmpbuf(png_ptr))) {
+    fclose(file);
+    png_destroy_write_struct(&png_ptr, &info_ptr);
+    vpERROR_TRACE("Error during init_io\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  /* setup libpng for using standard C fwrite() function with our FILE pointer
+   */
+  png_init_io(png_ptr, file);
+
+  unsigned int width = I.getWidth();
+  unsigned int height = I.getHeight();
+  int bit_depth = 8;
+  int color_type = PNG_COLOR_TYPE_RGB;
+  /* set some useful information from header */
+
+  if (setjmp(png_jmpbuf(png_ptr))) {
+    fclose(file);
+    png_destroy_write_struct(&png_ptr, &info_ptr);
+    vpERROR_TRACE("Error during write header\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
+               PNG_FILTER_TYPE_BASE);
+
+  png_write_info(png_ptr, info_ptr);
+
+  png_bytep *row_ptrs = new png_bytep[height];
+  for (unsigned int i = 0; i < height; i++)
+    row_ptrs[i] = new png_byte[3 * width];
+
+  unsigned char *input = (unsigned char *)I.bitmap;
+
+  for (unsigned int i = 0; i < height; i++) {
+    png_byte *row = row_ptrs[i];
+    for (unsigned int j = 0; j < width; j++) {
+      row[3 * j] = *(input);
+      input++;
+      row[3 * j + 1] = *(input);
+      input++;
+      row[3 * j + 2] = *(input);
+      input++;
+      input++;
+    }
+  }
+
+  png_write_image(png_ptr, row_ptrs);
+
+  png_write_end(png_ptr, NULL);
+
+  for (unsigned int j = 0; j < height; j++)
+    delete[] row_ptrs[j];
+
+  delete[] row_ptrs;
+
+  png_destroy_write_struct(&png_ptr, &info_ptr);
+
+  fclose(file);
+}
+
+/*!
+  Read the contents of the PNG file, allocate memory
+  for the corresponding gray level image, if necessary convert the data in
+  gray level, and set the bitmap whith the gray level data. That means that
+  the image \e I is a "black and white" rendering of the original image in \e
+  filename, as in a black and white photograph. If necessary, the quantization
+  formula used is \f$0,299 r + 0,587 g + 0,114 b\f$.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  \param I : Image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void readPNGLibpng(vpImage<unsigned char> &I, const std::string &filename)
+{
+  FILE *file;
+  png_byte magic[8];
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "rb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str()));
+  }
+
+  /* read magic number */
+  if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) {
+    fclose(file);
+    throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str()));
+  }
+
+  /* check for valid magic number */
+  if (png_sig_cmp(magic, 0, sizeof(magic))) {
+    fclose(file);
+    throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image",
+                           filename.c_str()));
+  }
+
+  /* create a png read struct */
+  // printf("version %s\n", PNG_LIBPNG_VER_STRING);
+  png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+  if (png_ptr == NULL) {
+    fprintf(stderr, "error: can't create a png read structure!\n");
+    fclose(file);
+    throw(vpImageException(vpImageException::ioError, "error reading png file"));
+  }
+
+  /* create a png info struct */
+  png_infop info_ptr = png_create_info_struct(png_ptr);
+  if (info_ptr == NULL) {
+    fprintf(stderr, "error: can't create a png info structure!\n");
+    fclose(file);
+    png_destroy_read_struct(&png_ptr, NULL, NULL);
+    throw(vpImageException(vpImageException::ioError, "error reading png file"));
+  }
+
+  /* initialize the setjmp for returning properly after a libpng error occured
+   */
+  if (setjmp(png_jmpbuf(png_ptr))) {
+    fclose(file);
+    png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+    vpERROR_TRACE("Error during init io\n");
+    throw(vpImageException(vpImageException::ioError, "PNG read error"));
+  }
+
+  /* setup libpng for using standard C fread() function with our FILE pointer
+   */
+  png_init_io(png_ptr, file);
+
+  /* tell libpng that we have already read the magic number */
+  png_set_sig_bytes(png_ptr, sizeof(magic));
+
+  /* read png info */
+  png_read_info(png_ptr, info_ptr);
+
+  unsigned int width = png_get_image_width(png_ptr, info_ptr);
+  unsigned int height = png_get_image_height(png_ptr, info_ptr);
+
+  unsigned int bit_depth, channels, color_type;
+  /* get some useful information from header */
+  bit_depth = png_get_bit_depth(png_ptr, info_ptr);
+  channels = png_get_channels(png_ptr, info_ptr);
+  color_type = png_get_color_type(png_ptr, info_ptr);
+
+  /* convert index color images to RGB images */
+  if (color_type == PNG_COLOR_TYPE_PALETTE)
+    png_set_palette_to_rgb(png_ptr);
+
+  /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */
+  if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8)
+    png_set_expand(png_ptr);
+
+  //  if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS))
+  //    png_set_tRNS_to_alpha (png_ptr);
+
+  if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA)
+    png_set_strip_alpha(png_ptr);
+
+  if (bit_depth == 16)
+    png_set_strip_16(png_ptr);
+  else if (bit_depth < 8)
+    png_set_packing(png_ptr);
+
+  /* update info structure to apply transformations */
+  png_read_update_info(png_ptr, info_ptr);
+
+  channels = png_get_channels(png_ptr, info_ptr);
+
+  if ((width != I.getWidth()) || (height != I.getHeight()))
+    I.resize(height, width);
+
+  png_bytep *rowPtrs = new png_bytep[height];
+
+  unsigned int stride = png_get_rowbytes(png_ptr, info_ptr);
+  unsigned char *data = new unsigned char[stride * height];
+
+  for (unsigned int i = 0; i < height; i++)
+    rowPtrs[i] = (png_bytep)data + (i * stride);
+
+  png_read_image(png_ptr, rowPtrs);
+
+  vpImage<vpRGBa> Ic(height, width);
+  unsigned char *output;
+
+  switch (channels) {
+  case 1:
+    output = (unsigned char *)I.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i];
+    }
+    break;
+
+  case 2:
+    output = (unsigned char *)I.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i * 2];
+    }
+    break;
+
+  case 3:
+    output = (unsigned char *)Ic.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i * 3];
+      *(output++) = data[i * 3 + 1];
+      *(output++) = data[i * 3 + 2];
+      *(output++) = vpRGBa::alpha_default;
+    }
+    vpImageConvert::convert(Ic, I);
+    break;
+
+  case 4:
+    output = (unsigned char *)Ic.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i * 4];
+      *(output++) = data[i * 4 + 1];
+      *(output++) = data[i * 4 + 2];
+      *(output++) = data[i * 4 + 3];
+    }
+    vpImageConvert::convert(Ic, I);
+    break;
+  }
+
+  delete[](png_bytep) rowPtrs;
+  delete[] data;
+  png_read_end(png_ptr, NULL);
+  png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+  fclose(file);
+}
+
+/*!
+  Read a PNG file and initialize a scalar image.
+
+  Read the contents of the PNG file, allocate
+  memory for the corresponding image, and set
+  the bitmap whith the content of
+  the file.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  If the file corresponds to a grayscaled image, a conversion is done to deal
+  with \e I which is a color image.
+
+  \param I : Color image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void readPNGLibpng(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  FILE *file;
+  png_byte magic[8];
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "rb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str()));
+  }
+
+  /* read magic number */
+  if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) {
+    fclose(file);
+    throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str()));
+  }
+
+  /* check for valid magic number */
+  if (png_sig_cmp(magic, 0, sizeof(magic))) {
+    fclose(file);
+    throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image",
+                           filename.c_str()));
+  }
+
+  /* create a png read struct */
+  png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+  if (!png_ptr) {
+    fclose(file);
+    vpERROR_TRACE("Error during png_create_read_struct()\n");
+    throw(vpImageException(vpImageException::ioError, "PNG read error"));
+  }
+
+  /* create a png info struct */
+  png_infop info_ptr = png_create_info_struct(png_ptr);
+  if (!info_ptr) {
+    fclose(file);
+    png_destroy_read_struct(&png_ptr, NULL, NULL);
+    vpERROR_TRACE("Error during png_create_info_struct()\n");
+    throw(vpImageException(vpImageException::ioError, "PNG read error"));
+  }
+
+  /* initialize the setjmp for returning properly after a libpng error occured
+   */
+  if (setjmp(png_jmpbuf(png_ptr))) {
+    fclose(file);
+    png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+    vpERROR_TRACE("Error during init io\n");
+    throw(vpImageException(vpImageException::ioError, "PNG read error"));
+  }
+
+  /* setup libpng for using standard C fread() function with our FILE pointer
+   */
+  png_init_io(png_ptr, file);
+
+  /* tell libpng that we have already read the magic number */
+  png_set_sig_bytes(png_ptr, sizeof(magic));
+
+  /* read png info */
+  png_read_info(png_ptr, info_ptr);
+
+  unsigned int width = png_get_image_width(png_ptr, info_ptr);
+  unsigned int height = png_get_image_height(png_ptr, info_ptr);
+
+  unsigned int bit_depth, channels, color_type;
+  /* get some useful information from header */
+  bit_depth = png_get_bit_depth(png_ptr, info_ptr);
+  channels = png_get_channels(png_ptr, info_ptr);
+  color_type = png_get_color_type(png_ptr, info_ptr);
+
+  /* convert index color images to RGB images */
+  if (color_type == PNG_COLOR_TYPE_PALETTE)
+    png_set_palette_to_rgb(png_ptr);
+
+  /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */
+  if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8)
+    png_set_expand(png_ptr);
+
+  //  if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS))
+  //    png_set_tRNS_to_alpha (png_ptr);
+
+  if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA)
+    png_set_strip_alpha(png_ptr);
+
+  if (bit_depth == 16)
+    png_set_strip_16(png_ptr);
+  else if (bit_depth < 8)
+    png_set_packing(png_ptr);
+
+  /* update info structure to apply transformations */
+  png_read_update_info(png_ptr, info_ptr);
+
+  channels = png_get_channels(png_ptr, info_ptr);
+
+  if ((width != I.getWidth()) || (height != I.getHeight()))
+    I.resize(height, width);
+
+  png_bytep *rowPtrs = new png_bytep[height];
+
+  unsigned int stride = png_get_rowbytes(png_ptr, info_ptr);
+  unsigned char *data = new unsigned char[stride * height];
+
+  for (unsigned int i = 0; i < height; i++)
+    rowPtrs[i] = (png_bytep)data + (i * stride);
+
+  png_read_image(png_ptr, rowPtrs);
+
+  vpImage<unsigned char> Ig(height, width);
+  unsigned char *output;
+
+  switch (channels) {
+  case 1:
+    output = (unsigned char *)Ig.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i];
+    }
+    vpImageConvert::convert(Ig, I);
+    break;
+
+  case 2:
+    output = (unsigned char *)Ig.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i * 2];
+    }
+    vpImageConvert::convert(Ig, I);
+    break;
+
+  case 3:
+    output = (unsigned char *)I.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i * 3];
+      *(output++) = data[i * 3 + 1];
+      *(output++) = data[i * 3 + 2];
+      *(output++) = vpRGBa::alpha_default;
+    }
+    break;
+
+  case 4:
+    output = (unsigned char *)I.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i * 4];
+      *(output++) = data[i * 4 + 1];
+      *(output++) = data[i * 4 + 2];
+      *(output++) = data[i * 4 + 3];
+    }
+    break;
+  }
+
+  delete[](png_bytep) rowPtrs;
+  delete[] data;
+  png_read_end(png_ptr, NULL);
+  png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+  fclose(file);
+}
+#endif
diff --git a/modules/io/src/image/private/vpImageIoOpenCV.cpp b/modules/io/src/image/private/vpImageIoOpenCV.cpp
new file mode 100644
index 0000000000..93b6a1ca1d
--- /dev/null
+++ b/modules/io/src/image/private/vpImageIoOpenCV.cpp
@@ -0,0 +1,205 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Read/write images.
+ *
+ * Authors:
+ * Eric Marchand
+ *
+ *****************************************************************************/
+
+/*!
+  \file vpImageIo.cpp
+  \brief Read/write images
+*/
+
+#include "vpImageIoBackend.h"
+
+//TODO:
+#ifdef VISP_HAVE_OPENCV
+#if (VISP_HAVE_OPENCV_VERSION >= 0x030000) // Require opencv >= 3.0.0
+#  include <opencv2/imgcodecs.hpp>
+#elif (VISP_HAVE_OPENCV_VERSION >= 0x020408) // Require opencv >= 2.4.8
+#  include <opencv2/core/core.hpp>
+#  include <opencv2/highgui/highgui.hpp>
+#  include <opencv2/imgproc/imgproc.hpp>
+#elif (VISP_HAVE_OPENCV_VERSION >= 0x020101) // Require opencv >= 2.1.1
+#  include <opencv2/core/core.hpp>
+#  include <opencv2/highgui/highgui.hpp>
+#  include <opencv2/highgui/highgui_c.h>
+#  include <opencv2/legacy/legacy.hpp>
+#else
+#  include <highgui.h>
+#endif
+#endif
+
+#include <visp3/core/vpImageConvert.h>
+
+
+#if defined(VISP_HAVE_OPENCV)
+
+/*!
+  Read the contents of the JPEG file, allocate memory
+  for the corresponding gray level image, if necessary convert the data in
+  gray level, and set the bitmap whith the gray level data. That means that
+  the image \e I is a "black and white" rendering of the original image in \e
+  filename, as in a black and white photograph. If necessary, the quantization
+  formula used is \f$0,299 r + 0,587 g + 0,114 b\f$.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  If EXIF information is embedded in the image file, the EXIF orientation is ignored.
+
+  \param I : Image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+
+*/
+void readOpenCV(vpImage<unsigned char> &I, const std::string &filename)
+{
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+#if VISP_HAVE_OPENCV_VERSION >= 0x030200
+    int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION;
+#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
+    int flags = cv::IMREAD_GRAYSCALE;
+#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
+    int flags = CV_LOAD_IMAGE_GRAYSCALE;
+#endif
+  cv::Mat Ip = cv::imread(filename.c_str(), flags);
+  if (!Ip.empty())
+    vpImageConvert::convert(Ip, I);
+  else
+    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
+#else
+  IplImage *Ip = NULL;
+  Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE);
+  if (Ip != NULL)
+    vpImageConvert::convert(Ip, I);
+  else
+    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
+  cvReleaseImage(&Ip);
+#endif
+}
+
+/*!
+  Read a JPEG file and initialize a scalar image.
+
+  Read the contents of the JPEG file, allocate
+  memory for the corresponding image, and set
+  the bitmap whith the content of
+  the file.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  If the file corresponds to a grayscaled image, a conversion is done to deal
+  with \e I which is a color image.
+
+  If EXIF information is embedded in the image file, the EXIF orientation is ignored.
+
+  \param I : Color image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void readOpenCV(vpImage<vpRGBa> &I, const std::string &filename)
+{
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+#if VISP_HAVE_OPENCV_VERSION >= 0x030200
+    int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION;
+#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
+    int flags = cv::IMREAD_GRAYSCALE;
+#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
+    int flags = CV_LOAD_IMAGE_GRAYSCALE;
+#endif
+  cv::Mat Ip = cv::imread(filename.c_str(), flags);
+  if (!Ip.empty())
+    vpImageConvert::convert(Ip, I);
+  else
+    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
+#else
+  IplImage *Ip = NULL;
+  Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_COLOR);
+  if (Ip != NULL)
+    vpImageConvert::convert(Ip, I);
+  else
+    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
+  cvReleaseImage(&Ip);
+#endif
+}
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a JPEG file.
+
+  \param I : Image to save as a JPEG file.
+  \param filename : Name of the file containing the image.
+*/
+void writeOpenCV(const vpImage<unsigned char> &I, const std::string &filename)
+{
+#if (VISP_HAVE_OPENCV_VERSION >= 0x020408)
+  cv::Mat Ip;
+  vpImageConvert::convert(I, Ip);
+  cv::imwrite(filename.c_str(), Ip);
+#else
+  IplImage *Ip = NULL;
+  vpImageConvert::convert(I, Ip);
+
+  cvSaveImage(filename.c_str(), Ip);
+
+  cvReleaseImage(&Ip);
+#endif
+}
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a JPEG file.
+
+  \param I : Image to save as a JPEG file.
+  \param filename : Name of the file containing the image.
+*/
+void writeOpenCV(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+#if (VISP_HAVE_OPENCV_VERSION >= 0x020408)
+  cv::Mat Ip;
+  vpImageConvert::convert(I, Ip);
+  cv::imwrite(filename.c_str(), Ip);
+#else
+  IplImage *Ip = NULL;
+  vpImageConvert::convert(I, Ip);
+
+  cvSaveImage(filename.c_str(), Ip);
+
+  cvReleaseImage(&Ip);
+#endif
+}
+
+#endif
diff --git a/modules/io/src/image/private/vpImageIoPortable.cpp b/modules/io/src/image/private/vpImageIoPortable.cpp
new file mode 100644
index 0000000000..0031e4c96a
--- /dev/null
+++ b/modules/io/src/image/private/vpImageIoPortable.cpp
@@ -0,0 +1,569 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Read/write images.
+ *
+ * Authors:
+ * Eric Marchand
+ *
+ *****************************************************************************/
+
+/*!
+  \file vpImageIo.cpp
+  \brief Read/write images
+*/
+
+#include "vpImageIoBackend.h"
+#include <visp3/core/vpIoTools.h>
+#include <visp3/core/vpImageConvert.h>
+
+//TODO:
+#if defined(_WIN32)
+// Include WinSock2.h before windows.h to ensure that winsock.h is not
+// included by windows.h since winsock.h and winsock2.h are incompatible
+#include <WinSock2.h>
+#include <windows.h>
+#endif
+
+
+void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w,
+                        unsigned int &h, unsigned int &maxval);
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+/*!
+ * Decode the PNM image header.
+ * \param filename[in] : File name.
+ * \param fd[in] : File desdcriptor.
+ * \param magic[in] : Magic number for identifying the file type.
+ * \param w[out] : Image width.
+ * \param h[out] : Image height.
+ * \param maxval[out] : Maximum pixel value.
+ */
+void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w,
+                        unsigned int &h, unsigned int &maxval)
+{
+  std::string line;
+  unsigned int nb_elt = 4, cpt_elt = 0;
+  while (cpt_elt != nb_elt) {
+    // Skip empty lines or lines starting with # (comment)
+    while (std::getline(fd, line) && (line.compare(0, 1, "#") == 0 || line.size() == 0)) {
+    }
+
+    if (fd.eof()) {
+      fd.close();
+      throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str()));
+    }
+
+    std::vector<std::string> header = vpIoTools::splitChain(line, std::string(" "));
+
+    if (header.size() == 0) {
+      fd.close();
+      throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str()));
+    }
+
+    if (cpt_elt == 0) { // decode magic
+      if (header[0].compare(0, magic.size(), magic) != 0) {
+        fd.close();
+        throw(vpImageException(vpImageException::ioError, "\"%s\" is not a PNM file with magic number %s",
+                               filename.c_str(), magic.c_str()));
+      }
+      cpt_elt++;
+      header.erase(header.begin(),
+                   header.begin() + 1); // erase first element that is processed
+    }
+    while (header.size()) {
+      if (cpt_elt == 1) { // decode width
+        std::istringstream ss(header[0]);
+        ss >> w;
+        cpt_elt++;
+        header.erase(header.begin(),
+                     header.begin() + 1); // erase first element that is processed
+      } else if (cpt_elt == 2) {          // decode height
+        std::istringstream ss(header[0]);
+        ss >> h;
+        cpt_elt++;
+        header.erase(header.begin(),
+                     header.begin() + 1); // erase first element that is processed
+      } else if (cpt_elt == 3) {          // decode maxval
+        std::istringstream ss(header[0]);
+        ss >> maxval;
+        cpt_elt++;
+        header.erase(header.begin(),
+                     header.begin() + 1); // erase first element that is processed
+      }
+    }
+  }
+}
+#endif
+
+//--------------------------------------------------------------------------
+// PFM
+//--------------------------------------------------------------------------
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function is built like portable gray pixmap (eg PGM P5) file.
+  but considers float image data.
+
+  \param I : Image to save as a (PFM P8) file.
+  \param filename : Name of the file containing the image.
+*/
+void vp_writePFM(const vpImage<float> &I, const std::string &filename)
+{
+  FILE *fd;
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot write PFM image: filename empty"));
+  }
+
+  fd = fopen(filename.c_str(), "wb");
+
+  if (fd == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PFM file \"%s\"", filename.c_str()));
+  }
+
+  // Write the head
+  fprintf(fd, "P8\n");                                 // Magic number
+  fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
+  fprintf(fd, "255\n");                                // Max level
+
+  // Write the bitmap
+  size_t ierr;
+  size_t nbyte = I.getWidth() * I.getHeight();
+
+  ierr = fwrite(I.bitmap, sizeof(float), nbyte, fd);
+  if (ierr != nbyte) {
+    fclose(fd);
+    throw(vpImageException(vpImageException::ioError, "Cannot save PFM file \"%s\": only %d bytes over %d saved ",
+                           filename.c_str(), ierr, nbyte));
+  }
+
+  fflush(fd);
+  fclose(fd);
+}
+
+//--------------------------------------------------------------------------
+// PGM
+//--------------------------------------------------------------------------
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a portable gray pixmap (PGM P5) file.
+
+  \param I : Image to save as a (PGM P5) file.
+  \param filename : Name of the file containing the image.
+*/
+void vp_writePGM(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  FILE *fd;
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty"));
+  }
+
+  fd = fopen(filename.c_str(), "wb");
+
+  if (fd == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str()));
+  }
+
+  // Write the head
+  fprintf(fd, "P5\n");                                 // Magic number
+  fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
+  fprintf(fd, "255\n");                                // Max level
+
+  // Write the bitmap
+  size_t ierr;
+  size_t nbyte = I.getWidth() * I.getHeight();
+
+  ierr = fwrite(I.bitmap, sizeof(unsigned char), nbyte, fd);
+  if (ierr != nbyte) {
+    fclose(fd);
+    throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved",
+                           filename.c_str(), ierr, nbyte));
+  }
+
+  fflush(fd);
+  fclose(fd);
+}
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a portable gray pixmap (PGM P5) file.
+
+  \param I : Image to save as a (PGM P5) file.
+  \param filename : Name of the file containing the image.
+*/
+void vp_writePGM(const vpImage<short> &I, const std::string &filename)
+{
+  vpImage<unsigned char> Iuc;
+  unsigned int nrows = I.getHeight();
+  unsigned int ncols = I.getWidth();
+
+  Iuc.resize(nrows, ncols);
+
+  for (unsigned int i = 0; i < nrows * ncols; i++)
+    Iuc.bitmap[i] = (unsigned char)I.bitmap[i];
+
+  vp_writePGM(Iuc, filename);
+}
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a portable gray pixmap (PGM P5) file.
+  Color image is converted into a grayscale image.
+
+  \param I : Image to save as a (PGM P5) file.
+  \param filename : Name of the file containing the image.
+*/
+void vp_writePGM(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+
+  FILE *fd;
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty"));
+  }
+
+  fd = fopen(filename.c_str(), "wb");
+
+  if (fd == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str()));
+  }
+
+  // Write the head
+  fprintf(fd, "P5\n");                                 // Magic number
+  fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
+  fprintf(fd, "255\n");                                // Max level
+
+  // Write the bitmap
+  size_t ierr;
+  size_t nbyte = I.getWidth() * I.getHeight();
+
+  vpImage<unsigned char> Itmp;
+  vpImageConvert::convert(I, Itmp);
+
+  ierr = fwrite(Itmp.bitmap, sizeof(unsigned char), nbyte, fd);
+  if (ierr != nbyte) {
+    fclose(fd);
+    throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved",
+                           filename.c_str(), ierr, nbyte));
+  }
+
+  fflush(fd);
+  fclose(fd);
+}
+
+/*!
+  Read a PFM P8 file and initialize a float image.
+
+  Read the contents of the portable gray pixmap (PFM P8) filename, allocate
+  memory for the corresponding image, and set the bitmap whith the content of
+  the file.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  \param I : Image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void vp_readPFM(vpImage<float> &I, const std::string &filename)
+{
+  unsigned int w = 0, h = 0, maxval = 0;
+  unsigned int w_max = 100000, h_max = 100000, maxval_max = 255;
+  std::string magic("P8");
+
+  std::ifstream fd(filename.c_str(), std::ios::binary);
+
+  // Open the filename
+  if (!fd.is_open()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str()));
+  }
+
+  vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval);
+
+  if (w > w_max || h > h_max) {
+    fd.close();
+    throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str()));
+  }
+  if (maxval > maxval_max) {
+    fd.close();
+    throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str()));
+  }
+
+  if ((h != I.getHeight()) || (w != I.getWidth())) {
+    I.resize(h, w);
+  }
+
+  unsigned int nbyte = I.getHeight() * I.getWidth();
+  fd.read((char *)I.bitmap, sizeof(float) * nbyte);
+  if (!fd) {
+    fd.close();
+    throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte,
+                           filename.c_str()));
+  }
+
+  fd.close();
+}
+
+/*!
+  Read a PGM P5 file and initialize a scalar image.
+
+  Read the contents of the portable gray pixmap (PGM P5) filename, allocate
+  memory for the corresponding image, and set the bitmap whith the content of
+  the file.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  \param I : Image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void vp_readPGM(vpImage<unsigned char> &I, const std::string &filename)
+{
+  unsigned int w = 0, h = 0, maxval = 0;
+  unsigned int w_max = 100000, h_max = 100000, maxval_max = 255;
+  std::string magic("P5");
+
+  std::ifstream fd(filename.c_str(), std::ios::binary);
+
+  // Open the filename
+  if (!fd.is_open()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str()));
+  }
+
+  vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval);
+
+  if (w > w_max || h > h_max) {
+    fd.close();
+    throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str()));
+  }
+  if (maxval > maxval_max) {
+    fd.close();
+    throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str()));
+  }
+
+  if ((h != I.getHeight()) || (w != I.getWidth())) {
+    I.resize(h, w);
+  }
+
+  unsigned int nbyte = I.getHeight() * I.getWidth();
+  fd.read((char *)I.bitmap, nbyte);
+  if (!fd) {
+    fd.close();
+    throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte,
+                           filename.c_str()));
+  }
+
+  fd.close();
+}
+
+/*!
+  Read a PGM P5 file and initialize a scalar image.
+
+  Read the contents of the portable gray pixmap (PGM P5) filename, allocate
+  memory for the corresponding image, and set the bitmap whith the content of
+  the file.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  The gray level image contained in the \e filename is converted in a
+  color image in \e I.
+
+  \param I : Color image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void vp_readPGM(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  vpImage<unsigned char> Itmp;
+
+  vp_readPGM(Itmp, filename);
+
+  vpImageConvert::convert(Itmp, I);
+}
+
+//--------------------------------------------------------------------------
+// PPM
+//--------------------------------------------------------------------------
+
+/*!
+  Read the contents of the portable pixmap (PPM P6) filename, allocate memory
+  for the corresponding gray level image, convert the data in gray level, and
+  set the bitmap whith the gray level data. That means that the image \e I is
+  a "black and white" rendering of the original image in \e filename, as in a
+  black and white photograph. The quantization formula used is \f$0,299 r +
+  0,587 g + 0,114 b\f$.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  \param I : Image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void vp_readPPM(vpImage<unsigned char> &I, const std::string &filename)
+{
+  vpImage<vpRGBa> Itmp;
+
+  vp_readPPM(Itmp, filename);
+
+  vpImageConvert::convert(Itmp, I);
+}
+
+/*!
+  Read the contents of the portable pixmap (PPM P6) filename,
+  allocate memory for the corresponding vpRGBa image.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  \param I : Image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void vp_readPPM(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  unsigned int w = 0, h = 0, maxval = 0;
+  unsigned int w_max = 100000, h_max = 100000, maxval_max = 255;
+  std::string magic("P6");
+
+  std::ifstream fd(filename.c_str(), std::ios::binary);
+
+  // Open the filename
+  if (!fd.is_open()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str()));
+  }
+
+  vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval);
+
+  if (w > w_max || h > h_max) {
+    fd.close();
+    throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str()));
+  }
+  if (maxval > maxval_max) {
+    fd.close();
+    throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str()));
+  }
+
+  if ((h != I.getHeight()) || (w != I.getWidth())) {
+    I.resize(h, w);
+  }
+
+  for (unsigned int i = 0; i < I.getHeight(); i++) {
+    for (unsigned int j = 0; j < I.getWidth(); j++) {
+      unsigned char rgb[3];
+      fd.read((char *)&rgb, 3);
+
+      if (!fd) {
+        fd.close();
+        throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"",
+                               (i * I.getWidth() + j) * 3 + fd.gcount(), I.getSize() * 3, filename.c_str()));
+      }
+
+      I[i][j].R = rgb[0];
+      I[i][j].G = rgb[1];
+      I[i][j].B = rgb[2];
+      I[i][j].A = vpRGBa::alpha_default;
+    }
+  }
+
+  fd.close();
+}
+
+/*!
+  Write the content of the bitmap in the file which name is given by \e
+  filename. This function writes a portable gray pixmap (PPM P6) file.
+  grayscale image is converted into a color image vpRGBa.
+
+  \param I : Image to save as a (PPM P6) file.
+  \param filename : Name of the file containing the image.
+*/
+void vp_writePPM(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  vpImage<vpRGBa> Itmp;
+
+  vpImageConvert::convert(I, Itmp);
+
+  vp_writePPM(Itmp, filename);
+}
+
+/*!
+  Write the content of the bitmap in the file which name is given by \e
+  filename. This function writes a portable gray pixmap (PPM P6) file.
+
+  \param I : Image to save as a (PPM P6) file.
+  \param filename : Name of the file containing the image.
+*/
+void vp_writePPM(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+  FILE *f;
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PPM file: filename empty"));
+  }
+
+  f = fopen(filename.c_str(), "wb");
+
+  if (f == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PPM file \"%s\"", filename.c_str()));
+  }
+
+  fprintf(f, "P6\n");                                 // Magic number
+  fprintf(f, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
+  fprintf(f, "%d\n", 255);                            // Max level
+
+  for (unsigned int i = 0; i < I.getHeight(); i++) {
+    for (unsigned int j = 0; j < I.getWidth(); j++) {
+      vpRGBa v = I[i][j];
+      unsigned char rgb[3];
+      rgb[0] = v.R;
+      rgb[1] = v.G;
+      rgb[2] = v.B;
+
+      size_t res = fwrite(&rgb, 1, 3, f);
+      if (res != 3) {
+        fclose(f);
+        throw(vpImageException(vpImageException::ioError, "cannot write file \"%s\"", filename.c_str()));
+      }
+    }
+  }
+
+  fflush(f);
+  fclose(f);
+}
diff --git a/modules/io/src/image/private/vpImageIoSimd.cpp b/modules/io/src/image/private/vpImageIoSimd.cpp
new file mode 100644
index 0000000000..40986bf743
--- /dev/null
+++ b/modules/io/src/image/private/vpImageIoSimd.cpp
@@ -0,0 +1,87 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Read/write images.
+ *
+ * Authors:
+ * Eric Marchand
+ *
+ *****************************************************************************/
+
+/*!
+  \file vpImageIo.cpp
+  \brief Read/write images
+*/
+
+#include "vpImageIoBackend.h"
+
+//TODO:
+#include <Simd/SimdLib.hpp>
+
+
+//TODO:
+void readSimdlib(vpImage<unsigned char> &I, const std::string &filename)
+{
+  size_t stride = 0, width = 0, height = 0;
+  SimdPixelFormatType format = SimdPixelFormatGray8;
+  uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format);
+  const bool copyData = false;
+  I.init(data, (unsigned int)height, (unsigned int)width, copyData);
+}
+
+void readSimdlib(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  size_t stride = 0, width = 0, height = 0;
+  SimdPixelFormatType format = SimdPixelFormatRgba32;
+  uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format);
+  const bool copyData = false;
+  I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData);
+}
+
+void writeJPEGSimdlib(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFileJpeg, 90, filename.c_str());
+}
+
+void writeJPEGSimdlib(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+  SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, 90, filename.c_str());
+}
+
+void writePNGSimdlib(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFileJpeg, 90, filename.c_str());
+}
+
+void writePNGSimdlib(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+  SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFilePng, 90, filename.c_str());
+}
diff --git a/modules/io/src/image/private/vpImageIoStb.cpp b/modules/io/src/image/private/vpImageIoStb.cpp
new file mode 100644
index 0000000000..97b453d841
--- /dev/null
+++ b/modules/io/src/image/private/vpImageIoStb.cpp
@@ -0,0 +1,121 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Read/write images.
+ *
+ * Authors:
+ * Eric Marchand
+ *
+ *****************************************************************************/
+
+/*!
+  \file vpImageIo.cpp
+  \brief Read/write images
+*/
+
+#include "vpImageIoBackend.h"
+
+//TODO:
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#  define VISP_HAVE_SSE2 1
+#endif
+
+#ifndef VISP_HAVE_SSE2
+#  define STBI_NO_SIMD
+#endif
+
+#define STB_IMAGE_IMPLEMENTATION
+#include <stb_image.h>
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include <stb_image_write.h>
+
+
+//TODO:
+void readStb(vpImage<unsigned char> &I, const std::string &filename)
+{
+  int width = 0, height = 0, channels = 0;
+  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_grey);
+  if (image == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
+  }
+  I.init(image, static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
+  stbi_image_free(image);
+}
+
+void readStb(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  int width = 0, height = 0, channels = 0;
+  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha);
+  if (image == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
+  }
+  I.init(reinterpret_cast<vpRGBa*>(image), static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
+  stbi_image_free(image);
+}
+
+void writeJPEGStb(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  int res = stbi_write_jpg(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_grey,
+                           reinterpret_cast<void*>(I.bitmap), 90);
+  if (res == 0) {
+    throw(vpImageException(vpImageException::ioError, "JEPG write error"));
+  }
+}
+
+void writeJPEGStb(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+  int res = stbi_write_jpg(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
+                           reinterpret_cast<void*>(I.bitmap), 90);
+  if (res == 0) {
+    throw(vpImageException(vpImageException::ioError, "JEPG write error"));
+  }
+}
+
+void writePNGStb(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  const int stride_in_bytes = static_cast<int>(I.getWidth());
+  int res = stbi_write_png(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_grey,
+                           reinterpret_cast<void*>(I.bitmap), stride_in_bytes);
+  if (res == 0) {
+    throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str()));
+  }
+}
+
+void writePNGStb(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+  const int stride_in_bytes = static_cast<int>(4 * I.getWidth());
+  int res = stbi_write_png(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
+                           reinterpret_cast<void*>(I.bitmap), stride_in_bytes);
+  if (res == 0) {
+    throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str()));
+  }
+}
diff --git a/modules/io/src/image/vpImageIo.cpp b/modules/io/src/image/vpImageIo.cpp
index cc7799d158..e8b221049e 100644
--- a/modules/io/src/image/vpImageIo.cpp
+++ b/modules/io/src/image/vpImageIo.cpp
@@ -46,119 +46,9 @@
 #include <visp3/core/vpIoTools.h>
 #include <visp3/io/vpImageIo.h>
 
-#if defined(_WIN32)
-// Include WinSock2.h before windows.h to ensure that winsock.h is not
-// included by windows.h since winsock.h and winsock2.h are incompatible
-#include <WinSock2.h>
-#include <windows.h>
-#endif
-
-#if defined(VISP_HAVE_JPEG)
-#include <jerror.h>
-#include <jpeglib.h>
-#endif
-
-#if defined(VISP_HAVE_PNG)
-#include <png.h>
-#endif
-
 //TODO:
-#include <Simd/SimdLib.hpp>
-//TODO:
-#define STB_IMAGE_IMPLEMENTATION
-#include <stb_image.h>
-
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-#include <stb_image_write.h>
-
-#if !defined(VISP_HAVE_OPENCV)
-#if !defined(VISP_HAVE_JPEG) || !defined(VISP_HAVE_PNG)
-
-#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
-#  define VISP_HAVE_SSE2 1
-#endif
-
-#ifndef VISP_HAVE_SSE2
-#  define STBI_NO_SIMD
-#endif
-
-#define STB_IMAGE_IMPLEMENTATION
-#include <stb_image.h>
+#include "private/vpImageIoBackend.h"
 
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-#include <stb_image_write.h>
-#endif
-#endif
-
-void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w,
-                        unsigned int &h, unsigned int &maxval);
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS
-/*!
- * Decode the PNM image header.
- * \param filename[in] : File name.
- * \param fd[in] : File desdcriptor.
- * \param magic[in] : Magic number for identifying the file type.
- * \param w[out] : Image width.
- * \param h[out] : Image height.
- * \param maxval[out] : Maximum pixel value.
- */
-void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w,
-                        unsigned int &h, unsigned int &maxval)
-{
-  std::string line;
-  unsigned int nb_elt = 4, cpt_elt = 0;
-  while (cpt_elt != nb_elt) {
-    // Skip empty lines or lines starting with # (comment)
-    while (std::getline(fd, line) && (line.compare(0, 1, "#") == 0 || line.size() == 0)) {
-    }
-
-    if (fd.eof()) {
-      fd.close();
-      throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str()));
-    }
-
-    std::vector<std::string> header = vpIoTools::splitChain(line, std::string(" "));
-
-    if (header.size() == 0) {
-      fd.close();
-      throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str()));
-    }
-
-    if (cpt_elt == 0) { // decode magic
-      if (header[0].compare(0, magic.size(), magic) != 0) {
-        fd.close();
-        throw(vpImageException(vpImageException::ioError, "\"%s\" is not a PNM file with magic number %s",
-                               filename.c_str(), magic.c_str()));
-      }
-      cpt_elt++;
-      header.erase(header.begin(),
-                   header.begin() + 1); // erase first element that is processed
-    }
-    while (header.size()) {
-      if (cpt_elt == 1) { // decode width
-        std::istringstream ss(header[0]);
-        ss >> w;
-        cpt_elt++;
-        header.erase(header.begin(),
-                     header.begin() + 1); // erase first element that is processed
-      } else if (cpt_elt == 2) {          // decode height
-        std::istringstream ss(header[0]);
-        ss >> h;
-        cpt_elt++;
-        header.erase(header.begin(),
-                     header.begin() + 1); // erase first element that is processed
-      } else if (cpt_elt == 3) {          // decode maxval
-        std::istringstream ss(header[0]);
-        ss >> maxval;
-        cpt_elt++;
-        header.erase(header.begin(),
-                     header.begin() + 1); // erase first element that is processed
-      }
-    }
-  }
-}
-#endif
 
 vpImageIo::vpImageFormatType vpImageIo::getFormat(const std::string &filename)
 {
@@ -271,18 +161,10 @@ void vpImageIo::read(vpImage<unsigned char> &I, const std::string &filename)
     readPPM(I, final_filename);
     break;
   case FORMAT_JPEG:
-#ifdef VISP_HAVE_JPEG
     readJPEG(I, final_filename);
-#else
-    try_opencv_reader = true;
-#endif
     break;
   case FORMAT_PNG:
-#if defined(VISP_HAVE_PNG)
     readPNG(I, final_filename);
-#else
-    try_opencv_reader = true;
-#endif
     break;
   case FORMAT_TIFF:
   case FORMAT_BMP:
@@ -297,39 +179,10 @@ void vpImageIo::read(vpImage<unsigned char> &I, const std::string &filename)
 
   if (try_opencv_reader) {
 #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-#if VISP_HAVE_OPENCV_VERSION >= 0x030200
-    int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
-    int flags = cv::IMREAD_GRAYSCALE;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
-    int flags = CV_LOAD_IMAGE_GRAYSCALE;
-#endif
-    // std::cout << "Use opencv to read the image" << std::endl;
-    cv::Mat cvI = cv::imread(final_filename, flags);
-    if (cvI.cols == 0 && cvI.rows == 0) {
-      std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported";
-      throw(vpImageException(vpImageException::ioError, message));
-    }
-    vpImageConvert::convert(cvI, I);
+    readOpenCV(I, filename);
 #else
-    switch (getFormat(final_filename)) {
-    case FORMAT_JPEG:
-      readJPEG(I, final_filename);
-      break;
-    case FORMAT_PNG:
-      readPNG(I, final_filename);
-      break;
-    case FORMAT_BMP:
-    case FORMAT_TIFF:
-    case FORMAT_DIB:
-    case FORMAT_PBM:
-    case FORMAT_RASTER:
-    case FORMAT_JPEG2000:
-    case FORMAT_UNKNOWN:
-    default:
-      std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported";
-      throw(vpImageException(vpImageException::ioError, message));
-    }
+    std::string message = "Cannot read file \"" + filename + "\": No backend able to support this image format";
+    throw(vpImageException(vpImageException::ioError, message));
 #endif
   }
 }
@@ -374,18 +227,10 @@ void vpImageIo::read(vpImage<vpRGBa> &I, const std::string &filename)
     readPPM(I, final_filename);
     break;
   case FORMAT_JPEG:
-#ifdef VISP_HAVE_JPEG
     readJPEG(I, final_filename);
-#else
-    try_opencv_reader = true;
-#endif
     break;
   case FORMAT_PNG:
-#if defined(VISP_HAVE_PNG)
     readPNG(I, final_filename);
-#else
-    try_opencv_reader = true;
-#endif
     break;
   case FORMAT_TIFF:
   case FORMAT_BMP:
@@ -400,39 +245,10 @@ void vpImageIo::read(vpImage<vpRGBa> &I, const std::string &filename)
 
   if (try_opencv_reader) {
 #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-#if VISP_HAVE_OPENCV_VERSION >= 0x030200
-    int flags = cv::IMREAD_COLOR | cv::IMREAD_IGNORE_ORIENTATION;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
-    int flags = cv::IMREAD_COLOR;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
-    int flags = CV_LOAD_IMAGE_COLOR;
-#endif
-    // std::cout << "Use opencv to read the image" << std::endl;
-    cv::Mat cvI = cv::imread(final_filename, flags);
-    if (cvI.cols == 0 && cvI.rows == 0) {
-      std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported";
-      throw(vpImageException(vpImageException::ioError, message));
-    }
-    vpImageConvert::convert(cvI, I);
+    readOpenCV(I, filename);
 #else
-    switch (getFormat(final_filename)) {
-    case FORMAT_JPEG:
-      readJPEG(I, final_filename);
-      break;
-    case FORMAT_PNG:
-      readPNG(I, final_filename);
-      break;
-    case FORMAT_BMP:
-    case FORMAT_TIFF:
-    case FORMAT_DIB:
-    case FORMAT_PBM:
-    case FORMAT_RASTER:
-    case FORMAT_JPEG2000:
-    case FORMAT_UNKNOWN:
-    default:
-      std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported";
-      throw(vpImageException(vpImageException::ioError, message));
-    }
+    std::string message = "Cannot read file \"" + filename + "\": No backend able to support this image format";
+    throw(vpImageException(vpImageException::ioError, message));
 #endif
   }
 }
@@ -463,18 +279,10 @@ void vpImageIo::write(const vpImage<unsigned char> &I, const std::string &filena
     writePPM(I, filename);
     break;
   case FORMAT_JPEG:
-#ifdef VISP_HAVE_JPEG
     writeJPEG(I, filename);
-#else
-    try_opencv_writer = true;
-#endif
     break;
   case FORMAT_PNG:
-#ifdef VISP_HAVE_PNG
     writePNG(I, filename);
-#else
-    try_opencv_writer = true;
-#endif
     break;
   case FORMAT_TIFF:
   case FORMAT_BMP:
@@ -488,30 +296,11 @@ void vpImageIo::write(const vpImage<unsigned char> &I, const std::string &filena
   }
 
   if (try_opencv_writer) {
-#if VISP_HAVE_OPENCV_VERSION >= 0x020100
-    // std::cout << "Use opencv to write the image" << std::endl;
-    cv::Mat cvI;
-    vpImageConvert::convert(I, cvI);
-    cv::imwrite(filename, cvI);
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    writeOpenCV(I, filename);
 #else
-    switch (getFormat(filename)) {
-    case FORMAT_JPEG:
-      writeJPEG(I, filename);
-      break;
-    case FORMAT_PNG:
-      writePNG(I, filename);
-      break;
-    case FORMAT_BMP:
-    case FORMAT_TIFF:
-    case FORMAT_DIB:
-    case FORMAT_PBM:
-    case FORMAT_RASTER:
-    case FORMAT_JPEG2000:
-    case FORMAT_UNKNOWN:
-    default:
-      vpCERROR << "Cannot write file: Image format not supported..." << std::endl;
-      throw(vpImageException(vpImageException::ioError, "Cannot write file: Image format not supported"));
-    }
+    std::string message = "Cannot write file \"" + filename + "\": No backend able to support this image format";
+    throw(vpImageException(vpImageException::ioError, message));
 #endif
   }
 }
@@ -542,18 +331,10 @@ void vpImageIo::write(const vpImage<vpRGBa> &I, const std::string &filename)
     writePPM(I, filename);
     break;
   case FORMAT_JPEG:
-#ifdef VISP_HAVE_JPEG
     writeJPEG(I, filename);
-#else
-    try_opencv_writer = true;
-#endif
     break;
   case FORMAT_PNG:
-#ifdef VISP_HAVE_PNG
     writePNG(I, filename);
-#else
-    try_opencv_writer = true;
-#endif
     break;
   case FORMAT_TIFF:
   case FORMAT_BMP:
@@ -567,1735 +348,250 @@ void vpImageIo::write(const vpImage<vpRGBa> &I, const std::string &filename)
   }
 
   if (try_opencv_writer) {
-#if VISP_HAVE_OPENCV_VERSION >= 0x020100
-    // std::cout << "Use opencv to write the image" << std::endl;
-    cv::Mat cvI;
-    vpImageConvert::convert(I, cvI);
-    cv::imwrite(filename, cvI);
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    writeOpenCV(I, filename);
 #else
-    switch (getFormat(filename)) {
-    case FORMAT_JPEG:
-      writeJPEG(I, filename);
-      break;
-    case FORMAT_PNG:
-      writePNG(I, filename);
-      break;
-    case FORMAT_BMP:
-    case FORMAT_TIFF:
-    case FORMAT_DIB:
-    case FORMAT_PBM:
-    case FORMAT_RASTER:
-    case FORMAT_JPEG2000:
-    case FORMAT_UNKNOWN:
-    default:
-      vpCERROR << "Cannot write file: Image format not supported..." << std::endl;
-      throw(vpImageException(vpImageException::ioError, "Cannot write file: Image format not supported"));
-  }
+    std::string message = "Cannot write file \"" + filename + "\": No backend able to support this image format";
+    throw(vpImageException(vpImageException::ioError, message));
 #endif
   }
 }
 
-//--------------------------------------------------------------------------
-// PFM
-//--------------------------------------------------------------------------
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function is built like portable gray pixmap (eg PGM P5) file.
-  but considers float image data.
-
-  \param I : Image to save as a (PFM P8) file.
-  \param filename : Name of the file containing the image.
-*/
-
-void vpImageIo::writePFM(const vpImage<float> &I, const std::string &filename)
+void vpImageIo::readJPEG(vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend)
 {
-  FILE *fd;
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot write PFM image: filename empty"));
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_JPEG)
+    readJPEGLibjpeg(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": Libjpeg backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    readOpenCV(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    readSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    readStb(I, filename);
   }
+}
 
-  fd = fopen(filename.c_str(), "wb");
-
-  if (fd == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PFM file \"%s\"", filename.c_str()));
+void vpImageIo::readJPEG(vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend)
+{
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_JPEG)
+    readJPEGLibjpeg(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": Libjpeg backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    readOpenCV(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    readSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    readStb(I, filename);
   }
+}
 
-  // Write the head
-  fprintf(fd, "P8\n");                                 // Magic number
-  fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
-  fprintf(fd, "255\n");                                // Max level
-
-  // Write the bitmap
-  size_t ierr;
-  size_t nbyte = I.getWidth() * I.getHeight();
-
-  ierr = fwrite(I.bitmap, sizeof(float), nbyte, fd);
-  if (ierr != nbyte) {
-    fclose(fd);
-    throw(vpImageException(vpImageException::ioError, "Cannot save PFM file \"%s\": only %d bytes over %d saved ",
-                           filename.c_str(), ierr, nbyte));
+void vpImageIo::readPNG(vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend)
+{
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_PNG)
+    readPNGLibpng(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": Libpng backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    readOpenCV(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    readSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    readStb(I, filename);
   }
-
-  fflush(fd);
-  fclose(fd);
 }
-//--------------------------------------------------------------------------
-// PGM
-//--------------------------------------------------------------------------
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a portable gray pixmap (PGM P5) file.
-
-  \param I : Image to save as a (PGM P5) file.
-  \param filename : Name of the file containing the image.
-*/
 
-void vpImageIo::writePGM(const vpImage<unsigned char> &I, const std::string &filename)
+void vpImageIo::readPNG(vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend)
 {
-
-  FILE *fd;
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty"));
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_PNG)
+    readPNGLibpng(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": Libpng backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    readOpenCV(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    readSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    readStb(I, filename);
   }
+}
 
-  fd = fopen(filename.c_str(), "wb");
-
-  if (fd == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str()));
+void vpImageIo::writeJPEG(const vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend)
+{
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_JPEG)
+    writeJPEGLibjpeg(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": Libjpeg backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    writeOpenCV(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    writeJPEGSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    writeJPEGStb(I, filename);
   }
+}
 
-  // Write the head
-  fprintf(fd, "P5\n");                                 // Magic number
-  fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
-  fprintf(fd, "255\n");                                // Max level
+void vpImageIo::writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend)
+{
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_JPEG)
+    writeJPEGLibjpeg(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": Libjpeg backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    writeOpenCV(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    writeJPEGSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    writeJPEGStb(I, filename);
+  }
+}
 
-  // Write the bitmap
-  size_t ierr;
-  size_t nbyte = I.getWidth() * I.getHeight();
+void vpImageIo::writePNG(const vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend)
+{
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_PNG)
+    writePNGLibpng(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": Libpng backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    writeOpenCV(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    writePNGSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    writePNGStb(I, filename);
+  }
+}
 
-  ierr = fwrite(I.bitmap, sizeof(unsigned char), nbyte, fd);
-  if (ierr != nbyte) {
-    fclose(fd);
-    throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved",
-                           filename.c_str(), ierr, nbyte));
+void vpImageIo::writePNG(const vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend)
+{
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_PNG)
+    writePNGLibpng(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": Libpng backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    writeOpenCV(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    writePNGSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    writePNGStb(I, filename);
   }
+}
 
-  fflush(fd);
-  fclose(fd);
+void vpImageIo::writePFM(const vpImage<float> &I, const std::string &filename)
+{
+  vp_writePFM(I, filename);
 }
 
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a portable gray pixmap (PGM P5) file.
+void vpImageIo::writePGM(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  vp_writePGM(I, filename);
+}
 
-  \param I : Image to save as a (PGM P5) file.
-  \param filename : Name of the file containing the image.
-*/
 void vpImageIo::writePGM(const vpImage<short> &I, const std::string &filename)
 {
-  vpImage<unsigned char> Iuc;
-  unsigned int nrows = I.getHeight();
-  unsigned int ncols = I.getWidth();
-
-  Iuc.resize(nrows, ncols);
-
-  for (unsigned int i = 0; i < nrows * ncols; i++)
-    Iuc.bitmap[i] = (unsigned char)I.bitmap[i];
-
-  vpImageIo::writePGM(Iuc, filename);
+  vp_writePGM(I, filename);
 }
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a portable gray pixmap (PGM P5) file.
-  Color image is converted into a grayscale image.
-
-  \param I : Image to save as a (PGM P5) file.
-  \param filename : Name of the file containing the image.
-*/
 
 void vpImageIo::writePGM(const vpImage<vpRGBa> &I, const std::string &filename)
 {
-
-  FILE *fd;
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty"));
-  }
-
-  fd = fopen(filename.c_str(), "wb");
-
-  if (fd == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str()));
-  }
-
-  // Write the head
-  fprintf(fd, "P5\n");                                 // Magic number
-  fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
-  fprintf(fd, "255\n");                                // Max level
-
-  // Write the bitmap
-  size_t ierr;
-  size_t nbyte = I.getWidth() * I.getHeight();
-
-  vpImage<unsigned char> Itmp;
-  vpImageConvert::convert(I, Itmp);
-
-  ierr = fwrite(Itmp.bitmap, sizeof(unsigned char), nbyte, fd);
-  if (ierr != nbyte) {
-    fclose(fd);
-    throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved",
-                           filename.c_str(), ierr, nbyte));
-  }
-
-  fflush(fd);
-  fclose(fd);
+  vp_writePGM(I, filename);
 }
 
-/*!
-  Read a PFM P8 file and initialize a float image.
-
-  Read the contents of the portable gray pixmap (PFM P8) filename, allocate
-  memory for the corresponding image, and set the bitmap whith the content of
-  the file.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-
-*/
-
 void vpImageIo::readPFM(vpImage<float> &I, const std::string &filename)
 {
-  unsigned int w = 0, h = 0, maxval = 0;
-  unsigned int w_max = 100000, h_max = 100000, maxval_max = 255;
-  std::string magic("P8");
-
-  std::ifstream fd(filename.c_str(), std::ios::binary);
-
-  // Open the filename
-  if (!fd.is_open()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str()));
-  }
-
-  vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval);
-
-  if (w > w_max || h > h_max) {
-    fd.close();
-    throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str()));
-  }
-  if (maxval > maxval_max) {
-    fd.close();
-    throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str()));
-  }
-
-  if ((h != I.getHeight()) || (w != I.getWidth())) {
-    I.resize(h, w);
-  }
-
-  unsigned int nbyte = I.getHeight() * I.getWidth();
-  fd.read((char *)I.bitmap, sizeof(float) * nbyte);
-  if (!fd) {
-    fd.close();
-    throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte,
-                           filename.c_str()));
-  }
-
-  fd.close();
+  vp_readPFM(I, filename);
 }
 
-/*!
-  Read a PGM P5 file and initialize a scalar image.
-
-  Read the contents of the portable gray pixmap (PGM P5) filename, allocate
-  memory for the corresponding image, and set the bitmap whith the content of
-  the file.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-*/
-
 void vpImageIo::readPGM(vpImage<unsigned char> &I, const std::string &filename)
 {
-  unsigned int w = 0, h = 0, maxval = 0;
-  unsigned int w_max = 100000, h_max = 100000, maxval_max = 255;
-  std::string magic("P5");
-
-  std::ifstream fd(filename.c_str(), std::ios::binary);
-
-  // Open the filename
-  if (!fd.is_open()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str()));
-  }
-
-  vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval);
-
-  if (w > w_max || h > h_max) {
-    fd.close();
-    throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str()));
-  }
-  if (maxval > maxval_max) {
-    fd.close();
-    throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str()));
-  }
-
-  if ((h != I.getHeight()) || (w != I.getWidth())) {
-    I.resize(h, w);
-  }
-
-  unsigned int nbyte = I.getHeight() * I.getWidth();
-  fd.read((char *)I.bitmap, nbyte);
-  if (!fd) {
-    fd.close();
-    throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte,
-                           filename.c_str()));
-  }
-
-  fd.close();
+  vp_readPGM(I, filename);
 }
 
-/*!
-  Read a PGM P5 file and initialize a scalar image.
-
-  Read the contents of the portable gray pixmap (PGM P5) filename, allocate
-  memory for the corresponding image, and set the bitmap whith the content of
-  the file.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  The gray level image contained in the \e filename is converted in a
-  color image in \e I.
-
-  \param I : Color image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-*/
-
 void vpImageIo::readPGM(vpImage<vpRGBa> &I, const std::string &filename)
 {
-  vpImage<unsigned char> Itmp;
+  vp_readPGM(I, filename);
+}
 
-  vpImageIo::readPGM(Itmp, filename);
+void vpImageIo::readPPM(vpImage<unsigned char> &I, const std::string &filename)
+{
+  vp_readPPM(I, filename);
+}
 
-  vpImageConvert::convert(Itmp, I);
+void vpImageIo::readPPM(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  vp_readPPM(I, filename);
 }
 
-//--------------------------------------------------------------------------
-// PPM
-//--------------------------------------------------------------------------
-
-/*!
-  Read the contents of the portable pixmap (PPM P6) filename, allocate memory
-  for the corresponding gray level image, convert the data in gray level, and
-  set the bitmap whith the gray level data. That means that the image \e I is
-  a "black and white" rendering of the original image in \e filename, as in a
-  black and white photograph. The quantization formula used is \f$0,299 r +
-  0,587 g + 0,114 b\f$.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-
-*/
-void vpImageIo::readPPM(vpImage<unsigned char> &I, const std::string &filename)
-{
-  vpImage<vpRGBa> Itmp;
-
-  vpImageIo::readPPM(Itmp, filename);
-
-  vpImageConvert::convert(Itmp, I);
-}
-
-/*!
-  Read the contents of the portable pixmap (PPM P6) filename,
-  allocate memory for the corresponding vpRGBa image.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::readPPM(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  unsigned int w = 0, h = 0, maxval = 0;
-  unsigned int w_max = 100000, h_max = 100000, maxval_max = 255;
-  std::string magic("P6");
-
-  std::ifstream fd(filename.c_str(), std::ios::binary);
-
-  // Open the filename
-  if (!fd.is_open()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str()));
-  }
-
-  vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval);
-
-  if (w > w_max || h > h_max) {
-    fd.close();
-    throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str()));
-  }
-  if (maxval > maxval_max) {
-    fd.close();
-    throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str()));
-  }
-
-  if ((h != I.getHeight()) || (w != I.getWidth())) {
-    I.resize(h, w);
-  }
-
-  for (unsigned int i = 0; i < I.getHeight(); i++) {
-    for (unsigned int j = 0; j < I.getWidth(); j++) {
-      unsigned char rgb[3];
-      fd.read((char *)&rgb, 3);
-
-      if (!fd) {
-        fd.close();
-        throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"",
-                               (i * I.getWidth() + j) * 3 + fd.gcount(), I.getSize() * 3, filename.c_str()));
-      }
-
-      I[i][j].R = rgb[0];
-      I[i][j].G = rgb[1];
-      I[i][j].B = rgb[2];
-      I[i][j].A = vpRGBa::alpha_default;
-    }
-  }
-
-  fd.close();
-}
-
-/*!
-  Write the content of the bitmap in the file which name is given by \e
-  filename. This function writes a portable gray pixmap (PPM P6) file.
-  grayscale image is converted into a color image vpRGBa.
-
-  \param I : Image to save as a (PPM P6) file.
-  \param filename : Name of the file containing the image.
-
-*/
-
 void vpImageIo::writePPM(const vpImage<unsigned char> &I, const std::string &filename)
 {
-  vpImage<vpRGBa> Itmp;
-
-  vpImageConvert::convert(I, Itmp);
-
-  vpImageIo::writePPM(Itmp, filename);
+  vp_writePPM(I, filename);
 }
 
-/*!
-  Write the content of the bitmap in the file which name is given by \e
-  filename. This function writes a portable gray pixmap (PPM P6) file.
-
-  \param I : Image to save as a (PPM P6) file.
-  \param filename : Name of the file containing the image.
-*/
 void vpImageIo::writePPM(const vpImage<vpRGBa> &I, const std::string &filename)
 {
-  FILE *f;
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PPM file: filename empty"));
-  }
-
-  f = fopen(filename.c_str(), "wb");
-
-  if (f == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PPM file \"%s\"", filename.c_str()));
-  }
-
-  fprintf(f, "P6\n");                                 // Magic number
-  fprintf(f, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
-  fprintf(f, "%d\n", 255);                            // Max level
-
-  for (unsigned int i = 0; i < I.getHeight(); i++) {
-    for (unsigned int j = 0; j < I.getWidth(); j++) {
-      vpRGBa v = I[i][j];
-      unsigned char rgb[3];
-      rgb[0] = v.R;
-      rgb[1] = v.G;
-      rgb[2] = v.B;
-
-      size_t res = fwrite(&rgb, 1, 3, f);
-      if (res != 3) {
-        fclose(f);
-        throw(vpImageException(vpImageException::ioError, "cannot write file \"%s\"", filename.c_str()));
-      }
-    }
-  }
-
-  fflush(f);
-  fclose(f);
-}
-
-//--------------------------------------------------------------------------
-// JPEG
-//--------------------------------------------------------------------------
-
-#if defined(VISP_HAVE_JPEG)
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a JPEG file.
-
-  \param I : Image to save as a JPEG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writeJPEG(const vpImage<unsigned char> &I, const std::string &filename)
-{
-  struct jpeg_compress_struct cinfo;
-  struct jpeg_error_mgr jerr;
-  FILE *file;
-
-  cinfo.err = jpeg_std_error(&jerr);
-  jpeg_create_compress(&cinfo);
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "wb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str()));
-  }
-
-  unsigned int width = I.getWidth();
-  unsigned int height = I.getHeight();
-
-  jpeg_stdio_dest(&cinfo, file);
-
-  cinfo.image_width = width;
-  cinfo.image_height = height;
-  cinfo.input_components = 1;
-  cinfo.in_color_space = JCS_GRAYSCALE;
-  jpeg_set_defaults(&cinfo);
-
-  jpeg_start_compress(&cinfo, TRUE);
-
-  unsigned char *line;
-  line = new unsigned char[width];
-  unsigned char *input = (unsigned char *)I.bitmap;
-  while (cinfo.next_scanline < cinfo.image_height) {
-    for (unsigned int i = 0; i < width; i++) {
-      line[i] = *(input);
-      input++;
-    }
-    jpeg_write_scanlines(&cinfo, &line, 1);
-  }
-
-  jpeg_finish_compress(&cinfo);
-  jpeg_destroy_compress(&cinfo);
-  delete[] line;
-  fclose(file);
-}
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a JPEG file.
-
-  \param I : Image to save as a JPEG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename)
-{
-  struct jpeg_compress_struct cinfo;
-  struct jpeg_error_mgr jerr;
-  FILE *file;
-
-  cinfo.err = jpeg_std_error(&jerr);
-  jpeg_create_compress(&cinfo);
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "wb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str()));
-  }
-
-  unsigned int width = I.getWidth();
-  unsigned int height = I.getHeight();
-
-  jpeg_stdio_dest(&cinfo, file);
-
-  cinfo.image_width = width;
-  cinfo.image_height = height;
-  cinfo.input_components = 3;
-  cinfo.in_color_space = JCS_RGB;
-  jpeg_set_defaults(&cinfo);
-
-  jpeg_start_compress(&cinfo, TRUE);
-
-  unsigned char *line;
-  line = new unsigned char[3 * width];
-  unsigned char *input = (unsigned char *)I.bitmap;
-  while (cinfo.next_scanline < cinfo.image_height) {
-    for (unsigned int i = 0; i < width; i++) {
-      line[i * 3] = *(input);
-      input++;
-      line[i * 3 + 1] = *(input);
-      input++;
-      line[i * 3 + 2] = *(input);
-      input++;
-      input++;
-    }
-    jpeg_write_scanlines(&cinfo, &line, 1);
-  }
-
-  jpeg_finish_compress(&cinfo);
-  jpeg_destroy_compress(&cinfo);
-  delete[] line;
-  fclose(file);
-}
-
-/*!
-  Read the contents of the JPEG file, allocate memory
-  for the corresponding gray level image, if necessary convert the data in
-  gray level, and set the bitmap whith the gray level data. That means that
-  the image \e I is a "black and white" rendering of the original image in \e
-  filename, as in a black and white photograph. If necessary, the quantization
-  formula used is \f$0,299 r + 0,587 g + 0,114 b\f$.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-
-*/
-void vpImageIo::readJPEG(vpImage<unsigned char> &I, const std::string &filename)
-{
-  struct jpeg_decompress_struct cinfo;
-  struct jpeg_error_mgr jerr;
-  FILE *file;
-
-  cinfo.err = jpeg_std_error(&jerr);
-  jpeg_create_decompress(&cinfo);
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "rb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str()));
-  }
-
-  jpeg_stdio_src(&cinfo, file);
-  jpeg_read_header(&cinfo, TRUE);
-
-  unsigned int width = cinfo.image_width;
-  unsigned int height = cinfo.image_height;
-
-  if ((width != I.getWidth()) || (height != I.getHeight()))
-    I.resize(height, width);
-
-  jpeg_start_decompress(&cinfo);
-
-  unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components);
-  JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1);
-
-  if (cinfo.out_color_space == JCS_RGB) {
-    vpImage<vpRGBa> Ic(height, width);
-    unsigned char *output = (unsigned char *)Ic.bitmap;
-    while (cinfo.output_scanline < cinfo.output_height) {
-      jpeg_read_scanlines(&cinfo, buffer, 1);
-      for (unsigned int i = 0; i < width; i++) {
-        *(output++) = buffer[0][i * 3];
-        *(output++) = buffer[0][i * 3 + 1];
-        *(output++) = buffer[0][i * 3 + 2];
-        *(output++) = vpRGBa::alpha_default;
-      }
-    }
-    vpImageConvert::convert(Ic, I);
-  }
-
-  else if (cinfo.out_color_space == JCS_GRAYSCALE) {
-    while (cinfo.output_scanline < cinfo.output_height) {
-      unsigned int row = cinfo.output_scanline;
-      jpeg_read_scanlines(&cinfo, buffer, 1);
-      memcpy(I[row], buffer[0], rowbytes);
-    }
-  }
-
-  jpeg_finish_decompress(&cinfo);
-  jpeg_destroy_decompress(&cinfo);
-  fclose(file);
-}
-
-/*!
-  Read a JPEG file and initialize a scalar image.
-
-  Read the contents of the JPEG file, allocate
-  memory for the corresponding image, and set
-  the bitmap whith the content of
-  the file.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  If the file corresponds to a grayscaled image, a conversion is done to deal
-  with \e I which is a color image.
-
-  \param I : Color image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::readJPEG(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  struct jpeg_decompress_struct cinfo;
-  struct jpeg_error_mgr jerr;
-  FILE *file;
-
-  cinfo.err = jpeg_std_error(&jerr);
-  jpeg_create_decompress(&cinfo);
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "rb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str()));
-  }
-
-  jpeg_stdio_src(&cinfo, file);
-
-  jpeg_read_header(&cinfo, TRUE);
-
-  unsigned int width = cinfo.image_width;
-  unsigned int height = cinfo.image_height;
-
-  if ((width != I.getWidth()) || (height != I.getHeight()))
-    I.resize(height, width);
-
-  jpeg_start_decompress(&cinfo);
-
-  unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components);
-  JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1);
-
-  if (cinfo.out_color_space == JCS_RGB) {
-    unsigned char *output = (unsigned char *)I.bitmap;
-    while (cinfo.output_scanline < cinfo.output_height) {
-      jpeg_read_scanlines(&cinfo, buffer, 1);
-      for (unsigned int i = 0; i < width; i++) {
-        *(output++) = buffer[0][i * 3];
-        *(output++) = buffer[0][i * 3 + 1];
-        *(output++) = buffer[0][i * 3 + 2];
-        *(output++) = vpRGBa::alpha_default;
-      }
-    }
-  }
-
-  else if (cinfo.out_color_space == JCS_GRAYSCALE) {
-    vpImage<unsigned char> Ig(height, width);
-
-    while (cinfo.output_scanline < cinfo.output_height) {
-      unsigned int row = cinfo.output_scanline;
-      jpeg_read_scanlines(&cinfo, buffer, 1);
-      memcpy(Ig[row], buffer[0], rowbytes);
-    }
-
-    vpImageConvert::convert(Ig, I);
-  }
-
-  jpeg_finish_decompress(&cinfo);
-  jpeg_destroy_decompress(&cinfo);
-  fclose(file);
-}
-
-#elif defined(VISP_HAVE_OPENCV)
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a JPEG file.
-
-  \param I : Image to save as a JPEG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writeJPEG(const vpImage<unsigned char> &I, const std::string &filename)
-{
-#if (VISP_HAVE_OPENCV_VERSION >= 0x020408)
-  cv::Mat Ip;
-  vpImageConvert::convert(I, Ip);
-  cv::imwrite(filename.c_str(), Ip);
-#else
-  IplImage *Ip = NULL;
-  vpImageConvert::convert(I, Ip);
-
-  cvSaveImage(filename.c_str(), Ip);
-
-  cvReleaseImage(&Ip);
-#endif
-}
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a JPEG file.
-
-  \param I : Image to save as a JPEG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename)
-{
-#if (VISP_HAVE_OPENCV_VERSION >= 0x020408)
-  cv::Mat Ip;
-  vpImageConvert::convert(I, Ip);
-  cv::imwrite(filename.c_str(), Ip);
-#else
-  IplImage *Ip = NULL;
-  vpImageConvert::convert(I, Ip);
-
-  cvSaveImage(filename.c_str(), Ip);
-
-  cvReleaseImage(&Ip);
-#endif
-}
-
-/*!
-  Read the contents of the JPEG file, allocate memory
-  for the corresponding gray level image, if necessary convert the data in
-  gray level, and set the bitmap whith the gray level data. That means that
-  the image \e I is a "black and white" rendering of the original image in \e
-  filename, as in a black and white photograph. If necessary, the quantization
-  formula used is \f$0,299 r + 0,587 g + 0,114 b\f$.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  If EXIF information is embedded in the image file, the EXIF orientation is ignored.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-
-*/
-void vpImageIo::readJPEG(vpImage<unsigned char> &I, const std::string &filename)
-{
-#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-#if VISP_HAVE_OPENCV_VERSION >= 0x030200
-    int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
-    int flags = cv::IMREAD_GRAYSCALE;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
-    int flags = CV_LOAD_IMAGE_GRAYSCALE;
-#endif
-  cv::Mat Ip = cv::imread(filename.c_str(), flags);
-  if (!Ip.empty())
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-#else
-  IplImage *Ip = NULL;
-  Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE);
-  if (Ip != NULL)
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-  cvReleaseImage(&Ip);
-#endif
-}
-
-/*!
-  Read a JPEG file and initialize a scalar image.
-
-  Read the contents of the JPEG file, allocate
-  memory for the corresponding image, and set
-  the bitmap whith the content of
-  the file.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  If the file corresponds to a grayscaled image, a conversion is done to deal
-  with \e I which is a color image.
-
-  If EXIF information is embedded in the image file, the EXIF orientation is ignored.
-
-  \param I : Color image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::readJPEG(vpImage<vpRGBa> &I, const std::string &filename)
-{
-#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-#if VISP_HAVE_OPENCV_VERSION >= 0x030200
-    int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
-    int flags = cv::IMREAD_GRAYSCALE;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
-    int flags = CV_LOAD_IMAGE_GRAYSCALE;
-#endif
-  cv::Mat Ip = cv::imread(filename.c_str(), flags);
-  if (!Ip.empty())
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-#else
-  IplImage *Ip = NULL;
-  Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_COLOR);
-  if (Ip != NULL)
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-  cvReleaseImage(&Ip);
-#endif
-}
-#else
-void vpImageIo::readJPEG(vpImage<unsigned char> &I, const std::string &filename)
-{
-  int width = 0, height = 0, channels = 0;
-  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_grey);
-  if (image == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
-  }
-  I.init(image, static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
-  stbi_image_free(image);
-}
-void vpImageIo::readJPEG(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  int width = 0, height = 0, channels = 0;
-  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha);
-  if (image == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
-  }
-  I.init(reinterpret_cast<vpRGBa*>(image), static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
-  stbi_image_free(image);
-}
-void vpImageIo::writeJPEG(const vpImage<unsigned char> &I, const std::string &filename)
-{
-  int res = stbi_write_jpg(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_grey,
-                           reinterpret_cast<void*>(I.bitmap), 90);
-  if (res == 0) {
-    throw(vpImageException(vpImageException::ioError, "JPEG write error"));
-  }
-}
-void vpImageIo::writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename)
-{
-  int res = stbi_write_jpg(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
-                           reinterpret_cast<void*>(I.bitmap), 90);
-  if (res == 0) {
-    throw(vpImageException(vpImageException::ioError, "JEPG write error"));
-  }
+  vp_writePPM(I, filename);
 }
-#endif
-
-//--------------------------------------------------------------------------
-// PNG
-//--------------------------------------------------------------------------
-
-#if defined(VISP_HAVE_PNG)
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a PNG file.
-
-  \param I : Image to save as a PNG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writePNG(const vpImage<unsigned char> &I, const std::string &filename)
-{
-  FILE *file;
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "wb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str()));
-  }
-
-  /* create a png info struct */
-  png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
-  if (!png_ptr) {
-    fclose(file);
-    vpERROR_TRACE("Error during png_create_write_struct()\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  png_infop info_ptr = png_create_info_struct(png_ptr);
-  if (!info_ptr) {
-    fclose(file);
-    png_destroy_write_struct(&png_ptr, NULL);
-    vpERROR_TRACE("Error during png_create_info_struct()\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  /* initialize the setjmp for returning properly after a libpng error occured
-   */
-  if (setjmp(png_jmpbuf(png_ptr))) {
-    fclose(file);
-    png_destroy_write_struct(&png_ptr, &info_ptr);
-    vpERROR_TRACE("Error during init_io\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  /* setup libpng for using standard C fwrite() function with our FILE pointer
-   */
-  png_init_io(png_ptr, file);
-
-  unsigned int width = I.getWidth();
-  unsigned int height = I.getHeight();
-  int bit_depth = 8;
-  int color_type = PNG_COLOR_TYPE_GRAY;
-  /* set some useful information from header */
-
-  if (setjmp(png_jmpbuf(png_ptr))) {
-    fclose(file);
-    png_destroy_write_struct(&png_ptr, &info_ptr);
-    vpERROR_TRACE("Error during write header\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
-               PNG_FILTER_TYPE_BASE);
-
-  png_write_info(png_ptr, info_ptr);
-
-  png_bytep *row_ptrs = new png_bytep[height];
-  for (unsigned int i = 0; i < height; i++)
-    row_ptrs[i] = new png_byte[width];
-
-  unsigned char *input = (unsigned char *)I.bitmap;
-
-  for (unsigned int i = 0; i < height; i++) {
-    png_byte *row = row_ptrs[i];
-    for (unsigned int j = 0; j < width; j++) {
-      row[j] = *(input);
-      input++;
-    }
-  }
-
-  png_write_image(png_ptr, row_ptrs);
-
-  png_write_end(png_ptr, NULL);
-
-  for (unsigned int j = 0; j < height; j++)
-    delete[] row_ptrs[j];
-
-  delete[] row_ptrs;
-
-  png_destroy_write_struct(&png_ptr, &info_ptr);
-
-  fclose(file);
-}
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a PNG file.
-
-  \param I : Image to save as a PNG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writePNG(const vpImage<vpRGBa> &I, const std::string &filename)
-{
-  FILE *file;
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "wb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str()));
-  }
-
-  /* create a png info struct */
-  png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
-  if (!png_ptr) {
-    fclose(file);
-    vpERROR_TRACE("Error during png_create_write_struct()\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  png_infop info_ptr = png_create_info_struct(png_ptr);
-  if (!info_ptr) {
-    fclose(file);
-    png_destroy_write_struct(&png_ptr, NULL);
-    vpERROR_TRACE("Error during png_create_info_struct()\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  /* initialize the setjmp for returning properly after a libpng error occured
-   */
-  if (setjmp(png_jmpbuf(png_ptr))) {
-    fclose(file);
-    png_destroy_write_struct(&png_ptr, &info_ptr);
-    vpERROR_TRACE("Error during init_io\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  /* setup libpng for using standard C fwrite() function with our FILE pointer
-   */
-  png_init_io(png_ptr, file);
-
-  unsigned int width = I.getWidth();
-  unsigned int height = I.getHeight();
-  int bit_depth = 8;
-  int color_type = PNG_COLOR_TYPE_RGB;
-  /* set some useful information from header */
-
-  if (setjmp(png_jmpbuf(png_ptr))) {
-    fclose(file);
-    png_destroy_write_struct(&png_ptr, &info_ptr);
-    vpERROR_TRACE("Error during write header\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
-               PNG_FILTER_TYPE_BASE);
-
-  png_write_info(png_ptr, info_ptr);
-
-  png_bytep *row_ptrs = new png_bytep[height];
-  for (unsigned int i = 0; i < height; i++)
-    row_ptrs[i] = new png_byte[3 * width];
-
-  unsigned char *input = (unsigned char *)I.bitmap;
-  ;
-
-  for (unsigned int i = 0; i < height; i++) {
-    png_byte *row = row_ptrs[i];
-    for (unsigned int j = 0; j < width; j++) {
-      row[3 * j] = *(input);
-      input++;
-      row[3 * j + 1] = *(input);
-      input++;
-      row[3 * j + 2] = *(input);
-      input++;
-      input++;
-    }
-  }
-
-  png_write_image(png_ptr, row_ptrs);
-
-  png_write_end(png_ptr, NULL);
-
-  for (unsigned int j = 0; j < height; j++)
-    delete[] row_ptrs[j];
-
-  delete[] row_ptrs;
-
-  png_destroy_write_struct(&png_ptr, &info_ptr);
-
-  fclose(file);
-}
-
-/*!
-  Read the contents of the PNG file, allocate memory
-  for the corresponding gray level image, if necessary convert the data in
-  gray level, and set the bitmap whith the gray level data. That means that
-  the image \e I is a "black and white" rendering of the original image in \e
-  filename, as in a black and white photograph. If necessary, the quantization
-  formula used is \f$0,299 r + 0,587 g + 0,114 b\f$.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-
-*/
-void vpImageIo::readPNG(vpImage<unsigned char> &I, const std::string &filename)
-{
-  FILE *file;
-  png_byte magic[8];
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "rb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str()));
-  }
-
-  /* read magic number */
-  if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) {
-    fclose(file);
-    throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str()));
-  }
-
-  /* check for valid magic number */
-  if (png_sig_cmp(magic, 0, sizeof(magic))) {
-    fclose(file);
-    throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image",
-                           filename.c_str()));
-  }
-
-  /* create a png read struct */
-  // printf("version %s\n", PNG_LIBPNG_VER_STRING);
-  png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
-  if (png_ptr == NULL) {
-    fprintf(stderr, "error: can't create a png read structure!\n");
-    fclose(file);
-    throw(vpImageException(vpImageException::ioError, "error reading png file"));
-  }
-
-  /* create a png info struct */
-  png_infop info_ptr = png_create_info_struct(png_ptr);
-  if (info_ptr == NULL) {
-    fprintf(stderr, "error: can't create a png info structure!\n");
-    fclose(file);
-    png_destroy_read_struct(&png_ptr, NULL, NULL);
-    throw(vpImageException(vpImageException::ioError, "error reading png file"));
-  }
-
-  /* initialize the setjmp for returning properly after a libpng error occured
-   */
-  if (setjmp(png_jmpbuf(png_ptr))) {
-    fclose(file);
-    png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
-    vpERROR_TRACE("Error during init io\n");
-    throw(vpImageException(vpImageException::ioError, "PNG read error"));
-  }
-
-  /* setup libpng for using standard C fread() function with our FILE pointer
-   */
-  png_init_io(png_ptr, file);
-
-  /* tell libpng that we have already read the magic number */
-  png_set_sig_bytes(png_ptr, sizeof(magic));
-
-  /* read png info */
-  png_read_info(png_ptr, info_ptr);
-
-  unsigned int width = png_get_image_width(png_ptr, info_ptr);
-  unsigned int height = png_get_image_height(png_ptr, info_ptr);
-
-  unsigned int bit_depth, channels, color_type;
-  /* get some useful information from header */
-  bit_depth = png_get_bit_depth(png_ptr, info_ptr);
-  channels = png_get_channels(png_ptr, info_ptr);
-  color_type = png_get_color_type(png_ptr, info_ptr);
-
-  /* convert index color images to RGB images */
-  if (color_type == PNG_COLOR_TYPE_PALETTE)
-    png_set_palette_to_rgb(png_ptr);
-
-  /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */
-  if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8)
-    png_set_expand(png_ptr);
-
-  //  if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS))
-  //    png_set_tRNS_to_alpha (png_ptr);
-
-  if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA)
-    png_set_strip_alpha(png_ptr);
-
-  if (bit_depth == 16)
-    png_set_strip_16(png_ptr);
-  else if (bit_depth < 8)
-    png_set_packing(png_ptr);
-
-  /* update info structure to apply transformations */
-  png_read_update_info(png_ptr, info_ptr);
-
-  channels = png_get_channels(png_ptr, info_ptr);
-
-  if ((width != I.getWidth()) || (height != I.getHeight()))
-    I.resize(height, width);
-
-  png_bytep *rowPtrs = new png_bytep[height];
-
-  unsigned int stride = png_get_rowbytes(png_ptr, info_ptr);
-  unsigned char *data = new unsigned char[stride * height];
-
-  for (unsigned int i = 0; i < height; i++)
-    rowPtrs[i] = (png_bytep)data + (i * stride);
-
-  png_read_image(png_ptr, rowPtrs);
-
-  vpImage<vpRGBa> Ic(height, width);
-  unsigned char *output;
-
-  switch (channels) {
-  case 1:
-    output = (unsigned char *)I.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i];
-    }
-    break;
-
-  case 2:
-    output = (unsigned char *)I.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i * 2];
-    }
-    break;
-
-  case 3:
-    output = (unsigned char *)Ic.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i * 3];
-      *(output++) = data[i * 3 + 1];
-      *(output++) = data[i * 3 + 2];
-      *(output++) = vpRGBa::alpha_default;
-    }
-    vpImageConvert::convert(Ic, I);
-    break;
-
-  case 4:
-    output = (unsigned char *)Ic.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i * 4];
-      *(output++) = data[i * 4 + 1];
-      *(output++) = data[i * 4 + 2];
-      *(output++) = data[i * 4 + 3];
-    }
-    vpImageConvert::convert(Ic, I);
-    break;
-  }
-
-  delete[](png_bytep) rowPtrs;
-  delete[] data;
-  png_read_end(png_ptr, NULL);
-  png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
-  fclose(file);
-}
-
-/*!
-  Read a PNG file and initialize a scalar image.
-
-  Read the contents of the PNG file, allocate
-  memory for the corresponding image, and set
-  the bitmap whith the content of
-  the file.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  If the file corresponds to a grayscaled image, a conversion is done to deal
-  with \e I which is a color image.
-
-  \param I : Color image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::readPNG(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  FILE *file;
-  png_byte magic[8];
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "rb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str()));
-  }
-
-  /* read magic number */
-  if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) {
-    fclose(file);
-    throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str()));
-  }
-
-  /* check for valid magic number */
-  if (png_sig_cmp(magic, 0, sizeof(magic))) {
-    fclose(file);
-    throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image",
-                           filename.c_str()));
-  }
-
-  /* create a png read struct */
-  png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
-  if (!png_ptr) {
-    fclose(file);
-    vpERROR_TRACE("Error during png_create_read_struct()\n");
-    throw(vpImageException(vpImageException::ioError, "PNG read error"));
-  }
-
-  /* create a png info struct */
-  png_infop info_ptr = png_create_info_struct(png_ptr);
-  if (!info_ptr) {
-    fclose(file);
-    png_destroy_read_struct(&png_ptr, NULL, NULL);
-    vpERROR_TRACE("Error during png_create_info_struct()\n");
-    throw(vpImageException(vpImageException::ioError, "PNG read error"));
-  }
-
-  /* initialize the setjmp for returning properly after a libpng error occured
-   */
-  if (setjmp(png_jmpbuf(png_ptr))) {
-    fclose(file);
-    png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
-    vpERROR_TRACE("Error during init io\n");
-    throw(vpImageException(vpImageException::ioError, "PNG read error"));
-  }
-
-  /* setup libpng for using standard C fread() function with our FILE pointer
-   */
-  png_init_io(png_ptr, file);
-
-  /* tell libpng that we have already read the magic number */
-  png_set_sig_bytes(png_ptr, sizeof(magic));
-
-  /* read png info */
-  png_read_info(png_ptr, info_ptr);
-
-  unsigned int width = png_get_image_width(png_ptr, info_ptr);
-  unsigned int height = png_get_image_height(png_ptr, info_ptr);
-
-  unsigned int bit_depth, channels, color_type;
-  /* get some useful information from header */
-  bit_depth = png_get_bit_depth(png_ptr, info_ptr);
-  channels = png_get_channels(png_ptr, info_ptr);
-  color_type = png_get_color_type(png_ptr, info_ptr);
-
-  /* convert index color images to RGB images */
-  if (color_type == PNG_COLOR_TYPE_PALETTE)
-    png_set_palette_to_rgb(png_ptr);
-
-  /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */
-  if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8)
-    png_set_expand(png_ptr);
-
-  //  if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS))
-  //    png_set_tRNS_to_alpha (png_ptr);
-
-  if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA)
-    png_set_strip_alpha(png_ptr);
-
-  if (bit_depth == 16)
-    png_set_strip_16(png_ptr);
-  else if (bit_depth < 8)
-    png_set_packing(png_ptr);
-
-  /* update info structure to apply transformations */
-  png_read_update_info(png_ptr, info_ptr);
-
-  channels = png_get_channels(png_ptr, info_ptr);
-
-  if ((width != I.getWidth()) || (height != I.getHeight()))
-    I.resize(height, width);
-
-  png_bytep *rowPtrs = new png_bytep[height];
-
-  unsigned int stride = png_get_rowbytes(png_ptr, info_ptr);
-  unsigned char *data = new unsigned char[stride * height];
-
-  for (unsigned int i = 0; i < height; i++)
-    rowPtrs[i] = (png_bytep)data + (i * stride);
-
-  png_read_image(png_ptr, rowPtrs);
-
-  vpImage<unsigned char> Ig(height, width);
-  unsigned char *output;
-
-  switch (channels) {
-  case 1:
-    output = (unsigned char *)Ig.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i];
-    }
-    vpImageConvert::convert(Ig, I);
-    break;
-
-  case 2:
-    output = (unsigned char *)Ig.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i * 2];
-    }
-    vpImageConvert::convert(Ig, I);
-    break;
-
-  case 3:
-    output = (unsigned char *)I.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i * 3];
-      *(output++) = data[i * 3 + 1];
-      *(output++) = data[i * 3 + 2];
-      *(output++) = vpRGBa::alpha_default;
-    }
-    break;
-
-  case 4:
-    output = (unsigned char *)I.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i * 4];
-      *(output++) = data[i * 4 + 1];
-      *(output++) = data[i * 4 + 2];
-      *(output++) = data[i * 4 + 3];
-    }
-    break;
-  }
-
-  delete[](png_bytep) rowPtrs;
-  delete[] data;
-  png_read_end(png_ptr, NULL);
-  png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
-  fclose(file);
-}
-
-//TODO:
-void vpImageIo::readSimdlib(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  size_t stride = 0, width = 0, height = 0;
-  SimdPixelFormatType format = SimdPixelFormatRgba32;
-  uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format);
-  const bool copyData = false;
-  I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData);
-}
-
-void vpImageIo::readStb(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  int width = 0, height = 0, channels = 0;
-  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha);
-  if (image == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
-  }
-  I.init(reinterpret_cast<vpRGBa*>(image), static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
-  stbi_image_free(image);
-}
-
-inline bool ends_with(std::string const & value, std::string const & ending)
-{
-    if (ending.size() > value.size()) return false;
-    return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
-}
-
-void vpImageIo::writeSimdlib(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  if (ends_with(filename, ".png")) {
-    SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFilePng, 90, filename.c_str());
-  } else {
-    SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, 90, filename.c_str());
-  }
-}
-
-void vpImageIo::writeStb(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  if (ends_with(filename, ".png")) {
-    const int stride_in_bytes = static_cast<int>(4 * I.getWidth());
-    int res = stbi_write_png(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
-                             reinterpret_cast<void*>(I.bitmap), stride_in_bytes);
-    if (res == 0) {
-      throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str()));
-    }
-  } else {
-    int res = stbi_write_jpg(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
-                             reinterpret_cast<void*>(I.bitmap), 90);
-    if (res == 0) {
-      throw(vpImageException(vpImageException::ioError, "JEPG write error"));
-    }
-  }
-}
-
-#elif defined(VISP_HAVE_OPENCV)
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a PNG file.
-
-  \param I : Image to save as a PNG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writePNG(const vpImage<unsigned char> &I, const std::string &filename)
-{
-#if (VISP_HAVE_OPENCV_VERSION >= 0x020408)
-  cv::Mat Ip;
-  vpImageConvert::convert(I, Ip);
-  cv::imwrite(filename.c_str(), Ip);
-#else
-  IplImage *Ip = NULL;
-  vpImageConvert::convert(I, Ip);
-
-  cvSaveImage(filename.c_str(), Ip);
-
-  cvReleaseImage(&Ip);
-#endif
-}
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a PNG file.
-
-  \param I : Image to save as a PNG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writePNG(const vpImage<vpRGBa> &I, const std::string &filename)
-{
-#if (VISP_HAVE_OPENCV_VERSION >= 0x020408)
-  cv::Mat Ip;
-  vpImageConvert::convert(I, Ip);
-  cv::imwrite(filename.c_str(), Ip);
-#else
-  IplImage *Ip = NULL;
-  vpImageConvert::convert(I, Ip);
-
-  cvSaveImage(filename.c_str(), Ip);
-
-  cvReleaseImage(&Ip);
-#endif
-}
-
-/*!
-  Read the contents of the PNG file, allocate memory
-  for the corresponding gray level image, if necessary convert the data in
-  gray level, and set the bitmap whith the gray level data. That means that
-  the image \e I is a "black and white" rendering of the original image in \e
-  filename, as in a black and white photograph. If necessary, the quantization
-  formula used is \f$0,299 r + 0,587 g + 0,114 b\f$.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  If EXIF information is embedded in the image file, the EXIF orientation is ignored.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-
-*/
-void vpImageIo::readPNG(vpImage<unsigned char> &I, const std::string &filename)
-{
-#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-#if VISP_HAVE_OPENCV_VERSION >= 0x030200
-    int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
-    int flags = cv::IMREAD_GRAYSCALE;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
-    int flags = CV_LOAD_IMAGE_GRAYSCALE;
-#endif
-  cv::Mat Ip = cv::imread(filename.c_str(), flags);
-  if (!Ip.empty())
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-#else
-  IplImage *Ip = NULL;
-  Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE);
-  if (Ip != NULL)
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-  cvReleaseImage(&Ip);
-#endif
-}
-
-/*!
-  Read a PNG file and initialize a scalar image.
-
-  Read the contents of the PNG file, allocate
-  memory for the corresponding image, and set
-  the bitmap whith the content of
-  the file.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  If the file corresponds to a grayscaled image, a conversion is done to deal
-  with \e I which is a color image.
-
-  If EXIF information is embedded in the image file, the EXIF orientation is ignored.
-
-  \param I : Color image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::readPNG(vpImage<vpRGBa> &I, const std::string &filename)
-{
-#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-#if VISP_HAVE_OPENCV_VERSION >= 0x030200
-    int flags = cv::IMREAD_COLOR | cv::IMREAD_IGNORE_ORIENTATION;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
-    int flags = cv::IMREAD_COLOR;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
-    int flags = CV_LOAD_IMAGE_COLOR;
-#endif
-  cv::Mat Ip = cv::imread(filename.c_str(), flags);
-  if (!Ip.empty())
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-#else
-  IplImage *Ip = NULL;
-  Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_COLOR);
-  if (Ip != NULL)
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-  cvReleaseImage(&Ip);
-#endif
-}
-#else
-void vpImageIo::readPNG(vpImage<unsigned char> &I, const std::string &filename)
-{
-  int width = 0, height = 0, channels = 0;
-  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_grey);
-  if (image == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
-  }
-  I.init(image, static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
-  stbi_image_free(image);
-}
-void vpImageIo::readPNG(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  int width = 0, height = 0, channels = 0;
-  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha);
-  if (image == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
-  }
-  I.init(reinterpret_cast<vpRGBa*>(image), static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
-  stbi_image_free(image);
-}
-void vpImageIo::writePNG(const vpImage<unsigned char> &I, const std::string &filename)
-{
-  const int stride_in_bytes = static_cast<int>(I.getWidth());
-  int res = stbi_write_png(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_grey,
-                           reinterpret_cast<void*>(I.bitmap), stride_in_bytes);
-  if (res == 0) {
-    throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str()));
-  }
-}
-void vpImageIo::writePNG(const vpImage<vpRGBa> &I, const std::string &filename)
-{
-  const int stride_in_bytes = static_cast<int>(4 * I.getWidth());
-  int res = stbi_write_png(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
-                           reinterpret_cast<void*>(I.bitmap), stride_in_bytes);
-  if (res == 0) {
-    throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str()));
-  }
-}
-#endif
diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp
index 8efe2c759e..3bf19a465e 100644
--- a/modules/io/test/perfImageLoadSave.cpp
+++ b/modules/io/test/perfImageLoadSave.cpp
@@ -64,7 +64,7 @@ TEST_CASE("Benchmark Jpeg image loading", "[benchmark]") {
     vpImage<vpRGBa> I;
 
     BENCHMARK("vpImageIo::readSimdlib()") {
-      vpImageIo::readSimdlib(I, imagePathJpeg);
+      vpImageIo::readJPEG(I, imagePathJpeg, vpImageIo::IO_SIMDLIB_BACKEND);
       return I;
     };
   }
@@ -73,7 +73,7 @@ TEST_CASE("Benchmark Jpeg image loading", "[benchmark]") {
     vpImage<vpRGBa> I;
 
     BENCHMARK("vpImageIo::readStb()") {
-      vpImageIo::readStb(I, imagePathJpeg);
+      vpImageIo::readJPEG(I, imagePathJpeg, vpImageIo::IO_STB_IMAGE_BACKEND);
       return I;
     };
   }
@@ -93,7 +93,7 @@ TEST_CASE("Benchmark Png image loading", "[benchmark]") {
     vpImage<vpRGBa> I;
 
     BENCHMARK("vpImageIo::readSimdlib()") {
-      vpImageIo::readSimdlib(I, imagePathPng);
+      vpImageIo::readPNG(I, imagePathPng, vpImageIo::IO_SIMDLIB_BACKEND);
       return I;
     };
   }
@@ -102,7 +102,7 @@ TEST_CASE("Benchmark Png image loading", "[benchmark]") {
     vpImage<vpRGBa> I;
 
     BENCHMARK("vpImageIo::readStb()") {
-      vpImageIo::readStb(I, imagePathPng);
+      vpImageIo::readPNG(I, imagePathPng, vpImageIo::IO_STB_IMAGE_BACKEND);
       return I;
     };
   }
@@ -122,7 +122,7 @@ TEST_CASE("Benchmark big Png image loading", "[benchmark]") {
     vpImage<vpRGBa> I;
 
     BENCHMARK("vpImageIo::readSimdlib()") {
-      vpImageIo::readSimdlib(I, imagePathPngBig);
+      vpImageIo::readPNG(I, imagePathPngBig, vpImageIo::IO_SIMDLIB_BACKEND);
       return I;
     };
   }
@@ -131,7 +131,7 @@ TEST_CASE("Benchmark big Png image loading", "[benchmark]") {
     vpImage<vpRGBa> I;
 
     BENCHMARK("vpImageIo::readStb()") {
-      vpImageIo::readStb(I, imagePathPngBig);
+      vpImageIo::readPNG(I, imagePathPngBig, vpImageIo::IO_STB_IMAGE_BACKEND);
       return I;
     };
   }
@@ -153,7 +153,7 @@ TEST_CASE("Benchmark Jpeg image saving", "[benchmark]") {
     const std::string filename = "/tmp/Klimt_Simd.jpg";
 
     BENCHMARK("vpImageIo::writeSimdlib()") {
-      vpImageIo::writeSimdlib(I, filename);
+      vpImageIo::writeJPEG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND);
       return I;
     };
   }
@@ -162,7 +162,7 @@ TEST_CASE("Benchmark Jpeg image saving", "[benchmark]") {
     const std::string filename = "/tmp/Klimt_stb.jpg";
 
     BENCHMARK("vpImageIo::writeStb()") {
-      vpImageIo::writeStb(I, filename);
+      vpImageIo::writeJPEG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND);
       return I;
     };
   }
@@ -184,7 +184,7 @@ TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") {
     const std::string filename = "/tmp/Big_images_Simd.jpg";
 
     BENCHMARK("vpImageIo::writeSimdlib()") {
-      vpImageIo::writeSimdlib(I, filename);
+      vpImageIo::writeJPEG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND);
       return I;
     };
   }
@@ -193,7 +193,7 @@ TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") {
     const std::string filename = "/tmp/Big_images_stb.jpg";
 
     BENCHMARK("vpImageIo::writeStb()") {
-      vpImageIo::writeStb(I, filename);
+      vpImageIo::writeJPEG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND);
       return I;
     };
   }
@@ -215,7 +215,7 @@ TEST_CASE("Benchmark Png image saving", "[benchmark]") {
     const std::string filename = "/tmp/Klimt_Simd.png";
 
     BENCHMARK("vpImageIo::writeSimdlib()") {
-      vpImageIo::writeSimdlib(I, filename);
+      vpImageIo::writePNG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND);
       return I;
     };
   }
@@ -224,7 +224,7 @@ TEST_CASE("Benchmark Png image saving", "[benchmark]") {
     const std::string filename = "/tmp/Klimt_stb.png";
 
     BENCHMARK("vpImageIo::writeStb()") {
-      vpImageIo::writeStb(I, filename);
+      vpImageIo::writePNG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND);
       return I;
     };
   }
@@ -246,7 +246,7 @@ TEST_CASE("Benchmark big Png image saving", "[benchmark]") {
     const std::string filename = "/tmp/Big_images_Simd.png";
 
     BENCHMARK("vpImageIo::writeSimdlib()") {
-      vpImageIo::writeSimdlib(I, filename);
+      vpImageIo::writePNG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND);
       return I;
     };
   }
@@ -255,155 +255,12 @@ TEST_CASE("Benchmark big Png image saving", "[benchmark]") {
     const std::string filename = "/tmp/Big_images_stb.png";
 
     BENCHMARK("vpImageIo::writeStb()") {
-      vpImageIo::writeStb(I, filename);
+      vpImageIo::writePNG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND);
       return I;
     };
   }
 }
 
-//TEST_CASE("Benchmark bgr to grayscale (ViSP)", "[benchmark]") {
-//  vpImage<vpRGBa> I;
-//  vpImageIo::read(I, imagePathColor);
-
-//  std::vector<unsigned char> bgr;
-//  common_tools::RGBaToBGR(I, bgr);
-
-//  vpImage<unsigned char> I_gray(I.getHeight(), I.getWidth());
-
-//  BENCHMARK("Benchmark bgr to grayscale (ViSP)") {
-//    vpImageConvert::BGRToGrey(bgr.data(),
-//                              I_gray.bitmap,
-//                              I.getWidth(), I.getHeight(),
-//                              false, nThreads);
-//    return I_gray;
-//  };
-
-//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101)
-//  SECTION("OpenCV Mat type")
-//  {
-//    cv::Mat img;
-//    vpImageConvert::convert(I, img);
-
-//    BENCHMARK("Benchmark bgr to grayscale (ViSP + OpenCV Mat type)") {
-//      vpImageConvert::convert(img, I_gray, false, nThreads);
-//      return I_gray;
-//    };
-//  }
-//#endif
-//}
-//#endif
-
-//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101)
-//TEST_CASE("Benchmark bgr to grayscale (OpenCV)", "[benchmark]") {
-//  cv::Mat img = cv::imread(imagePathColor);
-//  cv::Mat img_gray(img.size(), CV_8UC1);
-
-//  BENCHMARK("Benchmark bgr to grayscale (OpenCV)") {
-//    cv::cvtColor(img, img_gray, cv::COLOR_BGR2GRAY);
-//    return img_gray;
-//  };
-//}
-//#endif
-
-//// C++11 to be able to do bgr.data()
-//#if VISP_CXX_STANDARD >= VISP_CXX_STANDARD_11
-//TEST_CASE("Benchmark bgr to rgba (naive code)", "[benchmark]") {
-//  vpImage<vpRGBa> I;
-//  vpImageIo::read(I, imagePathColor);
-
-//  std::vector<unsigned char> bgr;
-//  common_tools::RGBaToBGR(I, bgr);
-
-//  vpImage<vpRGBa> I_bench(I.getHeight(), I.getWidth());
-//  BENCHMARK("Benchmark bgr to rgba (naive code)") {
-//    common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast<unsigned char*>(I_bench.bitmap),
-//                               I.getWidth(), I.getHeight(), false);
-//    return I_bench;
-//  };
-//}
-
-//TEST_CASE("Benchmark bgr to rgba (ViSP)", "[benchmark]") {
-//  vpImage<vpRGBa> I;
-//  vpImageIo::read(I, imagePathColor);
-
-//  std::vector<unsigned char> bgr;
-//  common_tools::RGBaToBGR(I, bgr);
-
-//  SECTION("Check BGR to RGBa conversion")
-//  {
-//    vpImage<vpRGBa> ref(I.getHeight(), I.getWidth());
-//    common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast<unsigned char*>(ref.bitmap),
-//                               I.getWidth(), I.getHeight(), false);
-//    vpImage<vpRGBa> rgba(I.getHeight(), I.getWidth());
-//    vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast<unsigned char *>(rgba.bitmap),
-//                              I.getWidth(), I.getHeight(), false);
-
-//    CHECK((rgba == ref));
-//  }
-
-//  vpImage<vpRGBa> I_rgba(I.getHeight(), I.getWidth());
-//  BENCHMARK("Benchmark bgr to rgba (ViSP)") {
-//    vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast<unsigned char *>(I_rgba.bitmap),
-//                              I.getWidth(), I.getHeight(), false);
-//    return I_rgba;
-//  };
-
-//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101)
-//  SECTION("OpenCV Mat type")
-//  {
-//    cv::Mat img;
-//    vpImageConvert::convert(I, img);
-
-//    BENCHMARK("Benchmark bgr to rgba (ViSP + OpenCV Mat type)") {
-//      vpImageConvert::convert(img, I_rgba);
-//      return I_rgba;
-//    };
-//  }
-//#endif
-//}
-
-//TEST_CASE("Benchmark bgra to rgba (naive code)", "[benchmark]") {
-//  vpImage<vpRGBa> I;
-//  vpImageIo::read(I, imagePathColor);
-
-//  std::vector<unsigned char> bgra;
-//  common_tools::RGBaToBGRa(I, bgra);
-
-//  vpImage<vpRGBa> I_bench(I.getHeight(), I.getWidth());
-//  BENCHMARK("Benchmark bgra to rgba (naive code)") {
-//    common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast<unsigned char*>(I_bench.bitmap),
-//                                I.getWidth(), I.getHeight(), false);
-//    return I_bench;
-//  };
-//}
-
-//TEST_CASE("Benchmark bgra to rgba (ViSP)", "[benchmark]") {
-//  vpImage<vpRGBa> I;
-//  vpImageIo::read(I, imagePathColor);
-
-//  std::vector<unsigned char> bgra;
-//  common_tools::RGBaToBGRa(I, bgra);
-
-//  SECTION("Check BGRa to RGBa conversion")
-//  {
-//    vpImage<vpRGBa> ref(I.getHeight(), I.getWidth());
-//    common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast<unsigned char*>(ref.bitmap),
-//                                I.getWidth(), I.getHeight(), false);
-//    vpImage<vpRGBa> rgba(I.getHeight(), I.getWidth());
-//    vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast<unsigned char *>(rgba.bitmap),
-//                               I.getWidth(), I.getHeight(), false);
-
-//    CHECK((rgba == ref));
-//  }
-//  vpImage<vpRGBa> I_rgba(I.getHeight(), I.getWidth());
-//  BENCHMARK("Benchmark bgra to rgba (ViSP)") {
-//    vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast<unsigned char *>(I_rgba.bitmap),
-//                               I.getWidth(), I.getHeight(), false);
-//    return I_rgba;
-//  };
-//}
-//#endif
-
 int main(int argc, char *argv[])
 {
   Catch::Session session; // There must be exactly one instance

From 122e936a257d8caf21a75e23a651d1122d93e77d Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Wed, 3 Nov 2021 01:06:57 +0100
Subject: [PATCH 07/18] Update Simd lib to 4.9.107 version.

---
 3rdparty/simdlib/CMakeLists.txt               |  16 +-
 .../Simd/{SimdSse1.h => SimdAlignment.h}      | 113 +++--
 3rdparty/simdlib/Simd/SimdAllocator.hpp       |   6 +-
 3rdparty/simdlib/Simd/SimdArray.h             |  30 +-
 3rdparty/simdlib/Simd/SimdAvx1.h              |   9 +-
 ...SimdBaseRgbaToGray.cpp => SimdAvx1Cpu.cpp} |  45 +-
 3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp     |  14 +-
 3rdparty/simdlib/Simd/SimdAvx2.h              |  22 +-
 3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp   |  43 +-
 3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp   |  61 ++-
 3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp    |  10 +-
 3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp   |  74 ---
 3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp   | 149 ++++++
 3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp  |  56 ++-
 3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp  |  72 ---
 3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp         |  68 +++
 .../simdlib/Simd/SimdAvx2Deinterleave.cpp     |  59 ++-
 .../simdlib/Simd/SimdAvx2GaussianBlur.cpp     |   3 +-
 3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp      |   4 +-
 .../simdlib/Simd/SimdAvx2ReduceGray2x2.cpp    |   6 +-
 .../simdlib/Simd/SimdAvx2ReduceGray3x3.cpp    |   4 +-
 .../simdlib/Simd/SimdAvx2ReduceGray4x4.cpp    |   4 +-
 .../simdlib/Simd/SimdAvx2ReduceGray5x5.cpp    |   6 +-
 .../simdlib/Simd/SimdAvx2ResizeBilinear.cpp   |   4 +-
 3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp     |  23 +-
 3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp   |  92 ----
 3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp  |  97 ----
 3rdparty/simdlib/Simd/SimdBase.h              |  18 +-
 3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp   |  20 +-
 3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp   |  15 +-
 3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp    |   4 +-
 3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp   |  80 ---
 3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp   |  37 +-
 3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp  |  15 +-
 3rdparty/simdlib/Simd/SimdBaseCpu.cpp         | 234 +++++++++
 .../simdlib/Simd/SimdBaseDeinterleave.cpp     |  43 +-
 .../simdlib/Simd/SimdBaseGaussianBlur.cpp     |   2 +-
 3rdparty/simdlib/Simd/SimdBaseResizer.cpp     | 243 ++++++++-
 3rdparty/simdlib/Simd/SimdConfig.h            |  10 +-
 3rdparty/simdlib/Simd/SimdConst.h             |  70 +--
 3rdparty/simdlib/Simd/SimdConversion.h        |  55 +--
 3rdparty/simdlib/Simd/SimdCopyPixel.h         |  17 +
 3rdparty/simdlib/Simd/SimdCpu.h               | 101 +++-
 3rdparty/simdlib/Simd/SimdDefs.h              |  80 +--
 3rdparty/simdlib/Simd/SimdEnable.h            | 415 +---------------
 3rdparty/simdlib/Simd/SimdExp.h               | 176 ++++++-
 3rdparty/simdlib/Simd/SimdExtract.h           |  22 +-
 3rdparty/simdlib/Simd/SimdFrame.hpp           |  98 +++-
 3rdparty/simdlib/Simd/SimdInit.h              |  35 +-
 3rdparty/simdlib/Simd/SimdLib.cpp             | 279 ++++++-----
 3rdparty/simdlib/Simd/SimdLib.h               | 239 +++++----
 3rdparty/simdlib/Simd/SimdLib.hpp             | 465 +++++++++++++++++-
 3rdparty/simdlib/Simd/SimdLoad.h              | 277 +----------
 3rdparty/simdlib/Simd/SimdLoadBlock.h         | 251 ++++++++++
 3rdparty/simdlib/Simd/SimdLog.h               |  28 +-
 3rdparty/simdlib/Simd/SimdMath.h              |  47 +-
 3rdparty/simdlib/Simd/SimdMemory.h            | 104 ++--
 3rdparty/simdlib/Simd/SimdNeon.h              |  20 +-
 3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp   |  45 +-
 3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp   |  63 ++-
 3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp    |  10 +-
 3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp   |  81 ---
 3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp   |  83 +++-
 3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp  |  41 +-
 3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp  |  78 ---
 .../simdlib/Simd/SimdNeonDeinterleave.cpp     |  79 ++-
 .../simdlib/Simd/SimdNeonGaussianBlur.cpp     |   1 +
 3rdparty/simdlib/Simd/SimdNeonResizer.cpp     |   8 +-
 3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp   |  71 ---
 3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp  |  71 ---
 3rdparty/simdlib/Simd/SimdPixel.hpp           | 200 +++++++-
 3rdparty/simdlib/Simd/SimdPow.h               |   2 +-
 3rdparty/simdlib/Simd/SimdResizer.h           | 148 ++++--
 3rdparty/simdlib/Simd/SimdResizerCommon.h     |  97 ++++
 3rdparty/simdlib/Simd/SimdRuntime.h           |  34 +-
 3rdparty/simdlib/Simd/SimdSet.h               |   8 +-
 3rdparty/simdlib/Simd/SimdSse1Resizer.cpp     | 129 -----
 3rdparty/simdlib/Simd/SimdSse2.h              |   8 +-
 3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp  |  54 +-
 ...SimdBaseBgraToRgba.cpp => SimdSse2Cpu.cpp} |  44 +-
 .../simdlib/Simd/SimdSse2GaussianBlur3x3.cpp  |   3 +-
 3rdparty/simdlib/Simd/SimdSse2Resizer.cpp     |   8 +-
 3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp   |  75 ---
 3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp  |  96 ----
 3rdparty/simdlib/Simd/SimdSse41.h             |  76 +++
 ...e3BgrToBgra.cpp => SimdSse41BgrToBgra.cpp} | 185 ++++---
 ...e3BgrToGray.cpp => SimdSse41BgrToGray.cpp} | 241 +++++----
 ...sse3BgrToRgb.cpp => SimdSse41BgrToRgb.cpp} | 163 +++---
 ...e3BgraToBgr.cpp => SimdSse41BgraToBgr.cpp} | 257 ++++++----
 ...SimdBaseRgbToGray.cpp => SimdSse41Cpu.cpp} |  46 +-
 ...terleave.cpp => SimdSse41Deinterleave.cpp} |  60 ++-
 .../simdlib/Simd/SimdSse41GaussianBlur.cpp    |   3 +-
 ...ur3x3.cpp => SimdSse41GaussianBlur3x3.cpp} |  12 +-
 ...e3GrayToBgr.cpp => SimdSse41GrayToBgr.cpp} | 147 +++---
 ...Interleave.cpp => SimdSse41Interleave.cpp} |  11 +-
 ...imdSsse3Reduce.cpp => SimdSse41Reduce.cpp} | 401 ++++++++-------
 ...Gray2x2.cpp => SimdSse41ReduceGray2x2.cpp} | 189 ++++---
 ...Gray4x4.cpp => SimdSse41ReduceGray4x4.cpp} |  11 +-
 ...linear.cpp => SimdSse41ResizeBilinear.cpp} |   9 +-
 3rdparty/simdlib/Simd/SimdSse41Resizer.cpp    | 311 +++++++++++-
 3rdparty/simdlib/Simd/SimdSsse3.h             |  77 ---
 3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp  |  74 ---
 3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp |  73 ---
 .../simdlib/Simd/SimdSsse3CustomFunctions.cpp |  69 ---
 3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp    | 350 -------------
 3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp  |  93 ----
 3rdparty/simdlib/Simd/SimdStore.h             |  45 +-
 3rdparty/simdlib/Simd/SimdStream.h            |  21 +-
 3rdparty/simdlib/Simd/SimdUpdate.h            |  17 +-
 3rdparty/simdlib/Simd/SimdVersion.h           |   2 +-
 3rdparty/simdlib/Simd/SimdView.hpp            |   6 +-
 modules/core/src/image/vpImageConvert.cpp     |   4 +-
 112 files changed, 5013 insertions(+), 4067 deletions(-)
 rename 3rdparty/simdlib/Simd/{SimdSse1.h => SimdAlignment.h} (53%)
 mode change 100644 => 100755
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAllocator.hpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdArray.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx1.h
 rename 3rdparty/simdlib/Simd/{SimdBaseRgbaToGray.cpp => SimdAvx1Cpu.cpp} (57%)
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp
 create mode 100755 3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBase.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseCpu.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdBaseResizer.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdConfig.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdConst.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdConversion.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdCopyPixel.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdCpu.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdDefs.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdEnable.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdExp.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdExtract.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdFrame.hpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdInit.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLib.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLib.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLib.hpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLoad.h
 create mode 100755 3rdparty/simdlib/Simd/SimdLoadBlock.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdLog.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdMath.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdMemory.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeon.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdNeonResizer.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdPixel.hpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdPow.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdResizer.h
 create mode 100755 3rdparty/simdlib/Simd/SimdResizerCommon.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdRuntime.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSet.h
 delete mode 100644 3rdparty/simdlib/Simd/SimdSse1Resizer.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp
 rename 3rdparty/simdlib/Simd/{SimdBaseBgraToRgba.cpp => SimdSse2Cpu.cpp} (62%)
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse2Resizer.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp
 create mode 100755 3rdparty/simdlib/Simd/SimdSse41.h
 rename 3rdparty/simdlib/Simd/{SimdSsse3BgrToBgra.cpp => SimdSse41BgrToBgra.cpp} (57%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdSsse3BgrToGray.cpp => SimdSse41BgrToGray.cpp} (56%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdSsse3BgrToRgb.cpp => SimdSse41BgrToRgb.cpp} (84%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdSsse3BgraToBgr.cpp => SimdSse41BgraToBgr.cpp} (53%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdBaseRgbToGray.cpp => SimdSse41Cpu.cpp} (54%)
 rename 3rdparty/simdlib/Simd/{SimdSsse3Deinterleave.cpp => SimdSse41Deinterleave.cpp} (74%)
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp
 rename 3rdparty/simdlib/Simd/{SimdSsse3GaussianBlur3x3.cpp => SimdSse41GaussianBlur3x3.cpp} (95%)
 rename 3rdparty/simdlib/Simd/{SimdSsse3GrayToBgr.cpp => SimdSse41GrayToBgr.cpp} (92%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdSsse3Interleave.cpp => SimdSse41Interleave.cpp} (96%)
 rename 3rdparty/simdlib/Simd/{SimdSsse3Reduce.cpp => SimdSse41Reduce.cpp} (96%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdSsse3ReduceGray2x2.cpp => SimdSse41ReduceGray2x2.cpp} (94%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdSsse3ReduceGray4x4.cpp => SimdSse41ReduceGray4x4.cpp} (96%)
 mode change 100644 => 100755
 rename 3rdparty/simdlib/Simd/{SimdSsse3ResizeBilinear.cpp => SimdSse41ResizeBilinear.cpp} (98%)
 mode change 100644 => 100755
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdSse41Resizer.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3.h
 delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp
 delete mode 100644 3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdStore.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdStream.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdUpdate.h
 mode change 100644 => 100755 3rdparty/simdlib/Simd/SimdView.hpp

diff --git a/3rdparty/simdlib/CMakeLists.txt b/3rdparty/simdlib/CMakeLists.txt
index e6880b3800..dc6d111aae 100644
--- a/3rdparty/simdlib/CMakeLists.txt
+++ b/3rdparty/simdlib/CMakeLists.txt
@@ -109,23 +109,11 @@ if(X86 OR X86_64)
     file(GLOB_RECURSE SIMD_BASE_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdBase*.cpp)
     set_source_files_properties(${SIMD_BASE_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS}")
 
-    file(GLOB_RECURSE SIMD_SSE1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse1*.cpp)
-    set_source_files_properties(${SIMD_SSE1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG}")
-
     file(GLOB_RECURSE SIMD_SSE2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse2*.cpp)
-    set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE2_FLAG}")
-
-    file(GLOB_RECURSE SIMD_SSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse3*.cpp)
-    set_source_files_properties(${SIMD_SSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG}")
-
-    file(GLOB_RECURSE SIMD_SSSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSsse3*.cpp)
-    set_source_files_properties(${SIMD_SSSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSSE3_FLAG}")
+    set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG} ${SSE2_FLAG}")
 
     file(GLOB_RECURSE SIMD_SSE41_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse41*.cpp)
-    set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_1_FLAG}")
-
-    file(GLOB_RECURSE SIMD_SSE42_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse42*.cpp)
-    set_source_files_properties(${SIMD_SSE42_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}")
+    set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG} ${SSSE3_FLAG} ${SSE4_1_FLAG} ${SSE4_2_FLAG}")
 
     file(GLOB_RECURSE SIMD_AVX1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx1*.cpp)
     set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}")
diff --git a/3rdparty/simdlib/Simd/SimdSse1.h b/3rdparty/simdlib/Simd/SimdAlignment.h
old mode 100644
new mode 100755
similarity index 53%
rename from 3rdparty/simdlib/Simd/SimdSse1.h
rename to 3rdparty/simdlib/Simd/SimdAlignment.h
index e258d50ab3..9789cbb9e7
--- a/3rdparty/simdlib/Simd/SimdSse1.h
+++ b/3rdparty/simdlib/Simd/SimdAlignment.h
@@ -1,40 +1,73 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#ifndef __SimdSse_h__
-#define __SimdSse_h__
-
-#include "Simd/SimdDefs.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
-    {
-        void SquaredDifferenceSum32f(const float * a, const float * b, size_t size, float * sum);
-
-        void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum);
-    }
-#endif// SIMD_SSE_ENABLE
-}
-#endif//__SimdSse_h__
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdAlignment_h__
+#define __SimdAlignment_h__
+
+#include "Simd/SimdEnable.h"
+
+namespace Simd
+{
+    SIMD_INLINE size_t GetAlignment()
+    {
+#ifdef SIMD_AVX2_ENABLE
+        if (Avx2::Enable)
+            return sizeof(__m256i);
+        else
+#endif
+#ifdef SIMD_AVX_ENABLE
+        if (Avx::Enable)
+            return sizeof(__m256);
+        else
+#endif
+#ifdef SIMD_SSE41_ENABLE
+        if (Sse41::Enable)
+            return sizeof(__m128i);
+        else
+#endif
+#ifdef SIMD_SSE2_ENABLE
+        if (Sse2::Enable)
+            return sizeof(__m128i);
+        else
+#endif
+#ifdef SIMD_NEON_ENABLE
+        if (Neon::Enable)
+            return sizeof(uint8x16_t);
+        else
+#endif
+            return sizeof(void *);
+    }
+
+    extern const size_t ALIGNMENT;
+
+    SIMD_INLINE size_t Alignment()
+    {
+#if defined(WIN32)
+        return GetAlignment();
+#else
+        return ALIGNMENT;
+#endif
+    }
+}
+
+#endif//__SimdAlignment_h__
diff --git a/3rdparty/simdlib/Simd/SimdAllocator.hpp b/3rdparty/simdlib/Simd/SimdAllocator.hpp
old mode 100644
new mode 100755
index cd65f196f4..8ee548e5ae
--- a/3rdparty/simdlib/Simd/SimdAllocator.hpp
+++ b/3rdparty/simdlib/Simd/SimdAllocator.hpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -125,8 +125,8 @@ namespace Simd
         */
         static SIMD_INLINE size_t Alignment()
         {
-#if defined(__SimdEnable_h__) && defined(WIN32)
-            return Simd::ALIGNMENT;
+#if defined(__SimdAlignment_h__) && defined(WIN32)
+            return Simd::Alignment();
 #else
             return SimdAlignment();
 #endif
diff --git a/3rdparty/simdlib/Simd/SimdArray.h b/3rdparty/simdlib/Simd/SimdArray.h
old mode 100644
new mode 100755
index 30e793080f..2f7f1bbbe0
--- a/3rdparty/simdlib/Simd/SimdArray.h
+++ b/3rdparty/simdlib/Simd/SimdArray.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -57,15 +57,28 @@ namespace Simd
                 }
                 *(size_t*)&size = size_;
                 if (size_)
-                    *(T**)&data = (T*)Simd::Allocate(size * sizeof(T), align);
+                    *(T**)&data = (T*)Simd::Allocate(RawSize(), align);
             }
             if (clear)
                 Clear();
         }
 
+        SIMD_INLINE void Assign(const T * src, size_t size_)
+        {
+            Resize(size_, src == NULL);
+            if(src)
+                memcpy(data, src, RawSize());
+        }
+
         SIMD_INLINE void Clear()
         {
-            ::memset(data, 0, size * sizeof(T));
+            memset(data, 0, RawSize());
+        }
+
+        SIMD_INLINE void Swap(const Array & array)
+        {
+            Simd::Swap((T*&)data, (T*&)(array.data));
+            Simd::Swap((size_t&)size, (size_t&)(array.size));
         }
 
         SIMD_INLINE T & operator[] (size_t i)
@@ -77,12 +90,19 @@ namespace Simd
         {
             return data[i];
         }
+
+        SIMD_INLINE size_t RawSize() const
+        {
+            return size * sizeof(T);
+        }
     };
 
+    typedef Array<int8_t> Array8i;
     typedef Array<uint8_t> Array8u;
     typedef Array<int16_t> Array16i;
     typedef Array<uint16_t> Array16u;
     typedef Array<int32_t> Array32i;
+    typedef Array<uint32_t> Array32u;
     typedef Array<float> Array32f;
 
 #if defined(__GNUC__) && __GNUC__ >= 6
@@ -90,8 +110,8 @@ namespace Simd
 #pragma GCC diagnostic ignored "-Wignored-attributes"
 #endif
 
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         typedef Array<__m128> Array128f;
     }
diff --git a/3rdparty/simdlib/Simd/SimdAvx1.h b/3rdparty/simdlib/Simd/SimdAvx1.h
old mode 100644
new mode 100755
index 25c070c459..48df913c02
--- a/3rdparty/simdlib/Simd/SimdAvx1.h
+++ b/3rdparty/simdlib/Simd/SimdAvx1.h
@@ -1,8 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
-*               2019-2019 Facundo Galan.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -22,8 +21,8 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
-#ifndef __SimdAvx1_h__
-#define __SimdAvx1_h__
+#ifndef __SimdAvx_h__
+#define __SimdAvx_h__
 
 #include "Simd/SimdDefs.h"
 
@@ -36,4 +35,4 @@ namespace Simd
     }
 #endif// SIMD_AVX_ENABLE
 }
-#endif//__SimdAvx1_h__
+#endif//__SimdAvx_h__
diff --git a/3rdparty/simdlib/Simd/SimdBaseRgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx1Cpu.cpp
similarity index 57%
rename from 3rdparty/simdlib/Simd/SimdBaseRgbaToGray.cpp
rename to 3rdparty/simdlib/Simd/SimdAvx1Cpu.cpp
index 22d37b17ee..9d9cbb29d3 100644
--- a/3rdparty/simdlib/Simd/SimdBaseRgbaToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx1Cpu.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,23 +21,46 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
-#include "Simd/SimdConversion.h"
+#include "Simd/SimdEnable.h"
+#include "Simd/SimdCpu.h"
+
+#if defined(_MSC_VER)
+#include <windows.h>
+#endif
 
 namespace Simd
 {
-    namespace Base
+#ifdef SIMD_AVX_ENABLE
+    namespace Avx
     {
-        void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride)
+        SIMD_INLINE bool SupportedByCPU()
         {
-            for (size_t row = 0; row < height; ++row)
+            return
+                Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) &&
+                Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::AVX);
+        }
+
+        SIMD_INLINE bool SupportedByOS()
+        {
+#if defined(_MSC_VER)
+            __try
             {
-                const uint8_t * pRgba = rgba + row*rgbaStride;
-                uint8_t * pGray = gray + row*grayStride;
-                for (const uint8_t *pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgba += 4)
-                {
-                    *pGray = RgbToGray(pRgba[0], pRgba[1], pRgba[2]);
-                }
+                __m256d value = _mm256_set1_pd(1.0);// try to execute of AVX instructions;
+                return true;
             }
+            __except (EXCEPTION_EXECUTE_HANDLER)
+            {
+                return false;
+            }
+#else
+            return true;
+#endif
+        }
+
+        bool GetEnable()
+        {
+            return SupportedByCPU() && SupportedByOS();
         }
     }
+#endif
 }
diff --git a/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp b/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp
old mode 100644
new mode 100755
index e409c17ff1..319c609408
--- a/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx1Resizer.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -42,7 +42,7 @@ namespace Simd
             float * pbx[2] = { _bx[0].data, _bx[1].data };
             int32_t prev = -2;
             size_t rsa = AlignLo(rs, Avx::F);
-            size_t rsh = AlignLo(rs, Sse::F);
+            size_t rsh = AlignLo(rs, Sse2::F);
             for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride)
             {
                 float fy1 = _ay[dy];
@@ -78,10 +78,10 @@ namespace Simd
                             __m256 m1 = _mm256_mul_ps(fx1, _mm256_shuffle_ps(s0145, s2367, 0xDD));
                             _mm256_store_ps(pb + dx, _mm256_add_ps(m0, m1));
                         }
-                        for (; dx < rsh; dx += Sse::F)
+                        for (; dx < rsh; dx += Sse2::F)
                         {
-                            __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]);
-                            __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]);
+                            __m128 s01 = Sse2::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]);
+                            __m128 s23 = Sse2::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]);
                             __m128 fx1 = _mm_load_ps(_ax.data + dx);
                             __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1);
                             __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88));
@@ -128,7 +128,7 @@ namespace Simd
                     __m256 m1 = _mm256_mul_ps(_mm256_load_ps(pbx[1] + dx), _fy1);
                     _mm256_storeu_ps(dst + dx, _mm256_add_ps(m0, m1));
                 }
-                for (; dx < rsh; dx += Sse::F)
+                for (; dx < rsh; dx += Sse2::F)
                 {
                     __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0));
                     __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1));
@@ -144,7 +144,7 @@ namespace Simd
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
         {
             ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m256));
-            if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp))
+            if (param.IsFloatBilinear())
                 return new ResizerFloatBilinear(param);
             else
                 return Sse41::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
diff --git a/3rdparty/simdlib/Simd/SimdAvx2.h b/3rdparty/simdlib/Simd/SimdAvx2.h
old mode 100644
new mode 100755
index 46d3b2d547..f5957b26c1
--- a/3rdparty/simdlib/Simd/SimdAvx2.h
+++ b/3rdparty/simdlib/Simd/SimdAvx2.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
 *               2019-2019 Facundo Galan.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -32,24 +32,22 @@ namespace Simd
 #ifdef SIMD_AVX2_ENABLE
     namespace Avx2
     {
+        void BgraToBgr(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* bgr, size_t bgrStride);
+
         void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride);
 
-        void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride);
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride);
+
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride);
 
         void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height,
             const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
         void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
-        void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha);
-
-        void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride);
-
         void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride);
 
-        void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride);
-
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride);
+        void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride);
 
         void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride);
 
@@ -87,6 +85,12 @@ namespace Simd
         void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
 
+        void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride);
+
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride);
+
         void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
     }
diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp
old mode 100644
new mode 100755
index b1f9ef8417..ffb4828e98
--- a/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2BgrToBgra.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -67,6 +67,8 @@ namespace Simd
                 BgrToBgra<false>(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
         }
 
+        //---------------------------------------------------------------------
+
         template <bool align> SIMD_INLINE void Bgr48pToBgra32(uint8_t * bgra,
             const uint8_t * blue, const uint8_t * green, const uint8_t * red, size_t offset, __m256i alpha)
         {
@@ -117,6 +119,45 @@ namespace Simd
             else
                 Bgr48pToBgra32<false>(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha);
         }
+
+        //---------------------------------------------------------------------
+
+        template <bool align> SIMD_INLINE void RgbToBgra(const uint8_t* rgb, uint8_t* bgra, __m256i alpha)
+        {
+            Store<align>((__m256i*)bgra + 0, RgbToBgra<false>(Load<align>((__m256i*)(rgb + 0)), alpha));
+            Store<align>((__m256i*)bgra + 1, RgbToBgra<false>(Load<false>((__m256i*)(rgb + 24)), alpha));
+            Store<align>((__m256i*)bgra + 2, RgbToBgra<false>(Load<false>((__m256i*)(rgb + 48)), alpha));
+            Store<align>((__m256i*)bgra + 3, RgbToBgra<true >(Load<align>((__m256i*)(rgb + 64)), alpha));
+        }
+
+        template <bool align> void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+
+            __m256i _alpha = _mm256_slli_si256(_mm256_set1_epi32(alpha), 3);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    RgbToBgra<align>(rgb + 3 * col, bgra + 4 * col, _alpha);
+                if (width != alignedWidth)
+                    RgbToBgra<false>(rgb + 3 * (width - A), bgra + 4 * (width - A), _alpha);
+                rgb += rgbStride;
+                bgra += bgraStride;
+            }
+        }
+
+        void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride))
+                RgbToBgra<true>(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+            else
+                RgbToBgra<false>(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+        }
     }
 #else
     // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToBgra.cpp.o) has no symbols
diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp
old mode 100644
new mode 100755
index d40b0f0cc6..7b922e7025
--- a/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2BgrToGray.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -45,7 +45,7 @@ namespace Simd
         {
             const __m256i lo = PackI32ToI16(BgraToGray32(bgra[0]), BgraToGray32(bgra[1]));
             const __m256i hi = PackI32ToI16(BgraToGray32(bgra[2]), BgraToGray32(bgra[3]));
-            return PackU16ToU8(lo, hi);
+            return PackI16ToU8(lo, hi);
         }
 
         template <bool align> SIMD_INLINE __m256i BgrToGray(const uint8_t * bgr)
@@ -84,6 +84,63 @@ namespace Simd
             else
                 BgrToGray<false>(bgr, width, height, bgrStride, gray, grayStride);
         }
+
+
+        //---------------------------------------------------------------------
+
+        const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
+
+        SIMD_INLINE __m256i RgbaToGray32(__m256i rgba)
+        {
+            const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF);
+            const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF);
+            const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_ROUND), _mm256_madd_epi16(r0b0, K16_RED_BLUE));
+            return _mm256_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT);
+        }
+
+        SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4])
+        {
+            const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
+            const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
+            return PackI16ToU8(lo, hi);
+        }
+
+        template <bool align> SIMD_INLINE __m256i RgbToGray(const uint8_t* rgb)
+        {
+            __m256i rgba[4];
+            rgba[0] = BgrToBgra<false>(Load<align>((__m256i*)(rgb + 0)), K32_01000000);
+            rgba[1] = BgrToBgra<false>(Load<false>((__m256i*)(rgb + 24)), K32_01000000);
+            rgba[2] = BgrToBgra<false>(Load<false>((__m256i*)(rgb + 48)), K32_01000000);
+            rgba[3] = BgrToBgra<true>(Load<align>((__m256i*)(rgb + 64)), K32_01000000);
+            return RgbaToGray(rgba);
+        }
+
+        template <bool align> void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    Store<align>((__m256i*)(gray + col), RgbToGray<align>(rgb + 3 * col));
+                if (width != alignedWidth)
+                    Store<false>((__m256i*)(gray + width - A), RgbToGray<false>(rgb + 3 * (width - A)));
+                rgb += rgbStride;
+                gray += grayStride;
+            }
+        }
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+        {
+            if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride))
+                RgbToGray<true>(rgb, width, height, rgbStride, gray, grayStride);
+            else
+                RgbToGray<false>(rgb, width, height, rgbStride, gray, grayStride);
+        }
     }
 #else
     // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToGray.cpp.o) has no symbols
diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp
old mode 100644
new mode 100755
index 2daae1e7df..a64ed8035e
--- a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2BgrToRgb.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -64,7 +64,7 @@ namespace Simd
                 _mm256_shuffle_epi8(p1, K8_SHFL_2P1)), _mm256_shuffle_epi8(p2, K8_SHFL_2P2)));
         }
 
-        template <bool align> void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
+        template <bool align> void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride)
         {
             assert(width >= A);
             if (align)
@@ -85,12 +85,12 @@ namespace Simd
             }
         }
 
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
+        void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride)
         {
             if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride))
-                BgrToRgb<true>(bgr, bgrStride, width, height, rgb, rgbStride);
+                BgrToRgb<true>(bgr, width, height, bgrStride, rgb, rgbStride);
             else
-                BgrToRgb<false>(bgr, bgrStride, width, height, rgb, rgbStride);
+                BgrToRgb<false>(bgr, width, height, bgrStride, rgb, rgbStride);
         }
     }
 #else
diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp
deleted file mode 100644
index a4f9efdb2f..0000000000
--- a/3rdparty/simdlib/Simd/SimdAvx2BgrToRgba.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdConversion.h"
-
-namespace Simd
-{
-#ifdef SIMD_AVX2_ENABLE
-    namespace Avx2
-    {
-        template <bool align> SIMD_INLINE void BgrToRgba(const uint8_t * bgr, uint8_t * rgba, __m256i alpha)
-        {
-            Store<align>((__m256i*)rgba + 0, BgrToRgba<false>(Load<align>((__m256i*)(bgr + 0)), alpha));
-            Store<align>((__m256i*)rgba + 1, BgrToRgba<false>(Load<false>((__m256i*)(bgr + 24)), alpha));
-            Store<align>((__m256i*)rgba + 2, BgrToRgba<false>(Load<false>((__m256i*)(bgr + 48)), alpha));
-            Store<align>((__m256i*)rgba + 3, BgrToRgba<true >(Load<align>((__m256i*)(bgr + 64)), alpha));
-        }
-
-        template <bool align> void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgba) && Aligned(rgbaStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            __m256i _alpha = _mm256_slli_si256(_mm256_set1_epi32(alpha), 3);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    BgrToRgba<align>(bgr + 3 * col, rgba + 4 * col, _alpha);
-                if (width != alignedWidth)
-                    BgrToRgba<false>(bgr + 3 * (width - A), rgba + 4 * (width - A), _alpha);
-                bgr += bgrStride;
-                rgba += rgbaStride;
-            }
-        }
-
-        void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha)
-        {
-            if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride))
-                BgrToRgba<true>(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-            else
-                BgrToRgba<false>(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToRgba.cpp.o) has no symbols
-    void dummy_SimdAvx2BgrToRgba(){};
-#endif//SIMD_AVX2_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp
new file mode 100755
index 0000000000..aac574d71c
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdAvx2BgraToBgr.cpp
@@ -0,0 +1,149 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdStore.h"
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdConst.h"
+
+namespace Simd
+{
+#ifdef SIMD_AVX2_ENABLE  
+    namespace Avx2
+    {
+        template <bool align> SIMD_INLINE __m256i BgraToBgr(const uint8_t* bgra)
+        {
+            __m256i _bgra = Load<align>((__m256i*)bgra);
+            return _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(_bgra, K8_SHUFFLE_BGRA_TO_BGR), K32_PERMUTE_BGRA_TO_BGR);
+        }
+
+        template <bool align> void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride)
+        {
+            assert(width >= F);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride));
+
+            size_t widthF = AlignLo(width, F);
+            if (width == widthF)
+                widthF -= F;
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < widthF; col += F)
+                    Store<false>((__m256i*)(bgr + 3 * col), BgraToBgr<align>(bgra + 4 * col));
+                if (width != widthF)
+                    Store24<false>(bgr + 3 * (width - F), BgraToBgr<false>(bgra + 4 * (width - F)));
+                bgra += bgraStride;
+                bgr += bgrStride;
+            }
+        }
+
+        void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride))
+                BgraToBgr<true>(bgra, width, height, bgraStride, bgr, bgrStride);
+            else
+                BgraToBgr<false>(bgra, width, height, bgraStride, bgr, bgrStride);
+        }
+
+        //---------------------------------------------------------------------
+
+        const __m256i K8_SHUFFLE_BGRA_TO_RGB = SIMD_MM256_SETR_EPI8(
+            0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1,
+            0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1);
+
+        template <bool align> SIMD_INLINE __m256i BgraToRgb(const uint8_t* bgra)
+        {
+            __m256i _bgra = Load<align>((__m256i*)bgra);
+            return _mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(_bgra, K8_SHUFFLE_BGRA_TO_RGB), K32_PERMUTE_BGRA_TO_BGR);
+        }
+
+        template <bool align> void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
+        {
+            assert(width >= F);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t widthF = AlignLo(width, F);
+            if (width == widthF)
+                widthF -= F;
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < widthF; col += F)
+                    Store<false>((__m256i*)(rgb + 3 * col), BgraToRgb<align>(bgra + 4 * col));
+                if (width != widthF)
+                    Store24<false>(rgb + 3 * (width - F), BgraToRgb<false>(bgra + 4 * (width - F)));
+                bgra += bgraStride;
+                rgb += rgbStride;
+            }
+        }
+
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride))
+                BgraToRgb<true>(bgra, width, height, bgraStride, rgb, rgbStride);
+            else
+                BgraToRgb<false>(bgra, width, height, bgraStride, rgb, rgbStride);
+        }
+
+        //---------------------------------------------------------------------
+
+        const __m256i K8_BGRA_TO_RGBA = SIMD_MM256_SETR_EPI8(
+            0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF,
+            0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF);
+
+        template <bool align> SIMD_INLINE void BgraToRgba(const uint8_t* bgra, uint8_t* rgba)
+        {
+            Store<align>((__m256i*)rgba, _mm256_shuffle_epi8(Load<align>((__m256i*)bgra), K8_BGRA_TO_RGBA));
+        }
+
+        template <bool align> void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride));
+
+            size_t size = width * 4;
+            size_t sizeA = AlignLo(size, A);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t i = 0; i < size; i += A)
+                    BgraToRgba<align>(bgra + i, rgba + i);
+                if (size != sizeA)
+                    BgraToRgba<false>(bgra + size - sizeA, rgba + size - sizeA);
+                bgra += bgraStride;
+                rgba += rgbaStride;
+            }
+        }
+
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride))
+                BgraToRgba<true>(bgra, width, height, bgraStride, rgba, rgbaStride);
+            else
+                BgraToRgba<false>(bgra, width, height, bgraStride, rgba, rgbaStride);
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp
old mode 100644
new mode 100755
index f203fcae78..7082801956
--- a/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2BgraToGray.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -46,7 +46,7 @@ namespace Simd
         {
             const __m256i lo = PackI32ToI16(BgraToGray32(bgra[0]), BgraToGray32(bgra[1]));
             const __m256i hi = PackI32ToI16(BgraToGray32(bgra[2]), BgraToGray32(bgra[3]));
-            return PackU16ToU8(lo, hi);
+            return PackI16ToU8(lo, hi);
         }
 
         template <bool align> SIMD_INLINE void Load(const uint8_t* p, __m256i a[4])
@@ -89,6 +89,58 @@ namespace Simd
             else
                 BgraToGray<false>(bgra, width, height, bgraStride, gray, grayStride);
         }
+
+        //---------------------------------------------------------------------
+
+        const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
+
+        SIMD_INLINE __m256i RgbaToGray32(__m256i rgba)
+        {
+            const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF);
+            const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF);
+            const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_0000), _mm256_madd_epi16(r0b0, K16_RED_BLUE));
+            return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT);
+        }
+
+        SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4])
+        {
+            const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
+            const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
+            return PackI16ToU8(lo, hi);
+        }
+
+        template <bool align> void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            __m256i a[4];
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                {
+                    Load<align>(rgba + 4 * col, a);
+                    Store<align>((__m256i*)(gray + col), RgbaToGray(a));
+                }
+                if (alignedWidth != width)
+                {
+                    Load<false>(rgba + 4 * (width - A), a);
+                    Store<false>((__m256i*)(gray + width - A), RgbaToGray(a));
+                }
+                rgba += rgbaStride;
+                gray += grayStride;
+            }
+        }
+
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+        {
+            if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride))
+                RgbaToGray<true>(rgba, width, height, rgbaStride, gray, grayStride);
+            else
+                RgbaToGray<false>(rgba, width, height, rgbaStride, gray, grayStride);
+        }
     }
 #else
     // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgraToGray.cpp.o) has no symbols
diff --git a/3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp b/3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp
deleted file mode 100644
index d64f184cbf..0000000000
--- a/3rdparty/simdlib/Simd/SimdAvx2BgraToRgba.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdConversion.h"
-
-namespace Simd
-{
-#ifdef SIMD_AVX2_ENABLE
-    namespace Avx2
-    {
-        template <bool align> SIMD_INLINE void BgraToRgba(const uint8_t * bgra, uint8_t * rgba)
-        {
-            Store<align>((__m256i*)rgba + 0, BgraToRgba(Load<align>((__m256i*)(bgra + 0))));
-            Store<align>((__m256i*)rgba + 1, BgraToRgba(Load<align>((__m256i*)(bgra + 32))));
-            Store<align>((__m256i*)rgba + 2, BgraToRgba(Load<align>((__m256i*)(bgra + 64))));
-            Store<align>((__m256i*)rgba + 3, BgraToRgba(Load<align>((__m256i*)(bgra + 96))));
-        }
-
-        template <bool align> void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    BgraToRgba<align>(bgra + 4 * col, rgba + 4 * col);
-                if (width != alignedWidth)
-                    BgraToRgba<false>(bgra + 4 * (width - A), rgba + 4 * (width - A));
-                bgra += bgraStride;
-                rgba += rgbaStride;
-            }
-        }
-
-        void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride)
-        {
-            if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride))
-                BgraToRgba<true>(bgra, width, height, bgraStride, rgba, rgbaStride);
-            else
-                BgraToRgba<false>(bgra, width, height, bgraStride, rgba, rgbaStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2BgrToRgba.cpp.o) has no symbols
-    void dummy_SimdAvx2BgraToRgba(){};
-#endif//SIMD_AVX2_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp b/3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp
new file mode 100644
index 0000000000..778b11803a
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdAvx2Cpu.cpp
@@ -0,0 +1,68 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdEnable.h"
+#include "Simd/SimdCpu.h"
+
+#if defined(_MSC_VER)
+#include <windows.h>
+#endif
+
+namespace Simd
+{
+#ifdef SIMD_AVX2_ENABLE
+    namespace Avx2
+    {
+        SIMD_INLINE bool SupportedByCPU()
+        {
+            return
+                Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) &&
+                Base::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX2) &&
+                Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::FMA) &&
+                Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::F16C);
+        }
+
+        SIMD_INLINE bool SupportedByOS()
+        {
+#if defined(_MSC_VER)
+            __try
+            {
+                __m256i value = _mm256_abs_epi8(_mm256_set1_epi8(1));// try to execute of AVX2 instructions;
+                return true;
+            }
+            __except (EXCEPTION_EXECUTE_HANDLER)
+            {
+                return false;
+            }
+#else
+            return true;
+#endif
+        }
+
+        bool GetEnable()
+        {
+            return SupportedByCPU() && SupportedByOS();
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp b/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp
old mode 100644
new mode 100755
index 762d0f37ba..2bf5741a35
--- a/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2Deinterleave.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -69,13 +69,15 @@ namespace Simd
                 DeinterleaveBgr<false>(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride);
         }
 
+        //---------------------------------------------------------------------
+
         const __m256i K8_SHUFFLE_BGRA = SIMD_MM256_SETR_EPI8(
             0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF,
             0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF);
 
         const __m256i K32_PERMUTE_BGRA = SIMD_MM256_SETR_EPI32(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7);
 
-        template <bool align> SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset)
+        template <bool align, bool alpha> SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset)
         {
             __m256i _bgra[4];
             _bgra[0] = _mm256_shuffle_epi8(Load<align>((__m256i*)bgra + 0), K8_SHUFFLE_BGRA);
@@ -93,39 +95,58 @@ namespace Simd
             __m256i rraa1 = _mm256_unpackhi_epi32(_bgra[2], _bgra[3]);
 
             Store<align>((__m256i*)(r + offset), _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(rraa0, rraa1), K32_PERMUTE_BGRA));
-            Store<align>((__m256i*)(a + offset), _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(rraa0, rraa1), K32_PERMUTE_BGRA));
+            if(alpha)
+                Store<align>((__m256i*)(a + offset), _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(rraa0, rraa1), K32_PERMUTE_BGRA));
         }
 
-        template <bool align> void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height,
-            uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride)
+        template <bool align> void DeinterleaveBgra(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height,
+            uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride)
         {
             assert(width >= A);
             if (align)
             {
                 assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride));
-                assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride));
+                assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL));
             }
 
             size_t alignedWidth = AlignLo(width, A);
 
-            for (size_t row = 0; row < height; ++row)
+            if (a)
             {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    DeinterleaveBgra<align>(bgra + col * 4, b, g, r, a, col);
-                if (width != alignedWidth)
-                    DeinterleaveBgra<false>(bgra + 4 * (width - A), b, g, r, a, width - A);
-                bgra += bgraStride;
-                b += bStride;
-                g += gStride;
-                r += rStride;
-                a += aStride;
+                for (size_t row = 0; row < height; ++row)
+                {
+                    for (size_t col = 0; col < alignedWidth; col += A)
+                        DeinterleaveBgra<align, true>(bgra + col * 4, b, g, r, a, col);
+                    if (width != alignedWidth)
+                        DeinterleaveBgra<false, true>(bgra + 4 * (width - A), b, g, r, a, width - A);
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
+                    a += aStride;
+                }
+            }
+            else
+            {
+                for (size_t row = 0; row < height; ++row)
+                {
+                    for (size_t col = 0; col < alignedWidth; col += A)
+                        DeinterleaveBgra<align, false>(bgra + col * 4, b, g, r, NULL, col);
+                    if (width != alignedWidth)
+                        DeinterleaveBgra<false, false>(bgra + 4 * (width - A), b, g, r, NULL, width - A);
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
+                }
             }
         }
 
-        void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height,
-            uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride)
+        void DeinterleaveBgra(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height,
+            uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride)
         {
-            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride))
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) &&
+                Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL))
                 DeinterleaveBgra<true>(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
             else
                 DeinterleaveBgra<false>(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
diff --git a/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp
old mode 100644
new mode 100755
index 243663a169..beefb55410
--- a/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2GaussianBlur.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2020 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -22,6 +22,7 @@
 * SOFTWARE.
 */
 #include "Simd/SimdMemory.h"
+#include "Simd/SimdLoadBlock.h"
 #include "Simd/SimdStore.h"
 #include "Simd/SimdGaussianBlur.h"
 #include "Simd/SimdExtract.h"
diff --git a/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp b/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp
old mode 100644
new mode 100755
index ca40f5a347..5a85a27334
--- a/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2Reduce.cpp
@@ -42,7 +42,7 @@ namespace Simd
                 _mm256_and_si256(_mm256_srli_si256(s01, 1), K16_00FF),
                 _mm256_and_si256(s11, K16_00FF),
                 _mm256_and_si256(_mm256_srli_si256(s11, 1), K16_00FF));
-            return PackU16ToU8(lo, hi);
+            return PackI16ToU8(lo, hi);
         }
 #else
         SIMD_INLINE __m256i Average16(const __m256i & s0, const __m256i & s1)
@@ -52,7 +52,7 @@ namespace Simd
 
         SIMD_INLINE __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11)
         {
-            return PackU16ToU8(Average16(s00, s10), Average16(s01, s11));
+            return PackI16ToU8(Average16(s00, s10), Average16(s01, s11));
         }
 #endif
 
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp
old mode 100644
new mode 100755
index c4ee30e989..d7caad1571
--- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray2x2.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2018 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -42,7 +42,7 @@ namespace Simd
                 _mm256_and_si256(_mm256_srli_si256(s01, 1), K16_00FF),
                 _mm256_and_si256(s11, K16_00FF),
                 _mm256_and_si256(_mm256_srli_si256(s11, 1), K16_00FF));
-            return PackU16ToU8(lo, hi);
+            return PackI16ToU8(lo, hi);
         }
 #else
         SIMD_INLINE __m256i Average16(const __m256i & s0, const __m256i & s1)
@@ -52,7 +52,7 @@ namespace Simd
 
         SIMD_INLINE __m256i Average8(const __m256i & s00, const __m256i & s01, const __m256i & s10, const __m256i & s11)
         {
-            return PackU16ToU8(Average16(s00, s10), Average16(s01, s11));
+            return PackI16ToU8(Average16(s00, s10), Average16(s01, s11));
         }
 #endif
 
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp
old mode 100644
new mode 100755
index 34b4a91ecb..71f36b978f
--- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray3x3.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -78,7 +78,7 @@ namespace Simd
 
         template <bool compensation> SIMD_INLINE __m256i ReduceRow(const __m256i lo[3], const __m256i hi[3])
         {
-            return PackU16ToU8(
+            return PackI16ToU8(
                 DivideBy16<compensation>(BinomialSum16(lo[0], lo[1], lo[2])),
                 DivideBy16<compensation>(BinomialSum16(hi[0], hi[1], hi[2])));
         }
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp
old mode 100644
new mode 100755
index bf732178ed..cea41815d3
--- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray4x4.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -119,7 +119,7 @@ namespace Simd
         {
             __m256i lo = ReduceRow16<align>(buffer, offset);
             __m256i hi = ReduceRow16<align>(buffer, offset + HA);
-            return PackU16ToU8(lo, hi);
+            return PackI16ToU8(lo, hi);
         }
 
         template <bool even> void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp
old mode 100644
new mode 100755
index 96771d8aee..fe2ebbd3cf
--- a/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2ReduceGray5x5.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -120,14 +120,14 @@ namespace Simd
         {
             const __m256i lo = MainRowX5x5<align, compensation>(buffer.dst + offset);
             const __m256i hi = MainRowX5x5<align, compensation>(buffer.dst + offset + HA);
-            return _mm256_and_si256(PackU16ToU8(lo, hi), K16_00FF);
+            return _mm256_and_si256(PackI16ToU8(lo, hi), K16_00FF);
         }
 
         template <bool align, bool compensation> SIMD_INLINE void MainRowX5x5(Buffer & buffer, size_t offset, uint8_t * dst)
         {
             __m256i lo = MainRowX5x5<align, compensation>(buffer, offset);
             __m256i hi = MainRowX5x5<align, compensation>(buffer, offset + A);
-            Store<false>((__m256i*)dst, PackU16ToU8(lo, hi));
+            Store<false>((__m256i*)dst, PackI16ToU8(lo, hi));
         }
 
         template <bool align, bool compensation> void ReduceGray5x5(
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp b/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp
old mode 100644
new mode 100755
index f00b174cb2..53c9cdc9f8
--- a/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2ResizeBilinear.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -273,7 +273,7 @@ namespace Simd
         {
             __m256i lo = InterpolateY<align>((__m256i*)bx0 + 0, (__m256i*)bx1 + 0, alpha);
             __m256i hi = InterpolateY<align>((__m256i*)bx0 + 1, (__m256i*)bx1 + 1, alpha);
-            Store<false>((__m256i*)dst, PackU16ToU8(lo, hi));
+            Store<false>((__m256i*)dst, PackI16ToU8(lo, hi));
         }
 
         template <size_t channelCount> void ResizeBilinear(
diff --git a/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp b/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp
old mode 100644
new mode 100755
index ab739b7aa9..d75c24989d
--- a/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp
+++ b/3rdparty/simdlib/Simd/SimdAvx2Resizer.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -23,6 +23,7 @@
 */
 #include "Simd/SimdMemory.h"
 #include "Simd/SimdResizer.h"
+#include "Simd/SimdResizerCommon.h"
 #include "Simd/SimdStore.h"
 #include "Simd/SimdSet.h"
 #include "Simd/SimdUpdate.h"
@@ -33,7 +34,7 @@ namespace Simd
     namespace Avx2
     {
         ResizerByteBilinear::ResizerByteBilinear(const ResParam & param)
-            : Ssse3::ResizerByteBilinear(param)
+            : Sse41::ResizerByteBilinear(param)
         {
         }
 
@@ -223,7 +224,7 @@ namespace Simd
         {
             __m256i lo = ResizerByteBilinearInterpolateY<align>((__m256i*)bx0 + 0, (__m256i*)bx1 + 0, alpha);
             __m256i hi = ResizerByteBilinearInterpolateY<align>((__m256i*)bx0 + 1, (__m256i*)bx1 + 1, alpha);
-            Store<false>((__m256i*)dst, PackU16ToU8(lo, hi));
+            Store<false>((__m256i*)dst, PackI16ToU8(lo, hi));
         }
 
         template<size_t N> void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride)
@@ -523,7 +524,7 @@ namespace Simd
             float * pbx[2] = { _bx[0].data, _bx[1].data };
             int32_t prev = -2;
             size_t rsa = AlignLo(rs, Avx::F);
-            size_t rsh = AlignLo(rs, Sse::F);
+            size_t rsh = AlignLo(rs, Sse2::F);
             for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride)
             {
                 float fy1 = _ay[dy];
@@ -560,10 +561,10 @@ namespace Simd
                             __m256 s1 = _mm256_shuffle_ps(s0145, s2367, 0xDD);
                             _mm256_store_ps(pb + dx, _mm256_fmadd_ps(s0, fx0, _mm256_mul_ps(s1, fx1)));
                         }
-                        for (; dx < rsh; dx += Sse::F)
+                        for (; dx < rsh; dx += Sse2::F)
                         {
-                            __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]);
-                            __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]);
+                            __m128 s01 = Sse2::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]);
+                            __m128 s23 = Sse2::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]);
                             __m128 fx1 = _mm_load_ps(_ax.data + dx);
                             __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1);
                             __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88));
@@ -625,7 +626,7 @@ namespace Simd
                     __m256 b1 = _mm256_load_ps(pbx[1] + dx);
                     _mm256_storeu_ps(dst + dx, _mm256_fmadd_ps(b0, _fy0, _mm256_mul_ps(b1, _fy1)));
                 }
-                for (; dx < rsh; dx += Sse::F)
+                for (; dx < rsh; dx += Sse2::F)
                 {
                     __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0));
                     __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1));
@@ -641,11 +642,11 @@ namespace Simd
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
         {
             ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m256i));
-            if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && dstX >= A)
+            if (param.IsByteBilinear() && dstX >= A)
                 return new ResizerByteBilinear(param);
-            else if (type == SimdResizeChannelByte && method == SimdResizeMethodArea)
+            else if (param.IsByteArea())
                 return new ResizerByteArea(param);
-            else if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp))
+            else if (param.IsFloatBilinear())
                 return new ResizerFloatBilinear(param);
             else
                 return Avx::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
diff --git a/3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp
deleted file mode 100644
index 1533d99dfb..0000000000
--- a/3rdparty/simdlib/Simd/SimdAvx2RgbToGray.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdStore.h"
-#include "Simd/SimdConversion.h"
-
-namespace Simd
-{
-#ifdef SIMD_AVX2_ENABLE
-    namespace Avx2
-    {
-        const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
-        const __m256i K16_GREEN_ROUND = SIMD_MM256_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM);
-
-        SIMD_INLINE __m256i RgbaToGray32(__m256i rgba)
-        {
-            const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF);
-            const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF);
-            const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_ROUND), _mm256_madd_epi16(r0b0, K16_RED_BLUE));
-            return _mm256_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT);
-        }
-
-        SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4])
-        {
-            const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
-            const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
-            return PackU16ToU8(lo, hi);
-        }
-
-        template <bool align> SIMD_INLINE __m256i RgbToGray(const uint8_t * rgb)
-        {
-            __m256i rgba[4];
-            rgba[0] = BgrToBgra<false>(Load<align>((__m256i*)(rgb + 0)), K32_01000000);
-            rgba[1] = BgrToBgra<false>(Load<false>((__m256i*)(rgb + 24)), K32_01000000);
-            rgba[2] = BgrToBgra<false>(Load<false>((__m256i*)(rgb + 48)), K32_01000000);
-            rgba[3] = BgrToBgra<true>(Load<align>((__m256i*)(rgb + 64)), K32_01000000);
-            return RgbaToGray(rgba);
-        }
-
-        template <bool align> void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    Store<align>((__m256i*)(gray + col), RgbToGray<align>(rgb + 3 * col));
-                if (width != alignedWidth)
-                    Store<false>((__m256i*)(gray + width - A), RgbToGray<false>(rgb + 3 * (width - A)));
-                rgb += rgbStride;
-                gray += grayStride;
-            }
-        }
-
-        void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride)
-        {
-            if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride))
-                RgbToGray<true>(rgb, width, height, rgbStride, gray, grayStride);
-            else
-                RgbToGray<false>(rgb, width, height, rgbStride, gray, grayStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2RgbToGray.cpp.o) has no symbols
-    void dummy_SimdAvx2RgbToGray(){};
-#endif//SIMD_AVX2_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp
deleted file mode 100644
index d28cb39832..0000000000
--- a/3rdparty/simdlib/Simd/SimdAvx2RgbaToGray.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdConversion.h"
-
-namespace Simd
-{
-#ifdef SIMD_AVX2_ENABLE
-    namespace Avx2
-    {
-        const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
-        const __m256i K16_GREEN_0000 = SIMD_MM256_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, 0x0000);
-        const __m256i K32_ROUND_TERM = SIMD_MM256_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM);
-
-        SIMD_INLINE __m256i RgbaToGray32(__m256i rgba)
-        {
-            const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF);
-            const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF);
-            const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_0000), _mm256_madd_epi16(r0b0, K16_RED_BLUE));
-            return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT);
-        }
-
-        SIMD_INLINE __m256i RgbaToGray(__m256i rgba[4])
-        {
-            const __m256i lo = PackI32ToI16(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
-            const __m256i hi = PackI32ToI16(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
-            return PackU16ToU8(lo, hi);
-        }
-
-        template <bool align> SIMD_INLINE void Load(const uint8_t* p, __m256i a[4])
-        {
-            a[0] = Load<align>((__m256i*)p + 0);
-            a[1] = Load<align>((__m256i*)p + 1);
-            a[2] = Load<align>((__m256i*)p + 2);
-            a[3] = Load<align>((__m256i*)p + 3);
-        }
-
-        template <bool align> void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-            __m256i a[4];
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                {
-                    Load<align>(rgba + 4 * col, a);
-                    Store<align>((__m256i*)(gray + col), RgbaToGray(a));
-                }
-                if (alignedWidth != width)
-                {
-                    Load<false>(rgba + 4 * (width - A), a);
-                    Store<false>((__m256i*)(gray + width - A), RgbaToGray(a));
-                }
-                rgba += rgbaStride;
-                gray += grayStride;
-            }
-        }
-
-        void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride)
-        {
-            if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride))
-                RgbaToGray<true>(rgba, width, height, rgbaStride, gray, grayStride);
-            else
-                RgbaToGray<false>(rgba, width, height, rgbaStride, gray, grayStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdAvx2RgbaToGray.cpp.o) has no symbols
-    void dummy_SimdAvx2RgbaToGray(){};
-#endif// SIMD_AVX2_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdBase.h b/3rdparty/simdlib/Simd/SimdBase.h
old mode 100644
new mode 100755
index 57d654751e..998a7b7cbe
--- a/3rdparty/simdlib/Simd/SimdBase.h
+++ b/3rdparty/simdlib/Simd/SimdBase.h
@@ -38,7 +38,9 @@ namespace Simd
 
         void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride);
 
-        void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride);
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride);
+
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride);
 
         void BgrToBgra(const uint8_t * bgr, size_t size, uint8_t * bgra, bool fillAlpha, bool lastRow, uint8_t alpha);
 
@@ -47,15 +49,9 @@ namespace Simd
         void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height,
             const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
-        void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha);
-
-        void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride);
-
         void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride);
 
-        void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride);
-
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride);
+        void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride);
 
         void Copy(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, uint8_t * dst, size_t dstStride);
 
@@ -104,6 +100,12 @@ namespace Simd
         void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
 
+        void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride);
+
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride);
+
         void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
 
diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp
old mode 100644
new mode 100755
index b909ee9d20..b5b8140dbe
--- a/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseBgrToBgra.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -106,5 +106,23 @@ namespace Simd
                 bgra += bgraStride;
             }
         }
+
+        void RgbToBgra(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+        {
+            size_t rgbGap = rgbStride - width * 3;
+            size_t bgraGap = bgraStride - width * 4;
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < width; ++col, rgb += 3, bgra += 4)
+                {
+                    bgra[0] = rgb[2];
+                    bgra[1] = rgb[1];
+                    bgra[2] = rgb[0];
+                    bgra[3] = alpha;
+                }
+                rgb += rgbGap;
+                bgra += bgraGap;
+            }
+        }
     }
 }
diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp
old mode 100644
new mode 100755
index e6fa81ddb1..26f7bf171b
--- a/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseBgrToGray.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -39,5 +39,18 @@ namespace Simd
                 }
             }
         }
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+        {
+            for (size_t row = 0; row < height; ++row)
+            {
+                const uint8_t* pRgb = rgb + row * rgbStride;
+                uint8_t* pGray = gray + row * grayStride;
+                for (const uint8_t* pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgb += 3)
+                {
+                    *pGray = BgrToGray(pRgb[2], pRgb[1], pRgb[0]);
+                }
+            }
+        }
     }
 }
diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp
old mode 100644
new mode 100755
index d508115a64..ece4ffc97f
--- a/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseBgrToRgb.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@ namespace Simd
 {
     namespace Base
     {
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
+        void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride)
         {
             size_t size = width * 3;
             for (size_t row = 0; row < height; ++row)
diff --git a/3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp b/3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp
deleted file mode 100644
index b7003c067b..0000000000
--- a/3rdparty/simdlib/Simd/SimdBaseBgrToRgba.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdDefs.h"
-#include <algorithm>
-
-namespace Simd
-{
-    namespace Base
-    {
-        void BgrToRgba(const uint8_t *bgr, size_t size, uint8_t *rgba, bool fillAlpha, bool lastRow, uint8_t alpha)
-        {
-            if (fillAlpha)
-            {
-#ifdef SIMD_BIG_ENDIAN
-                const int32_t alphaMask = alpha;
-#else
-                const int32_t alphaMask = alpha << 24;
-#endif
-                for (size_t i = (lastRow ? 1 : 0); i < size; ++i, bgr += 3, rgba += 4)
-                {
-                    *(int32_t*)rgba = (*(int32_t*)bgr) | alphaMask;
-                    std::swap(rgba[0], rgba[2]);
-                }
-                if (lastRow)
-                {
-                    rgba[0] = bgr[2];
-                    rgba[1] = bgr[1];
-                    rgba[2] = bgr[0];
-                    rgba[3] = alpha;
-                }
-            }
-            else
-            {
-                for (size_t i = (lastRow ? 1 : 0); i < size; ++i, bgr += 3, rgba += 4)
-                {
-                    *(int32_t*)rgba = (*(int32_t*)bgr);
-                    std::swap(rgba[0], rgba[2]);
-                }
-                if (lastRow)
-                {
-                    rgba[0] = bgr[2];
-                    rgba[1] = bgr[1];
-                    rgba[2] = bgr[0];
-                }
-            }
-        }
-
-        void BgrToRgba(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *rgba, size_t bgraStride, uint8_t alpha)
-        {
-            for (size_t row = 1; row < height; ++row)
-            {
-                BgrToRgba(bgr, width, rgba, true, false, alpha);
-                bgr += bgrStride;
-                rgba += bgraStride;
-            }
-            BgrToRgba(bgr, width, rgba, true, true, alpha);
-        }
-    }
-}
diff --git a/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp
old mode 100644
new mode 100755
index 8d3b1bbc6c..6ee5d55355
--- a/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseBgraToBgr.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -51,5 +51,40 @@ namespace Simd
             }
             BgraToBgr(bgra, width, bgr, true);
         }
+
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
+        {
+            size_t bgraGap = bgraStride - width * 4;
+            size_t rgbGap = rgbStride - width * 3;
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < width; ++col, bgra += 4, rgb += 3)
+                {
+                    rgb[2] = bgra[0];
+                    rgb[1] = bgra[1];
+                    rgb[0] = bgra[2];
+                }
+                bgra += bgraGap;
+                rgb += rgbGap;
+            }
+        }
+
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
+        {
+            size_t bgraGap = bgraStride - width * 4;
+            size_t rgbaGap = rgbaStride - width * 4;
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < width; ++col, bgra += 4, rgba += 4)
+                {
+                    rgba[2] = bgra[0];
+                    rgba[1] = bgra[1];
+                    rgba[0] = bgra[2];
+                    rgba[3] = bgra[3];
+                }
+                bgra += bgraGap;
+                rgba += rgbaGap;
+            }
+        }
     }
 }
diff --git a/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp b/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp
old mode 100644
new mode 100755
index 3d855e749e..16fba3e7ce
--- a/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseBgraToGray.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -39,5 +39,18 @@ namespace Simd
                 }
             }
         }
+
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+        {
+            for (size_t row = 0; row < height; ++row)
+            {
+                const uint8_t* pRgba = rgba + row * rgbaStride;
+                uint8_t* pGray = gray + row * grayStride;
+                for (const uint8_t* pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgba += 4)
+                {
+                    *pGray = BgrToGray(pRgba[2], pRgba[1], pRgba[0]);
+                }
+            }
+        }
     }
 }
diff --git a/3rdparty/simdlib/Simd/SimdBaseCpu.cpp b/3rdparty/simdlib/Simd/SimdBaseCpu.cpp
new file mode 100644
index 0000000000..77fc5718df
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseCpu.cpp
@@ -0,0 +1,234 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdCpu.h"
+
+#include <vector>
+#include <thread>
+#include <sstream>
+#include <iostream>
+
+#if defined(_MSC_VER)
+
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <intrin.h>
+
+#elif defined(__GNUC__)
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdlib.h>
+
+#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
+#include <cpuid.h>
+#endif
+
+#if defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)
+#include <fcntl.h>
+#include <sys/auxv.h>
+#if defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)
+#include <asm/hwcap.h>
+#endif
+#endif
+
+#else
+# error Do not know how to detect CPU info
+#endif
+
+namespace Simd
+{
+    namespace Base
+    {
+#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
+        bool CheckBit(Cpuid::Level level, Cpuid::Register index, Cpuid::Bit bit)
+        {
+            unsigned int registers[4] = { 0, 0, 0, 0 };
+#if defined(_MSC_VER)
+            __cpuid((int*)registers, level);
+#elif (defined __GNUC__)
+            if (__get_cpuid_max(0, NULL) < level)
+                return false;
+            __cpuid_count(level, 0, 
+                registers[Cpuid::Eax], 
+                registers[Cpuid::Ebx], 
+                registers[Cpuid::Ecx], 
+                registers[Cpuid::Edx]);
+#else
+#error Do not know how to detect CPU info!
+#endif
+            return (registers[index] & bit) == bit;
+        }
+#endif//defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
+
+#if defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE))
+        bool CheckBit(int at, int bit)
+        {
+            bool result = false;
+            int file = ::open("/proc/self/auxv", O_RDONLY);
+            if (file < 0)
+                return false;
+            const ssize_t size = 64;
+            unsigned long buffer[size];
+            for (ssize_t count = size; count == size;)
+            {
+                count = ::read(file, buffer, sizeof(buffer)) / sizeof(unsigned long);
+                for (int i = 0; i < count; i += 2)
+                {
+                    if (buffer[i] == (unsigned)at)
+                    {
+                        result = !!(buffer[i + 1] & bit);
+                        count = 0;
+                    }
+                    if (buffer[i] == AT_NULL)
+                        count = 0;
+                }
+            }
+            ::close(file);
+            return result;
+        }
+#endif//defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE))
+
+        size_t CpuThreadNumber()
+        {
+            return std::thread::hardware_concurrency();
+        }
+
+#if defined(_MSC_VER)
+        typedef SYSTEM_LOGICAL_PROCESSOR_INFORMATION Info;
+
+        void GetLogicalProcessorInformation(std::vector<Info> & info)
+        {
+            DWORD size = 0;
+            ::GetLogicalProcessorInformation(0, &size); 
+            info.resize(size / sizeof(Info));
+            ::GetLogicalProcessorInformation(info.data(), &size);
+        }
+
+        size_t CpuSocketNumber()
+        {
+            std::vector<Info> info;
+            GetLogicalProcessorInformation(info);
+            size_t number = 0;
+            for (size_t i = 0; i < info.size(); ++i)
+                if (info[i].Relationship == ::RelationNumaNode)
+                    number++;
+            return number;
+        }            
+
+        size_t CpuCoreNumber()
+        {
+            std::vector<Info> info;
+            GetLogicalProcessorInformation(info);
+            size_t number = 0;
+            for (size_t i = 0; i < info.size(); ++i)
+                if (info[i].Relationship == ::RelationProcessorCore)
+                    number++;
+            return number;
+        }
+
+        size_t CpuCacheSize(size_t level)
+        {
+            std::vector<Info> info;
+            GetLogicalProcessorInformation(info);
+            for (size_t i = 0; i < info.size(); ++i)
+                if (info[i].Relationship == ::RelationCache && info[i].Cache.Level == level && (info[i].Cache.Type == ::CacheData || info[i].Cache.Type == CacheUnified))
+                    return info[i].Cache.Size;
+            return 0;
+        }
+#elif defined(__GNUC__)
+        size_t CpuSocketNumber()
+        {
+            uint32_t number = 0;
+            ::FILE * p = ::popen("lscpu -b -p=Socket | grep -v '^#' | sort -u | wc -l", "r");
+            if (p)
+            {
+                char buffer[PATH_MAX];
+                while (::fgets(buffer, PATH_MAX, p));
+                number = ::atoi(buffer);
+                ::pclose(p);
+            }
+            return number;
+        }
+
+        size_t CpuCoreNumber()
+        {
+            uint32_t number = 0;
+            ::FILE * p = ::popen("lscpu -b -p=Core | grep -v '^#' | sort -u | wc -l", "r");
+            if (p)
+            {
+                char buffer[PATH_MAX];
+                while (::fgets(buffer, PATH_MAX, p));
+                number = ::atoi(buffer);
+                ::pclose(p);
+            }
+            return number;
+        }
+
+        SIMD_INLINE size_t CorrectIfZero(size_t value, size_t otherwise)
+        {
+            return value ? value : otherwise;
+        }
+
+#if defined(_SC_LEVEL1_DCACHE_SIZE) && defined(_SC_LEVEL2_CACHE_SIZE) && defined(_SC_LEVEL3_CACHE_SIZE)
+        size_t CpuCacheSize(size_t level)
+        {
+            switch (level)
+            {
+            case 1: return CorrectIfZero(::sysconf(_SC_LEVEL1_DCACHE_SIZE), 32 * 1024);
+            case 2: return CorrectIfZero(::sysconf(_SC_LEVEL2_CACHE_SIZE), 256 * 1024);
+            case 3: return CorrectIfZero(::sysconf(_SC_LEVEL3_CACHE_SIZE), 2048 * 1024);
+            default:
+                return 0;
+            }
+        }
+#else
+        size_t CpuCacheSize(size_t level)
+        {
+            switch (level)
+            {
+            case 1: return 32 * 1024;
+            case 2: return 256 * 1024;
+            case 3: return 2048 * 1024;
+            default:
+                return 0;
+            }
+        }
+#endif
+
+#else
+#error This platform is unsupported!
+#endif
+    }
+
+    namespace Cpu
+    {
+        const size_t SOCKET_NUMBER = Base::CpuSocketNumber();
+        const size_t CORE_NUMBER = Base::CpuCoreNumber();
+        const size_t THREAD_NUMBER = Base::CpuThreadNumber();
+        const size_t L1_CACHE_SIZE = Base::CpuCacheSize(1);
+        const size_t L2_CACHE_SIZE = Base::CpuCacheSize(2);
+        const size_t L3_CACHE_SIZE = Base::CpuCacheSize(3);
+    }
+}
diff --git a/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp b/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp
old mode 100644
new mode 100755
index ecb22ed4b0..366ce1bc0e
--- a/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseDeinterleave.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -48,20 +48,39 @@ namespace Simd
         void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height,
             uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride)
         {
-            for (size_t row = 0; row < height; ++row)
+            if (a)
             {
-                for (size_t col = 0, offset = 0; col < width; ++col, offset += 4)
+                for (size_t row = 0; row < height; ++row)
                 {
-                    b[col] = bgra[offset + 0];
-                    g[col] = bgra[offset + 1];
-                    r[col] = bgra[offset + 2];
-                    a[col] = bgra[offset + 3];
+                    for (size_t col = 0, offset = 0; col < width; ++col, offset += 4)
+                    {
+                        b[col] = bgra[offset + 0];
+                        g[col] = bgra[offset + 1];
+                        r[col] = bgra[offset + 2];
+                        a[col] = bgra[offset + 3];
+                    }
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
+                    a += aStride;
+                }
+            }
+            else
+            {
+                for (size_t row = 0; row < height; ++row)
+                {
+                    for (size_t col = 0, offset = 0; col < width; ++col, offset += 4)
+                    {
+                        b[col] = bgra[offset + 0];
+                        g[col] = bgra[offset + 1];
+                        r[col] = bgra[offset + 2];
+                    }
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
                 }
-                bgra += bgraStride;
-                b += bStride;
-                g += gStride;
-                r += rStride;
-                a += aStride;
             }
         }
     }
diff --git a/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp
old mode 100644
new mode 100755
index 560b9d3cb9..1394d919e1
--- a/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseGaussianBlur.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2020 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
diff --git a/3rdparty/simdlib/Simd/SimdBaseResizer.cpp b/3rdparty/simdlib/Simd/SimdBaseResizer.cpp
old mode 100644
new mode 100755
index 9585a4f1ac..b8c08d2b92
--- a/3rdparty/simdlib/Simd/SimdBaseResizer.cpp
+++ b/3rdparty/simdlib/Simd/SimdBaseResizer.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -23,6 +23,7 @@
 */
 #include "Simd/SimdMemory.h"
 #include "Simd/SimdResizer.h"
+#include "Simd/SimdCopyPixel.h"
 
 namespace Simd
 {
@@ -132,8 +133,6 @@ namespace Simd
         ResizerByteArea::ResizerByteArea(const ResParam & param)
             : Resizer(param)
         {
-            double scale = Simd::Max(float(_param.srcW) / _param.dstW, float(_param.srcH) / _param.dstH);
-
             _ay.Resize(_param.dstH + 1);
             _iy.Resize(_param.dstH + 1);
             EstimateParams(_param.srcH, _param.dstH, Base::AREA_RANGE, _ay.data, _iy.data);
@@ -234,28 +233,173 @@ namespace Simd
 
         //---------------------------------------------------------------------
 
+        ResizerShortBilinear::ResizerShortBilinear(const ResParam& param)
+            : Resizer(param)
+        {
+            _ay.Resize(_param.dstH, false, _param.align);
+            _iy.Resize(_param.dstH, false, _param.align);
+            EstimateIndexAlpha(_param.srcH, _param.dstH, 1, _iy.data, _ay.data);
+            size_t rs = _param.dstW * _param.channels;
+            _ax.Resize(rs, false, _param.align);
+            _ix.Resize(rs, false, _param.align);
+            EstimateIndexAlpha(_param.srcW, _param.dstW, _param.channels, _ix.data, _ax.data);
+            _bx[0].Resize(rs, false, _param.align);
+            _bx[1].Resize(rs, false, _param.align);
+        }
+
+        void ResizerShortBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t* indices, float* alphas)
+        {
+            float scale = (float)srcSize / dstSize;
+            for (size_t i = 0; i < dstSize; ++i)
+            {
+                float alpha = (float)((i + 0.5f) * scale - 0.5f);
+                ptrdiff_t index = (ptrdiff_t)::floor(alpha);
+                alpha -= index;
+                if (index < 0)
+                {
+                    index = 0;
+                    alpha = 0;
+                }
+                if (index > (ptrdiff_t)srcSize - 2)
+                {
+                    index = srcSize - 2;
+                    alpha = 1;
+                }
+                for (size_t c = 0; c < channels; c++)
+                {
+                    size_t offset = i * channels + c;
+                    indices[offset] = (int32_t)(channels * index + c);
+                    alphas[offset] = alpha;
+                }
+            }
+        }
+
+        void ResizerShortBilinear::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
+        {
+            Run((const uint16_t*)src, srcStride / sizeof(uint16_t), (uint16_t*)dst, dstStride / sizeof(uint16_t));
+        }
+
+        template<size_t N> void ResizerShortBilinear::RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride)
+        {
+            size_t rs = _param.dstW * N;
+            float* pbx[2] = { _bx[0].data, _bx[1].data };
+            int32_t prev = -2;
+            for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride)
+            {
+                float fy1 = _ay[dy];
+                float fy0 = 1.0f - fy1;
+                int32_t sy = _iy[dy];
+                int32_t k = 0;
+                if (sy == prev)
+                    k = 2;
+                else if (sy == prev + 1)
+                {
+                    Swap(pbx[0], pbx[1]);
+                    k = 1;
+                }
+                prev = sy;
+                for (; k < 2; k++)
+                {
+                    float* pb = pbx[k];
+                    const uint16_t* ps = src + (sy + k) * srcStride;
+                    for (size_t dx = 0; dx < rs; dx++)
+                    {
+                        int32_t sx = _ix[dx];
+                        float fx = _ax[dx];
+                        pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + N] * fx;
+                    }
+                }
+                for (size_t dx = 0; dx < rs; dx++)
+                    dst[dx] = Round(pbx[0][dx] * fy0 + pbx[1][dx] * fy1);
+            }
+        }
+
+        template<size_t N> void ResizerShortBilinear::RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride)
+        {
+            size_t rs = _param.dstW * N;
+            for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride)
+            {
+                float fy1 = _ay[dy];
+                float fy0 = 1.0f - fy1;
+                int32_t sy = _iy[dy];
+                const uint16_t* ps0 = src + (sy + 0) * srcStride;
+                const uint16_t* ps1 = src + (sy + 1) * srcStride;
+                for (size_t dx = 0; dx < rs; dx++)
+                {
+                    int32_t sx = _ix[dx];
+                    float fx1 = _ax[dx];
+                    float fx0 = 1.0f - fx1;
+                    float r0 = ps0[sx] * fx0 + ps0[sx + N] * fx1;
+                    float r1 = ps1[sx] * fx0 + ps1[sx + N] * fx1;
+                    dst[dx] = Round(r0 * fy0 + r1 * fy1);
+                }
+            }
+        }
+
+        void ResizerShortBilinear::Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride)
+        {
+            bool sparse = _param.dstH * 2.0 <= _param.srcH;
+            switch (_param.channels)
+            {
+            case 1: sparse ? RunS<1>(src, srcStride, dst, dstStride) : RunB<1>(src, srcStride, dst, dstStride); return;
+            case 2: sparse ? RunS<2>(src, srcStride, dst, dstStride) : RunB<2>(src, srcStride, dst, dstStride); return;
+            case 3: sparse ? RunS<3>(src, srcStride, dst, dstStride) : RunB<3>(src, srcStride, dst, dstStride); return;
+            case 4: sparse ? RunS<4>(src, srcStride, dst, dstStride) : RunB<4>(src, srcStride, dst, dstStride); return;
+            default:
+                assert(0);
+            }
+        }
+
+        //---------------------------------------------------------------------
+
         ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param)
             : Resizer(param)
         {
             _ay.Resize(_param.dstH, false, _param.align);
             _iy.Resize(_param.dstH, false, _param.align);
-            EstimateIndexAlpha(_param.srcH, _param.dstH, 1, _param.method == SimdResizeMethodCaffeInterp, _iy.data, _ay.data);
+            EstimateIndexAlpha(_param.srcH, _param.dstH, 1, _iy.data, _ay.data);
             size_t rs = _param.dstW * _param.channels;
             _ax.Resize(rs, false, _param.align);
             _ix.Resize(rs, false, _param.align);
-            EstimateIndexAlpha(_param.srcW, _param.dstW, _param.channels, _param.method == SimdResizeMethodCaffeInterp, _ix.data, _ax.data);
+            EstimateIndexAlpha(_param.srcW, _param.dstW, _param.channels, _ix.data, _ax.data);
             _bx[0].Resize(rs, false, _param.align);
             _bx[1].Resize(rs, false, _param.align);
         }
 
-        void ResizerFloatBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, bool caffeInterp, int32_t * indices, float * alphas)
+        void ResizerFloatBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t * indices, float * alphas)
         {
-            if (caffeInterp)
+            if (_param.method == SimdResizeMethodBilinear)
+            {
+                float scale = (float)srcSize / dstSize;
+                for (size_t i = 0; i < dstSize; ++i)
+                {
+                    float alpha = (float)((i + 0.5f) * scale - 0.5f);
+                    ptrdiff_t index = (ptrdiff_t)::floor(alpha);
+                    alpha -= index;
+                    if (index < 0)
+                    {
+                        index = 0;
+                        alpha = 0;
+                    }
+                    if (index > (ptrdiff_t)srcSize - 2)
+                    {
+                        index = srcSize - 2;
+                        alpha = 1;
+                    }
+                    for (size_t c = 0; c < channels; c++)
+                    {
+                        size_t offset = i * channels + c;
+                        indices[offset] = (int32_t)(channels * index + c);
+                        alphas[offset] = alpha;
+                    }
+                }
+            }            
+            else if (_param.method == SimdResizeMethodCaffeInterp)
             {
                 float scale = dstSize > 1 ? float(srcSize - 1) / float(dstSize - 1) : 0.0f;
                 for (size_t i = 0; i < dstSize; ++i)
                 {
-                    float alpha = float(i)*scale;
+                    float alpha = float(i) * scale;
                     ptrdiff_t index = (ptrdiff_t)::floor(alpha);
                     alpha -= index;
                     if (index > (ptrdiff_t)srcSize - 2)
@@ -266,17 +410,17 @@ namespace Simd
                     for (size_t c = 0; c < channels; c++)
                     {
                         size_t offset = i * channels + c;
-                        indices[offset] = (int32_t)(channels*index + c);
+                        indices[offset] = (int32_t)(channels * index + c);
                         alphas[offset] = alpha;
                     }
                 }
             }
-            else
+            else if (_param.method == SimdResizeMethodInferenceEngineInterp)
             {
                 float scale = (float)srcSize / dstSize;
                 for (size_t i = 0; i < dstSize; ++i)
                 {
-                    float alpha = (float)((i + 0.5f)*scale - 0.5f);
+                    float alpha = float(i) * scale;
                     ptrdiff_t index = (ptrdiff_t)::floor(alpha);
                     alpha -= index;
                     if (index < 0)
@@ -284,7 +428,7 @@ namespace Simd
                         index = 0;
                         alpha = 0;
                     }
-                    if (index >(ptrdiff_t)srcSize - 2)
+                    if (index > (ptrdiff_t)srcSize - 2)
                     {
                         index = srcSize - 2;
                         alpha = 1;
@@ -292,11 +436,13 @@ namespace Simd
                     for (size_t c = 0; c < channels; c++)
                     {
                         size_t offset = i * channels + c;
-                        indices[offset] = (int32_t)(channels*index + c);
+                        indices[offset] = (int32_t)(channels * index + c);
                         alphas[offset] = alpha;
                     }
                 }
             }
+            else
+                assert(0);
         }
 
         void ResizerFloatBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride)
@@ -346,15 +492,80 @@ namespace Simd
 
         //---------------------------------------------------------------------
 
+        ResizerNearest::ResizerNearest(const ResParam& param)
+            : Resizer(param)
+        {
+            _pixelSize = _param.PixelSize();
+            _iy.Resize(_param.dstH, false, _param.align);
+            EstimateIndex(_param.srcH, _param.dstH, 1, _iy.data);
+            _ix.Resize(_param.dstW, false, _param.align);
+            EstimateIndex(_param.srcW, _param.dstW, _pixelSize, _ix.data);
+        }
+
+        void ResizerNearest::EstimateIndex(size_t srcSize, size_t dstSize, size_t pixelSize, int32_t* indices)
+        {
+            float scale = (float)srcSize / dstSize;
+            for (size_t i = 0; i < dstSize; ++i)
+            {
+                float alpha = (i + 0.5f) * scale;
+                int index = RestrictRange((int)::floor(alpha), 0, (int)srcSize - 1);
+                indices[i] = (int)(index * pixelSize);
+            }
+        }
+
+        void ResizerNearest::Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
+        {
+            for (size_t dy = 0; dy < _param.dstH; dy++)
+            {
+                const uint8_t* srcRow = src + _iy[dy] * srcStride;
+                for (size_t dx = 0, offset = 0; dx < _param.dstW; dx++, offset += _pixelSize)
+                    memcpy(dst + offset, srcRow + _ix[dx], _pixelSize);
+                dst += dstStride;
+            }
+        }
+
+        template<size_t N> void ResizerNearest::Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
+        {
+            for (size_t dy = 0; dy < _param.dstH; dy++)
+            {
+                const uint8_t * srcRow = src + _iy[dy] * srcStride;
+                for (size_t dx = 0, offset = 0; dx < _param.dstW; dx++, offset += N)
+                    CopyPixel<N>(srcRow + _ix[dx], dst + offset);
+                dst += dstStride;
+            }
+        }
+
+        void ResizerNearest::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
+        {
+            switch (_pixelSize)
+            {
+            case 1: Resize<1>(src, srcStride, dst, dstStride); break;
+            case 2: Resize<2>(src, srcStride, dst, dstStride); break;
+            case 3: Resize<3>(src, srcStride, dst, dstStride); break;
+            case 4: Resize<4>(src, srcStride, dst, dstStride); break;
+            case 6: Resize<6>(src, srcStride, dst, dstStride); break;
+            case 8: Resize<8>(src, srcStride, dst, dstStride); break;
+            case 12: Resize<12>(src, srcStride, dst, dstStride); break;
+            default:
+                Resize(src, srcStride, dst, dstStride);
+            }
+        }
+
+        //---------------------------------------------------------------------
+
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
         {
             ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(void*));
-            if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear)
+            if (param.IsByteBilinear())
                 return new ResizerByteBilinear(param);
-            else  if (type == SimdResizeChannelByte && method == SimdResizeMethodArea)
+            else if (param.IsByteArea())
                 return new ResizerByteArea(param);
-            else if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp))
+            else if (param.IsShortBilinear())
+                return new ResizerShortBilinear(param);
+            else if (param.IsFloatBilinear())
                 return new ResizerFloatBilinear(param);
+            else if (param.IsNearest())
+                return new ResizerNearest(param);
             else
                 return NULL;
         }
diff --git a/3rdparty/simdlib/Simd/SimdConfig.h b/3rdparty/simdlib/Simd/SimdConfig.h
old mode 100644
new mode 100755
index 8e328e2495..22c7fdd8e6
--- a/3rdparty/simdlib/Simd/SimdConfig.h
+++ b/3rdparty/simdlib/Simd/SimdConfig.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -24,18 +24,10 @@
 #ifndef __SimdConfig_h__
 #define __SimdConfig_h__
 
-//#define SIMD_SSE_DISABLE
-
 //#define SIMD_SSE2_DISABLE
 
-//#define SIMD_SSE3_DISABLE
-
-//#define SIMD_SSSE3_DISABLE
-
 //#define SIMD_SSE41_DISABLE
 
-//#define SIMD_SSE42_DISABLE
-
 //#define SIMD_AVX_DISABLE
 
 //#define SIMD_AVX2_DISABLE
diff --git a/3rdparty/simdlib/Simd/SimdConst.h b/3rdparty/simdlib/Simd/SimdConst.h
old mode 100644
new mode 100755
index 38e217d6ca..e18c1b90d0
--- a/3rdparty/simdlib/Simd/SimdConst.h
+++ b/3rdparty/simdlib/Simd/SimdConst.h
@@ -76,25 +76,13 @@ namespace Simd
         const int DIVISION_BY_9_FACTOR = (1 << DIVISION_BY_9_SHIFT) / 9;
     }
 
-#ifdef SIMD_SSE_ENABLE    
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE    
+    namespace Sse2
     {
         const size_t F = sizeof(__m128) / sizeof(float);
         const size_t DF = 2 * F;
         const size_t QF = 4 * F;
         const size_t HF = F / 2;
-    }
-#endif// SIMD_SSE_ENABLE
-
-#ifdef SIMD_SSE2_ENABLE    
-    namespace Sse2
-    {
-        using namespace Sse;
-#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug
-        using Sse::F;
-        using Sse::DF;
-        using Sse::QF;
-#endif
 
         const size_t A = sizeof(__m128i);
         const size_t DA = 2 * A;
@@ -128,6 +116,7 @@ namespace Simd
         const __m128i K16_0020 = SIMD_MM_SET1_EPI16(0x0020);
         const __m128i K16_0080 = SIMD_MM_SET1_EPI16(0x0080);
         const __m128i K16_00FF = SIMD_MM_SET1_EPI16(0x00FF);
+        const __m128i K16_0101 = SIMD_MM_SET1_EPI16(0x0101);
         const __m128i K16_FF00 = SIMD_MM_SET1_EPI16(0xFF00);
 
         const __m128i K32_00000001 = SIMD_MM_SET1_EPI32(0x00000001);
@@ -138,6 +127,7 @@ namespace Simd
         const __m128i K32_0000FFFF = SIMD_MM_SET1_EPI32(0x0000FFFF);
         const __m128i K32_00010000 = SIMD_MM_SET1_EPI32(0x00010000);
         const __m128i K32_01000000 = SIMD_MM_SET1_EPI32(0x01000000);
+        const __m128i K32_00FF0000 = SIMD_MM_SET1_EPI32(0x00FF0000);
         const __m128i K32_00FFFFFF = SIMD_MM_SET1_EPI32(0x00FFFFFF);
         const __m128i K32_FFFFFF00 = SIMD_MM_SET1_EPI32(0xFFFFFF00);
 
@@ -162,22 +152,15 @@ namespace Simd
     }
 #endif// SIMD_SSE2_ENABLE
 
-#ifdef SIMD_SSE3_ENABLE    
-    namespace Sse3
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
     {
         using namespace Sse2;
 #if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug      
-        using Sse::F;
-        using Sse::DF;
-        using Sse::QF;
+        using Sse2::F;
+        using Sse2::DF;
+        using Sse2::QF;
 #endif
-    }
-#endif// SIMD_SSE3_ENABLE
-
-#ifdef SIMD_SSSE3_ENABLE    
-    namespace Ssse3
-    {
-        using namespace Sse3;
 
         const __m128i K8_SHUFFLE_GRAY_TO_BGR0 = SIMD_MM_SETR_EPI8(0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5);
         const __m128i K8_SHUFFLE_GRAY_TO_BGR1 = SIMD_MM_SETR_EPI8(0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xA, 0xA);
@@ -207,27 +190,8 @@ namespace Simd
         const __m128i K8_SHUFFLE_BGR1_TO_RED = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1);
         const __m128i K8_SHUFFLE_BGR2_TO_RED = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF);
     }
-#endif// SIMD_SSSE3_ENABLE
-
-#ifdef SIMD_SSE41_ENABLE    
-    namespace Sse41
-    {
-        using namespace Ssse3;
-#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug      
-        using Sse::F;
-        using Sse::DF;
-        using Sse::QF;
-#endif
-    }
 #endif// SIMD_SSE41_ENABLE
 
-#ifdef SIMD_SSE42_ENABLE    
-    namespace Sse42
-    {
-        using namespace Sse41;
-    }
-#endif// SIMD_SSE42_ENABLE
-
 #ifdef SIMD_AVX_ENABLE    
     namespace Avx
     {
@@ -282,6 +246,7 @@ namespace Simd
         const __m256i K16_0020 = SIMD_MM256_SET1_EPI16(0x0020);
         const __m256i K16_0080 = SIMD_MM256_SET1_EPI16(0x0080);
         const __m256i K16_00FF = SIMD_MM256_SET1_EPI16(0x00FF);
+        const __m256i K16_0101 = SIMD_MM256_SET1_EPI16(0x0101);
         const __m256i K16_FF00 = SIMD_MM256_SET1_EPI16(0xFF00);
 
         const __m256i K32_00000001 = SIMD_MM256_SET1_EPI32(0x00000001);
@@ -292,6 +257,7 @@ namespace Simd
         const __m256i K32_0000FFFF = SIMD_MM256_SET1_EPI32(0x0000FFFF);
         const __m256i K32_00010000 = SIMD_MM256_SET1_EPI32(0x00010000);
         const __m256i K32_01000000 = SIMD_MM256_SET1_EPI32(0x01000000);
+        const __m256i K32_00FF0000 = SIMD_MM256_SET1_EPI32(0x00FF0000);
         const __m256i K32_FFFFFF00 = SIMD_MM256_SET1_EPI32(0xFFFFFF00);
 
         const __m256i K16_Y_ADJUST = SIMD_MM256_SET1_EPI16(Base::Y_ADJUST);
@@ -311,6 +277,8 @@ namespace Simd
 
         const __m256i K16_DIVISION_BY_9_FACTOR = SIMD_MM256_SET1_EPI16(Base::DIVISION_BY_9_FACTOR);
 
+        const __m256i K64_00000000FFFFFFFF = SIMD_MM256_SET2_EPI32(0xFFFFFFFF, 0);
+
         const __m256i K8_SHUFFLE_0 = SIMD_MM256_SETR_EPI8(
             0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
             0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0);
@@ -389,11 +357,11 @@ namespace Simd
             -1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1,
             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF);
 
-        const __m256i K8_BGRA_TO_BGR_SHUFFLE = SIMD_MM256_SETR_EPI8(
+        const __m256i K8_BGR_TO_BGRA_SHUFFLE = SIMD_MM256_SETR_EPI8(
             0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1,
             0x4, 0x5, 0x6, -1, 0x7, 0x8, 0x9, -1, 0xA, 0xB, 0xC, -1, 0xD, 0xE, 0xF, -1);
 
-        const __m256i K8_BGRA_TO_RGB_SHUFFLE = SIMD_MM256_SETR_EPI8(
+        const __m256i K8_RGB_TO_BGRA_SHUFFLE = SIMD_MM256_SETR_EPI8(
             0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1,
             0x6, 0x5, 0x4, -1, 0x9, 0x8, 0x7, -1, 0xC, 0xB, 0xA, -1, 0xF, 0xE, 0xD, -1);
 
@@ -402,6 +370,12 @@ namespace Simd
             0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF);
 
         const __m256i K32_TWO_UNPACK_PERMUTE = SIMD_MM256_SETR_EPI32(0, 2, 4, 6, 1, 3, 5, 7);
+
+        const __m256i K8_SHUFFLE_BGRA_TO_BGR = SIMD_MM256_SETR_EPI8(
+            0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1,
+            0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1);
+
+        const __m256i K32_PERMUTE_BGRA_TO_BGR = SIMD_MM256_SETR_EPI32(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, -1, -1);
     }
 #endif// SIMD_AVX2_ENABLE
 
@@ -459,8 +433,10 @@ namespace Simd
         const uint32x4_t K32_000000FF = SIMD_VEC_SET1_EPI32(0x000000FF);
         const uint32x4_t K32_0000FFFF = SIMD_VEC_SET1_EPI32(0x0000FFFF);
         const uint32x4_t K32_00010000 = SIMD_VEC_SET1_EPI32(0x00010000);
+        const uint32x4_t K32_00FF0000 = SIMD_VEC_SET1_EPI32(0x00FF0000);
         const uint32x4_t K32_01000000 = SIMD_VEC_SET1_EPI32(0x01000000);
         const uint32x4_t K32_08080800 = SIMD_VEC_SET1_EPI32(0x08080800);
+        const uint32x4_t K32_FF000000 = SIMD_VEC_SET1_EPI32(0xFF000000);
         const uint32x4_t K32_FFFFFF00 = SIMD_VEC_SET1_EPI32(0xFFFFFF00);
         const uint32x4_t K32_FFFFFFFF = SIMD_VEC_SET1_EPI32(0xFFFFFFFF);
         const uint32x4_t K32_0123 = SIMD_VEC_SETR_EPI32(0, 1, 2, 3);
diff --git a/3rdparty/simdlib/Simd/SimdConversion.h b/3rdparty/simdlib/Simd/SimdConversion.h
old mode 100644
new mode 100755
index e0601a9f61..5f8f0a0b9b
--- a/3rdparty/simdlib/Simd/SimdConversion.h
+++ b/3rdparty/simdlib/Simd/SimdConversion.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
 *               2014-2015 Antonenka Mikhail.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -38,16 +38,10 @@ namespace Simd
             return (BLUE_TO_GRAY_WEIGHT*blue + GREEN_TO_GRAY_WEIGHT * green +
                 RED_TO_GRAY_WEIGHT * red + BGR_TO_GRAY_ROUND_TERM) >> BGR_TO_GRAY_AVERAGING_SHIFT;
         }
-
-        SIMD_INLINE int RgbToGray(int red, int green, int blue)
-        {
-            return (BLUE_TO_GRAY_WEIGHT*blue + GREEN_TO_GRAY_WEIGHT * green +
-                RED_TO_GRAY_WEIGHT * red + BGR_TO_GRAY_ROUND_TERM) >> BGR_TO_GRAY_AVERAGING_SHIFT;
-        }
     }
 
-#ifdef SIMD_SSSE3_ENABLE    
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE
+    namespace Sse41
     {
         template <int index> __m128i InterleaveBgr(__m128i blue, __m128i green, __m128i red);
 
@@ -99,7 +93,7 @@ namespace Simd
                         _mm_shuffle_epi8(bgr[2], K8_SHUFFLE_BGR2_TO_RED)));
         }
     }
-#endif//SIMD_SSSE3_ENABLE
+#endif
 
 #ifdef SIMD_AVX2_ENABLE    
     namespace Avx2
@@ -181,41 +175,24 @@ namespace Simd
 
         template<> SIMD_INLINE __m256i BgrToBgra<false>(const __m256i & bgr, const __m256i & alpha)
         {
-            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0x94), K8_BGRA_TO_BGR_SHUFFLE), alpha);
+            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0x94), K8_BGR_TO_BGRA_SHUFFLE), alpha);
         }
 
         template<> SIMD_INLINE __m256i BgrToBgra<true>(const __m256i & bgr, const __m256i & alpha)
         {
-            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0xE9), K8_BGRA_TO_BGR_SHUFFLE), alpha);
-        }
-
-        template<bool tail> __m256i BgrToRgba(const __m256i & bgr, const __m256i & alpha);
-
-        template<> SIMD_INLINE __m256i BgrToRgba<false>(const __m256i & bgr, const __m256i & alpha)
-        {
-            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0x94), K8_BGRA_TO_RGB_SHUFFLE), alpha);
-        }
-
-        template<> SIMD_INLINE __m256i BgrToRgba<true>(const __m256i & bgr, const __m256i & alpha)
-        {
-            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0xE9), K8_BGRA_TO_RGB_SHUFFLE), alpha);
-        }
-
-        SIMD_INLINE __m256i BgraToRgba(const __m256i & bgra)
-        {
-            return _mm256_shuffle_epi8(bgra, K8_BGRA_TO_RGBA_SHUFFLE);
+            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(bgr, 0xE9), K8_BGR_TO_BGRA_SHUFFLE), alpha);
         }
 
         template<bool tail> __m256i RgbToBgra(const __m256i & rgb, const __m256i & alpha);
 
         template<> SIMD_INLINE __m256i RgbToBgra<false>(const __m256i & rgb, const __m256i & alpha)
         {
-            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0x94), K8_BGRA_TO_RGB_SHUFFLE), alpha);
+            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0x94), K8_RGB_TO_BGRA_SHUFFLE), alpha);
         }
 
         template<> SIMD_INLINE __m256i RgbToBgra<true>(const __m256i & rgb, const __m256i & alpha)
         {
-            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0xE9), K8_BGRA_TO_RGB_SHUFFLE), alpha);
+            return _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(rgb, 0xE9), K8_RGB_TO_BGRA_SHUFFLE), alpha);
         }
     }
 #endif// SIMD_AVX2_ENABLE
@@ -236,8 +213,20 @@ namespace Simd
 
         template <int part> SIMD_INLINE int32x4_t BgrToU(uint16x8_t blue, uint16x8_t green, uint16x8_t red)
         {
-            return vshrq_n_s32(vmlal_s16(vmlal_s16(vmlal_s16(K32_BGR_TO_YUV_ROUND_TERM, (int16x4_t)Half<part>(blue), K16_BLUE_TO_U_WEIGHT),
-                (int16x4_t)Half<part>(green), K16_GREEN_TO_U_WEIGHT), (int16x4_t)Half<part>(red), K16_RED_TO_U_WEIGHT), Base::BGR_TO_YUV_AVERAGING_SHIFT);
+            return vshrq_n_s32(vmlal_s16(vmlal_s16(vmlal_s16(K32_BGR_TO_YUV_ROUND_TERM, vreinterpret_s16_u16(Half<part>(blue)), K16_BLUE_TO_U_WEIGHT),
+                vreinterpret_s16_u16(Half<part>(green)), K16_GREEN_TO_U_WEIGHT), vreinterpret_s16_u16(Half<part>(red)), K16_RED_TO_U_WEIGHT), Base::BGR_TO_YUV_AVERAGING_SHIFT);
+        }
+
+        SIMD_INLINE int16x8_t BgrToU(uint16x8_t blue, uint16x8_t green, uint16x8_t red)
+        {
+            return vaddq_s16(K16_UV_ADJUST, PackI32(BgrToU<0>(blue, green, red), BgrToU<1>(blue, green, red)));
+        }
+
+        SIMD_INLINE uint8x16_t BgrToU(uint8x16_t blue, uint8x16_t green, uint8x16_t red)
+        {
+            return PackSaturatedI16(
+                BgrToU(UnpackU8<0>(blue), UnpackU8<0>(green), UnpackU8<0>(red)),
+                BgrToU(UnpackU8<1>(blue), UnpackU8<1>(green), UnpackU8<1>(red)));
         }
     }
 #endif// SIMD_NEON_ENABLE
diff --git a/3rdparty/simdlib/Simd/SimdCopyPixel.h b/3rdparty/simdlib/Simd/SimdCopyPixel.h
old mode 100644
new mode 100755
index 6f113e4c39..a5539eba35
--- a/3rdparty/simdlib/Simd/SimdCopyPixel.h
+++ b/3rdparty/simdlib/Simd/SimdCopyPixel.h
@@ -56,6 +56,23 @@ namespace Simd
         {
             ((uint32_t*)dst)[0] = ((uint32_t*)src)[0];
         }
+
+        template<> SIMD_INLINE void CopyPixel<6>(const uint8_t* src, uint8_t* dst)
+        {
+            ((uint32_t*)dst)[0] = ((uint32_t*)src)[0];
+            ((uint16_t*)dst)[2] = ((uint16_t*)src)[2];
+        }
+
+        template<> SIMD_INLINE void CopyPixel<8>(const uint8_t* src, uint8_t* dst)
+        {
+            ((uint64_t*)dst)[0] = ((uint64_t*)src)[0];
+        }
+
+        template<> SIMD_INLINE void CopyPixel<12>(const uint8_t* src, uint8_t* dst)
+        {
+            ((uint64_t*)dst)[0] = ((uint64_t*)src)[0];
+            ((uint32_t*)dst)[2] = ((uint32_t*)src)[2];
+        }
     }
 }
 
diff --git a/3rdparty/simdlib/Simd/SimdCpu.h b/3rdparty/simdlib/Simd/SimdCpu.h
old mode 100644
new mode 100755
index adaf916462..b10d9fa98f
--- a/3rdparty/simdlib/Simd/SimdCpu.h
+++ b/3rdparty/simdlib/Simd/SimdCpu.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -28,8 +28,103 @@
 
 namespace Simd
 {
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
+    namespace Cpuid
+    {
+        // See http://www.sandpile.org/x86/cpuid.htm for additional information.
+        enum Level
+        {
+            Ordinary = 1,
+            Extended = 7,
+        };
+
+        enum Register
+        {
+            Eax = 0,
+            Ebx = 1,
+            Ecx = 2,
+            Edx = 3,
+        };
+
+        enum Bit
+        {
+            // Ordinary:
+            // Edx:
+            SSE = 1 << 25,
+            SSE2 = 1 << 26,
+
+            // Ecx:
+            SSE3 = 1 << 0,
+            SSSE3 = 1 << 9,
+            FMA = 1 << 12,
+            SSE41 = 1 << 19,
+            SSE42 = 1 << 20,
+            OSXSAVE = 1 << 27,
+            AVX = 1 << 28,
+            F16C = 1 << 29,
+
+            // Extended:
+            // Ebx:
+            AVX2 = 1 << 5,
+            AVX512F = 1 << 16,
+            AVX512DQ = 1 << 17,
+            AVX512CD = 1 << 28,
+            AVX512BW = 1 << 30,
+            AVX512VL = 1 << 31,
+
+            // Ecx:
+            AVX512VBMI = 1 << 1,
+            AVX512VNNI = 1 << 11,
+        };
+    }
+#endif//defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
+
+    namespace Cpu
+    {
+        extern const size_t SOCKET_NUMBER;
+        extern const size_t CORE_NUMBER;
+        extern const size_t THREAD_NUMBER;
+        extern const size_t L1_CACHE_SIZE;
+        extern const size_t L2_CACHE_SIZE;
+        extern const size_t L3_CACHE_SIZE;
+    }
+
+    namespace Base
+    {
+#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
+        bool CheckBit(Cpuid::Level level, Cpuid::Register index, Cpuid::Bit bit);
+#endif
+
+#if defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE))
+        bool CheckBit(int at, int bit);
+#endif
+
+        size_t CpuSocketNumber();
+
+        size_t CpuCoreNumber();
+
+        size_t CpuThreadNumber();
+
+        size_t CpuCacheSize(size_t level);
+
+        SIMD_INLINE size_t AlgCacheL1()
+        {
+            return Cpu::L1_CACHE_SIZE;
+        }
+
+        SIMD_INLINE size_t AlgCacheL2()
+        {
+            return Cpu::L3_CACHE_SIZE ? Cpu::L2_CACHE_SIZE : Cpu::L2_CACHE_SIZE * Cpu::SOCKET_NUMBER / Cpu::CORE_NUMBER;
+        }
+
+        SIMD_INLINE size_t AlgCacheL3()
+        {
+            return Cpu::L3_CACHE_SIZE ? Cpu::L3_CACHE_SIZE * Cpu::SOCKET_NUMBER / Cpu::CORE_NUMBER : Cpu::L2_CACHE_SIZE;
+        }
+    }
+
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         const unsigned int SCR_FTZ = 1 << 15;
         const unsigned int SCR_DAZ = 1 << 6;
diff --git a/3rdparty/simdlib/Simd/SimdDefs.h b/3rdparty/simdlib/Simd/SimdDefs.h
old mode 100644
new mode 100755
index c2b9274ed4..97d8f06ad6
--- a/3rdparty/simdlib/Simd/SimdDefs.h
+++ b/3rdparty/simdlib/Simd/SimdDefs.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -37,10 +37,24 @@
 #include <math.h>
 #include <cmath>
 
+#if defined(SIMD_SSE2_DISABLE) && !defined(SIMD_SSE41_DISABLE)
+#define SIMD_SSE41_DISABLE
+#endif
+
+#if defined(SIMD_SSE41_DISABLE) && !defined(SIMD_AVX_DISABLE)
+#define SIMD_AVX_DISABLE
+#endif
+
+#if defined(SIMD_AVX_DISABLE) && !defined(SIMD_AVX2_DISABLE)
+#define SIMD_AVX2_DISABLE
+#endif
+
 #if defined(_MSC_VER) && defined(_MSC_FULL_VER)
 
 #define SIMD_ALIGNED(x) __declspec(align(x))
 
+#define SIMD_NOINLINE __declspec(noinline)
+
 #ifdef _M_IX86
 #define SIMD_X86_ENABLE
 #endif
@@ -55,30 +69,14 @@
 
 #if defined(SIMD_X64_ENABLE) || defined(SIMD_X86_ENABLE)
 
-#if !defined(SIMD_SSE_DISABLE) && _MSC_VER >= 1200
-#define SIMD_SSE_ENABLE
-#endif
-
 #if !defined(SIMD_SSE2_DISABLE) && _MSC_VER >= 1300
 #define SIMD_SSE2_ENABLE
 #endif
 
-#if !defined(SIMD_SSE3_DISABLE) && _MSC_VER >= 1500
-#define SIMD_SSE3_ENABLE
-#endif
-
-#if !defined(SIMD_SSSE3_DISABLE) && _MSC_VER >= 1500
-#define SIMD_SSSE3_ENABLE
-#endif
-
 #if !defined(SIMD_SSE41_DISABLE) && _MSC_VER >= 1500
 #define SIMD_SSE41_ENABLE
 #endif
 
-#if !defined(SIMD_SSE42_DISABLE) && _MSC_VER >= 1500
-#define SIMD_SSE42_ENABLE
-#endif
-
 #if !defined(SIMD_AVX_DISABLE) && _MSC_FULL_VER >= 160040219
 #define SIMD_AVX_ENABLE
 #endif
@@ -88,7 +86,7 @@
 #endif
 
 #if defined(NDEBUG) && _MSC_VER >= 1700 && _MSC_VER < 1900
-#define SIMD_MADDUBS_ERROR // Visual Studio 2012/2013 release mode compiler bug in function _mm256_maddubs_epi16:
+#define SIMD_MADDUBS_ERROR // Visual Studio 2012/2013 release mode compiler bug in function _mm256_maddubs_epi16.
 #endif
 
 #if defined(NDEBUG) && _MSC_VER == 1914
@@ -123,6 +121,8 @@
 
 #define SIMD_ALIGNED(x) __attribute__ ((aligned(x)))
 
+#define SIMD_NOINLINE __attribute__ ((noinline))
+
 #ifdef __i386__
 #define SIMD_X86_ENABLE
 #endif
@@ -159,36 +159,16 @@
 #define SIMD_ARM64_ENABLE
 #endif
 
-#if defined __mips__
-#define SIMD_MIPS_ENABLE
-#endif
-
 #if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
 
-#if !defined(SIMD_SSE_DISABLE) && defined(__SSE__)
-#define SIMD_SSE_ENABLE
-#endif
-
-#if !defined(SIMD_SSE2_DISABLE) && defined(__SSE2__)
+#if !defined(SIMD_SSE2_DISABLE) && defined(__SSE__) && defined(__SSE2__)
 #define SIMD_SSE2_ENABLE
 #endif
 
-#if !defined(SIMD_SSE3_DISABLE) && defined(__SSE3__)
-#define SIMD_SSE3_ENABLE
-#endif
-
-#if !defined(SIMD_SSSE3_DISABLE) && defined(__SSSE3__)
-#define SIMD_SSSE3_ENABLE
-#endif
-
-#if !defined(SIMD_SSE41_DISABLE) && defined(__SSE4_1__)
+#if !defined(SIMD_SSE41_DISABLE) && defined(__SSE3__) && defined(__SSSE3__) && defined(__SSE4_1__) && defined(__SSE4_2__)
 #define SIMD_SSE41_ENABLE
 #endif
 
-#if !defined(SIMD_SSE42_DISABLE) && defined(__SSE4_2__)
-#define SIMD_SSE42_ENABLE
-#endif
-
 #if !defined(SIMD_AVX_DISABLE) && defined(__AVX__)
 #define SIMD_AVX_ENABLE
 #endif
@@ -239,27 +219,11 @@
 
 #endif
 
-#ifdef SIMD_SSE_ENABLE
-#include <xmmintrin.h>
-#endif
-
 #ifdef SIMD_SSE2_ENABLE
 #include <emmintrin.h>
 #endif
 
-#ifdef SIMD_SSE3_ENABLE
-# include <pmmintrin.h>
-#endif
-
-#ifdef SIMD_SSSE3_ENABLE
-#include <tmmintrin.h>
-#endif
-
 #ifdef SIMD_SSE41_ENABLE
-#include <smmintrin.h>
-#endif
-
-#ifdef SIMD_SSE42_ENABLE
 #include <nmmintrin.h>
 #endif
 
@@ -273,10 +237,10 @@
 
 #if defined(SIMD_AVX_ENABLE) || defined(SIMD_AVX2_ENABLE)
 #define SIMD_ALIGN 32
-#elif defined(SIMD_SSE_ENABLE) || defined(SIMD_SSE2_ENABLE) || defined(SIMD_SSE3_ENABLE)  || defined(SIMD_SSSE3_ENABLE) || defined(SIMD_SSE41_ENABLE) || defined(SIMD_SSE42_ENABLE) \
+#elif defined(SIMD_SSE2_ENABLE) || defined(SIMD_SSE41_ENABLE) \
     || defined(SIMD_NEON_ENABLE)
 #define SIMD_ALIGN 16
-#elif defined (SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+#elif defined (SIMD_X64_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM64_ENABLE)
 #define SIMD_ALIGN 8
 #else
 #define SIMD_ALIGN 4
diff --git a/3rdparty/simdlib/Simd/SimdEnable.h b/3rdparty/simdlib/Simd/SimdEnable.h
old mode 100644
new mode 100755
index 6c79eb0d94..a501daf8ad
--- a/3rdparty/simdlib/Simd/SimdEnable.h
+++ b/3rdparty/simdlib/Simd/SimdEnable.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -62,455 +62,74 @@
 
 namespace Simd
 {
-#if defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
-    namespace Cpuid
-    {
-        // See http://www.sandpile.org/x86/cpuid.htm for additional information.
-        enum Level
-        {
-            Ordinary = 1,
-            Extended = 7,
-        };
-
-        enum Register
-        {
-            Eax = 0,
-            Ebx = 1,
-            Ecx = 2,
-            Edx = 3,
-        };
-
-        enum Bit
-        {
-            // Ordinary:
-            // Edx:
-            SSE = 1 << 25,
-            SSE2 = 1 << 26,
-
-            // Ecx:
-            SSE3 = 1 << 0,
-            SSSE3 = 1 << 9,
-            FMA = 1 << 12,
-            SSE41 = 1 << 19,
-            SSE42 = 1 << 20,
-            OSXSAVE = 1 << 27,
-            AVX = 1 << 28,
-            F16C = 1 << 29,
-
-            // Extended:
-            // Ebx:
-            AVX2 = 1 << 5,
-            AVX512F = 1 << 16,
-            AVX512BW = 1 << 30,
-
-            // Ecx:
-            AVX512VBMI = 1 << 1,
-        };
-
-        SIMD_INLINE bool CheckBit(Level level, Register index, Bit bit)
-        {
-            unsigned int registers[4] = { 0, 0, 0, 0 };
-#if defined(_MSC_VER)
-            __cpuid((int*)registers, level);
-#elif (defined __GNUC__)
-            if (__get_cpuid_max(0, NULL) < level)
-                return false;
-            __cpuid_count(level, 0, registers[Eax], registers[Ebx], registers[Ecx], registers[Edx]);
-#else
-#error Do not know how to detect CPU info!
-#endif
-            return (registers[index] & bit) == bit;
-        }
-    }
-#endif//defined(SIMD_X86_ENABLE) || defined(SIMD_X64_ENABLE)
-
-#if !defined(__APPLE__) // not macOS, iOS
-#if defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE))
-    namespace CpuInfo
-    {
-        SIMD_INLINE bool CheckBit(int at, int bit)
-        {
-            bool result = false;
-            int file = ::open("/proc/self/auxv", O_RDONLY);
-            if (file < 0)
-                return false;
-            const ssize_t size = 64;
-            unsigned long buffer[size];
-            for (ssize_t count = size; count == size;)
-            {
-                count = ::read(file, buffer, sizeof(buffer)) / sizeof(unsigned long);
-                for (int i = 0; i < count; i += 2)
-                {
-                    if (buffer[i] == (unsigned)at)
-                    {
-                        result = !!(buffer[i + 1] & bit);
-                        count = 0;
-                    }
-                    if (buffer[i] == AT_NULL)
-                        count = 0;
-                }
-            }
-            ::close(file);
-            return result;
-        }
-    }
-#endif//defined(__GNUC__) && (defined(SIMD_PPC_ENABLE) || defined(SIMD_PPC64_ENABLE) || defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE))
-#endif//(TARGET_OS_IOS == 0) not iOS
-
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
-    {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Edx, Cpuid::SSE);
-        }
-
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                __m128 value = _mm_set1_ps(1.0f);// try to execute of SSE instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
-    }
-#endif
-
 #ifdef SIMD_SSE2_ENABLE
     namespace Sse2
     {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Edx, Cpuid::SSE2);
-        }
+        bool GetEnable();
 
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                __m128d value = _mm_set1_pd(1.0);// try to execute of SSE2 instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
-    }
-#endif
-
-#ifdef SIMD_SSE3_ENABLE
-    namespace Sse3
-    {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE3);
-        }
-
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                __m128 value = _mm_hadd_ps(_mm_set1_ps(1.0f), _mm_set1_ps(2.0f)); //try to execute of SSE3 instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
-    }
-#endif
-
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSSE3);
-        }
-
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                __m128i value = _mm_abs_epi8(_mm_set1_epi8(-1)); //try to execute of SSSE3 instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
+        const bool Enable = GetEnable();
     }
 #endif
 
 #ifdef SIMD_SSE41_ENABLE
     namespace Sse41
     {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE41);
-        }
-
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                int value = _mm_testz_si128(_mm_set1_epi8(0), _mm_set1_epi8(-1)); // try to execute of SSE41 instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
-    }
-#endif
-
-#ifdef SIMD_SSE42_ENABLE
-    namespace Sse42
-    {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE42);
-        }
+        bool GetEnable();
 
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                uint32_t value = _mm_crc32_u8(0, 1); // try to execute of SSE42 instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
+        const bool Enable = GetEnable();
     }
 #endif
 
 #ifdef SIMD_AVX_ENABLE
     namespace Avx
     {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return
-                Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) &&
-                Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::AVX);
-        }
-
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                __m256d value = _mm256_set1_pd(1.0);// try to execute of AVX instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
+        bool GetEnable();
 
-        const bool Enable = SupportedByCPU() && SupportedByOS();
+        const bool Enable = GetEnable();
     }
 #endif
 
 #ifdef SIMD_AVX2_ENABLE
     namespace Avx2
     {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-            return
-                Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::OSXSAVE) &&
-                Cpuid::CheckBit(Cpuid::Extended, Cpuid::Ebx, Cpuid::AVX2) &&
-                Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::FMA) &&
-                Cpuid::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::F16C);
-        }
+        bool GetEnable();
 
-        SIMD_INLINE bool SupportedByOS()
-        {
-#if defined(_MSC_VER)
-            __try
-            {
-                __m256i value = _mm256_abs_epi8(_mm256_set1_epi8(1));// try to execute of AVX2 instructions;
-                return true;
-            }
-            __except (EXCEPTION_EXECUTE_HANDLER)
-            {
-                return false;
-            }
-#else
-            return true;
-#endif
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
+        const bool Enable = GetEnable();
     }
 #endif
 
 #ifdef SIMD_NEON_ENABLE
     namespace Neon
     {
-        SIMD_INLINE bool SupportedByCPU()
-        {
-#if defined(_MSC_VER)
-            return true;
-#elif defined(__GNUC__)
-#if defined(SIMD_ARM64_ENABLE) || (TARGET_OS_IOS != 0) // iOS
-            return true;
-#else
-            return CpuInfo::CheckBit(AT_HWCAP, HWCAP_NEON);
-#endif
-#else
-#error Do not know how to detect NEON support!
-#endif
-        }
+        bool GetEnable();
 
-        SIMD_INLINE bool SupportedByOS()
-        {
-            return true;
-        }
-
-        const bool Enable = SupportedByCPU() && SupportedByOS();
+        const bool Enable = GetEnable();
     }
 #endif
-
-    SIMD_INLINE size_t Alignment()
-    {
-#ifdef SIMD_AVX2_ENABLE
-        if (Avx2::Enable)
-            return sizeof(__m256i);
-        else
-#endif
-#ifdef SIMD_AVX_ENABLE
-        if (Avx::Enable)
-            return sizeof(__m256);
-        else
-#endif
-#ifdef SIMD_SSE41_ENABLE
-        if (Sse41::Enable)
-            return sizeof(__m128i);
-        else
-#endif
-#ifdef SIMD_SSSE3_ENABLE
-        if (Ssse3::Enable)
-            return sizeof(__m128i);
-        else
-#endif
-#ifdef SIMD_SSE2_ENABLE
-        if (Sse2::Enable)
-            return sizeof(__m128i);
-        else
-#endif
-#ifdef SIMD_SSE_ENABLE
-        if (Sse::Enable)
-            return sizeof(__m128);
-        else
-#endif
-#ifdef SIMD_NEON_ENABLE
-        if (Neon::Enable)
-            return sizeof(uint8x16_t);
-        else
-#endif
-            return sizeof(void *);
-    }
-
-    const size_t ALIGNMENT = Alignment();
 }
 
 #define SIMD_BASE_FUNC(func) Simd::Base::func
 
-#ifdef SIMD_SSE_ENABLE
-#define SIMD_SSE_FUNC(func) Simd::Sse::Enable ? Simd::Sse::func :
-#else
-#define SIMD_SSE_FUNC(func)
-#endif
-
 #ifdef SIMD_SSE2_ENABLE
-#define SIMD_SSE2_FUNC(func) Simd::Sse2::Enable ? Simd::Sse2::func :
-#else
-#define SIMD_SSE2_FUNC(func)
-#endif
-
-#ifdef SIMD_SSE3_ENABLE
-#define SIMD_SSE3_FUNC(func) Simd::Sse3::Enable ? Simd::Sse3::func :
+#define SIMD_SSE2_FUNC(func) Simd::Sse2::Enable ? Simd::Sse2::func : 
 #else
-#define SIMD_SSE3_FUNC(func)
-#endif
-
-#ifdef SIMD_SSSE3_ENABLE
-#define SIMD_SSSE3_FUNC(func) Simd::Ssse3::Enable ? Simd::Ssse3::func :
-#else
-#define SIMD_SSSE3_FUNC(func)
+#define SIMD_SSE2_FUNC(func) 
 #endif
 
 #ifdef SIMD_SSE41_ENABLE
-#define SIMD_SSE41_FUNC(func) Simd::Sse41::Enable ? Simd::Sse41::func :
-#else
-#define SIMD_SSE41_FUNC(func)
-#endif
-
-#ifdef SIMD_SSE42_ENABLE
-#define SIMD_SSE42_FUNC(func) Simd::Sse42::Enable ? Simd::Sse42::func :
+#define SIMD_SSE41_FUNC(func) Simd::Sse41::Enable ? Simd::Sse41::func : 
 #else
-#define SIMD_SSE42_FUNC(func)
+#define SIMD_SSE41_FUNC(func) 
 #endif
 
 #ifdef SIMD_AVX_ENABLE
-#define SIMD_AVX_FUNC(func) Simd::Avx::Enable ? Simd::Avx::func :
+#define SIMD_AVX_FUNC(func) Simd::Avx::Enable ? Simd::Avx::func : 
 #else
 #define SIMD_AVX_FUNC(func)
 #endif
 
 #ifdef SIMD_AVX2_ENABLE
-#define SIMD_AVX2_FUNC(func) Simd::Avx2::Enable ? Simd::Avx2::func :
+#define SIMD_AVX2_FUNC(func) Simd::Avx2::Enable ? Simd::Avx2::func : 
 #else
 #define SIMD_AVX2_FUNC(func)
 #endif
diff --git a/3rdparty/simdlib/Simd/SimdExp.h b/3rdparty/simdlib/Simd/SimdExp.h
old mode 100644
new mode 100755
index 3bfbc3f8f5..1600275b23
--- a/3rdparty/simdlib/Simd/SimdExp.h
+++ b/3rdparty/simdlib/Simd/SimdExp.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -34,6 +34,11 @@ namespace Simd
         {
             return ::expf(value);
         }
+
+        SIMD_INLINE float Log(float value)
+        {
+            return ::logf(value);
+        }
     }
 
 #ifdef SIMD_SSE2_ENABLE    
@@ -107,20 +112,20 @@ namespace Simd
                 __m128 exp = Exp2(_mm_mul_ps(_k, value));
                 __m128 neg = _mm_mul_ps(alpha, _mm_sub_ps(exp, _1_0));
                 __m128 mask = _mm_cmpgt_ps(_mm_setzero_ps(), value);
-                return Sse::Combine(mask, neg, value);
+                return Combine(mask, neg, value);
             }
         };
 
         namespace Detail
         {
-            SIMD_INLINE __m128 Poly5(__m128 x)
+            SIMD_INLINE __m128 Poly5(__m128 x, float a, float b, float c, float d, float e, float f)
             {
-                __m128 p = _mm_set1_ps(1.8775767e-3f);
-                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(8.9893397e-3f));
-                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(5.5826318e-2f));
-                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(2.4015361e-1f));
-                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(6.9315308e-1f));
-                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(9.9999994e-1f));
+                __m128 p = _mm_set1_ps(f);
+                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(e));
+                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(d));
+                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(c));
+                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(b));
+                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(a));
                 return p;
             }
 
@@ -130,9 +135,19 @@ namespace Simd
                 __m128i ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
                 __m128 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
                 __m128 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
-                __m128 expfpart = Poly5(fpart);
+                __m128 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
                 return _mm_mul_ps(expipart, expfpart);
             }
+
+            SIMD_INLINE __m128 Log2(__m128 x)
+            {
+                __m128 _1 = _mm_set1_ps(1.0f);
+                __m128i i = _mm_castps_si128(x);
+                __m128 e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, _mm_set1_epi32(0x7F800000)), 23), _mm_set1_epi32(127)));
+                __m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, _mm_set1_epi32(0x007FFFFF))), _1);
+                __m128 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+                return _mm_add_ps(_mm_mul_ps(p, _mm_sub_ps(m, _1)), e);
+            }
         }
 
         SIMD_INLINE __m128 Exponent(__m128 value)
@@ -145,7 +160,36 @@ namespace Simd
             __m128 exp = Exponent(value);
             __m128 neg = _mm_mul_ps(alpha, _mm_sub_ps(exp, _mm_set1_ps(1.0f)));
             __m128 mask = _mm_cmpgt_ps(_mm_setzero_ps(), value);
-            return Sse::Combine(mask, neg, value);
+            return Combine(mask, neg, value);
+        }
+
+        SIMD_INLINE __m128 Logarithm(__m128 value)
+        {
+            return _mm_mul_ps(_mm_set1_ps(0.693147181f), Detail::Log2(value));
+        }
+
+        SIMD_INLINE __m128 Mish(__m128 value, __m128 threshold)
+        {
+            __m128 _1 = _mm_set1_ps(1.0f);
+            __m128 mish = _mm_add_ps(Exponent(value), _1);
+            mish = _mm_add_ps(_mm_mul_ps(mish, mish), _1);
+            mish = _mm_mul_ps(value, _mm_sub_ps(_1, _mm_div_ps(_mm_set1_ps(2.0f), mish)));
+            return Combine(_mm_cmpgt_ps(threshold, value), mish, value);
+        }
+
+        SIMD_INLINE __m128 Softplus(__m128 value, __m128 beta, __m128 threshold)
+        {
+            __m128 exp = Exponent(_mm_mul_ps(value, beta));
+            __m128 log = Logarithm(_mm_add_ps(_mm_set1_ps(1.0f), exp));
+            __m128 mask = _mm_cmpgt_ps(threshold, value);
+            return Combine(mask, _mm_div_ps(log, beta), value);
+        }
+
+        SIMD_INLINE __m128 Tanh(__m128 value)
+        {
+            __m128 _1 = _mm_set1_ps(1.0f);
+            __m128 exp = Detail::Exp2(_mm_mul_ps(_mm_set1_ps(2.88539008f), value));
+            return _mm_div_ps(_mm_sub_ps(exp, _1), _mm_add_ps(_1, exp));
         }
     }
 #endif //SIMD_SSE2_ENABLE   
@@ -227,14 +271,14 @@ namespace Simd
 
         namespace Detail
         {
-            SIMD_INLINE __m256 Poly5(__m256 x)
+            SIMD_INLINE __m256 Poly5(__m256 x, float a, float b, float c, float d, float e, float f)
             {
-                __m256 p = _mm256_set1_ps(1.8775767e-3f);
-                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(8.9893397e-3f));
-                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(5.5826318e-2f));
-                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(2.4015361e-1f));
-                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(6.9315308e-1f));
-                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(9.9999994e-1f));
+                __m256 p = _mm256_set1_ps(f);
+                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(e));
+                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(d));
+                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(c));
+                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(b));
+                p = _mm256_add_ps(_mm256_mul_ps(x, p), _mm256_set1_ps(a));
                 return p;
             }
 
@@ -244,9 +288,19 @@ namespace Simd
                 __m256i ipart = _mm256_cvtps_epi32(_mm256_sub_ps(x, _mm256_set1_ps(0.5f)));
                 __m256 fpart = _mm256_sub_ps(x, _mm256_cvtepi32_ps(ipart));
                 __m256 expipart = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(ipart, _mm256_set1_epi32(127)), 23));
-                __m256 expfpart = Poly5(fpart);
+                __m256 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
                 return _mm256_mul_ps(expipart, expfpart);
             }
+
+            SIMD_INLINE __m256 Log2(__m256 x)
+            {
+                __m256 _1 = _mm256_set1_ps(1.0f);
+                __m256i i = _mm256_castps_si256(x);
+                __m256 e = _mm256_cvtepi32_ps(_mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(i, _mm256_set1_epi32(0x7F800000)), 23), _mm256_set1_epi32(127)));
+                __m256 m = _mm256_or_ps(_mm256_castsi256_ps(_mm256_and_si256(i, _mm256_set1_epi32(0x007FFFFF))), _1);
+                __m256 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+                return _mm256_add_ps(_mm256_mul_ps(p, _mm256_sub_ps(m, _1)), e);
+            }
         }
 
         SIMD_INLINE __m256 Exponent(__m256 value)
@@ -261,6 +315,35 @@ namespace Simd
             __m256 mask = _mm256_cmp_ps(_mm256_setzero_ps(), value, _CMP_GT_OS);
             return _mm256_blendv_ps(value, neg, mask);
         }
+
+        SIMD_INLINE __m256 Logarithm(__m256 value)
+        {
+            return _mm256_mul_ps(_mm256_set1_ps(0.693147181f), Detail::Log2(value));
+        }
+
+        SIMD_INLINE __m256 Mish(__m256 value, __m256 threshold)
+        {
+            __m256 _1 = _mm256_set1_ps(1.0f);
+            __m256 mish = _mm256_add_ps(Exponent(value), _1);
+            mish = Fmadd<true>(mish, mish, _1);
+            mish = _mm256_mul_ps(value, _mm256_sub_ps(_1, _mm256_div_ps(_mm256_set1_ps(2.0f), mish)));
+            return _mm256_blendv_ps(value, mish, _mm256_cmp_ps(threshold, value, _CMP_GT_OS));
+        }
+
+        SIMD_INLINE __m256 Softplus(__m256 value, __m256 beta, __m256 threshold)
+        {
+            __m256 exp = Exponent(_mm256_mul_ps(value, beta));
+            __m256 log = Logarithm(_mm256_add_ps(_mm256_set1_ps(1.0f), exp));
+            __m256 mask = _mm256_cmp_ps(threshold, value, _CMP_GT_OS);
+            return _mm256_blendv_ps(value, _mm256_div_ps(log, beta), mask);
+        }
+
+        SIMD_INLINE __m256 Tanh(__m256 value)
+        {
+            __m256 _1 = _mm256_set1_ps(1.0f);
+            __m256 exp = Detail::Exp2(_mm256_mul_ps(_mm256_set1_ps(2.88539008f), value));
+            return _mm256_div_ps(_mm256_sub_ps(exp, _1), _mm256_add_ps(_1, exp));
+        }
     }
 #endif //SIMD_AVX2_ENABLE
 
@@ -341,14 +424,14 @@ namespace Simd
 
         namespace Detail
         {
-            SIMD_INLINE float32x4_t Poly5(float32x4_t x)
+            SIMD_INLINE float32x4_t Poly5(float32x4_t x, float a, float b, float c, float d, float e, float f)
             {
-                float32x4_t p = vdupq_n_f32(1.8775767e-3f);
-                p = vmlaq_f32(vdupq_n_f32(8.9893397e-3f), x, p);
-                p = vmlaq_f32(vdupq_n_f32(5.5826318e-2f), x, p);
-                p = vmlaq_f32(vdupq_n_f32(2.4015361e-1f), x, p);
-                p = vmlaq_f32(vdupq_n_f32(6.9315308e-1f), x, p);
-                p = vmlaq_f32(vdupq_n_f32(9.9999994e-1f), x, p);
+                float32x4_t p = vdupq_n_f32(f);
+                p = vmlaq_f32(vdupq_n_f32(e), x, p);
+                p = vmlaq_f32(vdupq_n_f32(d), x, p);
+                p = vmlaq_f32(vdupq_n_f32(c), x, p);
+                p = vmlaq_f32(vdupq_n_f32(b), x, p);
+                p = vmlaq_f32(vdupq_n_f32(a), x, p);
                 return p;
             }
 
@@ -358,9 +441,19 @@ namespace Simd
                 int32x4_t ipart = vcvtq_s32_f32(vsubq_f32(x, vdupq_n_f32(0.5f)));
                 float32x4_t fpart = vsubq_f32(x, vcvtq_f32_s32(ipart));
                 float32x4_t expipart = vreinterpretq_f32_s32(vshlq_n_s32(vaddq_s32(ipart, vdupq_n_s32(127)), 23));
-                float32x4_t expfpart = Poly5(fpart);
+                float32x4_t expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
                 return vmulq_f32(expipart, expfpart);
             }
+
+            SIMD_INLINE float32x4_t Log2(float32x4_t x)
+            {
+                float32x4_t _1 = vdupq_n_f32(1.0f);
+                int32x4_t i = vreinterpretq_s32_f32(x);
+                float32x4_t e = vcvtq_f32_s32(vsubq_s32(vshrq_n_s32(vandq_s32(i, vdupq_n_s32(0x7F800000)), 23), vdupq_n_s32(127)));
+                float32x4_t m = Or(vreinterpretq_f32_s32(vandq_s32(i, vdupq_n_s32(0x007FFFFF))), _1);
+                float32x4_t p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+                return vaddq_f32(vmulq_f32(p, vsubq_f32(m, _1)), e);
+            }
         }
 
         SIMD_INLINE float32x4_t Exponent(float32x4_t value)
@@ -375,6 +468,35 @@ namespace Simd
             uint32x4_t mask = vcgtq_f32(vdupq_n_f32(0.0f), value);
             return vbslq_f32(mask, neg, value);
         }
+
+        SIMD_INLINE float32x4_t Logarithm(float32x4_t value)
+        {
+            return vmulq_f32(vdupq_n_f32(0.693147181f), Detail::Log2(value));
+        }
+
+        template<int iter> SIMD_INLINE float32x4_t Mish(float32x4_t value, float32x4_t threshold)
+        {
+            float32x4_t _1 = vdupq_n_f32(1.0f);
+            float32x4_t mish = vaddq_f32(Exponent(value), _1);
+            mish = Fmadd<true>(mish, mish, _1);
+            mish = vmulq_f32(value, vsubq_f32(_1, Div<iter>(vdupq_n_f32(2.0f), mish)));
+            return vbslq_f32(vcgtq_f32(threshold, value), mish, value);
+        }
+
+        template<int iter> SIMD_INLINE float32x4_t Softplus(float32x4_t value, float32x4_t beta, float32x4_t threshold)
+        {
+            float32x4_t exp = Exponent(vmulq_f32(value, beta));
+            float32x4_t log = Logarithm(vaddq_f32(vdupq_n_f32(1.0f), exp));
+            uint32x4_t mask = vcgtq_f32(threshold, value);
+            return vbslq_f32(mask, Div<iter>(log, beta), value);
+        }
+
+        template<int iter> SIMD_INLINE float32x4_t Tanh(float32x4_t value)
+        {
+            float32x4_t _1 = vdupq_n_f32(1.0f);
+            float32x4_t exp = Detail::Exp2(vmulq_f32(vdupq_n_f32(2.88539008f), value));
+            return Div<iter>(vsubq_f32(exp, _1), vaddq_f32(_1, exp));
+        }
     }
 #endif //SIMD_NEON_ENABLE
 }
diff --git a/3rdparty/simdlib/Simd/SimdExtract.h b/3rdparty/simdlib/Simd/SimdExtract.h
old mode 100644
new mode 100755
index d0d8184d7c..e30a0c85e5
--- a/3rdparty/simdlib/Simd/SimdExtract.h
+++ b/3rdparty/simdlib/Simd/SimdExtract.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -28,8 +28,8 @@
 
 namespace Simd
 {
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         SIMD_INLINE float ExtractValue(__m128 a, int i)
         {
@@ -44,12 +44,7 @@ namespace Simd
             _mm_store_ps(_a, a);
             return _a[0] + _a[1] + _a[2] + _a[3];
         }
-    }
-#endif//SIMD_SSE_ENABLE
 
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
         template <int index> SIMD_INLINE int ExtractInt8(__m128i a)
         {
             return _mm_extract_epi16(_mm_srli_si128(a, index & 0x1), index >> 1) & 0xFF;
@@ -90,8 +85,8 @@ namespace Simd
     }
 #endif// SIMD_SSE2_ENABLE
 
-#ifdef SIMD_SSE3_ENABLE
-    namespace Sse3
+#ifdef SIMD_SSE41_ENABLE
+    namespace Sse41
     {
         SIMD_INLINE float ExtractSum(__m128 a)
         {
@@ -103,7 +98,7 @@ namespace Simd
             return _mm_hadd_ps(_mm_hadd_ps(a[0], a[1]), _mm_hadd_ps(a[2], a[3]));
         }
     }
-#endif//SIMD_SSE3_ENABLE
+#endif//SIMD_SSE41_ENABLE
 
 #ifdef SIMD_AVX_ENABLE
     namespace Avx
@@ -199,6 +194,11 @@ namespace Simd
             return vgetq_lane_u32(a, 0) + vgetq_lane_u32(a, 1) + vgetq_lane_u32(a, 2) + vgetq_lane_u32(a, 3);
         }
 
+        SIMD_INLINE int32_t ExtractSum32s(const int32x4_t& a)
+        {
+            return vgetq_lane_s32(a, 0) + vgetq_lane_s32(a, 1) + vgetq_lane_s32(a, 2) + vgetq_lane_s32(a, 3);
+        }
+
         SIMD_INLINE uint64_t ExtractSum64u(const uint64x2_t & a)
         {
             return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
diff --git a/3rdparty/simdlib/Simd/SimdFrame.hpp b/3rdparty/simdlib/Simd/SimdFrame.hpp
old mode 100644
new mode 100755
index 53cc33879d..45b0b6022a
--- a/3rdparty/simdlib/Simd/SimdFrame.hpp
+++ b/3rdparty/simdlib/Simd/SimdFrame.hpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
 *               2014-2019 Antonenka Mikhail,
 *               2019-2019 Artur Voronkov.
 *
@@ -58,6 +58,10 @@ namespace Simd
             Bgr24,
             /*! One plane 8-bit gray pixel format. */
             Gray8,
+            /*! One plane 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */
+            Rgb24,
+            /*! One plane 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */
+            Rgba32,
         };
 
         const size_t width; /*!< \brief A width of the frame. */
@@ -373,6 +377,8 @@ namespace Simd
         case View<A>::Gray8: (Format&)format = Gray8; break;
         case View<A>::Bgr24: (Format&)format = Bgr24; break;
         case View<A>::Bgra32: (Format&)format = Bgra32; break;
+        case View<A>::Rgb24: (Format&)format = Rgb24; break;
+        case View<A>::Rgba32: (Format&)format = Rgba32; break;
         default:
             assert(0);
         }
@@ -420,6 +426,14 @@ namespace Simd
         case Gray8:
             planes[0] = View<A>(width, height, stride0, View<A>::Gray8, data0);
             break;
+        case Rgb24:
+            planes[0] = View<A>(width, height, stride0, View<A>::Rgb24, data0);
+            break;
+        case Rgba32:
+            planes[0] = View<A>(width, height, stride0, View<A>::Rgba32, data0);
+            break;
+        default:
+            assert(0);
         }
     }
 
@@ -494,6 +508,14 @@ namespace Simd
         case Gray8:
             planes[0].Recreate(width, height, View<A>::Gray8);
             break;
+        case Rgb24:
+            planes[0].Recreate(width, height, View<A>::Rgb24);
+            break;
+        case Rgba32:
+            planes[0].Recreate(width, height, View<A>::Rgba32);
+            break;
+        default:
+            assert(0);
         }
     }
 
@@ -591,6 +613,8 @@ namespace Simd
         case Bgra32:  return 1;
         case Bgr24:   return 1;
         case Gray8:   return 1;
+        case Rgb24:   return 1;
+        case Rgba32:  return 1;
         default: assert(0); return 0;
         }
     }
@@ -648,6 +672,12 @@ namespace Simd
             case Frame<A>::Gray8:
                 BgraToGray(src.planes[0], dst.planes[0]);
                 break;
+            case Frame<A>::Rgb24:
+                BgraToRgb(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Rgba32:
+                BgraToRgba(src.planes[0], dst.planes[0]);
+                break;
             default:
                 assert(0);
             }
@@ -662,6 +692,12 @@ namespace Simd
             case Frame<A>::Gray8:
                 BgrToGray(src.planes[0], dst.planes[0]);
                 break;
+            case Frame<A>::Rgb24:
+                BgrToRgb(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Rgba32:
+                BgrToRgba(src.planes[0], dst.planes[0]);
+                break;
             default:
                 assert(0);
             }
@@ -676,11 +712,71 @@ namespace Simd
             case Frame<A>::Bgr24:
                 GrayToBgr(src.planes[0], dst.planes[0]);
                 break;
+            case Frame<A>::Rgb24:
+                GrayToRgb(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Rgba32:
+                GrayToRgba(src.planes[0], dst.planes[0]);
+                break;
             default:
                 assert(0);
             }
             break;
 
+        case Frame<A>::Rgb24:
+            switch (dst.format)
+            {
+            case Frame<A>::Bgra32:
+                RgbToBgra(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Gray8:
+                RgbToGray(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Bgr24:
+                RgbToBgr(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Rgba32:
+                RgbToRgba(src.planes[0], dst.planes[0]);
+                break;
+            default:
+                assert(0);
+            }
+
+        case Frame<A>::Rgba32:
+            switch (dst.format)
+            {
+            case Frame<A>::Nv12:
+            {
+                View<A> bgr(src.Size(), View<A>::Bgr24);
+                RgbaToBgr(src.planes[0], bgr);
+                View<A> u(src.Size(), View<A>::Gray8), v(src.Size(), View<A>::Gray8);
+                BgrToYuv420p(bgr, dst.planes[0], u, v);
+                InterleaveUv(u, v, dst.planes[1]);
+                break;
+            }
+            case Frame<A>::Yuv420p:
+            {
+                View<A> bgr(src.Size(), View<A>::Bgr24);
+                RgbaToBgr(src.planes[0], bgr);
+                BgrToYuv420p(bgr, dst.planes[0], dst.planes[1], dst.planes[2]);
+                break;
+            }
+            case Frame<A>::Bgra32:
+                RgbaToBgra(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Gray8:
+                RgbaToGray(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Bgr24:
+                RgbaToBgr(src.planes[0], dst.planes[0]);
+                break;
+            case Frame<A>::Rgb24:
+                RgbaToRgb(src.planes[0], dst.planes[0]);
+                break;
+            default:
+                assert(0);
+            }
+
         default:
             assert(0);
         }
diff --git a/3rdparty/simdlib/Simd/SimdInit.h b/3rdparty/simdlib/Simd/SimdInit.h
old mode 100644
new mode 100755
index 179e61bdb4..707ea4c8bc
--- a/3rdparty/simdlib/Simd/SimdInit.h
+++ b/3rdparty/simdlib/Simd/SimdInit.h
@@ -28,7 +28,22 @@
 
 namespace Simd
 {
-#if defined(_MSC_VER) && (defined(SIMD_X64_ENABLE) || defined(SIMD_X86_ENABLE))
+
+#if defined(_MSC_VER) && !defined(__clang__) && (defined(SIMD_X64_ENABLE) || defined(SIMD_X86_ENABLE))
+
+#define SIMD_INIT_AS_CHAR
+
+#elif defined(__GNUC__) || defined(__clang__) || (defined(_MSC_VER) && defined(SIMD_NEON_ENABLE))
+
+#define SIMD_INIT_AS_LONGLONG
+
+#else
+
+#error This platform is unsupported!
+
+#endif
+
+#if defined(SIMD_INIT_AS_CHAR)
 
     template <class T> SIMD_INLINE char GetChar(T value, size_t index)
     {
@@ -50,7 +65,7 @@ namespace Simd
 	Simd::GetChar(int64_t(a), 4), Simd::GetChar(int64_t(a), 5), \
 	Simd::GetChar(int64_t(a), 6), Simd::GetChar(int64_t(a), 7)
 
-#elif defined(__GNUC__) || (defined(_MSC_VER) && defined(SIMD_NEON_ENABLE))
+#elif defined(SIMD_INIT_AS_LONGLONG)
 
 #define SIMD_CHAR_AS_LONGLONG(a) (((long long)a) & 0xFF)
 
@@ -94,11 +109,15 @@ namespace Simd
 #define SIMD_LL_SET2_EPI32(a, b) \
     SIMD_INT_AS_LONGLONG(a) | (SIMD_INT_AS_LONGLONG(b) << 32)
 
-#endif//defined(__GNUC__) || (defined(_MSC_VER) && defined(SIMD_NEON_ENABLE))
+#else
+
+#error This platform is unsupported!
+
+#endif
 
 #if defined(SIMD_SSE2_ENABLE)
 
-#if defined(_MSC_VER)
+#if defined(SIMD_INIT_AS_CHAR)
 
 #define SIMD_MM_SET1_EPI8(a) \
     {SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \
@@ -148,7 +167,7 @@ namespace Simd
 #define SIMD_MM_SETR_EPI64(a0, a1) \
     {SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1)}
 
-#elif defined(__GNUC__)
+#elif defined(SIMD_INIT_AS_LONGLONG)
 
 #define SIMD_MM_SET1_EPI8(a) \
     {SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a)}
@@ -192,7 +211,7 @@ namespace Simd
 
 #if defined(SIMD_AVX2_ENABLE)
 
-#if defined(_MSC_VER)
+#if defined(SIMD_INIT_AS_CHAR)
 
 #define SIMD_MM256_SET1_EPI8(a) \
 	{SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), SIMD_AS_CHAR(a), \
@@ -263,7 +282,7 @@ namespace Simd
 #define SIMD_MM256_SETR_EPI64(a0, a1, a2, a3) \
     {SIMD_AS_8CHARS(a0), SIMD_AS_8CHARS(a1), SIMD_AS_8CHARS(a2), SIMD_AS_8CHARS(a3)}
 
-#elif defined(__GNUC__)
+#elif defined(SIMD_INIT_AS_LONGLONG)
 
 #define SIMD_MM256_SET1_EPI8(a) \
     {SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a), \
@@ -310,7 +329,7 @@ namespace Simd
 #define SIMD_MM256_SETR_EPI64(a0, a1, a2, a3) \
     {a0, a1, a2, a3}
 
-#endif// defined(_MSC_VER) || defined(__GNUC__)
+#endif
 
 #endif// SIMD_AVX2_ENABLE
 
diff --git a/3rdparty/simdlib/Simd/SimdLib.cpp b/3rdparty/simdlib/Simd/SimdLib.cpp
old mode 100644
new mode 100755
index eb181ec376..b1cac8b1ba
--- a/3rdparty/simdlib/Simd/SimdLib.cpp
+++ b/3rdparty/simdlib/Simd/SimdLib.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
 *               2014-2018 Antonenka Mikhail,
 *               2018-2018 Radchenko Andrey,
 *               2019-2019 Facundo Galan.
@@ -55,18 +55,18 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved)
 #include "Simd/SimdLib.h"
 
 #include "Simd/SimdMemory.h"
+#include "Simd/SimdCpu.h"
 #include "Simd/SimdEnable.h"
+#include "Simd/SimdAlignment.h"
 #include "Simd/SimdConst.h"
-#include "Simd/SimdCpu.h"
 #include "Simd/SimdLog.h"
 
 #include "Simd/SimdResizer.h"
 #include "Simd/SimdGaussianBlur.h"
 
 #include "Simd/SimdBase.h"
-#include "Simd/SimdSse1.h"
 #include "Simd/SimdSse2.h"
-#include "Simd/SimdSsse3.h"
+#include "Simd/SimdSse41.h"
 #include "Simd/SimdAvx1.h"
 #include "Simd/SimdAvx2.h"
 #include "Simd/SimdNeon.h"
@@ -75,6 +75,11 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved)
 #include "Simd/SimdVersion.h"
 #endif
 
+namespace Simd
+{
+    const size_t ALIGNMENT = GetAlignment();
+}
+
 SIMD_API const char * SimdVersion()
 {
     return SIMD_VERSION;
@@ -118,9 +123,9 @@ SIMD_API void SimdRelease(void * context)
 
 SIMD_API SimdBool SimdGetFastMode()
 {
-#ifdef SIMD_SSE_ENABLE
-    if (Sse::Enable)
-        return Sse::GetFastMode();
+#ifdef SIMD_SSE2_ENABLE
+    if (Sse2::Enable)
+        return Sse2::GetFastMode();
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
@@ -133,9 +138,9 @@ SIMD_API SimdBool SimdGetFastMode()
 
 SIMD_API void SimdSetFastMode(SimdBool value)
 {
-#ifdef SIMD_SSE_ENABLE
-    if (Sse::Enable)
-        Sse::SetFastMode(value);
+#ifdef SIMD_SSE2_ENABLE
+    if (Sse2::Enable)
+        Sse2::SetFastMode(value);
 #endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable)
@@ -145,9 +150,9 @@ SIMD_API void SimdSetFastMode(SimdBool value)
 
 SIMD_API void SimdBgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride)
 {
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && width >= Sse41::A)
+        Sse41::BgraToBgr(bgra, width, height, bgraStride, bgr, bgrStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
@@ -178,84 +183,69 @@ SIMD_API void SimdBgraToGray(const uint8_t *bgra, size_t width, size_t height, s
         Base::BgraToGray(bgra, width, height, bgraStride, gray, grayStride);
 }
 
-SIMD_API void SimdRgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride)
+SIMD_API void SimdBgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
 {
 #ifdef SIMD_AVX2_ENABLE
-    if(Avx2::Enable && width >= Avx2::A)
-        Avx2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+    if (Avx2::Enable && width >= Avx2::A)
+        Avx2::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride);
     else
 #endif
-#ifdef SIMD_SSE2_ENABLE
-    if(Sse2::Enable && width >= Sse2::A)
-        Sse2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
-    if (Neon::Enable && width >= Neon::HA)
-        Neon::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+    if (Neon::Enable && width >= Neon::A)
+        Neon::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride);
     else
 #endif
-        Base::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+        Base::BgraToRgb(bgra, width, height, bgraStride, rgb, rgbStride);
 }
 
-SIMD_API void SimdBgrToBgra(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha)
+SIMD_API void SimdBgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
 {
-#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR)
-    if(Avx2::Enable && width >= Avx2::A)
-        Avx2::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
+#ifdef SIMD_AVX2_ENABLE
+    if (Avx2::Enable && width >= Avx2::A)
+        Avx2::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable && width >= Neon::A)
-        Neon::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
+        Neon::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
     else
 #endif
-        Base::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
+        Base::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
 }
 
-SIMD_API void SimdBgrToRgba(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *rgba, size_t rgbaStride, uint8_t alpha)
+SIMD_API void SimdBgrToBgra(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha)
 {
 #if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR)
     if(Avx2::Enable && width >= Avx2::A)
-        Avx2::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-    else
-#endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-    else
-#endif
-#ifdef SIMD_NEON_ENABLE
-    if (Neon::Enable && width >= Neon::A)
-        Neon::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
+        Avx2::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
     else
 #endif
-        Base::BgrToRgba(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-}
-
-SIMD_API void SimdBgraToRgba(const uint8_t *bgra, size_t width, size_t height, size_t bgraStride, uint8_t *rgba, size_t rgbaStride)
-{
-#if defined(SIMD_AVX2_ENABLE)
-    if(Avx2::Enable && width >= Avx2::A)
-        Avx2::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && width >= Sse41::A)
+        Sse41::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
+#ifdef SIMD_VMX_ENABLE
+    if(Vmx::Enable && width >= Vmx::A)
+        Vmx::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable && width >= Neon::A)
-        Neon::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
+        Neon::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
     else
 #endif
-        Base::BgraToRgba(bgra, width, height, bgraStride, rgba, rgbaStride);
+        Base::BgrToBgra(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
 }
 
 SIMD_API void SimdBgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height,
@@ -286,9 +276,9 @@ SIMD_API void SimdBgrToGray(const uint8_t *bgr, size_t width, size_t height, siz
         Avx2::BgrToGray(bgr, width, height, bgrStride, gray, grayStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::BgrToGray(bgr, width, height, bgrStride, gray, grayStride);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && width >= Sse41::A)
+        Sse41::BgrToGray(bgr, width, height, bgrStride, gray, grayStride);
     else
 #endif
 #ifdef SIMD_SSE2_ENABLE
@@ -304,49 +294,29 @@ SIMD_API void SimdBgrToGray(const uint8_t *bgr, size_t width, size_t height, siz
         Base::BgrToGray(bgr, width, height, bgrStride, gray, grayStride);
 }
 
-SIMD_API void SimdRgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride)
+SIMD_API void SimdBgrToRgb(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride)
 {
-#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR)
-    if (Avx2::Enable && width >= Avx2::A)
-        Avx2::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
-    else
-#endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
-    else
-#endif
-#ifdef SIMD_SSE2_ENABLE
-    if (Sse2::Enable && width >= Sse2::A)
-        Sse2::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
+#ifdef SIMD_AVX512BW_ENABLE
+    if (Avx512bw::Enable)
+        Avx512bw::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride);
     else
 #endif
-#ifdef SIMD_NEON_ENABLE
-    if (Neon::Enable && width >= Neon::A)
-      Neon::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
-    else
-#endif
-    Base::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
-}
-
-SIMD_API void SimdBgrToRgb(const uint8_t *bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
-{
 #ifdef SIMD_AVX2_ENABLE
     if (Avx2::Enable && width >= Avx2::A)
-        Avx2::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride);
+        Avx2::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable && width >= Neon::A)
-        Neon::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride);
+        Neon::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride);
     else
 #endif
-        Base::BgrToRgb(bgr, bgrStride, width, height, rgb, rgbStride);
+        Base::BgrToRgb(bgr, width, height, bgrStride, rgb, rgbStride);
 }
 
 SIMD_API void SimdCopy(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, uint8_t * dst, size_t dstStride)
@@ -368,9 +338,9 @@ SIMD_API void SimdDeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t
         Avx2::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::DeinterleaveBgr(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
@@ -389,9 +359,9 @@ SIMD_API void SimdDeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size
         Avx2::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::DeinterleaveBgra(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
@@ -410,9 +380,9 @@ SIMD_API void SimdGaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t
         Avx2::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && (width - 1)*channelCount >= Ssse3::A)
-        Ssse3::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && (width - 1)*channelCount >= Sse41::A)
+        Sse41::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride);
     else
 #endif
 #ifdef SIMD_SSE2_ENABLE
@@ -448,9 +418,9 @@ SIMD_API void SimdGrayToBgr(const uint8_t * gray, size_t width, size_t height, s
         Avx2::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && width >= Sse41::A)
+        Sse41::GrayToBgr(gray, width, height, grayStride, bgr, bgrStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
@@ -489,9 +459,9 @@ SIMD_API void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t
         Avx2::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::InterleaveBgr(b, bStride, g, gStride, r, rStride, width, height, bgr, bgrStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
@@ -510,9 +480,9 @@ SIMD_API void SimdInterleaveBgra(const uint8_t * b, size_t bStride, const uint8_
         Avx2::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && width >= Ssse3::A)
-        Ssse3::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::InterleaveBgra(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride);
     else
 #endif
 #ifdef SIMD_NEON_ENABLE
@@ -552,9 +522,9 @@ SIMD_API void SimdReduceColor2x2(const uint8_t *src, size_t srcWidth, size_t src
         Avx2::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && srcWidth >= Ssse3::DA)
-        Ssse3::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && srcWidth >= Sse41::DA)
+        Sse41::ReduceColor2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
     else
 #endif
 #ifdef SIMD_SSE2_ENABLE
@@ -578,9 +548,9 @@ SIMD_API void SimdReduceGray2x2(const uint8_t *src, size_t srcWidth, size_t srcH
         Avx2::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && srcWidth >= Ssse3::DA)
-        Ssse3::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && srcWidth >= Sse41::DA)
+        Sse41::ReduceGray2x2(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
     else
 #endif
 #ifdef SIMD_SSE2_ENABLE
@@ -625,9 +595,9 @@ SIMD_API void SimdReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcH
         Avx2::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && srcWidth > Ssse3::A)
-        Ssse3::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && srcWidth > Sse41::A)
+        Sse41::ReduceGray4x4(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
     else
 #endif
 #ifdef SIMD_SSE2_ENABLE
@@ -672,9 +642,9 @@ SIMD_API void SimdResizeBilinear(const uint8_t *src, size_t srcWidth, size_t src
         Avx2::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if(Ssse3::Enable && dstWidth >= Ssse3::A)
-        Ssse3::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
+#ifdef SIMD_SSE41_ENABLE
+    if(Sse41::Enable && dstWidth >= Sse41::A)
+        Sse41::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
     else
 #endif
 #ifdef SIMD_SSE2_ENABLE
@@ -707,21 +677,11 @@ SIMD_API void * SimdResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t ds
         return Sse41::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
     else
 #endif
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable)
-        return Ssse3::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
-    else
-#endif
 #ifdef SIMD_SSE2_ENABLE
     if (Sse2::Enable)
         return Sse2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
     else
 #endif
-#ifdef SIMD_SSE_ENABLE
-    if (Sse::Enable)
-        return Sse::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
-    else
-#endif
 #ifdef SIMD_NEON_ENABLE
     if (Neon::Enable)
         return Neon::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
@@ -735,6 +695,66 @@ SIMD_API void SimdResizerRun(const void * resizer, const uint8_t * src, size_t s
     ((Resizer*)resizer)->Run(src, srcStride, dst, dstStride);
 }
 
+SIMD_API void SimdRgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+{
+#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR)
+    if (Avx2::Enable && width >= Avx2::A)
+        Avx2::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+    else
+#endif
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+    else
+#endif
+#ifdef SIMD_NEON_ENABLE
+    if (Neon::Enable && width >= Neon::A)
+        Neon::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+    else
+#endif
+        Base::RgbToBgra(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+}
+
+SIMD_API void SimdRgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+{
+#if defined(SIMD_AVX2_ENABLE) && !defined(SIMD_CLANG_AVX2_BGR_TO_BGRA_ERROR)
+    if (Avx2::Enable && width >= Avx2::A)
+        Avx2::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
+    else
+#endif
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && width >= Sse41::A)
+        Sse41::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
+    else
+#endif
+#ifdef SIMD_NEON_ENABLE
+    if (Neon::Enable && width >= Neon::A)
+        Neon::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
+    else
+#endif
+        Base::RgbToGray(rgb, width, height, rgbStride, gray, grayStride);
+}
+
+SIMD_API void SimdRgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+{
+#if defined(SIMD_AVX2_ENABLE)
+    if (Avx2::Enable && width >= Avx2::A)
+        Avx2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+    else
+#endif
+#ifdef SIMD_SSE2_ENABLE
+    if (Sse2::Enable && width >= Sse2::A)
+        Sse2::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+    else
+#endif
+#ifdef SIMD_NEON_ENABLE
+    if (Neon::Enable && width >= Neon::A)
+        Neon::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+    else
+#endif
+        Base::RgbaToGray(rgba, width, height, rgbaStride, gray, grayStride);
+}
+
 SIMD_API void SimdStretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
                     uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride)
 {
@@ -842,6 +862,7 @@ SIMD_API void SimdMatTranspose(const double * mat, size_t rows, size_t cols, dou
 
 SIMD_API void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff)
 {
+  //TODO:
 #ifdef SIMD_SSSE3_ENABLE
     if (Ssse3::Enable && size >= Ssse3::A)
         Ssse3::SimdImageDifference(img1,img2, size, imgDiff);
diff --git a/3rdparty/simdlib/Simd/SimdLib.h b/3rdparty/simdlib/Simd/SimdLib.h
old mode 100644
new mode 100755
index c3862f19f1..4838b82261
--- a/3rdparty/simdlib/Simd/SimdLib.h
+++ b/3rdparty/simdlib/Simd/SimdLib.h
@@ -1,8 +1,8 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
-*               2014-2016 Antonenka Mikhail,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
+*               2014-2019 Antonenka Mikhail,
 *               2019-2019 Facundo Galan.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -27,8 +27,6 @@
 #ifndef __SimdLib_h__
 #define __SimdLib_h__
 
-#include "Simd/SimdConfig.h"
-
 #include <stddef.h>
 
 #if defined(_MSC_VER) || defined(__CODEGEARC__)
@@ -107,12 +105,8 @@ typedef enum
     SimdCpuInfoCacheL1, /*!< A size of level 1 data cache. */
     SimdCpuInfoCacheL2, /*!< A size of level 2 cache. */
     SimdCpuInfoCacheL3, /*!< A size of level 3 cache. */
-    SimdCpuInfoSse, /*!< Availability of SSE (x86). */
     SimdCpuInfoSse2, /*!< Availability of SSE2 (x86). */
-    SimdCpuInfoSse3, /*!< Availability of SSE3 (x86). */
-    SimdCpuInfoSsse3, /*!< Availability of SSSE3 (x86). */
     SimdCpuInfoSse41, /*!< Availability of SSE4.1 (x86). */
-    SimdCpuInfoSse42, /*!< Availability of SSE4.2 (x86). */
     SimdCpuInfoAvx, /*!< Availability of AVX (x86). */
     SimdCpuInfoAvx2, /*!< Availability of AVX2 (x86). */
     SimdCpuInfoAvx512f, /*!< Availability of AVX-512F (x86). */
@@ -120,7 +114,6 @@ typedef enum
     SimdCpuInfoVmx, /*!< Availability of VMX or Altivec (PowerPC). */
     SimdCpuInfoVsx, /*!< Availability of VSX (PowerPC). */
     SimdCpuInfoNeon, /*!< Availability of NEON (ARM). */
-    SimdCpuInfoMsa, /*!< Availability of MSA (MIPS). */
 } SimdCpuInfoType;
 
 /*! @ingroup c_types
@@ -188,6 +181,8 @@ typedef enum
     SimdPixelFormatHsl24,
     /*! A 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */
     SimdPixelFormatRgb24,
+    /*! A 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */
+    SimdPixelFormatRgba32,
 } SimdPixelFormatType;
 
 /*! @ingroup c_types
@@ -208,12 +203,14 @@ typedef enum
 {
     /*! 8-bit integer channel type.  */
     SimdResizeChannelByte,
+    /*! 16-bit integer channel type.  */
+    SimdResizeChannelShort,
     /*! 32-bit float channel type.  */
     SimdResizeChannelFloat,
 } SimdResizeChannelType;
 
 /*! @ingroup resizing
-    Describes methods used in oreder to resize image.
+    Describes methods used in order to resize image.
 */
 typedef enum
 {
@@ -223,6 +220,10 @@ typedef enum
     SimdResizeMethodCaffeInterp,
     /*! Area method. */
     SimdResizeMethodArea,
+    /*! InferenceEngine::Extension::Cpu::Interp compatible method. */
+    SimdResizeMethodInferenceEngineInterp,
+    /*! Nearest pixel method. */
+    SimdResizeMethodNearest,
 } SimdResizeMethodType;
 
 // ViSP custom SIMD code
@@ -317,7 +318,7 @@ extern "C"
 
         \fn size_t SimdAlignment();
 
-        \short Gets alignment required for the most productive work of the Simd Library.
+        \short Gets alignment required for the most productive work of Simd Library.
 
         \return a required alignment.
     */
@@ -359,17 +360,18 @@ extern "C"
 
         \fn void SimdBgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride);
 
-        \short Converts 32-bit BGRA image to 24-bit BGR image.
+        \short Converts 32-bit BGRA image to 24-bit BGR image. Also it can be used for 32-bit RGBA to 24-bit RGB conversion.
 
         All images must have the same width and height.
 
-        \note This function has a C++ wrapper Simd::BgraToBgr(const View<A>& bgra, View<A>& bgr).
+        \note This function has C++ wrappers: Simd::BgraToBgr(const View<A>& bgra, View<A>& bgr)
+            and Simd::RgbaToRgb(const View<A>& rgba, View<A>& rgb).
 
-        \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image.
+        \param [in] bgra - a pointer to pixels data of input 32-bit BGRA (or 32-bit RGBA) image.
         \param [in] width - an image width.
         \param [in] height - an image height.
         \param [in] bgraStride - a row size of the bgra image.
-        \param [out] bgr - a pointer to pixels data of output 24-bit BGR image.
+        \param [out] bgr - a pointer to pixels data of output 24-bit BGR (or 24-bit RGB) image.
         \param [in] bgrStride - a row size of the bgr image.
     */
     SIMD_API void SimdBgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride);
@@ -395,76 +397,63 @@ extern "C"
 
     /*! @ingroup bgra_conversion
 
-        \fn void SimdRgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride);
+        \fn void SimdBgraToRgb(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgb, size_t rgbStride);
 
-        \short Converts 32-bit RGBA image to 8-bit gray image.
+        \short Converts 32-bit BGRA image to 24-bit RGB image. Also it can be used for 32-bit RGBA to 24-bit BGR conversion.
 
         All images must have the same width and height.
 
-        \param [in] rgba - a pointer to pixels data of input 32-bit RGBA image.
+        \note This function has C++ wrappers: Simd::BgraToRgb(const View<A>& bgra, View<A>& rgb)
+            and Simd::RgbaToBgr(const View<A>& rgba, View<A>& bgr).
+
+        \param [in] bgra - a pointer to pixels data of input 32-bit BGRA (or 32-bit RGBA) image.
         \param [in] width - an image width.
         \param [in] height - an image height.
-        \param [in] rgbaStride - a row size of the rgba image.
-        \param [out] gray - a pointer to pixels data of output 8-bit gray image.
-        \param [in] grayStride - a row size of the gray image.
+        \param [in] bgraStride - a row size of the bgra image.
+        \param [out] rgb - a pointer to pixels data of output 24-bit RGB (or 24-bit BGR) image.
+        \param [in] rgbStride - a row size of the rgb image.
     */
-    SIMD_API void SimdRgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride);
+    SIMD_API void SimdBgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride);
 
-    /*! @ingroup bgr_conversion
+    /*! @ingroup bgra_conversion
 
-        \fn void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
+        \fn void SimdBgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride);
 
-        \short Converts 24-bit BGR image to 32-bit BGRA image.
+        \short Converts 32-bit BGRA image to 32-bit RGBA image. Also it can be used for 32-bit RGBA to 32-bit BGRA conversion.
 
         All images must have the same width and height.
 
-        \note This function has a C++ wrapper Simd::BgrToBgra(const View<A>& bgr, View<A>& bgra, uint8_t alpha).
+        \note This function has C++ wrappers: Simd::BgraToRgba(const View<A>& bgra, View<A>& rgba)
+            and Simd::RgbaToBgra(const View<A>& rgba, View<A>& bgra).
 
-        \param [in] bgr - a pointer to pixels data of input 24-bit BGR image.
+        \param [in] bgra - a pointer to pixels data of input 32-bit BGRA (or 32-bit RGBA) image.
         \param [in] width - an image width.
         \param [in] height - an image height.
-        \param [in] bgrStride - a row size of the bgr image.
-        \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image.
         \param [in] bgraStride - a row size of the bgra image.
-        \param [in] alpha - a value of alpha channel.
+        \param [out] rgba - a pointer to pixels data of output 32-bit RGBA (or 32-bit BGRA) image.
+        \param [in] rgbaStride - a row size of the rgb image.
     */
-    SIMD_API void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
+    SIMD_API void SimdBgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride);
 
     /*! @ingroup bgr_conversion
 
-        \fn void SimdBgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha);
+        \fn void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
-        \short Converts 24-bit BGR image to 32-bit RGBA image.
+        \short Converts 24-bit BGR image to 32-bit BGRA image.
 
         All images must have the same width and height.
 
+        \note This function has a C++ wrapper Simd::BgrToBgra(const View<A>& bgr, View<A>& bgra, uint8_t alpha).
+
         \param [in] bgr - a pointer to pixels data of input 24-bit BGR image.
         \param [in] width - an image width.
         \param [in] height - an image height.
         \param [in] bgrStride - a row size of the bgr image.
-        \param [out] rgba - a pointer to pixels data of output 32-bit BGRA image.
-        \param [in] rgbaStride - a row size of the bgra image.
-        \param [in] alpha - a value of alpha channel.
-    */
-    SIMD_API void SimdBgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha);
-
-    /*! @ingroup bgr_conversion
-
-        \fn void SimdBgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride);
-
-        \short Converts 32-bit BGRA image to 32-bit RGBA image.
-
-        All images must have the same width and height.
-
-        \param [in] bgra - a pointer to pixels data of input 32-bit BGRA image.
-        \param [in] width - an image width.
-        \param [in] height - an image height.
+        \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image.
         \param [in] bgraStride - a row size of the bgra image.
-        \param [out] rgba - a pointer to pixels data of output 32-bit RGBA image.
-        \param [in] rgbaStride - a row size of the rgba image.
         \param [in] alpha - a value of alpha channel.
     */
-    SIMD_API void SimdBgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride);
+    SIMD_API void SimdBgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
     /*! @ingroup other_conversion
 
@@ -512,39 +501,23 @@ extern "C"
 
     /*! @ingroup bgr_conversion
 
-        \fn void SimdRgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride);
+        \fn void SimdBgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride);
 
-        \short Converts 24-bit RGB image to 8-bit gray image.
+        \short Converts 24-bit BGR image to 24-bit RGB image. Also it can be used for 24-bit RGB to 24-bit BGR conversion.
 
         All images must have the same width and height.
 
-        \param [in] rgb - a pointer to pixels data of input 24-bit BGR image.
-        \param [in] width - an image width.
-        \param [in] height - an image height.
-        \param [in] rgbStride - a row size of the bgr image.
-        \param [out] gray - a pointer to pixels data of output 8-bit gray image.
-        \param [in] grayStride - a row size of the gray image.
-    */
-    SIMD_API void SimdRgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride);
-
-    /*! @ingroup bgr_conversion
-
-        \fn void SimdBgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride);
-
-        \short Converts 24-bit BGR image to 24-bit RGB image (also it performs backward conversion).
-
-        All images must have the same width and height.
+        \note This function has C++ wrappers: Simd::BgrToRgb(const View<A> & bgr, View<A> & rgb) 
+            and Simd::RgbToBgr(const View<A>& rgb, View<A>& bgr).
 
-        \note This function has a C++ wrapper Simd::BgrToRgb(const View<A> & bgr, View<A> & rgb).
-
-        \param [in] bgr - a pointer to pixels data of input 24-bit BGR image.
-        \param [in] bgrStride - a row size of the bgr image.
+        \param [in] bgr - a pointer to pixels data of input 24-bit BGR image (or 24-bit RGB image).
         \param [in] width - an image width.
         \param [in] height - an image height.
-        \param [out] rgb - a pointer to pixels data of output 24-bit RGB image.
+        \param [in] bgrStride - a row size of the bgr image.
+        \param [out] rgb - a pointer to pixels data of output 24-bit RGB image (or 24-bit BGR image).
         \param [in] rgbStride - a row size of the rgb image.
     */
-    SIMD_API void SimdBgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride);
+    SIMD_API void SimdBgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride);
 
     /*! @ingroup copying
 
@@ -591,7 +564,7 @@ extern "C"
     SIMD_API void SimdCopyFrame(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize,
         size_t frameLeft, size_t frameTop, size_t frameRight, size_t frameBottom, uint8_t * dst, size_t dstStride);
 
-    /*! @ingroup other_conversion
+    /*! @ingroup deinterleave_conversion
 
         \fn void SimdDeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride);
 
@@ -599,7 +572,9 @@ extern "C"
 
         All images must have the same width and height.
 
-        \note This function has a C++ wrapper Simd::DeinterleaveBgr(const View<A>& bgr, View<A>& b, View<A>& g, View<A>& r).
+        \note This function has C++ wrappers:
+            Simd::DeinterleaveBgr(const View<A>& bgr, View<A>& b, View<A>& g, View<A>& r),
+            Simd::DeinterleaveRgb(const View<A>& rgb, View<A>& r, View<A>& g, View<A>& b).
 
         \param [in] bgr - a pointer to pixels data of input 24-bit BGR interleaved image.
         \param [in] bgrStride - a row size of the bgr image.
@@ -615,7 +590,7 @@ extern "C"
     SIMD_API void SimdDeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height,
         uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride);
 
-    /*! @ingroup other_conversion
+    /*! @ingroup deinterleave_conversion
 
         \fn void SimdDeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride);
 
@@ -623,7 +598,11 @@ extern "C"
 
         All images must have the same width and height.
 
-        \note This function has a C++ wrapper Simd::DeinterleaveBgra(const View<A>& bgra, View<A>& b, View<A>& g, View<A>& r, View<A>& a).
+        \note This function has C++ wrappers:
+            Simd::DeinterleaveBgra(const View<A>& bgra, View<A>& b, View<A>& g, View<A>& r, View<A>& a),
+            Simd::DeinterleaveBgra(const View<A>& bgra, View<A>& b, View<A>& g, View<A>& r),
+            Simd::DeinterleaveRgba(const View<A>& rgba, View<A>& r, View<A>& g, View<A>& b, View<A>& a),
+            Simd::DeinterleaveRgba(const View<A>& rgba, View<A>& r, View<A>& g, View<A>& b).
 
         \param [in] bgra - a pointer to pixels data of input 32-bit BGRA interleaved image.
         \param [in] bgraStride - a row size of the bgra image.
@@ -635,7 +614,7 @@ extern "C"
         \param [in] gStride - a row size of the g image.
         \param [out] r - a pointer to pixels data of 8-bit Red planar image.
         \param [in] rStride - a row size of the r image.
-        \param [out] a - a pointer to pixels data of 8-bit Alpha planar image.
+        \param [out] a - a pointer to pixels data of 8-bit Alpha planar image. It can be NULL.
         \param [in] aStride - a row size of the a image.
     */
     SIMD_API void SimdDeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height,
@@ -670,20 +649,27 @@ extern "C"
         size_t channelCount, uint8_t * dst, size_t dstStride);
 
     /*! @ingroup gaussian_filter
+
         \fn void * SimdGaussianBlurInit(size_t width, size_t height, size_t channels, const float * sigma, const float* epsilon);
+
         \short Creates Gaussian blur filter context.
+
         In particular calculates Gaussian blur coefficients:
         \verbatim
         half = floor(sqrt(log(1/epsilon)) * sigma);
         weight[2*half + 1];
+
         for(x = -half; x <= half; ++x)
             weight[x + half] = exp(-sqr(x / sigma) / 2);
+
         sum = 0;
         for (x = -half; x <= half; ++x)
             sum += weight[x + half];
+
         for (x = -half; x <= half; ++x)
             weight[x + half] /= sum;
         \endverbatim
+
         \param [in] width - a width of input and output image.
         \param [in] height - a height of input and output image.
         \param [in] channels - a channel number of input and output image. Its value must be in range [1..4].
@@ -697,8 +683,11 @@ extern "C"
     SIMD_API void* SimdGaussianBlurInit(size_t width, size_t height, size_t channels, const float * sigma, const float* epsilon);
 
     /*! @ingroup gaussian_filter
+
         \fn void SimdGaussianBlurRun(const void* filter, const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride);
+
         \short Performs image Gaussian bluring.
+
         Bluring algorithm for every point:
         \verbatim
         sum = 0;
@@ -713,6 +702,7 @@ extern "C"
         }
         dst[dx, dy] = sum;
         \endverbatim
+
         \param [in] filter - a filter context. It must be created by function ::SimdGaussianBlurInit and released by function ::SimdRelease.
         \param [in] src - a pointer to pixels data of the original input image.
         \param [in] srcStride - a row size (in bytes) of the input image.
@@ -725,17 +715,18 @@ extern "C"
 
         \fn void SimdGrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride);
 
-        \short Converts 8-bit gray image to 24-bit BGR image.
+        \short Converts 8-bit gray image to 24-bit BGR image. Also it can be used for 8-bit gray to 24-bit RGB conversion.
 
         All images must have the same width and height.
 
-        \note This function has a C++ wrapper Simd::GrayToBgr(const View<A>& gray, View<A>& bgr).
+        \note This function has C++ wrappers: Simd::GrayToBgr(const View<A>& gray, View<A>& bgr) 
+            and Simd::GrayToRgb(const View<A>& gray, View<A>& rgb).
 
         \param [in] gray - a pointer to pixels data of input 8-bit gray image.
         \param [in] width - an image width.
         \param [in] height - an image height.
         \param [in] grayStride - a row size of the gray image.
-        \param [out] bgr - a pointer to pixels data of output 24-bit BGR image.
+        \param [out] bgr - a pointer to pixels data of output 24-bit BGR (or 24-bit RGB) image.
         \param [in] bgrStride - a row size of the bgr image.
     */
     SIMD_API void SimdGrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride);
@@ -744,17 +735,18 @@ extern "C"
 
         \fn void SimdGrayToBgra(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
-        \short Converts 8-bit gray image to 32-bit BGRA image.
+        \short Converts 8-bit gray image to 32-bit BGRA image. Also it can be used for 8-bit gray to 32-bit RGBA conversion.
 
         All images must have the same width and height.
 
-        \note This function has a C++ wrapper Simd::GrayToBgra(const View<A>& gray, View<A>& bgra, uint8_t alpha).
+        \note This function has C++ wrappers: Simd::GrayToBgra(const View<A>& gray, View<A>& bgra, uint8_t alpha) 
+            and Simd::GrayToRgba(const View<A>& gray, View<A>& rgba, uint8_t alpha).
 
         \param [in] gray - a pointer to pixels data of input 8-bit gray image.
         \param [in] width - an image width.
         \param [in] height - an image height.
         \param [in] grayStride - a row size of the gray image.
-        \param [out] bgra - a pointer to pixels data of output 32-bit BGRA image.
+        \param [out] bgra - a pointer to pixels data of output 32-bit BGRA (or 32-bit RGBA) image.
         \param [in] bgraStride - a row size of the bgra image.
         \param [in] alpha - a value of alpha channel.
     */
@@ -785,7 +777,7 @@ extern "C"
     SIMD_API void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride,
         size_t width, size_t height, uint8_t * bgr, size_t bgrStride);
 
-    /*! @ingroup other_conversion
+    /*! @ingroup interleave_conversion
 
         \fn void SimdInterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride);
 
@@ -1125,6 +1117,16 @@ extern "C"
 
         \short Creates resize context.
 
+        An using example (resize of RGBA64 image):
+        \verbatim
+        void * resizer = SimdResizerInit(srcX, srcY, dstX, dstY, 4, SimdResizeChannelShort, SimdResizeMethodBilinear);
+        if (resizer)
+        {
+             SimdResizerRun(resizer, (uint8_t*)src, srcStride, (uint8_t*)dst, dstStride);
+             SimdRelease(resizer);
+        }
+        \endverbatim
+
         \param [in] srcX - a width of the input image.
         \param [in] srcY - a height of the input image.
         \param [in] dstX - a width of the output image.
@@ -1152,6 +1154,65 @@ extern "C"
     */
     SIMD_API void SimdResizerRun(const void * resizer, const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
 
+    /*! @ingroup rgb_conversion
+
+        \fn void SimdRgbToBgra(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
+
+        \short Converts 24-bit RGB image to 32-bit BGRA image. Also it can be used for 24-bit BGR to 32-bit RGBA conversion.
+
+        All images must have the same width and height.
+
+        \note This function has C++ wrappers: Simd::RgbToBgra(const View<A>& rgb, View<A>& bgra, uint8_t alpha)
+            and Simd::BgrToRgba(const View<A>& bgr, View<A>& rgba, uint8_t alpha).
+
+        \param [in] rgb - a pointer to pixels data of input 24-bit RGB (or 24-bit BGR) image.
+        \param [in] width - an image width.
+        \param [in] height - an image height.
+        \param [in] rgbStride - a row size of the rgb image.
+        \param [out] bgra - a pointer to pixels data of output 32-bit BGRA (or 32-bit RGBA) image.
+        \param [in] bgraStride - a row size of the bgra image.
+        \param [in] alpha - a value of alpha channel.
+    */
+    SIMD_API void SimdRgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+
+    /*! @ingroup rgb_conversion
+
+        \fn void SimdRgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride);
+
+        \short Converts 24-bit RGB image to 8-bit gray image.
+
+        All images must have the same width and height.
+
+        \note This function has a C++ wrapper Simd::RgbToGray(const View<A>& rgb, View<A>& gray).
+
+        \param [in] rgb - a pointer to pixels data of input 24-bit RGB image.
+        \param [in] width - an image width.
+        \param [in] height - an image height.
+        \param [in] rgbStride - a row size of the rgb image.
+        \param [out] gray - a pointer to pixels data of output 8-bit gray image.
+        \param [in] grayStride - a row size of the gray image.
+    */
+    SIMD_API void SimdRgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride);
+
+    /*! @ingroup rgba_conversion
+
+        \fn void SimdRgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride);
+
+        \short Converts 32-bit RGBA image to 8-bit gray image.
+
+        All images must have the same width and height.
+
+        \note This function has a C++ wrapper Simd::RgbaToGray(const View<A>& rgba, View<A>& gray).
+
+        \param [in] rgba - a pointer to pixels data of input 32-bit RGBA image.
+        \param [in] width - an image width.
+        \param [in] height - an image height.
+        \param [in] rgbaStride - a row size of the rgba image.
+        \param [out] gray - a pointer to pixels data of output 8-bit gray image.
+        \param [in] grayStride - a row size of the gray image.
+    */
+    SIMD_API void SimdRgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride);
+
     /*! @ingroup resizing
 
         \fn void SimdStretchGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
diff --git a/3rdparty/simdlib/Simd/SimdLib.hpp b/3rdparty/simdlib/Simd/SimdLib.hpp
old mode 100644
new mode 100755
index 7f7e6745d5..aaedc571e2
--- a/3rdparty/simdlib/Simd/SimdLib.hpp
+++ b/3rdparty/simdlib/Simd/SimdLib.hpp
@@ -1,8 +1,8 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
-*               2014-2016 Antonenka Mikhail,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
+*               2014-2019 Antonenka Mikhail,
 *               2019-2019 Facundo Galan.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -31,7 +31,9 @@
 #ifndef __SimdLib_hpp__
 #define __SimdLib_hpp__
 
-/*! \namespace Simd */
+/*! @ingroup functions
+    Simd API C++ wrappers.
+*/
 namespace Simd
 {
     /*! @ingroup bgra_conversion
@@ -74,6 +76,46 @@ namespace Simd
         SimdBgraToGray(bgra.data, bgra.width, bgra.height, bgra.stride, gray.data, gray.stride);
     }
 
+    /*! @ingroup bgra_conversion
+
+        \fn void BgraToRgb(const View<A>& bgra, View<A>& rgb)
+
+        \short Converts 32-bit BGRA image to 24-bit RGB image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdBgraToRgb.
+
+        \param [in] bgra - an input 32-bit BGRA image.
+        \param [out] rgb - an output 24-bit RGB image.
+    */
+    template<template<class> class A> SIMD_INLINE void BgraToRgb(const View<A>& bgra, View<A>& rgb)
+    {
+        assert(EqualSize(bgra, rgb) && bgra.format == View<A>::Bgra32 && rgb.format == View<A>::Rgb24);
+
+        SimdBgraToRgb(bgra.data, bgra.width, bgra.height, bgra.stride, rgb.data, rgb.stride);
+    }
+
+    /*! @ingroup bgra_conversion
+
+        \fn void BgraToRgba(const View<A>& bgra, View<A>& rgba)
+
+        \short Converts 32-bit BGRA image to 32-bit RGBA image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdBgraToRgba.
+
+        \param [in] bgra - an input 32-bit BGRA image.
+        \param [out] rgba - an output 32-bit RGBA image.
+    */
+    template<template<class> class A> SIMD_INLINE void BgraToRgba(const View<A>& bgra, View<A>& rgba)
+    {
+        assert(EqualSize(bgra, rgba) && bgra.format == View<A>::Bgra32 && rgba.format == View<A>::Rgba32);
+
+        SimdBgraToRgba(bgra.data, bgra.width, bgra.height, bgra.stride, rgba.data, rgba.stride);
+    }
+
     /*! @ingroup bgr_conversion
 
         \fn void BgrToBgra(const View<A>& bgr, View<A>& bgra, uint8_t alpha = 0xFF)
@@ -142,7 +184,7 @@ namespace Simd
 
         \fn void BgrToRgb(const View<A> & bgr, View<A> & rgb)
 
-        \short Converts 24-bit BGR image to 24-bit RGB image (also it performs backward conversion).
+        \short Converts 24-bit BGR image to 24-bit RGB image.
 
         All images must have the same width and height.
 
@@ -153,9 +195,30 @@ namespace Simd
     */
     template<template<class> class A> SIMD_INLINE void BgrToRgb(const View<A> & bgr, View<A> & rgb)
     {
-        assert(EqualSize(bgr, rgb) && bgr.PixelSize() == 3 && rgb.PixelSize() == 3);
+        assert(EqualSize(bgr, rgb) && bgr.format == View<A>::Bgr24 && rgb.format == View<A>::Rgb24);
 
-        SimdBgrToRgb(bgr.data, bgr.stride, bgr.width, bgr.height, rgb.data, rgb.stride);
+        SimdBgrToRgb(bgr.data, bgr.width, bgr.height, bgr.stride, rgb.data, rgb.stride);
+    }
+
+    /*! @ingroup bgr_conversion
+
+        \fn void BgrToRgba(const View<A>& bgr, View<A>& rgba, uint8_t alpha = 0xFF)
+
+        \short Converts 24-bit BGR image to 32-bit RGBA image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdRgbToBgra.
+
+        \param [in] bgr - an input 24-bit BGR image.
+        \param [out] rgba - an output 32-bit RGBA image.
+        \param [in] alpha - a value of alpha channel. It is equal to 256 by default.
+    */
+    template<template<class> class A> SIMD_INLINE void BgrToRgba(const View<A>& bgr, View<A>& rgba, uint8_t alpha = 0xFF)
+    {
+        assert(EqualSize(bgr, rgba) && rgba.format == View<A>::Rgba32 && bgr.format == View<A>::Bgr24);
+
+        SimdRgbToBgra(bgr.data, bgr.width, bgr.height, bgr.stride, rgba.data, rgba.stride, alpha);
     }
 
     /*! @ingroup copying
@@ -204,7 +267,7 @@ namespace Simd
             frame.left, frame.top, frame.right, frame.bottom, dst.data, dst.stride);
     }
 
-    /*! @ingroup other_conversion
+    /*! @ingroup deinterleave_conversion
 
         \fn void DeinterleaveBgr(const View<A>& bgr, View<A>& b, View<A>& g, View<A>& r)
 
@@ -226,7 +289,7 @@ namespace Simd
         SimdDeinterleaveBgr(bgr.data, bgr.stride, bgr.width, bgr.height, b.data, b.stride, g.data, g.stride, r.data, r.stride);
     }
 
-    /*! @ingroup other_conversion
+    /*! @ingroup deinterleave_conversion
 
         \fn void DeinterleaveBgra(const View<A>& bgra, View<A>& b, View<A>& g, View<A>& r, View<A>& a)
 
@@ -249,6 +312,95 @@ namespace Simd
         SimdDeinterleaveBgra(bgra.data, bgra.stride, bgra.width, bgra.height, b.data, b.stride, g.data, g.stride, r.data, r.stride, a.data, a.stride);
     }
 
+    /*! @ingroup deinterleave_conversion
+
+        \fn void DeinterleaveBgra(const View<A>& bgra, View<A>& b, View<A>& g, View<A>& r)
+
+        \short Deinterleaves 32-bit BGRA interleaved image into separated 8-bit Blue, Green and Red planar images (Alpha channel is ignored).
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdDeinterleaveBgra.
+
+        \param [in] bgra - an input 32-bit BGRA interleaved image.
+        \param [out] b - an output 8-bit Blue planar image.
+        \param [out] g - an output 8-bit Green planar image.
+        \param [out] r - an output 8-bit Red planar image.
+    */
+    template<template<class> class A> SIMD_INLINE void DeinterleaveBgra(const View<A>& bgra, View<A>& b, View<A>& g, View<A>& r)
+    {
+        assert(EqualSize(bgra, b) && Compatible(b, g, r) && bgra.format == View<A>::Bgra32 && b.format == View<A>::Gray8);
+
+        SimdDeinterleaveBgra(bgra.data, bgra.stride, bgra.width, bgra.height, b.data, b.stride, g.data, g.stride, r.data, r.stride, NULL, 0);
+    }
+
+    /*! @ingroup deinterleave_conversion
+
+        \fn void DeinterleaveRgb(const View<A>& rgb, View<A>& r, View<A>& g, View<A>& b)
+
+        \short Deinterleaves 24-bit RGB interleaved image into separated 8-bit Red, Green and Blue planar images.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdDeinterleaveBgr.
+
+        \param [in] rgb - an input 24-bit RGB interleaved image.
+        \param [out] r - an output 8-bit Red planar image.
+        \param [out] g - an output 8-bit Green planar image.
+        \param [out] b - an output 8-bit Blue planar image.
+        */
+    template<template<class> class A> SIMD_INLINE void DeinterleaveRgb(const View<A>& rgb, View<A>& r, View<A>& g, View<A>& b)
+    {
+        assert(EqualSize(rgb, b) && Compatible(b, g, r) && rgb.format == View<A>::Rgb24 && b.format == View<A>::Gray8);
+
+        SimdDeinterleaveBgr(rgb.data, rgb.stride, rgb.width, rgb.height, r.data, r.stride, g.data, g.stride, b.data, b.stride);
+    }
+
+    /*! @ingroup deinterleave_conversion
+
+        \fn void DeinterleaveRgba(const View<A>& rgba, View<A>& r, View<A>& g, View<A>& b, View<A>& a)
+
+        \short Deinterleaves 32-bit RGBA interleaved image into separated 8-bit Red, Green, Blue and Alpha planar images.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdDeinterleaveBgra.
+
+        \param [in] rgba - an input 32-bit RGBA interleaved image.
+        \param [out] r - an output 8-bit Red planar image.
+        \param [out] g - an output 8-bit Green planar image.
+        \param [out] b - an output 8-bit Blue planar image.
+        \param [out] a - an output 8-bit Alpha planar image.
+    */
+    template<template<class> class A> SIMD_INLINE void DeinterleaveRgba(const View<A>& rgba, View<A>& r, View<A>& g, View<A>& b, View<A>& a)
+    {
+        assert(EqualSize(rgba, b) && Compatible(b, g, r, a) && rgba.format == View<A>::Rgba32 && b.format == View<A>::Gray8);
+
+        SimdDeinterleaveBgra(rgba.data, rgba.stride, rgba.width, rgba.height, r.data, r.stride, g.data, g.stride, b.data, b.stride, a.data, a.stride);
+    }
+
+    /*! @ingroup deinterleave_conversion
+
+        \fn void DeinterleaveRgba(const View<A>& rgba, View<A>& r, View<A>& g, View<A>& b)
+
+        \short Deinterleaves 32-bit RGBA interleaved image into separated 8-bit Red, Green and Blue planar images (Alpha channel is ignored).
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdDeinterleaveBgra.
+
+        \param [in] rgba - an input 32-bit RGBA interleaved image.
+        \param [out] r - an output 8-bit Red planar image.
+        \param [out] g - an output 8-bit Green planar image.
+        \param [out] b - an output 8-bit Blue planar image.
+    */
+    template<template<class> class A> SIMD_INLINE void DeinterleaveRgba(const View<A>& rgba, View<A>& r, View<A>& g, View<A>& b)
+    {
+        assert(EqualSize(rgba, b) && Compatible(b, g, r) && rgba.format == View<A>::Rgba32 && b.format == View<A>::Gray8);
+
+        SimdDeinterleaveBgra(rgba.data, rgba.stride, rgba.width, rgba.height, r.data, r.stride, g.data, g.stride, b.data, b.stride, NULL, 0);
+    }
+
     /*! @ingroup other_filter
 
         \fn void GaussianBlur3x3(const View<A>& src, View<A>& dst)
@@ -295,6 +447,26 @@ namespace Simd
         SimdGrayToBgr(gray.data, gray.width, gray.height, gray.stride, bgr.data, bgr.stride);
     }
 
+    /*! @ingroup gray_conversion
+
+        \fn void GrayToRgb(const View<A>& gray, View<A>& rgb)
+
+        \short Converts 8-bit gray image to 24-bit RGB image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdGrayToBgr.
+
+        \param [in] gray - an input 8-bit gray image.
+        \param [out] rgb - an output 24-bit RGB image.
+    */
+    template<template<class> class A> SIMD_INLINE void GrayToRgb(const View<A>& gray, View<A>& rgb)
+    {
+        assert(EqualSize(gray, rgb) && rgb.format == View<A>::Rgb24 && gray.format == View<A>::Gray8);
+
+        SimdGrayToBgr(gray.data, gray.width, gray.height, gray.stride, rgb.data, rgb.stride);
+    }
+
     /*! @ingroup gray_conversion
 
         \fn void GrayToBgra(const View<A>& gray, View<A>& bgra, uint8_t alpha = 0xFF)
@@ -316,6 +488,27 @@ namespace Simd
         SimdGrayToBgra(gray.data, gray.width, gray.height, gray.stride, bgra.data, bgra.stride, alpha);
     }
 
+    /*! @ingroup gray_conversion
+
+        \fn void GrayToRgba(const View<A>& gray, View<A>& rgba, uint8_t alpha = 0xFF)
+
+        \short Converts 8-bit gray image to 32-bit RGBA image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdGrayToBgra.
+
+        \param [in] gray - an input 8-bit gray image.
+        \param [out] rgba - an output 32-bit RGBA image.
+        \param [in] alpha - a value of alpha channel. It is equal to 255 by default.
+    */
+    template<template<class> class A> SIMD_INLINE void GrayToRgba(const View<A>& gray, View<A>& rgba, uint8_t alpha = 0xFF)
+    {
+        assert(EqualSize(gray, rgba) && rgba.format == View<A>::Rgba32 && gray.format == View<A>::Gray8);
+
+        SimdGrayToBgra(gray.data, gray.width, gray.height, gray.stride, rgba.data, rgba.stride, alpha);
+    }
+
     /*! @ingroup other_conversion
 
         \fn void InterleaveBgr(const View<A> & b, const View<A> & g, const View<A> & r, View<A> & bgr)
@@ -338,7 +531,7 @@ namespace Simd
         SimdInterleaveBgr(b.data, b.stride, g.data, g.stride, r.data, r.stride, bgr.width, bgr.height, bgr.data, bgr.stride);
     }
 
-    /*! @ingroup other_conversion
+    /*! @ingroup interleave_conversion
 
         \fn void InterleaveBgra(const View<A>& b, const View<A>& g, const View<A>& r, const View<A>& a, View<A>& bgra)
 
@@ -798,6 +991,200 @@ namespace Simd
         }
     }
 
+    /*! @ingroup resizing
+
+        \fn void Resize(const View<A> & src, View<A> & dst, const Point<ptrdiff_t> & size, ::SimdResizeMethodType method = ::SimdResizeMethodBilinear)
+
+        \short Performs resizing of image.
+
+        \param [in] src - an original input image.
+        \param [out] dst - a resized output image. The input image can be the output.
+        \param [in] size - a size of output image.
+        \param [in] method - a resizing method. By default it is equal to ::SimdResizeMethodBilinear.
+    */
+    template<template<class> class A> SIMD_INLINE void Resize(const View<A>& src, View<A>& dst, const Point<ptrdiff_t> & size, ::SimdResizeMethodType method = ::SimdResizeMethodBilinear)
+    {
+        assert(src.format == View<A>::Float || src.ChannelSize() == 1);
+
+        if (&src == &dst)
+        {
+            if (src.Size() != size)
+            {
+                View<A> tmp(size, src.format);
+                Resize(src, tmp, method);
+                dst.Swap(tmp);
+            }
+        }
+        else
+        {
+            if (dst.Size() != size)
+                dst.Recreate(size, src.format);
+            Resize(src, dst, method);
+        }
+    }
+
+    /*! @ingroup rgb_conversion
+
+        \fn void RgbToBgr(const View<A> & rgb, View<A> & bgr)
+
+        \short Converts 24-bit RGB image to 24-bit BGR image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdBgrToRgb.
+
+        \param [in] rgb - an input 24-bit RGB image.
+        \param [out] bgr - an output 24-bit BGR image.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbToBgr(const View<A>& rgb, View<A>& bgr)
+    {
+        assert(EqualSize(bgr, rgb) && rgb.format == View<A>::Rgb24 || bgr.format == View<A>::Bgr24);
+
+        SimdBgrToRgb(rgb.data, rgb.width, rgb.height, rgb.stride, bgr.data, bgr.stride);
+    }
+
+    /*! @ingroup rgb_conversion
+
+        \fn void RgbToBgra(const View<A>& rgb, View<A>& bgra, uint8_t alpha = 0xFF)
+
+        \short Converts 24-bit RGB image to 32-bit BGRA image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdRgbToBgra.
+
+        \param [in] rgb - an input 24-bit RGB image.
+        \param [out] bgra - an output 32-bit BGRA image.
+        \param [in] alpha - a value of alpha channel. It is equal to 256 by default.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbToBgra(const View<A>& rgb, View<A>& bgra, uint8_t alpha = 0xFF)
+    {
+        assert(EqualSize(rgb, bgra) && bgra.format == View<A>::Bgra32 && rgb.format == View<A>::Rgb24);
+
+        SimdRgbToBgra(rgb.data, rgb.width, rgb.height, rgb.stride, bgra.data, bgra.stride, alpha);
+    }
+
+    /*! @ingroup rgb_conversion
+
+        \fn void RgbToGray(const View<A>& rgb, View<A>& gray)
+
+        \short Converts 24-bit RGB image to 8-bit gray image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdRgbToGray.
+
+        \param [in] rgb - an input 24-bit RGB image.
+        \param [out] gray - an output 8-bit gray image.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbToGray(const View<A>& rgb, View<A>& gray)
+    {
+        assert(EqualSize(rgb, gray) && rgb.format == View<A>::Rgb24 && gray.format == View<A>::Gray8);
+
+        SimdRgbToGray(rgb.data, rgb.width, rgb.height, rgb.stride, gray.data, gray.stride);
+    }
+
+    /*! @ingroup rgb_conversion
+
+        \fn void RgbToRgba(const View<A>& rgb, View<A>& rgba, uint8_t alpha = 0xFF)
+
+        \short Converts 24-bit RGB image to 32-bit RGBA image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdBgrToBgra.
+
+        \param [in] rgb - an input 24-bit RGB image.
+        \param [out] rgba - an output 32-bit RGBA image.
+        \param [in] alpha - a value of alpha channel. It is equal to 256 by default.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbToRgba(const View<A>& rgb, View<A>& rgba, uint8_t alpha = 0xFF)
+    {
+        assert(EqualSize(rgb, rgba) && rgba.format == View<A>::Rgba32 && rgb.format == View<A>::Rgb24);
+
+        SimdBgrToBgra(rgb.data, rgb.width, rgb.height, rgb.stride, rgba.data, rgba.stride, alpha);
+    }
+
+    /*! @ingroup rgba_conversion
+
+        \fn void RgbaToBgr(const View<A>& rgba, View<A>& bgr)
+
+        \short Converts 32-bit RGBA image to 24-bit BGR image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdBgraToRgb.
+
+        \param [in] rgba - an input 32-bit RGBA image.
+        \param [out] bgr - an output 24-bit RGB image.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbaToBgr(const View<A>& rgba, View<A>& bgr)
+    {
+        assert(EqualSize(rgba, bgr) && rgba.format == View<A>::Rgba32 && bgr.format == View<A>::Bgr24);
+
+        SimdBgraToRgb(rgba.data, rgba.width, rgba.height, rgba.stride, bgr.data, bgr.stride);
+    }
+
+    /*! @ingroup rgba_conversion
+
+        \fn void RgbaToBgra(const View<A>& rgba, View<A>& bgra)
+
+        \short Converts 32-bit RGBA image to 32-bit BGRA image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdBgraToRgba.
+
+        \param [in] rgba - an input 32-bit RGBA image.
+        \param [out] bgra - an output 32-bit BGRA image.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbaToBgra(const View<A>& rgba, View<A>& bgra)
+    {
+        assert(EqualSize(bgra, rgba) && bgra.format == View<A>::Bgra32 && rgba.format == View<A>::Rgba32);
+
+        SimdBgraToRgba(rgba.data, rgba.width, rgba.height, rgba.stride, bgra.data, bgra.stride);
+    }
+
+    /*! @ingroup rgba_conversion
+
+        \fn void RgbaToGray(const View<A>& rgba, View<A>& gray)
+
+        \short Converts 32-bit RGBA image to 8-bit gray image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdRgbaToGray.
+
+        \param [in] rgba - an input 32-bit RGBA image.
+        \param [out] gray - an output 8-bit gray image.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbaToGray(const View<A>& rgba, View<A>& gray)
+    {
+        assert(EqualSize(rgba, gray) && rgba.format == View<A>::Rgba32 && gray.format == View<A>::Gray8);
+
+        SimdRgbaToGray(rgba.data, rgba.width, rgba.height, rgba.stride, gray.data, gray.stride);
+    }
+
+    /*! @ingroup rgba_conversion
+
+        \fn void RgbaToRgb(const View<A>& rgba, View<A>& rgb)
+
+        \short Converts 32-bit RGBA image to 24-bit RGB image.
+
+        All images must have the same width and height.
+
+        \note This function is a C++ wrapper for function ::SimdBgraToBgr.
+
+        \param [in] rgba - an input 32-bit RGBA image.
+        \param [out] rgb - an output 24-bit RGB image.
+    */
+    template<template<class> class A> SIMD_INLINE void RgbaToRgb(const View<A>& rgba, View<A>& rgb)
+    {
+        assert(EqualSize(rgba, rgb) && rgba.format == View<A>::Rgba32 && rgb.format == View<A>::Rgb24);
+
+        SimdBgraToBgr(rgba.data, rgba.width, rgba.height, rgba.stride, rgb.data, rgb.stride);
+    }
+
     /*! @ingroup resizing
 
         \fn void StretchGray2x2(const View<A>& src, View<A>& dst)
@@ -825,7 +1212,7 @@ namespace Simd
 
         The input and output images must have the same width and height.
 
-        \note This function supports conversion between Gray8, Bgr24 and Bgra32 image formats.
+        \note This function supports conversion between View::Gray8, View::Bgr24, View::Bgra32, View::Rgb24 and View::Rgba32 image formats.
 
         \param [in] src - an input image.
         \param [out] dst - an output image.
@@ -848,9 +1235,15 @@ namespace Simd
             case View<A>::Bgra32:
                 GrayToBgra(src, dst);
                 break;
+            case View<A>::Rgba32:
+                GrayToRgba(src, dst);
+                break;
             case View<A>::Bgr24:
                 GrayToBgr(src, dst);
                 break;
+            case View<A>::Rgb24:
+                GrayToRgb(src, dst);
+                break;
             default:
                 assert(0);
             }
@@ -865,6 +1258,32 @@ namespace Simd
             case View<A>::Gray8:
                 BgrToGray(src, dst);
                 break;
+            case View<A>::Rgb24:
+                BgrToRgb(src, dst);
+                break;
+            case View<A>::Rgba32:
+                BgrToRgba(src, dst);
+                break;
+            default:
+                assert(0);
+            }
+            break;
+
+        case View<A>::Rgb24:
+            switch (dst.format)
+            {
+            case View<A>::Bgra32:
+                RgbToBgra(src, dst);
+                break;
+            case View<A>::Bgr24:
+                RgbToBgr(src, dst);
+                break;
+            case View<A>::Gray8:
+                RgbToGray(src, dst);
+                break;
+            case View<A>::Rgba32:
+                RgbToRgba(src, dst);
+                break;
             default:
                 assert(0);
             }
@@ -879,6 +1298,32 @@ namespace Simd
             case View<A>::Gray8:
                 BgraToGray(src, dst);
                 break;
+            case View<A>::Rgb24:
+                BgraToRgb(src, dst);
+                break;
+            case View<A>::Rgba32:
+                BgraToRgba(src, dst);
+                break;
+            default:
+                assert(0);
+            }
+            break;
+
+        case View<A>::Rgba32:
+            switch (dst.format)
+            {
+            case View<A>::Bgra32:
+                RgbaToBgra(src, dst);
+                break;
+            case View<A>::Bgr24:
+                RgbaToBgr(src, dst);
+                break;
+            case View<A>::Gray8:
+                RgbaToGray(src, dst);
+                break;
+            case View<A>::Rgb24:
+                RgbaToRgb(src, dst);
+                break;
             default:
                 assert(0);
             }
diff --git a/3rdparty/simdlib/Simd/SimdLoad.h b/3rdparty/simdlib/Simd/SimdLoad.h
old mode 100644
new mode 100755
index 97d7af7098..243858ca1b
--- a/3rdparty/simdlib/Simd/SimdLoad.h
+++ b/3rdparty/simdlib/Simd/SimdLoad.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -28,16 +28,8 @@
 
 namespace Simd
 {
-    enum PadType
-    {
-        PadNose1,
-        PadNone,
-        PadTail1,
-        PadTail2,
-    };
-
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         template <bool align> SIMD_INLINE __m128 Load(const float * p);
 
@@ -56,7 +48,7 @@ namespace Simd
             return _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)p0), (__m64*)p1);
         }
 
-        SIMD_INLINE __m128 LoadPadZeroNose1(const float * p)
+        SIMD_INLINE __m128 LoadPadZeroNose1(const float* p)
         {
             SIMD_ALIGNED(16) const int32_t m[F] = { 0, -1, -1, -1 };
             __m128 a = _mm_loadu_ps(p + 1);
@@ -64,7 +56,7 @@ namespace Simd
             return _mm_and_ps(b, _mm_load_ps((float*)m));
         }
 
-        SIMD_INLINE __m128 LoadPadZeroTail1(const float * p)
+        SIMD_INLINE __m128 LoadPadZeroTail1(const float* p)
         {
             SIMD_ALIGNED(16) const int32_t m[F] = { -1, -1, -1, 0 };
             __m128 a = _mm_loadu_ps(p - 1);
@@ -72,20 +64,15 @@ namespace Simd
             return _mm_and_ps(b, _mm_load_ps((float*)m));
         }
 
-        SIMD_INLINE __m128 LoadPadZeroTail2(const float * p)
+        SIMD_INLINE __m128 LoadPadZeroTail2(const float* p)
         {
             SIMD_ALIGNED(16) const int32_t m[F] = { -1, -1, 0, 0 };
             __m128 a = _mm_loadu_ps(p - 2);
             __m128 b = _mm_shuffle_ps(a, a, 0xFE);
             return _mm_and_ps(b, _mm_load_ps((float*)m));
         }
-    }
-#endif//SIMD_SSE_ENABLE
 
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
-        using namespace Sse;
+        //---------------------------------------------------------------------
 
         template <bool align> SIMD_INLINE __m128i Load(const __m128i * p);
 
@@ -99,6 +86,11 @@ namespace Simd
             return _mm_load_si128(p);
         }
 
+        SIMD_INLINE __m128i Load(const __m128i* p0, const __m128i* p1)
+        {
+            return _mm_castps_si128(_mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)p0), (__m64*)p1));
+        }
+
         template <bool align> SIMD_INLINE __m128i LoadMaskI8(const __m128i * p, __m128i index)
         {
             return _mm_cmpeq_epi8(Load<align>(p), index);
@@ -113,90 +105,13 @@ namespace Simd
         {
             return _mm_or_si128(_mm_srli_si128(last, count), _mm_and_si128(last, _mm_slli_si128(K_INV_ZERO, A - count)));
         }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadNose3(const uint8_t * p, __m128i a[3])
-        {
-            a[1] = Load<align>((__m128i*)p);
-            a[0] = LoadBeforeFirst<step>(a[1]);
-            a[2] = _mm_loadu_si128((__m128i*)(p + step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadBody3(const uint8_t * p, __m128i a[3])
-        {
-            a[0] = _mm_loadu_si128((__m128i*)(p - step));
-            a[1] = Load<align>((__m128i*)p);
-            a[2] = _mm_loadu_si128((__m128i*)(p + step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadTail3(const uint8_t * p, __m128i a[3])
-        {
-            a[0] = _mm_loadu_si128((__m128i*)(p - step));
-            a[1] = Load<align>((__m128i*)p);
-            a[2] = LoadAfterLast<step>(a[1]);
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadNose5(const uint8_t * p, __m128i a[5])
-        {
-            a[2] = Load<align>((__m128i*)p);
-            a[1] = LoadBeforeFirst<step>(a[2]);
-            a[0] = LoadBeforeFirst<step>(a[1]);
-            a[3] = _mm_loadu_si128((__m128i*)(p + step));
-            a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadBody5(const uint8_t * p, __m128i a[5])
-        {
-            a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step));
-            a[1] = _mm_loadu_si128((__m128i*)(p - step));
-            a[2] = Load<align>((__m128i*)p);
-            a[3] = _mm_loadu_si128((__m128i*)(p + step));
-            a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadTail5(const uint8_t * p, __m128i a[5])
-        {
-            a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step));
-            a[1] = _mm_loadu_si128((__m128i*)(p - step));
-            a[2] = Load<align>((__m128i*)p);
-            a[3] = LoadAfterLast<step>(a[2]);
-            a[4] = LoadAfterLast<step>(a[3]);
-        }
-
-        SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m128i a[3])
-        {
-            a[0] = LoadBeforeFirst<1>(_mm_loadu_si128((__m128i*)p));
-            a[2] = _mm_loadu_si128((__m128i*)(p + 1));
-        }
-
-        SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m128i a[3])
-        {
-            a[0] = _mm_loadu_si128((__m128i*)(p - 1));
-            a[2] = _mm_loadu_si128((__m128i*)(p + 1));
-        }
-
-        SIMD_INLINE void LoadTailDx(const uint8_t * p, __m128i a[3])
-        {
-            a[0] = _mm_loadu_si128((__m128i*)(p - 1));
-            a[2] = LoadAfterLast<1>(_mm_loadu_si128((__m128i*)p));
-        }
     }
 #endif//SIMD_SSE2_ENABLE
 
-#ifdef SIMD_SSE3_ENABLE
-    namespace Sse3
-    {
-#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug      
-        using Sse::Load;
-        using Sse2::Load;
-#endif
-    }
-#endif
-
 #ifdef SIMD_SSE41_ENABLE
     namespace Sse41
     {
 #if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug      
-        using Sse::Load;
         using Sse2::Load;
 #endif
     }
@@ -219,12 +134,17 @@ namespace Simd
 
         template<bool align> SIMD_INLINE __m256 Load(const float * p0, const float * p1)
         {
-            return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse::Load<align>(p0)), Sse::Load<align>(p1), 1);
+            return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse2::Load<align>(p0)), Sse2::Load<align>(p1), 1);
         }
 
         SIMD_INLINE __m256 Load(const float * p0, const float * p1, const float * p2, const float * p3)
         {
-            return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse::Load(p0, p1)), Sse::Load(p2, p3), 1);
+            return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse2::Load(p0, p1)), Sse2::Load(p2, p3), 1);
+        }
+
+        SIMD_INLINE __m256 Load(const float * ptr, __m256i mask)
+        {
+            return _mm256_maskload_ps(ptr, mask);
         }
     }
 #endif//SIMD_AVX_ENABLE
@@ -333,86 +253,6 @@ namespace Simd
             __m128i secondHi = LoadHalfAfterLast<step>(firstHi);
             second = _mm256_inserti128_si256(_mm256_castsi128_si256(secondLo), secondHi, 0x1);
         }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadNose3(const uint8_t * p, __m256i a[3])
-        {
-            a[0] = LoadBeforeFirst<align, step>(p);
-            a[1] = Load<align>((__m256i*)p);
-            a[2] = _mm256_loadu_si256((__m256i*)(p + step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadBody3(const uint8_t * p, __m256i a[3])
-        {
-            a[0] = _mm256_loadu_si256((__m256i*)(p - step));
-            a[1] = Load<align>((__m256i*)p);
-            a[2] = _mm256_loadu_si256((__m256i*)(p + step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadTail3(const uint8_t * p, __m256i a[3])
-        {
-            a[0] = _mm256_loadu_si256((__m256i*)(p - step));
-            a[1] = Load<align>((__m256i*)p);
-            a[2] = LoadAfterLast<align, step>(p);
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadNose5(const uint8_t * p, __m256i a[5])
-        {
-            LoadBeforeFirst<align, step>(p, a[1], a[0]);
-            a[2] = Load<align>((__m256i*)p);
-            a[3] = _mm256_loadu_si256((__m256i*)(p + step));
-            a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadBody5(const uint8_t * p, __m256i a[5])
-        {
-            a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step));
-            a[1] = _mm256_loadu_si256((__m256i*)(p - step));
-            a[2] = Load<align>((__m256i*)p);
-            a[3] = _mm256_loadu_si256((__m256i*)(p + step));
-            a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step));
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadTail5(const uint8_t * p, __m256i a[5])
-        {
-            a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step));
-            a[1] = _mm256_loadu_si256((__m256i*)(p - step));
-            a[2] = Load<align>((__m256i*)p);
-            LoadAfterLast<align, step>(p, a[3], a[4]);
-        }
-
-        SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m256i a[3])
-        {
-            a[0] = LoadBeforeFirst<false, 1>(p);
-            a[2] = _mm256_loadu_si256((__m256i*)(p + 1));
-        }
-
-        SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m256i a[3])
-        {
-            a[0] = _mm256_loadu_si256((__m256i*)(p - 1));
-            a[2] = _mm256_loadu_si256((__m256i*)(p + 1));
-        }
-
-        SIMD_INLINE void LoadTailDx(const uint8_t * p, __m256i a[3])
-        {
-            a[0] = _mm256_loadu_si256((__m256i*)(p - 1));
-            a[2] = LoadAfterLast<false, 1>(p);
-        }
-
-        template <bool align> SIMD_INLINE __m256 Load(const float * p);
-
-        template <> SIMD_INLINE __m256 Load<false>(const float * p)
-        {
-            return _mm256_loadu_ps(p);
-        }
-
-        template <> SIMD_INLINE __m256 Load<true>(const float * p)
-        {
-#ifdef _MSC_VER
-            return _mm256_castsi256_ps(_mm256_load_si256((__m256i*)p));
-#else
-            return _mm256_load_ps(p);
-#endif
-        }
     }
 #endif//SIMD_AVX2_ENABLE
 
@@ -456,12 +296,12 @@ namespace Simd
 
         template <bool align> SIMD_INLINE int32x4_t Load(const int32_t * p)
         {
-            return (int32x4_t)Load<align>((const uint8_t*)p);
+            return vreinterpretq_s32_u8(Load<align>((const uint8_t*)p));
         }
 
         template <bool align> SIMD_INLINE uint32x4_t Load(const uint32_t * p)
         {
-            return (uint32x4_t)Load<align>((const uint8_t*)p);
+            return vreinterpretq_u32_u8(Load<align>((const uint8_t*)p));
         }
 
         template <bool align> SIMD_INLINE float32x4_t Load(const float * p);
@@ -829,81 +669,6 @@ namespace Simd
             return vextq_u8(last, vextq_u8(last, last, 16 - count), count);
         }
 
-        template <bool align, size_t step> SIMD_INLINE void LoadNose3(const uint8_t * p, uint8x16_t a[3])
-        {
-            a[1] = Load<align>(p);
-            a[0] = LoadBeforeFirst<step>(a[1]);
-            a[2] = vld1q_u8(p + step);
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadBody3(const uint8_t * p, uint8x16_t a[3])
-        {
-#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE
-            __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE);
-#endif
-            a[0] = vld1q_u8(p - step);
-            a[1] = Load<align>(p);
-            a[2] = vld1q_u8(p + step);
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadTail3(const uint8_t * p, uint8x16_t a[3])
-        {
-            a[0] = vld1q_u8(p - step);
-            a[1] = Load<align>(p);
-            a[2] = LoadAfterLast<step>(a[1]);
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadNose5(const uint8_t * p, uint8x16_t a[5])
-        {
-            a[2] = Load<align>(p);
-            a[1] = LoadBeforeFirst<step>(a[2]);
-            a[0] = LoadBeforeFirst<step>(a[1]);
-            a[3] = vld1q_u8(p + step);
-            a[4] = vld1q_u8(p + 2 * step);
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadBody5(const uint8_t * p, uint8x16_t a[5])
-        {
-#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE
-            __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE);
-#endif
-            a[0] = vld1q_u8(p - 2 * step);
-            a[1] = vld1q_u8(p - step);
-            a[2] = Load<align>(p);
-            a[3] = vld1q_u8(p + step);
-            a[4] = vld1q_u8(p + 2 * step);
-        }
-
-        template <bool align, size_t step> SIMD_INLINE void LoadTail5(const uint8_t * p, uint8x16_t a[5])
-        {
-            a[0] = vld1q_u8(p - 2 * step);
-            a[1] = vld1q_u8(p - step);
-            a[2] = Load<align>(p);
-            a[3] = LoadAfterLast<step>(a[2]);
-            a[4] = LoadAfterLast<step>(a[3]);
-        }
-
-        SIMD_INLINE void LoadNoseDx(const uint8_t * p, uint8x16_t a[3])
-        {
-            a[0] = LoadBeforeFirst<1>(vld1q_u8(p));
-            a[2] = vld1q_u8(p + 1);
-        }
-
-        SIMD_INLINE void LoadBodyDx(const uint8_t * p, uint8x16_t a[3])
-        {
-#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE
-            __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE);
-#endif
-            a[0] = vld1q_u8(p - 1);
-            a[2] = vld1q_u8(p + 1);
-        }
-
-        SIMD_INLINE void LoadTailDx(const uint8_t * p, uint8x16_t a[3])
-        {
-            a[0] = vld1q_u8(p - 1);
-            a[2] = LoadAfterLast<1>(vld1q_u8(p));
-        }
-
         template <size_t count> SIMD_INLINE uint8x8_t LoadBeforeFirst(uint8x8_t first)
         {
             return vext_u8(vext_u8(first, first, count), first, 8 - count);
diff --git a/3rdparty/simdlib/Simd/SimdLoadBlock.h b/3rdparty/simdlib/Simd/SimdLoadBlock.h
new file mode 100755
index 0000000000..8a46e07687
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdLoadBlock.h
@@ -0,0 +1,251 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdLoadBlock_h__
+#define __SimdLoadBlock_h__
+
+#include "Simd/SimdLoad.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
+    {
+        template <bool align, size_t step> SIMD_INLINE void LoadNose3(const uint8_t * p, __m128i a[3])
+        {
+            a[1] = Load<align>((__m128i*)p);
+            a[0] = LoadBeforeFirst<step>(a[1]);
+            a[2] = _mm_loadu_si128((__m128i*)(p + step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadBody3(const uint8_t * p, __m128i a[3])
+        {
+            a[0] = _mm_loadu_si128((__m128i*)(p - step));
+            a[1] = Load<align>((__m128i*)p);
+            a[2] = _mm_loadu_si128((__m128i*)(p + step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadTail3(const uint8_t * p, __m128i a[3])
+        {
+            a[0] = _mm_loadu_si128((__m128i*)(p - step));
+            a[1] = Load<align>((__m128i*)p);
+            a[2] = LoadAfterLast<step>(a[1]);
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadNose5(const uint8_t * p, __m128i a[5])
+        {
+            a[2] = Load<align>((__m128i*)p);
+            a[1] = LoadBeforeFirst<step>(a[2]);
+            a[0] = LoadBeforeFirst<step>(a[1]);
+            a[3] = _mm_loadu_si128((__m128i*)(p + step));
+            a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadBody5(const uint8_t * p, __m128i a[5])
+        {
+            a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step));
+            a[1] = _mm_loadu_si128((__m128i*)(p - step));
+            a[2] = Load<align>((__m128i*)p);
+            a[3] = _mm_loadu_si128((__m128i*)(p + step));
+            a[4] = _mm_loadu_si128((__m128i*)(p + 2 * step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadTail5(const uint8_t * p, __m128i a[5])
+        {
+            a[0] = _mm_loadu_si128((__m128i*)(p - 2 * step));
+            a[1] = _mm_loadu_si128((__m128i*)(p - step));
+            a[2] = Load<align>((__m128i*)p);
+            a[3] = LoadAfterLast<step>(a[2]);
+            a[4] = LoadAfterLast<step>(a[3]);
+        }
+
+        SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m128i a[3])
+        {
+            a[0] = LoadBeforeFirst<1>(_mm_loadu_si128((__m128i*)p));
+            a[2] = _mm_loadu_si128((__m128i*)(p + 1));
+        }
+
+        SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m128i a[3])
+        {
+            a[0] = _mm_loadu_si128((__m128i*)(p - 1));
+            a[2] = _mm_loadu_si128((__m128i*)(p + 1));
+        }
+
+        SIMD_INLINE void LoadTailDx(const uint8_t * p, __m128i a[3])
+        {
+            a[0] = _mm_loadu_si128((__m128i*)(p - 1));
+            a[2] = LoadAfterLast<1>(_mm_loadu_si128((__m128i*)p));
+        }
+    }
+#endif//SIMD_SSE2_ENABLE
+
+#ifdef SIMD_AVX2_ENABLE
+    namespace Avx2
+    {
+        template <bool align, size_t step> SIMD_INLINE void LoadNose3(const uint8_t * p, __m256i a[3])
+        {
+            a[0] = LoadBeforeFirst<align, step>(p);
+            a[1] = Load<align>((__m256i*)p);
+            a[2] = _mm256_loadu_si256((__m256i*)(p + step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadBody3(const uint8_t * p, __m256i a[3])
+        {
+            a[0] = _mm256_loadu_si256((__m256i*)(p - step));
+            a[1] = Load<align>((__m256i*)p);
+            a[2] = _mm256_loadu_si256((__m256i*)(p + step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadTail3(const uint8_t * p, __m256i a[3])
+        {
+            a[0] = _mm256_loadu_si256((__m256i*)(p - step));
+            a[1] = Load<align>((__m256i*)p);
+            a[2] = LoadAfterLast<align, step>(p);
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadNose5(const uint8_t * p, __m256i a[5])
+        {
+            LoadBeforeFirst<align, step>(p, a[1], a[0]);
+            a[2] = Load<align>((__m256i*)p);
+            a[3] = _mm256_loadu_si256((__m256i*)(p + step));
+            a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadBody5(const uint8_t * p, __m256i a[5])
+        {
+            a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step));
+            a[1] = _mm256_loadu_si256((__m256i*)(p - step));
+            a[2] = Load<align>((__m256i*)p);
+            a[3] = _mm256_loadu_si256((__m256i*)(p + step));
+            a[4] = _mm256_loadu_si256((__m256i*)(p + 2 * step));
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadTail5(const uint8_t * p, __m256i a[5])
+        {
+            a[0] = _mm256_loadu_si256((__m256i*)(p - 2 * step));
+            a[1] = _mm256_loadu_si256((__m256i*)(p - step));
+            a[2] = Load<align>((__m256i*)p);
+            LoadAfterLast<align, step>(p, a[3], a[4]);
+        }
+
+        SIMD_INLINE void LoadNoseDx(const uint8_t * p, __m256i a[3])
+        {
+            a[0] = LoadBeforeFirst<false, 1>(p);
+            a[2] = _mm256_loadu_si256((__m256i*)(p + 1));
+        }
+
+        SIMD_INLINE void LoadBodyDx(const uint8_t * p, __m256i a[3])
+        {
+            a[0] = _mm256_loadu_si256((__m256i*)(p - 1));
+            a[2] = _mm256_loadu_si256((__m256i*)(p + 1));
+        }
+
+        SIMD_INLINE void LoadTailDx(const uint8_t * p, __m256i a[3])
+        {
+            a[0] = _mm256_loadu_si256((__m256i*)(p - 1));
+            a[2] = LoadAfterLast<false, 1>(p);
+        }
+    }
+#endif//SIMD_AVX2_ENABLE
+
+#ifdef SIMD_NEON_ENABLE
+    namespace Neon
+    {
+        template <bool align, size_t step> SIMD_INLINE void LoadNose3(const uint8_t * p, uint8x16_t a[3])
+        {
+            a[1] = Load<align>(p);
+            a[0] = LoadBeforeFirst<step>(a[1]);
+            a[2] = vld1q_u8(p + step);
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadBody3(const uint8_t * p, uint8x16_t a[3])
+        {
+#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE
+            __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE);
+#endif
+            a[0] = vld1q_u8(p - step);
+            a[1] = Load<align>(p);
+            a[2] = vld1q_u8(p + step);
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadTail3(const uint8_t * p, uint8x16_t a[3])
+        {
+            a[0] = vld1q_u8(p - step);
+            a[1] = Load<align>(p);
+            a[2] = LoadAfterLast<step>(a[1]);
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadNose5(const uint8_t * p, uint8x16_t a[5])
+        {
+            a[2] = Load<align>(p);
+            a[1] = LoadBeforeFirst<step>(a[2]);
+            a[0] = LoadBeforeFirst<step>(a[1]);
+            a[3] = vld1q_u8(p + step);
+            a[4] = vld1q_u8(p + 2 * step);
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadBody5(const uint8_t * p, uint8x16_t a[5])
+        {
+#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE
+            __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE);
+#endif
+            a[0] = vld1q_u8(p - 2 * step);
+            a[1] = vld1q_u8(p - step);
+            a[2] = Load<align>(p);
+            a[3] = vld1q_u8(p + step);
+            a[4] = vld1q_u8(p + 2 * step);
+        }
+
+        template <bool align, size_t step> SIMD_INLINE void LoadTail5(const uint8_t * p, uint8x16_t a[5])
+        {
+            a[0] = vld1q_u8(p - 2 * step);
+            a[1] = vld1q_u8(p - step);
+            a[2] = Load<align>(p);
+            a[3] = LoadAfterLast<step>(a[2]);
+            a[4] = LoadAfterLast<step>(a[3]);
+        }
+
+        SIMD_INLINE void LoadNoseDx(const uint8_t * p, uint8x16_t a[3])
+        {
+            a[0] = LoadBeforeFirst<1>(vld1q_u8(p));
+            a[2] = vld1q_u8(p + 1);
+        }
+
+        SIMD_INLINE void LoadBodyDx(const uint8_t * p, uint8x16_t a[3])
+        {
+#if defined(__GNUC__) && SIMD_NEON_PREFECH_SIZE
+            __builtin_prefetch(p + SIMD_NEON_PREFECH_SIZE);
+#endif
+            a[0] = vld1q_u8(p - 1);
+            a[2] = vld1q_u8(p + 1);
+        }
+
+        SIMD_INLINE void LoadTailDx(const uint8_t * p, uint8x16_t a[3])
+        {
+            a[0] = vld1q_u8(p - 1);
+            a[2] = LoadAfterLast<1>(vld1q_u8(p));
+        }
+    }
+#endif//SIMD_NEON_ENABLE
+}
+#endif//__SimdLoadBlock_h__
diff --git a/3rdparty/simdlib/Simd/SimdLog.h b/3rdparty/simdlib/Simd/SimdLog.h
old mode 100644
new mode 100755
index 45ba3f3be5..923a16dc70
--- a/3rdparty/simdlib/Simd/SimdLog.h
+++ b/3rdparty/simdlib/Simd/SimdLog.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -58,8 +58,8 @@ namespace Simd
         Log<T>(array.data, array.size, name);
     }
 
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         SIMD_INLINE void Log(const __m128 & value, const std::string & name)
         {
@@ -67,12 +67,7 @@ namespace Simd
             _mm_storeu_ps(buffer, value);
             Simd::Log<float>(buffer, F, name);
         }
-    }
-#endif //SIMD_SSE_ENABLE
 
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
         template<class T> SIMD_INLINE void Log(const __m128i & value, const std::string & name)
         {
             const size_t n = sizeof(__m128i) / sizeof(T);
@@ -86,7 +81,7 @@ namespace Simd
 #ifdef SIMD_SSE41_ENABLE
     namespace Sse41
     {
-        using namespace Sse;
+        using namespace Sse2;
     }
 #endif //SIMD_SSE41_ENABLE
 
@@ -173,14 +168,15 @@ namespace Simd
 #define SIMD_LOG2(value) Log<int16_t>(value, #value)
 #define SIMD_LOG4(value) Log<int32_t>(value, #value)
 
-#define SIMD_LOG_SS(message) \
+#define SIMD_LOG_ERROR(message) \
 { \
-    std::cout << __FUNCTION__  << " : " << message << std::endl; \
-    std::cout.flush(); \
+    std::stringstream ss; \
+    ss << std::endl << " In function " << SIMD_FUNCTION << ":" << std::endl; \
+    ss << " In file " << __FILE__ << ":" << __LINE__ << ":" << std::endl; \
+    ss << " Error: " << message << std::endl << std::endl; \
+    std::cerr << ss.str() << std::flush; \
 }
 
-#define SIMD_LOG_LINE() std::cout << __FUNCTION__  << " : " << __LINE__ << std::endl << std::flush; 
-
 #else//SIMD_LOG_ENABLE
 
 #define SIMD_LOG(value)
@@ -188,9 +184,7 @@ namespace Simd
 #define SIMD_LOG2(value)
 #define SIMD_LOG4(value)
 
-#define SIMD_LOG_SS(message)
-
-#define SIMD_LOG_LINE()
+#define SIMD_LOG_ERROR(message)
 
 #endif//SIMD_LOG_ENABLE 
 
diff --git a/3rdparty/simdlib/Simd/SimdMath.h b/3rdparty/simdlib/Simd/SimdMath.h
old mode 100644
new mode 100755
index 4b674ea512..0f7425f76e
--- a/3rdparty/simdlib/Simd/SimdMath.h
+++ b/3rdparty/simdlib/Simd/SimdMath.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
 *               2018-2019 Radchenko Andrey.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -66,11 +66,21 @@ namespace Simd
 #define SIMD_ROUND
     SIMD_INLINE int Round(double value)
     {
-#if defined(SIMD_SSE2_ENABLE) && ((defined(_MSC_VER) && defined(_M_X64)) || (defined(__GNUC__) && defined(__x86_64__)))
-        __m128d t = _mm_set_sd(value);
-        return _mm_cvtsd_si32(t);
+#if defined(SIMD_X64_ENABLE) && !defined(SIMD_SSE2_DISABLE)
+        __m128d _value = _mm_set_sd(value);
+        return _mm_cvtsd_si32(_value);
 #else
-        return (int)(value + (value >= 0 ? 0.5 : -0.5));
+        return (int)(value + (value >= 0.0 ? 0.5 : -0.5));
+#endif
+    }
+
+    SIMD_INLINE int Round(float value)
+    {
+#if defined(SIMD_X64_ENABLE) && !defined(SIMD_SSE2_DISABLE)
+        __m128 _value = _mm_set_ss(value);
+        return _mm_cvtss_si32(_value);
+#else
+        return (int)(value + (value >= 0.0f ? 0.5f : -0.5f));
 #endif
     }
 #endif
@@ -263,8 +273,8 @@ namespace Simd
         }
     }
 
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         SIMD_INLINE __m128 Square(__m128 value)
         {
@@ -330,12 +340,7 @@ namespace Simd
             __m128 m = _mm_max_ps(s0, s1);
             return _mm_store_ss(dst, _mm_max_ss(m, _mm_shuffle_ps(m, m, 1)));
         }
-    }
-#endif//SIMD_SSE_ENABLE
 
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
         SIMD_INLINE __m128i SaturateI16ToU8(__m128i value)
         {
             return _mm_min_epi16(K16_00FF, _mm_max_epi16(value, K_ZERO));
@@ -508,17 +513,8 @@ namespace Simd
     }
 #endif// SIMD_SSE2_ENABLE
 
-#ifdef SIMD_SSE3_ENABLE
-    namespace Sse3
-    {
-#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug      
-        using Sse::RightNotZero;
-#endif
-    }
-#endif//SIMD_SSE3_ENABLE
-
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE
+    namespace Sse41
     {
         using namespace Sse2;
 
@@ -538,12 +534,7 @@ namespace Simd
         {
             return _mm_maddubs_epi16(UnpackU8<part>(a, b), K8_01_FF);
         }
-    }
-#endif// SIMD_SSSE3_ENABLE
 
-#ifdef SIMD_SSE41_ENABLE
-    namespace Sse41
-    {
 #if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug     
         using Sse::RightNotZero;
 #endif
diff --git a/3rdparty/simdlib/Simd/SimdMemory.h b/3rdparty/simdlib/Simd/SimdMemory.h
old mode 100644
new mode 100755
index de45abb291..d7772ffa3c
--- a/3rdparty/simdlib/Simd/SimdMemory.h
+++ b/3rdparty/simdlib/Simd/SimdMemory.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2018 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *               2016-2016 Sintegrial Technologies.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -28,9 +28,10 @@
 #include "Simd/SimdDefs.h"
 #include "Simd/SimdMath.h"
 
-#if defined(__GNUC__) && defined(SIMD_ALLOCATE_ERROR_MESSAGE)
+#if defined(SIMD_ALLOCATE_ERROR_MESSAGE)
 #include <iostream>
 #endif
+#include <memory>
 
 namespace Simd
 {
@@ -88,17 +89,18 @@ namespace Simd
         align = AlignHi(align, sizeof(void *));
         size = AlignHi(size, align);
         int result = ::posix_memalign(&ptr, align, size);
-#ifdef SIMD_ALLOCATE_ERROR_MESSAGE
         if (result != 0)
+            ptr = NULL;
+#else
+        ptr = malloc(size);
+#endif
+#ifdef SIMD_ALLOCATE_ERROR_MESSAGE
+        if (ptr == NULL)
             std::cout << "The function posix_memalign can't allocate " << size << " bytes with align " << align << " !" << std::endl << std::flush;
 #endif
 #ifdef SIMD_ALLOCATE_ASSERT
-        assert(result == 0);
-#endif
-#else
-        ptr = malloc(size);
+        assert(ptr);
 #endif
-
 #ifdef SIMD_NO_MANS_LAND
         if (ptr)
             ptr = (char*)ptr + SIMD_NO_MANS_LAND;
@@ -121,60 +123,86 @@ namespace Simd
 #endif
     }
 
+    //---------------------------------------------------------------------------------------------
+
     struct Deletable
     {
         virtual ~Deletable() {}
     };
 
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+    //---------------------------------------------------------------------------------------------
+
+#if defined(SIMD_CPP_2011_ENABLE)
+    template<class T> using Holder = std::unique_ptr<T>;
+#else
+    template <class T> class Holder
     {
-        SIMD_INLINE bool Aligned(size_t size, size_t align = sizeof(__m128))
+        T* _ptr;
+
+    public:
+        Holder(T* ptr)
+            : _ptr(ptr)
         {
-            return Simd::Aligned(size, align);
         }
 
-        SIMD_INLINE bool Aligned(const void * ptr, size_t align = sizeof(__m128))
+        ~Holder()
         {
-            return Simd::Aligned(ptr, align);
+            if (_ptr)
+                delete _ptr;
+        }
+
+        T& operator * ()
+        {
+            return *_ptr;
+        }
+
+        const T& operator * () const
+        {
+            return *_ptr;
+        }
+
+        T* operator -> ()
+        {
+            return _ptr;
         }
-    }
-#endif// SIMD_SSE_ENABLE
+
+        const T* operator -> () const
+        {
+            return _ptr;
+        }
+
+        operator bool() const 
+        {
+            return _ptr != NULL;
+        }
+    };
+#endif
+
+    //---------------------------------------------------------------------------------------------
+
 
 #ifdef SIMD_SSE2_ENABLE
     namespace Sse2
     {
-        using Sse::Aligned;
-    }
-#endif// SIMD_SSE2_ENABLE
-
-#ifdef SIMD_SSE3_ENABLE
-    namespace Sse3
-    {
-        using Sse::Aligned;
-    }
-#endif// SIMD_SSE3_ENABLE
+        SIMD_INLINE bool Aligned(size_t size, size_t align = sizeof(__m128))
+        {
+            return Simd::Aligned(size, align);
+        }
 
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        using Sse::Aligned;
+        SIMD_INLINE bool Aligned(const void * ptr, size_t align = sizeof(__m128))
+        {
+            return Simd::Aligned(ptr, align);
+        }        
     }
-#endif// SIMD_SSSE3_ENABLE
+#endif// SIMD_SSE2_ENABLE
 
 #ifdef SIMD_SSE41_ENABLE
     namespace Sse41
     {
-        using Sse::Aligned;
+        using Sse2::Aligned;
     }
 #endif// SIMD_SSE41_ENABLE
 
-#ifdef SIMD_SSE42_ENABLE
-    namespace Sse42
-    {
-    }
-#endif// SIMD_SSE42_ENABLE
-
 #ifdef SIMD_AVX_ENABLE
     namespace Avx
     {
diff --git a/3rdparty/simdlib/Simd/SimdNeon.h b/3rdparty/simdlib/Simd/SimdNeon.h
old mode 100644
new mode 100755
index 54373b506e..bf2b98be69
--- a/3rdparty/simdlib/Simd/SimdNeon.h
+++ b/3rdparty/simdlib/Simd/SimdNeon.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
 *               2018-2018 Radchenko Andrey.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -36,22 +36,18 @@ namespace Simd
 
         void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride);
 
-        void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride);
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride);
 
-        void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
-
-        void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha);
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride);
 
-        void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride);
+        void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
         void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height,
             const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
         void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride);
 
-        void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride);
-
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride);
+        void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride);
 
         void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height,
             uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride);
@@ -93,6 +89,12 @@ namespace Simd
         void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
 
+        void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride);
+
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride);
+
         void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
     }
diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp
old mode 100644
new mode 100755
index bb25c0c6e8..98a360b0e6
--- a/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonBgrToBgra.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -74,6 +74,8 @@ namespace Simd
                 BgrToBgra<false>(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
         }
 
+        //---------------------------------------------------------------------
+
         template <bool align> SIMD_INLINE void Bgr48pToBgra32(uint8_t * bgra,
             const uint8_t * blue, const uint8_t * green, const uint8_t * red, size_t offset, const uint8x16_t & alpha)
         {
@@ -128,6 +130,47 @@ namespace Simd
             else
                 Bgr48pToBgra32<false>(blue, blueStride, width, height, green, greenStride, red, redStride, bgra, bgraStride, alpha);
         }
+
+        //---------------------------------------------------------------------
+
+        template <bool align> SIMD_INLINE void RgbToBgra(const uint8_t* rgb, uint8_t* bgra, uint8x16_t alpha)
+        {
+            uint8x16x3_t _rgb = Load3<align>(rgb);
+            uint8x16x4_t _bgra;
+            _bgra.val[0] = _rgb.val[2];
+            _bgra.val[1] = _rgb.val[1];
+            _bgra.val[2] = _rgb.val[0];
+            _bgra.val[3] = alpha;
+            Store4<align>(bgra, _bgra);
+        }
+
+        template <bool align> void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            uint8x16_t _alpha = vdupq_n_u8(alpha);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0, colBgra = 0, colRgb = 0; col < alignedWidth; col += A, colBgra += A4, colRgb += A3)
+                    RgbToBgra<align>(rgb + colRgb, bgra + colBgra, _alpha);
+                if (width != alignedWidth)
+                    RgbToBgra<false>(rgb + 3 * (width - A), bgra + 4 * (width - A), _alpha);
+                rgb += rgbStride;
+                bgra += bgraStride;
+            }
+        }
+
+        void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride))
+                RgbToBgra<true>(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+            else
+                RgbToBgra<false>(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+        }
     }
 #endif// SIMD_NEON_ENABLE
 }
diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp
old mode 100644
new mode 100755
index 0b9fdeaedf..57cf19f18d
--- a/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonBgrToGray.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -30,29 +30,31 @@ namespace Simd
 #ifdef SIMD_NEON_ENABLE    
     namespace Neon
     {
-        SIMD_INLINE uint8x8_t BgrToGray(uint8x8x3_t bgr)
+        SIMD_INLINE uint8x16_t BgrToGray(uint8x16x3_t bgr)
         {
-            return vmovn_u16(BgrToGray(vmovl_u8(bgr.val[0]), vmovl_u8(bgr.val[1]), vmovl_u8(bgr.val[2])));
+            uint8x8_t lo = vmovn_u16(BgrToGray(UnpackU8<0>(bgr.val[0]), UnpackU8<0>(bgr.val[1]), UnpackU8<0>(bgr.val[2])));
+            uint8x8_t hi = vmovn_u16(BgrToGray(UnpackU8<1>(bgr.val[0]), UnpackU8<1>(bgr.val[1]), UnpackU8<1>(bgr.val[2])));
+            return vcombine_u8(lo, hi);
         }
 
-        template <bool align> void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride)
+        template <bool align> void BgrToGray(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* gray, size_t grayStride)
         {
-            assert(width >= HA);
+            assert(width >= A);
             if (align)
                 assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride));
 
-            size_t alignedWidth = AlignLo(width, HA);
+            size_t alignedWidth = AlignLo(width, A);
             for (size_t row = 0; row < height; ++row)
             {
-                for (size_t col = 0; col < alignedWidth; col += HA)
+                for (size_t col = 0; col < alignedWidth; col += A)
                 {
-                    uint8x8x3_t _bgr = LoadHalf3<align>(bgr + 3 * col);
+                    uint8x16x3_t _bgr = Load3<align>(bgr + 3 * col);
                     Store<align>(gray + col, BgrToGray(_bgr));
                 }
                 if (alignedWidth != width)
                 {
-                    uint8x8x3_t _bgr = LoadHalf3<false>(bgr + 3 * (width - HA));
-                    Store<false>(gray + width - HA, BgrToGray(_bgr));
+                    uint8x16x3_t _bgr = Load3<false>(bgr + 3 * (width - A));
+                    Store<false>(gray + width - A, BgrToGray(_bgr));
                 }
                 bgr += bgrStride;
                 gray += grayStride;
@@ -66,6 +68,47 @@ namespace Simd
             else
                 BgrToGray<false>(bgr, width, height, bgrStride, gray, grayStride);
         }
+
+        //---------------------------------------------------------------------
+
+        SIMD_INLINE uint8x16_t RgbToGray(uint8x16x3_t rgb)
+        {
+            uint8x8_t lo = vmovn_u16(BgrToGray(UnpackU8<0>(rgb.val[2]), UnpackU8<0>(rgb.val[1]), UnpackU8<0>(rgb.val[0])));
+            uint8x8_t hi = vmovn_u16(BgrToGray(UnpackU8<1>(rgb.val[2]), UnpackU8<1>(rgb.val[1]), UnpackU8<1>(rgb.val[0])));
+            return vcombine_u8(lo, hi);
+        }
+
+        template <bool align> void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(rgb) && Aligned(rgbStride) && Aligned(gray) && Aligned(grayStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                {
+                    uint8x16x3_t _rgb = Load3<align>(rgb + 3 * col);
+                    Store<align>(gray + col, RgbToGray(_rgb));
+                }
+                if (alignedWidth != width)
+                {
+                    uint8x16x3_t _rgb = Load3<false>(rgb + 3 * (width - A));
+                    Store<false>(gray + width - A, RgbToGray(_rgb));
+                }
+                rgb += rgbStride;
+                gray += grayStride;
+            }
+        }
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+        {
+            if (Aligned(rgb) && Aligned(gray) && Aligned(rgbStride) && Aligned(grayStride))
+                RgbToGray<true>(rgb, width, height, rgbStride, gray, grayStride);
+            else
+                RgbToGray<false>(rgb, width, height, rgbStride, gray, grayStride);
+        }
     }
 #endif// SIMD_NEON_ENABLE
 }
diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp
old mode 100644
new mode 100755
index fb69a04b5f..b1e69cc3aa
--- a/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonBgrToRgb.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -38,7 +38,7 @@ namespace Simd
             Store3<align>(rgb, _bgr);
         }
 
-        template <bool align> void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
+        template <bool align> void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride)
         {
             assert(width >= A);
             if (align)
@@ -59,12 +59,12 @@ namespace Simd
             }
         }
 
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
+        void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride)
         {
             if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride))
-                BgrToRgb<true>(bgr, bgrStride, width, height, rgb, rgbStride);
+                BgrToRgb<true>(bgr, width, height, bgrStride, rgb, rgbStride);
             else
-                BgrToRgb<false>(bgr, bgrStride, width, height, rgb, rgbStride);
+                BgrToRgb<false>(bgr, width, height, bgrStride, rgb, rgbStride);
         }
     }
 #endif//SIMD_NEON_ENABLE
diff --git a/3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp b/3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp
deleted file mode 100644
index b2950c7da1..0000000000
--- a/3rdparty/simdlib/Simd/SimdNeonBgrToRgba.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_NEON_ENABLE  
-    namespace Neon
-    {
-        const size_t A3 = A * 3;
-        const size_t A4 = A * 4;
-
-        union Bgra
-        {
-            uint8x16x4_t bgra;
-            uint8x16x3_t bgr;
-        };
-
-        template <bool align> SIMD_INLINE void BgrToRgba(const uint8_t * bgr, uint8_t * rgba, Bgra & _bgra)
-        {
-            _bgra.bgr = Load3<align>(bgr);
-            uint8x16_t tmp = _bgra.bgr.val[0];
-            _bgra.bgr.val[0] = _bgra.bgr.val[2];
-            _bgra.bgr.val[2] = tmp;
-            Store4<align>(rgba, _bgra.bgra);
-        }
-
-        template <bool align> void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            Bgra _bgra;
-            _bgra.bgra.val[3] = vdupq_n_u8(alpha);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0, colRgba = 0, colBgr = 0; col < alignedWidth; col += A, colRgba += A4, colBgr += A3)
-                    BgrToRgba<align>(bgr + colBgr, rgba + colRgba, _bgra);
-                if (width != alignedWidth)
-                    BgrToRgba<false>(bgr + 3 * (width - A), rgba + 4 * (width - A), _bgra);
-                bgr += bgrStride;
-                rgba += rgbaStride;
-            }
-        }
-
-        void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha)
-        {
-            if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride))
-                BgrToRgba<true>(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-            else
-                BgrToRgba<false>(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-        }
-    }
-#endif// SIMD_NEON_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp
old mode 100644
new mode 100755
index f95e1a9118..944fe5b45e
--- a/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonBgraToBgr.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -66,6 +66,87 @@ namespace Simd
             else
                 BgraToBgr<false>(bgra, width, height, bgraStride, bgr, bgrStride);
         }
+
+        //---------------------------------------------------------------------
+
+        template <bool align> SIMD_INLINE void BgraToRgb(const uint8_t* bgra, uint8_t* rgb)
+        {
+            uint8x16x4_t _bgra = Load4<align>(bgra);
+            uint8x16x3_t _rgb;
+            _rgb.val[0] = _bgra.val[2];
+            _rgb.val[1] = _bgra.val[1];
+            _rgb.val[2] = _bgra.val[0];
+            Store3<align>(rgb, _rgb);
+        }
+
+        template <bool align> void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            if (width == alignedWidth)
+                alignedWidth -= A;
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0, colBgra = 0, colRgb = 0; col < alignedWidth; col += A, colBgra += A4, colRgb += A3)
+                    BgraToRgb<align>(bgra + colBgra, rgb + colRgb);
+                if (width != alignedWidth)
+                    BgraToRgb<false>(bgra + 4 * (width - A), rgb + 3 * (width - A));
+                bgra += bgraStride;
+                rgb += rgbStride;
+            }
+        }
+
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride))
+                BgraToRgb<true>(bgra, width, height, bgraStride, rgb, rgbStride);
+            else
+                BgraToRgb<false>(bgra, width, height, bgraStride, rgb, rgbStride);
+        }
+
+        //---------------------------------------------------------------------
+
+        template <bool align> SIMD_INLINE void BgraToRgba(const uint8_t* bgra, uint8_t* rgba)
+        {
+            uint8x16x4_t _bgra = Load4<align>(bgra);
+            uint8x16_t tmp = _bgra.val[0];
+            _bgra.val[0] = _bgra.val[2];
+            _bgra.val[2] = tmp;
+            Store4<align>(rgba, _bgra);
+        }
+
+        template <bool align> void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            if (width == alignedWidth)
+                alignedWidth -= A;
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0, colBgra = 0, colRgba = 0; col < alignedWidth; col += A, colBgra += A4, colRgba += A4)
+                    BgraToRgba<align>(bgra + colBgra, rgba + colRgba);
+                if (width != alignedWidth)
+                    BgraToRgba<false>(bgra + 4 * (width - A), rgba + 4 * (width - A));
+                bgra += bgraStride;
+                rgba += rgbaStride;
+            }
+        }
+
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride))
+                BgraToRgba<true>(bgra, width, height, bgraStride, rgba, rgbaStride);
+            else
+                BgraToRgba<false>(bgra, width, height, bgraStride, rgba, rgbaStride);
+        }
     }
 #endif// SIMD_NEON_ENABLE
 }
diff --git a/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp
old mode 100644
new mode 100755
index 24fc228560..6b2eb4de48
--- a/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonBgraToGray.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -66,6 +66,45 @@ namespace Simd
             else
                 BgraToGray<false>(bgra, width, height, bgraStride, gray, grayStride);
         }
+
+        //---------------------------------------------------------------------
+
+        SIMD_INLINE uint8x8_t RgbaToGray(uint8x8x4_t rgba)
+        {
+            return vmovn_u16(BgrToGray(vmovl_u8(rgba.val[2]), vmovl_u8(rgba.val[1]), vmovl_u8(rgba.val[0])));
+        }
+
+        template <bool align> void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+        {
+            assert(width >= HA);
+            if (align)
+                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride));
+
+            size_t alignedWidth = AlignLo(width, HA);
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += HA)
+                {
+                    uint8x8x4_t _rgba = LoadHalf4<align>(rgba + 4 * col);
+                    Store<align>(gray + col, RgbaToGray(_rgba));
+                }
+                if (alignedWidth != width)
+                {
+                    uint8x8x4_t _rgba = LoadHalf4<false>(rgba + 4 * (width - HA));
+                    Store<false>(gray + width - HA, RgbaToGray(_rgba));
+                }
+                rgba += rgbaStride;
+                gray += grayStride;
+            }
+        }
+
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+        {
+            if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride))
+                RgbaToGray<true>(rgba, width, height, rgbaStride, gray, grayStride);
+            else
+                RgbaToGray<false>(rgba, width, height, rgbaStride, gray, grayStride);
+        }
     }
 #endif// SIMD_NEON_ENABLE
 }
diff --git a/3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp b/3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp
deleted file mode 100644
index d1873eddcb..0000000000
--- a/3rdparty/simdlib/Simd/SimdNeonBgraToRgba.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_NEON_ENABLE  
-    namespace Neon
-    {
-        const size_t A4 = A * 4;
-
-        union Bgra
-        {
-            uint8x16x4_t bgra;
-        };
-
-        template <bool align> SIMD_INLINE void BgraToRgba(const uint8_t * bgra, uint8_t * rgba, Bgra & _bgra)
-        {
-            _bgra.bgra = Load4<align>(bgra);
-            uint8x16_t tmp = _bgra.bgra.val[0];
-            _bgra.bgra.val[0] = _bgra.bgra.val[2];
-            _bgra.bgra.val[2] = tmp;
-            Store4<align>(rgba, _bgra.bgra);
-        }
-
-        template <bool align> void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            Bgra _bgra;
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0, colRgba = 0; col < alignedWidth; col += A, colRgba += A4)
-                    BgraToRgba<align>(bgra + colRgba, rgba + colRgba, _bgra);
-                if (width != alignedWidth)
-                    BgraToRgba<false>(bgra + 4 * (width - A), rgba + 4 * (width - A), _bgra);
-                bgra += bgraStride;
-                rgba += rgbaStride;
-            }
-        }
-
-        void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride)
-        {
-            if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride))
-                BgraToRgba<true>(bgra, width, height, bgraStride, rgba, rgbaStride);
-            else
-                BgraToRgba<false>(bgra, width, height, bgraStride, rgba, rgbaStride);
-        }
-    }
-#endif// SIMD_NEON_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp b/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp
old mode 100644
new mode 100755
index 53530a788d..36a623efb5
--- a/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonDeinterleave.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -71,6 +71,8 @@ namespace Simd
                 DeinterleaveUv<false>(uv, uvStride, width, height, u, uStride, v, vStride);
         }
 
+        //---------------------------------------------------------------------
+
         template <bool align> void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height,
             uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride)
         {
@@ -118,6 +120,8 @@ namespace Simd
                 DeinterleaveBgr<false>(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride);
         }
 
+        //---------------------------------------------------------------------
+
         template <bool align> void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height,
             uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride)
         {
@@ -125,36 +129,65 @@ namespace Simd
             if (align)
             {
                 assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride));
-                assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride));
+                assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL));
             }
 
             size_t bodyWidth = AlignLo(width, A);
             size_t tail = width - bodyWidth;
-            for (size_t row = 0; row < height; ++row)
+            if (a)
             {
-                for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += QA)
+                for (size_t row = 0; row < height; ++row)
                 {
-                    uint8x16x4_t _bgra = Load4<align>(bgra + offset);
-                    Store<align>(b + col, _bgra.val[0]);
-                    Store<align>(g + col, _bgra.val[1]);
-                    Store<align>(r + col, _bgra.val[2]);
-                    Store<align>(a + col, _bgra.val[3]);
+                    for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += QA)
+                    {
+                        uint8x16x4_t _bgra = Load4<align>(bgra + offset);
+                        Store<align>(b + col, _bgra.val[0]);
+                        Store<align>(g + col, _bgra.val[1]);
+                        Store<align>(r + col, _bgra.val[2]);
+                        Store<align>(a + col, _bgra.val[3]);
+                    }
+                    if (tail)
+                    {
+                        size_t col = width - A;
+                        size_t offset = 4 * col;
+                        uint8x16x4_t _bgra = Load4<false>(bgra + offset);
+                        Store<false>(b + col, _bgra.val[0]);
+                        Store<false>(g + col, _bgra.val[1]);
+                        Store<false>(r + col, _bgra.val[2]);
+                        Store<false>(a + col, _bgra.val[3]);
+                    }
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
+                    a += aStride;
                 }
-                if (tail)
+            }
+            else
+            {
+                for (size_t row = 0; row < height; ++row)
                 {
-                    size_t col = width - A;
-                    size_t offset = 4 * col;
-                    uint8x16x4_t _bgra = Load4<false>(bgra + offset);
-                    Store<false>(b + col, _bgra.val[0]);
-                    Store<false>(g + col, _bgra.val[1]);
-                    Store<false>(r + col, _bgra.val[2]);
-                    Store<false>(a + col, _bgra.val[3]);
+                    for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += QA)
+                    {
+                        uint8x16x4_t _bgra = Load4<align>(bgra + offset);
+                        Store<align>(b + col, _bgra.val[0]);
+                        Store<align>(g + col, _bgra.val[1]);
+                        Store<align>(r + col, _bgra.val[2]);
+                    }
+                    if (tail)
+                    {
+                        size_t col = width - A;
+                        size_t offset = 4 * col;
+                        uint8x16x4_t _bgra = Load4<false>(bgra + offset);
+                        Store<false>(b + col, _bgra.val[0]);
+                        Store<false>(g + col, _bgra.val[1]);
+                        Store<false>(r + col, _bgra.val[2]);
+                    }
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
                 }
-                bgra += bgraStride;
-                b += bStride;
-                g += gStride;
-                r += rStride;
-                a += aStride;
             }
         }
 
@@ -162,7 +195,7 @@ namespace Simd
             uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride)
         {
             if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) &&
-                Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride))
+                Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL))
                 DeinterleaveBgra<true>(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
             else
                 DeinterleaveBgra<false>(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
diff --git a/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp
old mode 100644
new mode 100755
index 752778be2a..1d63a6510b
--- a/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonGaussianBlur.cpp
@@ -22,6 +22,7 @@
 * SOFTWARE.
 */
 #include "Simd/SimdMemory.h"
+#include "Simd/SimdLoadBlock.h"
 #include "Simd/SimdStore.h"
 #include "Simd/SimdGaussianBlur.h"
 #include "Simd/SimdLog.h"
diff --git a/3rdparty/simdlib/Simd/SimdNeonResizer.cpp b/3rdparty/simdlib/Simd/SimdNeonResizer.cpp
old mode 100644
new mode 100755
index b2e965200e..d11a0e29a8
--- a/3rdparty/simdlib/Simd/SimdNeonResizer.cpp
+++ b/3rdparty/simdlib/Simd/SimdNeonResizer.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -578,11 +578,11 @@ namespace Simd
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
         {
             ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(float32x4_t));
-            if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && dstX >= A)
+            if (param.IsByteBilinear() && dstX >= A)
                 return new ResizerByteBilinear(param);
-            else if (type == SimdResizeChannelByte && method == SimdResizeMethodArea)
+            else if (param.IsByteArea())
                 return new ResizerByteArea(param);
-            else if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp))
+            else if (param.IsFloatBilinear())
                 return new ResizerFloatBilinear(param);
             else
                 return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
diff --git a/3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp
deleted file mode 100644
index 37b288b277..0000000000
--- a/3rdparty/simdlib/Simd/SimdNeonRgbToGray.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdConversion.h"
-
-namespace Simd
-{
-#ifdef SIMD_NEON_ENABLE    
-    namespace Neon
-    {
-        SIMD_INLINE uint8x8_t RgbToGray(uint8x8x3_t rgb)
-        {
-            return vmovn_u16(BgrToGray(vmovl_u8(rgb.val[2]), vmovl_u8(rgb.val[1]), vmovl_u8(rgb.val[0])));
-        }
-
-        template <bool align> void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride)
-        {
-            assert(width >= HA);
-            if (align)
-                assert(Aligned(rgb) && Aligned(rgbStride) && Aligned(gray) && Aligned(grayStride));
-
-            size_t alignedWidth = AlignLo(width, HA);
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += HA)
-                {
-                    uint8x8x3_t _rgb = LoadHalf3<align>(rgb + 3 * col);
-                    Store<align>(gray + col, RgbToGray(_rgb));
-                }
-                if (alignedWidth != width)
-                {
-                    uint8x8x3_t _rgb = LoadHalf3<false>(rgb + 3 * (width - HA));
-                    Store<false>(gray + width - HA, RgbToGray(_rgb));
-                }
-                rgb += rgbStride;
-                gray += grayStride;
-            }
-        }
-
-        void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride)
-        {
-            if (Aligned(rgb) && Aligned(gray) && Aligned(rgbStride) && Aligned(grayStride))
-                RgbToGray<true>(rgb, width, height, rgbStride, gray, grayStride);
-            else
-                RgbToGray<false>(rgb, width, height, rgbStride, gray, grayStride);
-        }
-    }
-#endif// SIMD_NEON_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp
deleted file mode 100644
index 377d6fcb42..0000000000
--- a/3rdparty/simdlib/Simd/SimdNeonRgbaToGray.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdConversion.h"
-
-namespace Simd
-{
-#ifdef SIMD_NEON_ENABLE
-    namespace Neon
-    {
-        SIMD_INLINE uint8x8_t RgbaToGray(uint8x8x4_t rgba)
-        {
-            return vmovn_u16(BgrToGray(vmovl_u8(rgba.val[2]), vmovl_u8(rgba.val[1]), vmovl_u8(rgba.val[0])));
-        }
-
-        template <bool align> void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride)
-        {
-            assert(width >= HA);
-            if (align)
-                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride));
-
-            size_t alignedWidth = AlignLo(width, HA);
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += HA)
-                {
-                    uint8x8x4_t _rgba = LoadHalf4<align>(rgba + 4 * col);
-                    Store<align>(gray + col, RgbaToGray(_rgba));
-                }
-                if (alignedWidth != width)
-                {
-                    uint8x8x4_t _rgba = LoadHalf4<false>(rgba + 4 * (width - HA));
-                    Store<false>(gray + width - HA, RgbaToGray(_rgba));
-                }
-                rgba += rgbaStride;
-                gray += grayStride;
-            }
-        }
-
-        void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride)
-        {
-            if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride))
-                RgbaToGray<true>(rgba, width, height, rgbaStride, gray, grayStride);
-            else
-                RgbaToGray<false>(rgba, width, height, rgbaStride, gray, grayStride);
-        }
-    }
-#endif// SIMD_NEON_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdPixel.hpp b/3rdparty/simdlib/Simd/SimdPixel.hpp
old mode 100644
new mode 100755
index 109c18ec1d..f95ce46ee6
--- a/3rdparty/simdlib/Simd/SimdPixel.hpp
+++ b/3rdparty/simdlib/Simd/SimdPixel.hpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -41,6 +41,7 @@ namespace Simd
         struct Hsv24;
         struct Hsl24;
         struct Rgb24;
+        struct Rgba32;
 
         //-------------------------------------------------------------------------
 
@@ -86,6 +87,13 @@ namespace Simd
             */
             Bgr24(const Rgb24 & p);
 
+            /*!
+                Creates a new 24-bit BGR pixel structure on the base of 32-bit RGBA pixel.
+
+                \param [in] p - 32-bit RGBA pixel.
+            */
+            Bgr24(const Rgba32& p);
+
             /*!
                 Creates a copy of 24-bit BGR pixel structure.
 
@@ -165,6 +173,13 @@ namespace Simd
             */
             Bgra32(const Rgb24 & p, const uint8_t & a = uint8_t(255));
 
+            /*!
+                Creates a new 32-bit BGRA pixel structure on the base of 32-bit RGBA pixel.
+
+                \param [in] p - 32-bit RGBA pixel.
+            */
+            Bgra32(const Rgba32& p);
+
             /*!
                 Creates a copy of 32-bit BGRA pixel structure.
 
@@ -360,6 +375,13 @@ namespace Simd
             */
             Rgb24(const Bgr24 & p);
 
+            /*!
+                Creates a new 24-bit RGB pixel structure on the base of 32-bit RGBA pixel.
+
+                \param [in] p - 32-bit RGBA pixel.
+            */
+            Rgb24(const Rgba32& p);
+
             /*!
                 Creates a copy of 24-bit RGB pixel structure.
 
@@ -392,6 +414,92 @@ namespace Simd
             template <template<class> class A> static Rgb24 & At(View<A> & view, ptrdiff_t col, ptrdiff_t row);
         };
 
+        /*! @ingroup cpp_pixels
+
+            \short 32-bit RGBA pixel.
+
+            Provides manipulation of 32-bit RGBA (Red, Blue, Green, Alpha) pixels of the View struct.
+        */
+        struct Rgba32
+        {
+            uint8_t red; /*!< \brief 8-bit red channel 32-bit BGRA pixel. */
+            uint8_t green; /*!< \brief 8-bit green channel 32-bit BGRA pixel. */
+            uint8_t blue; /*!< \brief 8-bit blue channel 32-bit BGRA pixel. */
+            uint8_t alpha; /*!< \brief 8-bit alpha channel 32-bit RGBA pixel. */
+
+            /*!
+                Creates a new 32-bit RGBA pixel structure with specified channel values.
+
+                \param [in] gray - initial value for all channels. It is equal to 0 by default.
+                \param [in] a - initial value for alpha channel. It is equal to 255 by default.
+            */
+            Rgba32(const uint8_t& gray = uint8_t(0), const uint8_t& a = uint8_t(255));
+
+            /*!
+                Creates a new 32-bit RGBA pixel structure with specified channel values.
+
+                \param [in] r - initial value for red channel.
+                \param [in] g - initial value for green channel.
+                \param [in] b - initial value for blue channel.
+                \param [in] a - initial value for alpha channel. It is equal to 255 by default.
+            */
+            Rgba32(const uint8_t& r, const uint8_t& g, const uint8_t& b, const uint8_t& a = uint8_t(255));
+
+            /*!
+                Creates a new 32-bit RGBA pixel structure on the base of 32-bit BGRA pixel.
+
+                \param [in] p - 32-bit BGRA pixel.
+            */
+            Rgba32(const Bgra32& p);
+
+            /*!
+                Creates a new 32-bit RGBA pixel structure on the base of 24-bit BGR pixel.
+
+                \param [in] p - 24-bit BGR pixel.
+                \param [in] a - initial value for alpha channel. It is equal to 255 by default.
+            */
+            Rgba32(const Bgr24& p, const uint8_t& a = uint8_t(255));
+
+            /*!
+                Creates a new of 32-bit RGBA pixel structure on the base of 24-bit RGB pixel.
+
+                \param [in] p - 24-bit RGB pixel.
+                \param [in] a - initial value for alpha channel. It is equal to 255 by default.
+            */
+            Rgba32(const Rgb24& p, const uint8_t& a = uint8_t(255));
+
+            /*!
+                Creates a copy of 32-bit RGBA pixel structure.
+
+                \param [in] p - 32-bit RGBA pixel.
+            */
+            Rgba32(const Rgba32& p);
+
+            /*!
+                \fn template <template<class> class A> static const Rgba32 & At(const View<A> & view, ptrdiff_t col, ptrdiff_t row);
+
+                Gets constant reference to the pixel with specific coordinates at the image view.
+
+                \param [in] view - an image view of 32-bit RGBA pixel format.
+                \param [in] col - x-coordinate of the pixel.
+                \param [in] row - y-coordinate of the pixel.
+                \return a constant reference to the pixel.
+            */
+            template <template<class> class A> static const Rgba32& At(const View<A>& view, ptrdiff_t col, ptrdiff_t row);
+
+            /*!
+                \fn template <template<class> class A> static Rgba32 & At(View<A> & view, ptrdiff_t col, ptrdiff_t row);
+
+                Gets reference to the pixel with specific coordinates at the image view.
+
+                \param [in] view - an image view of 32-bit RGBA pixel format.
+                \param [in] col - x-coordinate of the pixel.
+                \param [in] row - y-coordinate of the pixel.
+                \return a reference to the pixel.
+            */
+            template <template<class> class A> static Rgba32& At(View<A>& view, ptrdiff_t col, ptrdiff_t row);
+        };
+
         //-------------------------------------------------------------------------
 
         // struct Bgr24 implementation:
@@ -417,14 +525,21 @@ namespace Simd
         {
         }
 
-        SIMD_INLINE Bgr24::Bgr24(const Bgr24 & p)
+        SIMD_INLINE Bgr24::Bgr24(const Rgb24 & p)
             : blue(p.blue)
             , green(p.green)
             , red(p.red)
         {
         }
 
-        SIMD_INLINE Bgr24::Bgr24(const Rgb24 & p)
+        SIMD_INLINE Bgr24::Bgr24(const Rgba32& p)
+            : blue(p.blue)
+            , green(p.green)
+            , red(p.red)
+        {
+        }
+
+        SIMD_INLINE Bgr24::Bgr24(const Bgr24 & p)
             : blue(p.blue)
             , green(p.green)
             , red(p.red)
@@ -479,6 +594,14 @@ namespace Simd
         {
         }
 
+        SIMD_INLINE Bgra32::Bgra32(const Rgba32& p)
+            : blue(p.blue)
+            , green(p.green)
+            , red(p.red)
+            , alpha(p.alpha)
+        {
+        }
+
         SIMD_INLINE Bgra32::Bgra32(const Bgra32 & p)
             : blue(p.blue)
             , green(p.green)
@@ -605,6 +728,13 @@ namespace Simd
         {
         }
 
+        SIMD_INLINE Rgb24::Rgb24(const Rgba32& p)
+            : red(p.red)
+            , green(p.green)
+            , blue(p.blue)
+        {
+        }
+
         SIMD_INLINE Rgb24::Rgb24(const Rgb24 & p)
             : red(p.red)
             , green(p.green)
@@ -625,6 +755,70 @@ namespace Simd
 
             return Simd::At<A, Rgb24>(view, col, row);
         }
+
+        // struct Rgba32 implementation:
+
+        SIMD_INLINE Rgba32::Rgba32(const uint8_t& gray, const uint8_t& a)
+            : red(gray)
+            , green(gray)
+            , blue(gray)
+            , alpha(a)
+        {
+        }
+
+        SIMD_INLINE Rgba32::Rgba32(const uint8_t& r, const uint8_t& g, const uint8_t& b, const uint8_t& a)
+            : red(r)
+            , green(g)
+            , blue(b)
+            , alpha(a)
+        {
+        }
+
+        SIMD_INLINE Rgba32::Rgba32(const Bgra32& p)
+            : red(p.red)
+            , green(p.green)
+            , blue(p.blue)
+            , alpha(p.alpha)
+        {
+        }        
+        
+        SIMD_INLINE Rgba32::Rgba32(const Bgr24& p, const uint8_t& a)
+            : red(p.red)
+            , green(p.green)
+            , blue(p.blue)
+            , alpha(a)
+        {
+        }
+
+        SIMD_INLINE Rgba32::Rgba32(const Rgb24& p, const uint8_t& a)
+            : red(p.red)
+            , green(p.green)
+            , blue(p.blue)
+            , alpha(a)
+        {
+        }
+
+        SIMD_INLINE Rgba32::Rgba32(const Rgba32& p)
+            : red(p.red)
+            , green(p.green)
+            , blue(p.blue)
+            , alpha(p.alpha)
+        {
+        }
+
+        template <template<class> class A> SIMD_INLINE const Rgba32& Rgba32::At(const View<A>& view, ptrdiff_t col, ptrdiff_t row)
+        {
+            assert(view.format == View<A>::Rgba32);
+
+            return Simd::At<A, Rgba32>(view, col, row);
+        }
+
+        template <template<class> class A> SIMD_INLINE Rgba32& Rgba32::At(View<A>& view, ptrdiff_t col, ptrdiff_t row)
+        {
+            assert(view.format == View<A>::Rgba32);
+
+            return Simd::At<A, Rgba32>(view, col, row);
+        }
     }
 }
 
diff --git a/3rdparty/simdlib/Simd/SimdPow.h b/3rdparty/simdlib/Simd/SimdPow.h
old mode 100644
new mode 100755
index 309e3104f0..ca0db18eb5
--- a/3rdparty/simdlib/Simd/SimdPow.h
+++ b/3rdparty/simdlib/Simd/SimdPow.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
diff --git a/3rdparty/simdlib/Simd/SimdResizer.h b/3rdparty/simdlib/Simd/SimdResizer.h
old mode 100644
new mode 100755
index 0a70ee0ad6..15dacfcd0c
--- a/3rdparty/simdlib/Simd/SimdResizer.h
+++ b/3rdparty/simdlib/Simd/SimdResizer.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -46,6 +46,43 @@ namespace Simd
             this->channels = channels;
             this->align = align;
         }
+
+        bool IsByteBilinear() const
+        {
+            return type == SimdResizeChannelByte && method == SimdResizeMethodBilinear;
+        }
+
+        bool IsByteArea() const
+        {
+            return type == SimdResizeChannelByte && method == SimdResizeMethodArea;
+        }
+
+        bool IsShortBilinear() const
+        {
+            return type == SimdResizeChannelShort && method == SimdResizeMethodBilinear;
+        }
+
+        bool IsFloatBilinear() const
+        {
+            return type == SimdResizeChannelFloat && 
+                (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp || method == SimdResizeMethodInferenceEngineInterp);
+        }
+
+        bool IsNearest() const
+        {
+            return method == SimdResizeMethodNearest;
+        }
+
+        size_t ChannelSize() const
+        {
+            static const size_t sizes[3] = { 1, 2, 4 };
+            return sizes[(int)type];
+        }
+
+        size_t PixelSize() const
+        {
+            return ChannelSize() * channels;
+        }
     };
 
     class Resizer : Deletable
@@ -94,13 +131,32 @@ namespace Simd
             virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
         };
 
+        class ResizerShortBilinear : public Resizer
+        {
+        protected:
+            Array32i _ix, _iy;
+            Array32f _ax, _ay, _bx[2];
+
+            void EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t* indices, float* alphas);
+
+            template<size_t N> void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+            template<size_t N> void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+
+            virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+
+        public:
+            ResizerShortBilinear(const ResParam& param);
+
+            virtual void Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride);
+        };
+
         class ResizerFloatBilinear : public Resizer
         {
         protected:
             Array32i _ix, _iy;
             Array32f _ax, _ay, _bx[2];
 
-            void EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, bool caffeInterp, int32_t * indices, float * alphas);
+            void EstimateIndexAlpha(size_t srcSize, size_t dstSize, size_t channels, int32_t * indices, float * alphas);
 
             virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride);
 
@@ -110,22 +166,23 @@ namespace Simd
             virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
         };
 
-        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
-    }
-
-#ifdef SIMD_SSE_ENABLE    
-    namespace Sse
-    {
-        class ResizerFloatBilinear : public Base::ResizerFloatBilinear
+        class ResizerNearest : public Resizer
         {
-            virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride);
+            void Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride);
+            template<size_t N> void Resize(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride);
+        protected:
+            size_t _pixelSize;
+            Array32i _ix, _iy;
+
+            void EstimateIndex(size_t srcSize, size_t dstSize, size_t pixelSize, int32_t* indices);
         public:
-            ResizerFloatBilinear(const ResParam & param);
-        };
+            ResizerNearest(const ResParam& param);
 
+            virtual void Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride);
+        };        
+        
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
     }
-#endif //SIMD_SSE_ENABLE 
 
 #ifdef SIMD_SSE2_ENABLE    
     namespace Sse2
@@ -156,12 +213,19 @@ namespace Simd
             virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
         };
 
+        class ResizerFloatBilinear : public Base::ResizerFloatBilinear
+        {
+            virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride);
+        public:
+            ResizerFloatBilinear(const ResParam & param);
+        };
+
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
     }
 #endif //SIMD_SSE2_ENABLE 
 
-#ifdef SIMD_SSSE3_ENABLE    
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
     {
         class ResizerByteBilinear : public Sse2::ResizerByteBilinear
         {
@@ -183,15 +247,8 @@ namespace Simd
             ResizerByteBilinear(const ResParam & param);
 
             virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
-        };
-
-        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
-    }
-#endif //SIMD_SSSE3_ENABLE 
-
-#ifdef SIMD_SSE41_ENABLE    
-    namespace Sse41
-    {
+        };        
+        
         class ResizerByteArea : public Sse2::ResizerByteArea
         {
         protected:
@@ -202,6 +259,17 @@ namespace Simd
             virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
         };
 
+        class ResizerShortBilinear : public Base::ResizerShortBilinear
+        {
+        protected:
+            template<size_t N> void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+            template<size_t N> void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+
+            virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+        public:
+            ResizerShortBilinear(const ResParam& param);
+        };
+
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
     }
 #endif //SIMD_SSE41_ENABLE
@@ -223,15 +291,7 @@ namespace Simd
 #ifdef SIMD_AVX2_ENABLE    
     namespace Avx2
     {
-        template <class Idx> SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t * src, const Idx & index, const uint8_t * alpha, uint8_t * dst)
-        {
-            __m256i _src = _mm256_loadu_si256((__m256i*)(src + index.src));
-            __m256i _shuffle = _mm256_loadu_si256((__m256i*)&index.shuffle);
-            __m256i _alpha = _mm256_loadu_si256((__m256i*)(alpha + index.dst));
-            _mm256_storeu_si256((__m256i*)(dst + index.dst), _mm256_maddubs_epi16(Avx2::Shuffle(_src, _shuffle), _alpha));
-        }
-
-        class ResizerByteBilinear : public Ssse3::ResizerByteBilinear
+        class ResizerByteBilinear : public Sse41::ResizerByteBilinear
         {
         protected:
             struct Idx
@@ -260,6 +320,17 @@ namespace Simd
             virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
         };
 
+        class ResizerShortBilinear : public Sse41::ResizerShortBilinear
+        {
+        protected:
+            template<size_t N> void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+            template<size_t N> void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+
+            virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+        public:
+            ResizerShortBilinear(const ResParam& param);
+        };
+
         class ResizerFloatBilinear : public Base::ResizerFloatBilinear
         {
             virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride);
@@ -308,6 +379,17 @@ namespace Simd
             virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
         };
 
+        class ResizerShortBilinear : public Base::ResizerShortBilinear
+        {
+        protected:
+            template<size_t N> void RunB(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+            template<size_t N> void RunS(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+
+            virtual void Run(const uint16_t* src, size_t srcStride, uint16_t* dst, size_t dstStride);
+        public:
+            ResizerShortBilinear(const ResParam& param);
+        };
+
         class ResizerFloatBilinear : public Base::ResizerFloatBilinear
         {
             virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride);
diff --git a/3rdparty/simdlib/Simd/SimdResizerCommon.h b/3rdparty/simdlib/Simd/SimdResizerCommon.h
new file mode 100755
index 0000000000..3e6ab00ffa
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdResizerCommon.h
@@ -0,0 +1,97 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdResizerCommon_h__
+#define __SimdResizerCommon_h__
+
+#include "Simd/SimdLoad.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        const __m128i RSB_1_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x4, 0x5, -1, -1, 0x8, 0x9, -1, -1, 0xC, 0xD, -1, -1);
+        const __m128i RSB_1_1 = SIMD_MM_SETR_EPI8(0x2, 0x3, -1, -1, 0x6, 0x7, -1, -1, 0xA, 0xB, -1, -1, 0xE, 0xF, -1, -1);
+
+        SIMD_INLINE __m128 BilColS1(const uint16_t* src, const int32_t* idx, __m128 fx0, __m128 fx1)
+        {
+            __m128i s = _mm_setr_epi32(
+                *(uint32_t*)(src + idx[0]), *(uint32_t*)(src + idx[1]),
+                *(uint32_t*)(src + idx[2]), *(uint32_t*)(src + idx[3]));
+            __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_1_0)));
+            __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_1_1)));
+            return _mm_add_ps(m0, m1);
+        }
+
+        const __m128i RSB_2_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x2, 0x3, -1, -1, 0x8, 0x9, -1, -1, 0xA, 0xB, -1, -1);
+        const __m128i RSB_2_1 = SIMD_MM_SETR_EPI8(0x4, 0x5, -1, -1, 0x6, 0x7, -1, -1, 0xC, 0xD, -1, -1, 0xE, 0xF, -1, -1);
+
+        SIMD_INLINE __m128 BilColS2(const uint16_t* src, const int32_t* idx, __m128 fx0, __m128 fx1)
+        {
+            __m128i s = Sse2::Load((__m128i*)(src + idx[0]), (__m128i*)(src + idx[2]));
+            __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_2_0)));
+            __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_2_1)));
+            return _mm_add_ps(m0, m1);
+        }
+
+        const __m128i RSB_3_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x2, 0x3, -1, -1, 0x4, 0x5, -1, -1, -1, -1, -1, -1);
+        const __m128i RSB_3_1 = SIMD_MM_SETR_EPI8(0x6, 0x7, -1, -1, 0x8, 0x9, -1, -1, 0xA, 0xB, -1, -1, -1, -1, -1, -1);
+
+        SIMD_INLINE __m128 BilColS3(const uint16_t* src, __m128 fx0, __m128 fx1)
+        {
+            __m128i s = _mm_loadu_si128((__m128i*)src);
+            __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_3_0)));
+            __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_3_1)));
+            return _mm_add_ps(m0, m1);
+        }
+
+        const __m128i RSB_4_0 = SIMD_MM_SETR_EPI8(0x0, 0x1, -1, -1, 0x2, 0x3, -1, -1, 0x4, 0x5, -1, -1, 0x6, 0x7, -1, -1);
+        const __m128i RSB_4_1 = SIMD_MM_SETR_EPI8(0x8, 0x9, -1, -1, 0xA, 0xB, -1, -1, 0xC, 0xD, -1, -1, 0xE, 0xF, -1, -1);
+
+        SIMD_INLINE __m128 BilColS4(const uint16_t* src, __m128 fx0, __m128 fx1)
+        {
+            __m128i s = _mm_loadu_si128((__m128i*)src);
+            __m128 m0 = _mm_mul_ps(fx0, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_4_0)));
+            __m128 m1 = _mm_mul_ps(fx1, _mm_cvtepi32_ps(_mm_shuffle_epi8(s, RSB_4_1)));
+            return _mm_add_ps(m0, m1);
+        }
+
+        const __m128i RSB_3_P = SIMD_MM_SETR_EPI8(0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, -1, -1, -1, -1);
+    }
+#endif //SIMD_SSE41_ENABLE
+
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        template <class Idx> SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t * src, const Idx & index, const uint8_t * alpha, uint8_t * dst)
+        {
+            __m256i _src = _mm256_loadu_si256((__m256i*)(src + index.src));
+            __m256i _shuffle = _mm256_loadu_si256((__m256i*)&index.shuffle);
+            __m256i _alpha = _mm256_loadu_si256((__m256i*)(alpha + index.dst));
+            _mm256_storeu_si256((__m256i*)(dst + index.dst), _mm256_maddubs_epi16(Avx2::Shuffle(_src, _shuffle), _alpha));
+        }
+    }
+#endif //SIMD_AVX2_ENABLE 
+}
+#endif//__SimdResizerCommon_h__
diff --git a/3rdparty/simdlib/Simd/SimdRuntime.h b/3rdparty/simdlib/Simd/SimdRuntime.h
old mode 100644
new mode 100755
index 5fb82ebd00..de098cdb94
--- a/3rdparty/simdlib/Simd/SimdRuntime.h
+++ b/3rdparty/simdlib/Simd/SimdRuntime.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -54,10 +54,13 @@ namespace Simd
             if (!_info.empty())
             {
                 std::sort(_candidates.begin(), _candidates.end(), [](const Candidate & a, const Candidate & b) { return a.Mean() < b.Mean(); });
-                std::cout << std::setprecision(3) << std::fixed;
                 std::cout << "Simd::Runtime " << _info << " : ";
+                int64_t f = TimeFrequency();
                 for (size_t i = 0; i < _candidates.size(); ++i)
-                    std::cout << _candidates[i].func.Name() << ": " << _candidates[i].Mean()*1000.0 << "  ";
+                {
+                    int64_t t = _candidates[i].Mean();
+                    std::cout << _candidates[i].func.Name() << ": " << t * 1000 / f << "." << (t * 1000000 / f) % 1000 << "  ";
+                }
                 std::cout << std::endl;
             }
 #endif
@@ -104,18 +107,18 @@ namespace Simd
         {
             Func func;
             size_t count;
-            double sum, min, max;
+            int64_t sum, min, max;
 
             SIMD_INLINE Candidate(const Func & f)
                 : func(f)
                 , count(0)
                 , sum(0)
-                , min(std::numeric_limits<double>::max())
-                , max(std::numeric_limits<double>::min())
+                , min(std::numeric_limits<int64_t>::max())
+                , max(0)
             {
             }
 
-            SIMD_INLINE void Update(const double & value)
+            SIMD_INLINE void Update(int64_t value)
             {
                 count += 1;
                 sum += value;
@@ -123,9 +126,14 @@ namespace Simd
                 max = std::max(max, value);
             }
 
-            SIMD_INLINE double Mean() const
+            SIMD_INLINE int64_t Mean() const
             {
-                return (sum - min - max) / (count - 2);
+                if( count > 2)
+                    return (sum - min - max) / (count - 2);
+                else if (count > 0)
+                    return sum / count;
+                else
+                    return sum;
             }
         };
         typedef std::vector<Candidate> Candidates;
@@ -144,9 +152,9 @@ namespace Simd
                 if (_info.empty())
                     _info = current->func.Info(args);
 #endif
-                double start = Simd::Time();
+                int64_t start = Simd::TimeCounter();
                 current->func.Run(args);
-                current->Update(Simd::Time() - start);
+                current->Update(Simd::TimeCounter() - start);
             }
             else
             {
@@ -173,10 +181,10 @@ namespace Simd
         SIMD_INLINE Candidate * Best()
         {
             Candidate * best = &_candidates[0];
-            double min = best->Mean();
+            int64_t min = best->Mean();
             for (size_t i = 1; i < _candidates.size(); ++i)
             {
-                double mean = _candidates[i].Mean();
+                int64_t mean = _candidates[i].Mean();
                 if (mean < min)
                 {
                     min = mean;
diff --git a/3rdparty/simdlib/Simd/SimdSet.h b/3rdparty/simdlib/Simd/SimdSet.h
old mode 100644
new mode 100755
index ae1bb6066a..22b5622e73
--- a/3rdparty/simdlib/Simd/SimdSet.h
+++ b/3rdparty/simdlib/Simd/SimdSet.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2018 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -109,6 +109,12 @@ namespace Simd
             const float a[4] = { a0, a1, a2, a3 };
             return vld1q_f32(a);
         }
+
+        SIMD_INLINE int32x4_t SetI32(int32_t a0, int32_t a1, int32_t a2, int32_t a3)
+        {
+            const int32_t a[4] = { a0, a1, a2, a3 };
+            return vld1q_s32(a);
+        }
     }
 #endif// SIMD_NEON_ENABLE
 }
diff --git a/3rdparty/simdlib/Simd/SimdSse1Resizer.cpp b/3rdparty/simdlib/Simd/SimdSse1Resizer.cpp
deleted file mode 100644
index 405ee03f4f..0000000000
--- a/3rdparty/simdlib/Simd/SimdSse1Resizer.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdResizer.h"
-#include "Simd/SimdStore.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSE_ENABLE 
-    namespace Sse
-    {
-        ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param)
-            : Base::ResizerFloatBilinear(param)
-        {
-        }
-
-        void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride)
-        {
-            size_t cn = _param.channels;
-            size_t rs = _param.dstW * cn;
-            float * pbx[2] = { _bx[0].data, _bx[1].data };
-            int32_t prev = -2;
-            size_t rsa = AlignLo(rs, Sse::F);
-            for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride)
-            {
-                float fy1 = _ay[dy];
-                float fy0 = 1.0f - fy1;
-                int32_t sy = _iy[dy];
-                int32_t k = 0;
-
-                if (sy == prev)
-                    k = 2;
-                else if (sy == prev + 1)
-                {
-                    Swap(pbx[0], pbx[1]);
-                    k = 1;
-                }
-
-                prev = sy;
-
-                for (; k < 2; k++)
-                {
-                    float * pb = pbx[k];
-                    const float * ps = src + (sy + k)*srcStride;
-                    size_t dx = 0;
-                    if (cn == 1)
-                    {
-                        __m128 _1 = _mm_set1_ps(1.0f);
-                        for (; dx < rsa; dx += Sse::F)
-                        {
-                            __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]);
-                            __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); 
-                            __m128 fx1 = _mm_load_ps(_ax.data + dx);
-                            __m128 fx0 = _mm_sub_ps(_1, fx1);
-                            __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88));
-                            __m128 m1 = _mm_mul_ps(fx1, _mm_shuffle_ps(s01, s23, 0xDD));
-                            _mm_store_ps(pb + dx, _mm_add_ps(m0, m1));
-                        }
-                    }
-                    if (cn == 3 && rs > 3)
-                    {
-                        __m128 _1 = _mm_set1_ps(1.0f);
-                        size_t rs3 = rs - 3;
-                        for (; dx < rs3; dx += 3)
-                        {
-                            __m128 s0 = _mm_loadu_ps(ps + _ix[dx] + 0);
-                            __m128 s1 = _mm_loadu_ps(ps + _ix[dx] + 3);
-                            __m128 fx1 = _mm_set1_ps(_ax.data[dx]);
-                            __m128 fx0 = _mm_sub_ps(_1, fx1);
-                            _mm_storeu_ps(pb + dx, _mm_add_ps(_mm_mul_ps(fx0, s0), _mm_mul_ps(fx1, s1)));
-                        }
-                    }
-                    for (; dx < rs; dx++)
-                    {
-                        int32_t sx = _ix[dx];
-                        float fx = _ax[dx];
-                        pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + cn] * fx;
-                    }
-                }  
-
-                size_t dx = 0;
-                __m128 _fy0 = _mm_set1_ps(fy0);
-                __m128 _fy1 = _mm_set1_ps(fy1);
-                for (; dx < rsa; dx += Sse::F)
-                {
-                    __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _fy0);
-                    __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _fy1);
-                    _mm_storeu_ps(dst + dx, _mm_add_ps(m0, m1));
-                }
-                for (; dx < rs; dx++)
-                    dst[dx] = pbx[0][dx] * fy0 + pbx[1][dx] * fy1;
-            }
-        }
-
-        //---------------------------------------------------------------------
-
-        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
-        {
-            ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128));
-            if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp))
-                return new ResizerFloatBilinear(param);
-            else
-                return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
-        }
-    }
-#endif //SIMD_SSE_ENABLE 
-}
-
diff --git a/3rdparty/simdlib/Simd/SimdSse2.h b/3rdparty/simdlib/Simd/SimdSse2.h
old mode 100644
new mode 100755
index ce304774f5..66a0d22700
--- a/3rdparty/simdlib/Simd/SimdSse2.h
+++ b/3rdparty/simdlib/Simd/SimdSse2.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -33,15 +33,11 @@ namespace Simd
     {
         void BgraToGray(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * gray, size_t grayStride);
 
-        void RgbaToGray(const uint8_t * rgba, size_t width, size_t height, size_t rgbaStride, uint8_t * gray, size_t grayStride);
-
         void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height,
             const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
 
         void BgrToGray(const uint8_t *bgr, size_t width, size_t height, size_t bgrStride, uint8_t *gray, size_t grayStride);
 
-        void RgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride);
-
         void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height,
             size_t channelCount, uint8_t * dst, size_t dstStride);
 
@@ -68,6 +64,8 @@ namespace Simd
         void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
 
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride);
+
         void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
 
diff --git a/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp b/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp
old mode 100644
new mode 100755
index c150220b82..b818225858
--- a/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse2BgraToGray.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -88,6 +88,58 @@ namespace Simd
             else
                 BgraToGray<false>(bgra, width, height, bgraStride, gray, grayStride);
         }
+
+        //---------------------------------------------------------------------
+
+        const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
+
+        SIMD_INLINE __m128i RgbaToGray32(__m128i rgba)
+        {
+            const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF);
+            const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF);
+            const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_0000), _mm_madd_epi16(r0b0, K16_RED_BLUE));
+            return _mm_srli_epi32(_mm_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT);
+        }
+
+        SIMD_INLINE __m128i RgbaToGray(__m128i rgba[4])
+        {
+            const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
+            const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
+            return _mm_packus_epi16(lo, hi);
+        }
+
+        template <bool align> void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            __m128i a[4];
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                {
+                    Load<align>(rgba + 4 * col, a);
+                    Store<align>((__m128i*)(gray + col), RgbaToGray(a));
+                }
+                if (alignedWidth != width)
+                {
+                    Load<false>(rgba + 4 * (width - A), a);
+                    Store<false>((__m128i*)(gray + width - A), RgbaToGray(a));
+                }
+                rgba += rgbaStride;
+                gray += grayStride;
+            }
+        }
+
+        void RgbaToGray(const uint8_t* rgba, size_t width, size_t height, size_t rgbaStride, uint8_t* gray, size_t grayStride)
+        {
+            if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride))
+                RgbaToGray<true>(rgba, width, height, rgbaStride, gray, grayStride);
+            else
+                RgbaToGray<false>(rgba, width, height, rgbaStride, gray, grayStride);
+        }
     }
 #else
     // Work arround to avoid warning: libvisp_simdlib.a(SimdSse2BgraToGray.cpp.o) has no symbols
diff --git a/3rdparty/simdlib/Simd/SimdBaseBgraToRgba.cpp b/3rdparty/simdlib/Simd/SimdSse2Cpu.cpp
similarity index 62%
rename from 3rdparty/simdlib/Simd/SimdBaseBgraToRgba.cpp
rename to 3rdparty/simdlib/Simd/SimdSse2Cpu.cpp
index 8ada2f6a2c..3d1dfe00fb 100644
--- a/3rdparty/simdlib/Simd/SimdBaseBgraToRgba.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse2Cpu.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,30 +21,44 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
-#include "Simd/SimdDefs.h"
-#include <algorithm>
+#include "Simd/SimdEnable.h"
+#include "Simd/SimdCpu.h"
+
+#if defined(_MSC_VER)
+#include <windows.h>
+#endif
 
 namespace Simd
 {
-    namespace Base
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
-        void BgraToRgba(const uint8_t *bgra, size_t size, uint8_t *rgba)
+        SIMD_INLINE bool SupportedByCPU()
         {
-            for (size_t i = 0; i < size; ++i, bgra += 4, rgba += 4)
-            {
-                *(int32_t*)rgba = (*(int32_t*)bgra);
-                std::swap(rgba[0], rgba[2]);
-            }
+            return Base::CheckBit(Cpuid::Ordinary, Cpuid::Edx, Cpuid::SSE2);
         }
 
-        void BgraToRgba(const uint8_t *bgra, size_t width, size_t height, size_t bgraStride, uint8_t *rgba, size_t rgbaStride)
+        SIMD_INLINE bool SupportedByOS()
         {
-            for (size_t row = 0; row < height; ++row)
+#if defined(_MSC_VER)
+            __try
+            {
+                __m128d value = _mm_set1_pd(1.0);// try to execute of SSE2 instructions;
+                return true;
+            }
+            __except (EXCEPTION_EXECUTE_HANDLER)
             {
-                BgraToRgba(bgra, width, rgba);
-                bgra += bgraStride;
-                rgba += rgbaStride;
+                return false;
             }
+#else
+            return true;
+#endif
+        }
+
+        bool GetEnable()
+        {
+            return SupportedByCPU() && SupportedByOS();
         }
     }
+#endif
 }
diff --git a/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp b/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp
old mode 100644
new mode 100755
index 394488a804..70e4f139ea
--- a/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse2GaussianBlur3x3.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -22,6 +22,7 @@
 * SOFTWARE.
 */
 #include "Simd/SimdMemory.h"
+#include "Simd/SimdLoadBlock.h"
 #include "Simd/SimdStore.h"
 
 namespace Simd
diff --git a/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp b/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp
old mode 100644
new mode 100755
index f29d96eeb1..c289ab7f75
--- a/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse2Resizer.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -297,12 +297,12 @@ namespace Simd
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
         {
             ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128i));
-            if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && (channels == 1 || channels == 2) && dstX >= A)
+            if (param.IsByteBilinear() && (channels == 1 || channels == 2) && dstX >= A)
                 return new ResizerByteBilinear(param);
-            else if (type == SimdResizeChannelByte && method == SimdResizeMethodArea)
+            else if (param.IsByteArea())
                 return new ResizerByteArea(param);
             else
-                return Sse::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
+                return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
         }
     }
 #else
diff --git a/3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp b/3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp
deleted file mode 100644
index 927dde0dae..0000000000
--- a/3rdparty/simdlib/Simd/SimdSse2RgbToGray.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdBase.h"
-#include "Simd/SimdSse2.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
-        namespace
-        {
-            struct Buffer
-            {
-                Buffer(size_t width)
-                {
-                    _p = Allocate(sizeof(uint8_t) * 4 * width);
-                    rgba = (uint8_t*)_p;
-                }
-
-                ~Buffer()
-                {
-                    Free(_p);
-                }
-
-                uint8_t * rgba;
-            private:
-                void *_p;
-            };
-        }
-
-        void RgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride)
-        {
-            assert(width >= A);
-
-            Buffer buffer(width);
-
-            for (size_t row = 1; row < height; ++row)
-            {
-                Base::BgrToBgra(rgb, width, buffer.rgba, false, false, 0xFF);
-                Sse2::RgbaToGray(buffer.rgba, width, 1, 4 * width, gray, width);
-                rgb += rgbStride;
-                gray += grayStride;
-            }
-            Base::BgrToBgra(rgb, width, buffer.rgba, false, true, 0xFF);
-            Sse2::RgbaToGray(buffer.rgba, width, 1, 4 * width, gray, width);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSse2RgbToGray.cpp.o) has no symbols
-    void dummy_SimdSse2RgbToGray(){};
-#endif//SIMD_SSE2_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp b/3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp
deleted file mode 100644
index 884f09924b..0000000000
--- a/3rdparty/simdlib/Simd/SimdSse2RgbaToGray.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
-        const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
-        const __m128i K16_GREEN_0000 = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, 0x0000);
-        const __m128i K32_ROUND_TERM = SIMD_MM_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM);
-
-        SIMD_INLINE __m128i RgbaToGray32(__m128i rgba)
-        {
-            const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF);
-            const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF);
-            const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_0000), _mm_madd_epi16(r0b0, K16_RED_BLUE));
-            return _mm_srli_epi32(_mm_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT);
-        }
-
-        SIMD_INLINE __m128i RgbaToGray(__m128i rgba[4])
-        {
-            const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
-            const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
-            return _mm_packus_epi16(lo, hi);
-        }
-
-        template <bool align> SIMD_INLINE void Load(const uint8_t* p, __m128i a[4])
-        {
-            a[0] = Load<align>((__m128i*)p + 0);
-            a[1] = Load<align>((__m128i*)p + 1);
-            a[2] = Load<align>((__m128i*)p + 2);
-            a[3] = Load<align>((__m128i*)p + 3);
-        }
-
-        template <bool align> void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(gray) && Aligned(grayStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-            __m128i a[4];
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                {
-                    Load<align>(rgba + 4 * col, a);
-                    Store<align>((__m128i*)(gray + col), RgbaToGray(a));
-                }
-                if (alignedWidth != width)
-                {
-                    Load<false>(rgba + 4 * (width - A), a);
-                    Store<false>((__m128i*)(gray + width - A), RgbaToGray(a));
-                }
-                rgba += rgbaStride;
-                gray += grayStride;
-            }
-        }
-
-        void RgbaToGray(const uint8_t *rgba, size_t width, size_t height, size_t rgbaStride, uint8_t *gray, size_t grayStride)
-        {
-            if (Aligned(rgba) && Aligned(gray) && Aligned(rgbaStride) && Aligned(grayStride))
-                RgbaToGray<true>(rgba, width, height, rgbaStride, gray, grayStride);
-            else
-                RgbaToGray<false>(rgba, width, height, rgbaStride, gray, grayStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSse2RgbaToGray.cpp.o) has no symbols
-    void dummy_SimdSse2RgbaToGray(){};
-#endif// SIMD_SSE2_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdSse41.h b/3rdparty/simdlib/Simd/SimdSse41.h
new file mode 100755
index 0000000000..958fc11bc5
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdSse41.h
@@ -0,0 +1,76 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdSse41_h__
+#define __SimdSse41_h__
+
+#include "Simd/SimdDefs.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE
+    namespace Sse41
+    {
+        void BgraToBgr(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* bgr, size_t bgrStride);
+
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride);
+
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride);
+
+        void BgrToBgra(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+
+        void BgrToGray(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* gray, size_t grayStride);
+
+        void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride);
+
+        void DeinterleaveBgr(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height, uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride);
+
+        void DeinterleaveBgra(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height, uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride);
+
+        void GaussianBlur3x3(const uint8_t* src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t* dst, size_t dstStride);
+
+        void GrayToBgr(const uint8_t* gray, size_t width, size_t height, size_t grayStride, uint8_t* bgr, size_t bgrStride);
+
+        void InterleaveBgr(const uint8_t* b, size_t bStride, const uint8_t* g, size_t gStride, const uint8_t* r, size_t rStride, size_t width, size_t height, uint8_t* bgr, size_t bgrStride);
+
+        void InterleaveBgra(const uint8_t* b, size_t bStride, const uint8_t* g, size_t gStride, const uint8_t* r, size_t rStride, const uint8_t* a, size_t aStride, size_t width, size_t height, uint8_t* bgra, size_t bgraStride);
+
+        void ReduceColor2x2(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
+
+        void ReduceGray2x2(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
+
+        void ReduceGray4x4(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
+
+        void ResizeBilinear(const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
+
+        void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride);
+    }
+#endif// SIMD_SSE41_ENABLE
+}
+#endif//__SimdSse41_h__
diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToBgra.cpp b/3rdparty/simdlib/Simd/SimdSse41BgrToBgra.cpp
old mode 100644
new mode 100755
similarity index 57%
rename from 3rdparty/simdlib/Simd/SimdSsse3BgrToBgra.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41BgrToBgra.cpp
index 2c7f277758..65787e1a45
--- a/3rdparty/simdlib/Simd/SimdSsse3BgrToBgra.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41BgrToBgra.cpp
@@ -1,74 +1,111 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        template <bool align> SIMD_INLINE void BgrToBgra(const uint8_t * bgr, uint8_t * bgra, __m128i alpha, __m128i shuffle)
-        {
-            Store<align>((__m128i*)bgra + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<align>((__m128i*)(bgr + 0)), shuffle)));
-            Store<align>((__m128i*)bgra + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 12)), shuffle)));
-            Store<align>((__m128i*)bgra + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 24)), shuffle)));
-            Store<align>((__m128i*)bgra + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(bgr + 32)), 4), shuffle)));
-        }
-
-        template <bool align> void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3);
-            __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    BgrToBgra<align>(bgr + 3 * col, bgra + 4 * col, _alpha, _shuffle);
-                if (width != alignedWidth)
-                    BgrToBgra<false>(bgr + 3 * (width - A), bgra + 4 * (width - A), _alpha, _shuffle);
-                bgr += bgrStride;
-                bgra += bgraStride;
-            }
-        }
-
-        void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha)
-        {
-            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride))
-                BgrToBgra<true>(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
-            else
-                BgrToBgra<false>(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToBgra.cpp.o) has no symbols
-    void dummy_SimdSsse3BgrToBgra(){};
-#endif// SIMD_SSSE3_ENABLE
-}
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdStore.h"
+#include "Simd/SimdMemory.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE  
+    namespace Sse41
+    {
+        template <bool align> SIMD_INLINE void BgrToBgra(const uint8_t * bgr, uint8_t * bgra, __m128i alpha, __m128i shuffle)
+        {
+            Store<align>((__m128i*)bgra + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<align>((__m128i*)(bgr + 0)), shuffle)));
+            Store<align>((__m128i*)bgra + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 12)), shuffle)));
+            Store<align>((__m128i*)bgra + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 24)), shuffle)));
+            Store<align>((__m128i*)bgra + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(bgr + 32)), 4), shuffle)));
+        }
+
+        template <bool align> void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+
+            __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3);
+            __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    BgrToBgra<align>(bgr + 3 * col, bgra + 4 * col, _alpha, _shuffle);
+                if (width != alignedWidth)
+                    BgrToBgra<false>(bgr + 3 * (width - A), bgra + 4 * (width - A), _alpha, _shuffle);
+                bgr += bgrStride;
+                bgra += bgraStride;
+            }
+        }
+
+        void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride))
+                BgrToBgra<true>(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
+            else
+                BgrToBgra<false>(bgr, width, height, bgrStride, bgra, bgraStride, alpha);
+        }
+
+        //---------------------------------------------------------------------
+
+        template <bool align> SIMD_INLINE void RgbToBgra(const uint8_t* rgb, uint8_t* bgra, __m128i alpha, __m128i shuffle)
+        {
+            Store<align>((__m128i*)bgra + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<align>((__m128i*)(rgb + 0)), shuffle)));
+            Store<align>((__m128i*)bgra + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(rgb + 12)), shuffle)));
+            Store<align>((__m128i*)bgra + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(rgb + 24)), shuffle)));
+            Store<align>((__m128i*)bgra + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(rgb + 32)), 4), shuffle)));
+        }
+
+        template <bool align> void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+
+            __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3);
+            __m128i _shuffle = _mm_setr_epi8(0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    RgbToBgra<align>(rgb + 3 * col, bgra + 4 * col, _alpha, _shuffle);
+                if (width != alignedWidth)
+                    RgbToBgra<false>(rgb + 3 * (width - A), bgra + 4 * (width - A), _alpha, _shuffle);
+                 rgb += rgbStride;
+                bgra += bgraStride;
+            }
+        }
+
+        void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride))
+                RgbToBgra<true>(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+            else
+                RgbToBgra<false>(rgb, width, height, rgbStride, bgra, bgraStride, alpha);
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToGray.cpp b/3rdparty/simdlib/Simd/SimdSse41BgrToGray.cpp
old mode 100644
new mode 100755
similarity index 56%
rename from 3rdparty/simdlib/Simd/SimdSsse3BgrToGray.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41BgrToGray.cpp
index 224a87bbce..b089e35631
--- a/3rdparty/simdlib/Simd/SimdSsse3BgrToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41BgrToGray.cpp
@@ -1,93 +1,148 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        const __m128i K16_BLUE_RED = SIMD_MM_SET2_EPI16(Base::BLUE_TO_GRAY_WEIGHT, Base::RED_TO_GRAY_WEIGHT);
-        const __m128i K16_GREEN_ROUND = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM);
-
-        SIMD_INLINE __m128i BgraToGray32(__m128i bgra)
-        {
-            const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(bgra, 1), K16_00FF);
-            const __m128i b0r0 = _mm_and_si128(bgra, K16_00FF);
-            const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(b0r0, K16_BLUE_RED));
-            return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT);
-        }
-
-        SIMD_INLINE __m128i BgraToGray(__m128i bgra[4])
-        {
-            const __m128i lo = _mm_packs_epi32(BgraToGray32(bgra[0]), BgraToGray32(bgra[1]));
-            const __m128i hi = _mm_packs_epi32(BgraToGray32(bgra[2]), BgraToGray32(bgra[3]));
-            return _mm_packus_epi16(lo, hi);
-        }
-
-        template <bool align> SIMD_INLINE __m128i BgrToGray(const uint8_t * bgr, __m128i shuffle)
-        {
-            __m128i bgra[4];
-            bgra[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<align>((__m128i*)(bgr + 0)), shuffle));
-            bgra[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 12)), shuffle));
-            bgra[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 24)), shuffle));
-            bgra[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(bgr + 32)), 4), shuffle));
-            return BgraToGray(bgra);
-        }
-
-        template <bool align> void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    Store<align>((__m128i*)(gray + col), BgrToGray<align>(bgr + 3 * col, _shuffle));
-                if (width != alignedWidth)
-                    Store<false>((__m128i*)(gray + width - A), BgrToGray<false>(bgr + 3 * (width - A), _shuffle));
-                bgr += bgrStride;
-                gray += grayStride;
-            }
-        }
-
-        void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride)
-        {
-            if (Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride))
-                BgrToGray<true>(bgr, width, height, bgrStride, gray, grayStride);
-            else
-                BgrToGray<false>(bgr, width, height, bgrStride, gray, grayStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToGray.cpp.o) has no symbols
-    void dummy_SimdSsse3BgrToGray(){};
-#endif// SIMD_SSSE3_ENABLE
-}
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdStore.h"
+#include "Simd/SimdMemory.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE  
+    namespace Sse41
+    {
+        const __m128i K16_BLUE_RED = SIMD_MM_SET2_EPI16(Base::BLUE_TO_GRAY_WEIGHT, Base::RED_TO_GRAY_WEIGHT);
+        const __m128i K16_GREEN_ROUND = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM);
+
+        SIMD_INLINE __m128i BgraToGray32(__m128i bgra)
+        {
+            const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(bgra, 1), K16_00FF);
+            const __m128i b0r0 = _mm_and_si128(bgra, K16_00FF);
+            const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(b0r0, K16_BLUE_RED));
+            return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT);
+        }
+
+        SIMD_INLINE __m128i BgraToGray(__m128i bgra[4])
+        {
+            const __m128i lo = _mm_packs_epi32(BgraToGray32(bgra[0]), BgraToGray32(bgra[1]));
+            const __m128i hi = _mm_packs_epi32(BgraToGray32(bgra[2]), BgraToGray32(bgra[3]));
+            return _mm_packus_epi16(lo, hi);
+        }
+
+        template <bool align> SIMD_INLINE __m128i BgrToGray(const uint8_t * bgr, __m128i shuffle)
+        {
+            __m128i bgra[4];
+            bgra[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<align>((__m128i*)(bgr + 0)), shuffle));
+            bgra[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 12)), shuffle));
+            bgra[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 24)), shuffle));
+            bgra[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(bgr + 32)), 4), shuffle));
+            return BgraToGray(bgra);
+        }
+
+        template <bool align> void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+
+            __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    Store<align>((__m128i*)(gray + col), BgrToGray<align>(bgr + 3 * col, _shuffle));
+                if (width != alignedWidth)
+                    Store<false>((__m128i*)(gray + width - A), BgrToGray<false>(bgr + 3 * (width - A), _shuffle));
+                bgr += bgrStride;
+                gray += grayStride;
+            }
+        }
+
+        void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride)
+        {
+            if (Aligned(gray) && Aligned(grayStride) && Aligned(bgr) && Aligned(bgrStride))
+                BgrToGray<true>(bgr, width, height, bgrStride, gray, grayStride);
+            else
+                BgrToGray<false>(bgr, width, height, bgrStride, gray, grayStride);
+        }
+
+        //---------------------------------------------------------------------
+
+        const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
+
+        SIMD_INLINE __m128i RgbaToGray32(__m128i rgba)
+        {
+            const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF);
+            const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF);
+            const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(r0b0, K16_RED_BLUE));
+            return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT);
+        }
+
+        SIMD_INLINE __m128i RgbaToGray(__m128i rgba[4])
+        {
+            const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
+            const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
+            return _mm_packus_epi16(lo, hi);
+        }
+
+        template <bool align> SIMD_INLINE __m128i RgbToGray(const uint8_t* rgb, __m128i shuffle)
+        {
+            __m128i rgba[4];
+            rgba[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<align>((__m128i*)(rgb + 0)), shuffle));
+            rgba[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(rgb + 12)), shuffle));
+            rgba[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(rgb + 24)), shuffle));
+            rgba[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(rgb + 32)), 4), shuffle));
+            return RgbaToGray(rgba);
+        }
+
+        template <bool align> void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+
+            __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    Store<align>((__m128i*)(gray + col), RgbToGray<align>(rgb + 3 * col, _shuffle));
+                if (width != alignedWidth)
+                    Store<false>((__m128i*)(gray + width - A), RgbToGray<false>(rgb + 3 * (width - A), _shuffle));
+                rgb += rgbStride;
+                gray += grayStride;
+            }
+        }
+
+        void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride)
+        {
+            if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride))
+                RgbToGray<true>(rgb, width, height, rgbStride, gray, grayStride);
+            else
+                RgbToGray<false>(rgb, width, height, rgbStride, gray, grayStride);
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToRgb.cpp b/3rdparty/simdlib/Simd/SimdSse41BgrToRgb.cpp
old mode 100644
new mode 100755
similarity index 84%
rename from 3rdparty/simdlib/Simd/SimdSsse3BgrToRgb.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41BgrToRgb.cpp
index 0f74b41b91..14a351a5c9
--- a/3rdparty/simdlib/Simd/SimdSsse3BgrToRgb.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41BgrToRgb.cpp
@@ -1,83 +1,80 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdStore.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        const __m128i K8_CVT_00 = SIMD_MM_SETR_EPI8(0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1);
-        const __m128i K8_CVT_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1);
-        const __m128i K8_CVT_10 = SIMD_MM_SETR_EPI8(-1, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        const __m128i K8_CVT_11 = SIMD_MM_SETR_EPI8(0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF);
-        const __m128i K8_CVT_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, -1);
-        const __m128i K8_CVT_21 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        const __m128i K8_CVT_22 = SIMD_MM_SETR_EPI8(-1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD);
-
-        template <bool align> SIMD_INLINE void BgrToRgb(const uint8_t * src, uint8_t * dst)
-        {
-            __m128i s0 = Load<align>((__m128i*)src + 0);
-            __m128i s1 = Load<align>((__m128i*)src + 1);
-            __m128i s2 = Load<align>((__m128i*)src + 2);
-            Store<align>((__m128i*)dst + 0, _mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_00), _mm_shuffle_epi8(s1, K8_CVT_01)));
-            Store<align>((__m128i*)dst + 1, _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_10), _mm_shuffle_epi8(s1, K8_CVT_11)), _mm_shuffle_epi8(s2, K8_CVT_12)));
-            Store<align>((__m128i*)dst + 2, _mm_or_si128(_mm_shuffle_epi8(s1, K8_CVT_21), _mm_shuffle_epi8(s2, K8_CVT_22)));
-        }
-
-        template <bool align> void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride));
-
-            const size_t A3 = A * 3;
-            size_t size = width * 3;
-            size_t aligned = AlignLo(width, A) * 3;
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t i = 0; i < aligned; i += A3)
-                    BgrToRgb<align>(bgr + i, rgb + i);
-                if (aligned < size)
-                    BgrToRgb<false>(bgr + size - A3, rgb + size - A3);
-                bgr += bgrStride;
-                rgb += rgbStride;
-            }
-        }
-
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride)
-        {
-            if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride))
-                BgrToRgb<true>(bgr, bgrStride, width, height, rgb, rgbStride);
-            else
-                BgrToRgb<false>(bgr, bgrStride, width, height, rgb, rgbStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToRgb.cpp.o) has no symbols
-    void dummy_SimdSsse3BgrToRgb(){};
-#endif//SIMD_SSSE3_ENABLE
-}
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdStore.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        const __m128i K8_CVT_00 = SIMD_MM_SETR_EPI8(0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1);
+        const __m128i K8_CVT_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1);
+        const __m128i K8_CVT_10 = SIMD_MM_SETR_EPI8(-1, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        const __m128i K8_CVT_11 = SIMD_MM_SETR_EPI8(0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF);
+        const __m128i K8_CVT_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, -1);
+        const __m128i K8_CVT_21 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        const __m128i K8_CVT_22 = SIMD_MM_SETR_EPI8(-1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD);
+
+        template <bool align> SIMD_INLINE void BgrToRgb(const uint8_t * src, uint8_t * dst)
+        {
+            __m128i s0 = Load<align>((__m128i*)src + 0);
+            __m128i s1 = Load<align>((__m128i*)src + 1);
+            __m128i s2 = Load<align>((__m128i*)src + 2);
+            Store<align>((__m128i*)dst + 0, _mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_00), _mm_shuffle_epi8(s1, K8_CVT_01)));
+            Store<align>((__m128i*)dst + 1, _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s0, K8_CVT_10), _mm_shuffle_epi8(s1, K8_CVT_11)), _mm_shuffle_epi8(s2, K8_CVT_12)));
+            Store<align>((__m128i*)dst + 2, _mm_or_si128(_mm_shuffle_epi8(s1, K8_CVT_21), _mm_shuffle_epi8(s2, K8_CVT_22)));
+        }
+
+        template <bool align> void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            const size_t A3 = A * 3;
+            size_t size = width * 3;
+            size_t aligned = AlignLo(width, A) * 3;
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t i = 0; i < aligned; i += A3)
+                    BgrToRgb<align>(bgr + i, rgb + i);
+                if (aligned < size)
+                    BgrToRgb<false>(bgr + size - A3, rgb + size - A3);
+                bgr += bgrStride;
+                rgb += rgbStride;
+            }
+        }
+
+        void BgrToRgb(const uint8_t* bgr, size_t width, size_t height, size_t bgrStride, uint8_t* rgb, size_t rgbStride)
+        {
+            if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride))
+                BgrToRgb<true>(bgr, width, height, bgrStride, rgb, rgbStride);
+            else
+                BgrToRgb<false>(bgr, width, height, bgrStride, rgb, rgbStride);
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgraToBgr.cpp b/3rdparty/simdlib/Simd/SimdSse41BgraToBgr.cpp
old mode 100644
new mode 100755
similarity index 53%
rename from 3rdparty/simdlib/Simd/SimdSsse3BgraToBgr.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41BgraToBgr.cpp
index ccf4c51c97..a3000972e6
--- a/3rdparty/simdlib/Simd/SimdSsse3BgraToBgr.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41BgraToBgr.cpp
@@ -1,92 +1,165 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        template <bool align> SIMD_INLINE void BgraToBgrBody(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2])
-        {
-            Store<align>((__m128i*)(bgr + 0), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 0), k[0][0]));
-            Store<false>((__m128i*)(bgr + 12), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 1), k[0][0]));
-            Store<false>((__m128i*)(bgr + 24), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 2), k[0][0]));
-            Store<false>((__m128i*)(bgr + 36), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 3), k[0][0]));
-        }
-
-        template <bool align> SIMD_INLINE void BgraToBgr(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2])
-        {
-            __m128i bgra0 = Load<align>((__m128i*)bgra + 0);
-            __m128i bgra1 = Load<align>((__m128i*)bgra + 1);
-            __m128i bgra2 = Load<align>((__m128i*)bgra + 2);
-            __m128i bgra3 = Load<align>((__m128i*)bgra + 3);
-            Store<align>((__m128i*)bgr + 0, _mm_or_si128(_mm_shuffle_epi8(bgra0, k[0][0]), _mm_shuffle_epi8(bgra1, k[0][1])));
-            Store<align>((__m128i*)bgr + 1, _mm_or_si128(_mm_shuffle_epi8(bgra1, k[1][0]), _mm_shuffle_epi8(bgra2, k[1][1])));
-            Store<align>((__m128i*)bgr + 2, _mm_or_si128(_mm_shuffle_epi8(bgra2, k[2][0]), _mm_shuffle_epi8(bgra3, k[2][1])));
-        }
-
-        template <bool align> void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-            if (width == alignedWidth)
-                alignedWidth -= A;
-
-            __m128i k[3][2];
-            k[0][0] = _mm_setr_epi8(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1);
-            k[0][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4);
-            k[1][0] = _mm_setr_epi8(0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1);
-            k[1][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9);
-            k[2][0] = _mm_setr_epi8(0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-            k[2][1] = _mm_setr_epi8(-1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    BgraToBgrBody<align>(bgra + 4 * col, bgr + 3 * col, k);
-                if (width != alignedWidth)
-                    BgraToBgr<false>(bgra + 4 * (width - A), bgr + 3 * (width - A), k);
-                bgra += bgraStride;
-                bgr += bgrStride;
-            }
-        }
-
-        void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride)
-        {
-            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride))
-                BgraToBgr<true>(bgra, width, height, bgraStride, bgr, bgrStride);
-            else
-                BgraToBgr<false>(bgra, width, height, bgraStride, bgr, bgrStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgraToBgr.cpp.o) has no symbols
-    void dummy_SimdSsse3BgraToBgr(){};
-#endif// SIMD_SSSE3_ENABLE
-}
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdStore.h"
+#include "Simd/SimdMemory.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE  
+    namespace Sse41
+    {
+        template <bool align> SIMD_INLINE void BgraToBgrBody(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2])
+        {
+            Store<align>((__m128i*)(bgr + 0), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 0), k[0][0]));
+            Store<false>((__m128i*)(bgr + 12), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 1), k[0][0]));
+            Store<false>((__m128i*)(bgr + 24), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 2), k[0][0]));
+            Store<false>((__m128i*)(bgr + 36), _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 3), k[0][0]));
+        }
+
+        template <bool align> SIMD_INLINE void BgraToBgr(const uint8_t * bgra, uint8_t * bgr, __m128i k[3][2])
+        {
+            __m128i bgra0 = Load<align>((__m128i*)bgra + 0);
+            __m128i bgra1 = Load<align>((__m128i*)bgra + 1);
+            __m128i bgra2 = Load<align>((__m128i*)bgra + 2);
+            __m128i bgra3 = Load<align>((__m128i*)bgra + 3);
+            Store<align>((__m128i*)bgr + 0, _mm_or_si128(_mm_shuffle_epi8(bgra0, k[0][0]), _mm_shuffle_epi8(bgra1, k[0][1])));
+            Store<align>((__m128i*)bgr + 1, _mm_or_si128(_mm_shuffle_epi8(bgra1, k[1][0]), _mm_shuffle_epi8(bgra2, k[1][1])));
+            Store<align>((__m128i*)bgr + 2, _mm_or_si128(_mm_shuffle_epi8(bgra2, k[2][0]), _mm_shuffle_epi8(bgra3, k[2][1])));
+        }
+
+        template <bool align> void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            if (width == alignedWidth)
+                alignedWidth -= A;
+
+            __m128i k[3][2];
+            k[0][0] = _mm_setr_epi8(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1);
+            k[0][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4);
+            k[1][0] = _mm_setr_epi8(0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1);
+            k[1][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9);
+            k[2][0] = _mm_setr_epi8(0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+            k[2][1] = _mm_setr_epi8(-1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    BgraToBgrBody<align>(bgra + 4 * col, bgr + 3 * col, k);
+                if (width != alignedWidth)
+                    BgraToBgr<false>(bgra + 4 * (width - A), bgr + 3 * (width - A), k);
+                bgra += bgraStride;
+                bgr += bgrStride;
+            }
+        }
+
+        void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride))
+                BgraToBgr<true>(bgra, width, height, bgraStride, bgr, bgrStride);
+            else
+                BgraToBgr<false>(bgra, width, height, bgraStride, bgr, bgrStride);
+        }
+
+        //---------------------------------------------------------------------
+
+        template <bool align> void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            if (width == alignedWidth)
+                alignedWidth -= A;
+
+            __m128i k[3][2];
+            k[0][0] = _mm_setr_epi8(0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1);
+            k[0][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x1, 0x0, 0x6);
+            k[1][0] = _mm_setr_epi8(0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1, -1, -1, -1, -1);
+            k[1][1] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9);
+            k[2][0] = _mm_setr_epi8(0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+            k[2][1] = _mm_setr_epi8(-1, -1, -1, -1, 0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                    BgraToBgrBody<align>(bgra + 4 * col, rgb + 3 * col, k);
+                if (width != alignedWidth)
+                    BgraToBgr<false>(bgra + 4 * (width - A), rgb + 3 * (width - A), k);
+                bgra += bgraStride;
+                rgb += rgbStride;
+            }
+        }
+
+        void BgraToRgb(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgb, size_t rgbStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgb) && Aligned(rgbStride))
+                BgraToRgb<true>(bgra, width, height, bgraStride, rgb, rgbStride);
+            else
+                BgraToRgb<false>(bgra, width, height, bgraStride, rgb, rgbStride);
+        }
+
+        //---------------------------------------------------------------------
+
+        const __m128i K8_BGRA_TO_RGBA = SIMD_MM_SETR_EPI8(0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF);
+
+        template <bool align> SIMD_INLINE void BgraToRgba(const uint8_t* bgra, uint8_t* rgba)
+        {
+            Store<align>((__m128i*)rgba, _mm_shuffle_epi8(Load<align>((__m128i*)bgra), K8_BGRA_TO_RGBA));
+        }
+
+        template <bool align> void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride));
+
+            size_t size = width * 4;
+            size_t sizeA = AlignLo(size, A);
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t i = 0; i < size; i += A)
+                    BgraToRgba<align>(bgra + i, rgba + i);
+                if (size != sizeA)
+                    BgraToRgba<false>(bgra + size - sizeA, rgba + size - sizeA);
+                bgra += bgraStride;
+                rgba += rgbaStride;
+            }
+        }
+
+        void BgraToRgba(const uint8_t* bgra, size_t width, size_t height, size_t bgraStride, uint8_t* rgba, size_t rgbaStride)
+        {
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(rgba) && Aligned(rgbaStride))
+                BgraToRgba<true>(bgra, width, height, bgraStride, rgba, rgbaStride);
+            else
+                BgraToRgba<false>(bgra, width, height, bgraStride, rgba, rgbaStride);
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdBaseRgbToGray.cpp b/3rdparty/simdlib/Simd/SimdSse41Cpu.cpp
similarity index 54%
rename from 3rdparty/simdlib/Simd/SimdBaseRgbToGray.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41Cpu.cpp
index 6ac7f88791..9b5719ce97 100644
--- a/3rdparty/simdlib/Simd/SimdBaseRgbToGray.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41Cpu.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,23 +21,47 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
-#include "Simd/SimdConversion.h"
+#include "Simd/SimdEnable.h"
+#include "Simd/SimdCpu.h"
+
+#if defined(_MSC_VER)
+#include <windows.h>
+#endif
 
 namespace Simd
 {
-    namespace Base
+#ifdef SIMD_SSE41_ENABLE
+    namespace Sse41
     {
-        void RgbToGray(const uint8_t *rgb, size_t width, size_t height, size_t rgbStride, uint8_t *gray, size_t grayStride)
+        SIMD_INLINE bool SupportedByCPU()
         {
-            for (size_t row = 0; row < height; ++row)
+            return 
+                Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE41) &&
+                Base::CheckBit(Cpuid::Ordinary, Cpuid::Ecx, Cpuid::SSE42);
+        }
+
+        SIMD_INLINE bool SupportedByOS()
+        {
+#if defined(_MSC_VER)
+            __try
             {
-                const uint8_t * pRgb = rgb + row*rgbStride;
-                uint8_t * pGray = gray + row*grayStride;
-                for (const uint8_t *pGrayEnd = pGray + width; pGray < pGrayEnd; pGray += 1, pRgb += 3)
-                {
-                    *pGray = RgbToGray(pRgb[0], pRgb[1], pRgb[2]);
-                }
+                int value = _mm_testz_si128(_mm_set1_epi8(0), _mm_set1_epi8(-1)); // try to execute of SSE41 instructions;
+                uint32_t crc = _mm_crc32_u8(0, 1); // try to execute of SSE42 instructions;
+                return true;
             }
+            __except (EXCEPTION_EXECUTE_HANDLER)
+            {
+                return false;
+            }
+#else
+            return true;
+#endif
+        }
+
+        bool GetEnable()
+        {
+            return SupportedByCPU() && SupportedByOS();
         }
     }
+#endif
 }
diff --git a/3rdparty/simdlib/Simd/SimdSsse3Deinterleave.cpp b/3rdparty/simdlib/Simd/SimdSse41Deinterleave.cpp
similarity index 74%
rename from 3rdparty/simdlib/Simd/SimdSsse3Deinterleave.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41Deinterleave.cpp
index 45ff364d03..68ae14efc5 100644
--- a/3rdparty/simdlib/Simd/SimdSsse3Deinterleave.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41Deinterleave.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -27,8 +27,8 @@
 
 namespace Simd
 {
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
     {
         template <bool align> SIMD_INLINE void DeinterleaveBgr(const uint8_t * bgr, uint8_t * b, uint8_t * g, uint8_t * r, size_t offset)
         {
@@ -69,9 +69,11 @@ namespace Simd
                 DeinterleaveBgr<false>(bgr, bgrStride, width, height, b, bStride, g, gStride, r, rStride);
         }
 
+        //---------------------------------------------------------------------
+
         const __m128i K8_SHUFFLE_BGRA = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF);
 
-        template <bool align> SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset)
+        template <bool align, bool alpha> SIMD_INLINE void DeinterleaveBgra(const uint8_t * bgra, uint8_t * b, uint8_t * g, uint8_t * r, uint8_t *a, size_t offset)
         {
             __m128i _bgra[4];
             _bgra[0] = _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 0), K8_SHUFFLE_BGRA);
@@ -89,7 +91,8 @@ namespace Simd
             __m128i rraa1 = _mm_unpackhi_epi32(_bgra[2], _bgra[3]);
 
             Store<align>((__m128i*)(r + offset), _mm_unpacklo_epi64(rraa0, rraa1));
-            Store<align>((__m128i*)(a + offset), _mm_unpackhi_epi64(rraa0, rraa1));
+            if(alpha)
+                Store<align>((__m128i*)(a + offset), _mm_unpackhi_epi64(rraa0, rraa1));
         }
 
         template <bool align> void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height,
@@ -99,36 +102,51 @@ namespace Simd
             if (align)
             {
                 assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride));
-                assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride));
+                assert(Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL));
             }
 
             size_t alignedWidth = AlignLo(width, A);
 
-            for (size_t row = 0; row < height; ++row)
+            if (a)
             {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    DeinterleaveBgra<align>(bgra + col * 4, b, g, r, a, col);
-                if (width != alignedWidth)
-                    DeinterleaveBgra<false>(bgra + 4 * (width - A), b, g, r, a, width - A);
-                bgra += bgraStride;
-                b += bStride;
-                g += gStride;
-                r += rStride;
-                a += aStride;
+                for (size_t row = 0; row < height; ++row)
+                {
+                    for (size_t col = 0; col < alignedWidth; col += A)
+                        DeinterleaveBgra<align, true>(bgra + col * 4, b, g, r, a, col);
+                    if (width != alignedWidth)
+                        DeinterleaveBgra<false, true>(bgra + 4 * (width - A), b, g, r, a, width - A);
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
+                    a += aStride;
+                }
+            }
+            else
+            {
+                for (size_t row = 0; row < height; ++row)
+                {
+                    for (size_t col = 0; col < alignedWidth; col += A)
+                        DeinterleaveBgra<align, false>(bgra + col * 4, b, g, r, NULL, col);
+                    if (width != alignedWidth)
+                        DeinterleaveBgra<false, false>(bgra + 4 * (width - A), b, g, r, NULL, width - A);
+                    bgra += bgraStride;
+                    b += bStride;
+                    g += gStride;
+                    r += rStride;
+                }
             }
         }
 
         void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height,
             uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride)
         {
-            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && Aligned(aStride))
+            if (Aligned(bgra) && Aligned(bgraStride) && Aligned(b) && Aligned(bStride) && 
+                Aligned(g) && Aligned(gStride) && Aligned(r) && Aligned(rStride) && Aligned(a) && (Aligned(aStride) || a == NULL))
                 DeinterleaveBgra<true>(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
             else
                 DeinterleaveBgra<false>(bgra, bgraStride, width, height, b, bStride, g, gStride, r, rStride, a, aStride);
         }
     }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Deinterleave.cpp.o) has no symbols
-    void dummy_SimdSsse3Deinterleave(){};
-#endif// SIMD_SSSE3_ENABLE
+#endif
 }
diff --git a/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp
old mode 100644
new mode 100755
index bacd2f7d91..73334c635d
--- a/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2020 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -22,6 +22,7 @@
 * SOFTWARE.
 */
 #include "Simd/SimdMemory.h"
+#include "Simd/SimdLoadBlock.h"
 #include "Simd/SimdStore.h"
 #include "Simd/SimdGaussianBlur.h"
 
diff --git a/3rdparty/simdlib/Simd/SimdSsse3GaussianBlur3x3.cpp b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur3x3.cpp
similarity index 95%
rename from 3rdparty/simdlib/Simd/SimdSsse3GaussianBlur3x3.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41GaussianBlur3x3.cpp
index 74ff76aa8a..11573a696b 100644
--- a/3rdparty/simdlib/Simd/SimdSsse3GaussianBlur3x3.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41GaussianBlur3x3.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -22,12 +22,13 @@
 * SOFTWARE.
 */
 #include "Simd/SimdMemory.h"
+#include "Simd/SimdLoadBlock.h"
 #include "Simd/SimdStore.h"
 
 namespace Simd
 {
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
     {
         namespace
         {
@@ -154,8 +155,5 @@ namespace Simd
                 GaussianBlur3x3<false>(src, srcStride, width, height, channelCount, dst, dstStride);
         }
     }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3GaussianBlur3x3.cpp.o) has no symbols
-    void dummy_SimdSsse3GaussianBlur3x3(){};
-#endif// SIMD_SSSE3_ENABLE
+#endif
 }
diff --git a/3rdparty/simdlib/Simd/SimdSsse3GrayToBgr.cpp b/3rdparty/simdlib/Simd/SimdSse41GrayToBgr.cpp
old mode 100644
new mode 100755
similarity index 92%
rename from 3rdparty/simdlib/Simd/SimdSsse3GrayToBgr.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41GrayToBgr.cpp
index 8106f6451a..db79b3e4f0
--- a/3rdparty/simdlib/Simd/SimdSsse3GrayToBgr.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41GrayToBgr.cpp
@@ -1,75 +1,72 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        template <bool align> SIMD_INLINE void GrayToBgr(uint8_t * bgr, __m128i gray)
-        {
-            Store<align>((__m128i*)bgr + 0, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR0));
-            Store<align>((__m128i*)bgr + 1, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR1));
-            Store<align>((__m128i*)bgr + 2, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR2));
-        }
-
-        template <bool align> void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                {
-                    __m128i _gray = Load<align>((__m128i*)(gray + col));
-                    GrayToBgr<align>(bgr + 3 * col, _gray);
-                }
-                if (alignedWidth != width)
-                {
-                    __m128i _gray = Load<false>((__m128i*)(gray + width - A));
-                    GrayToBgr<false>(bgr + 3 * (width - A), _gray);
-                }
-                gray += grayStride;
-                bgr += bgrStride;
-            }
-        }
-
-        void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride)
-        {
-            if (Aligned(bgr) && Aligned(gray) && Aligned(bgrStride) && Aligned(grayStride))
-                GrayToBgr<true>(gray, width, height, grayStride, bgr, bgrStride);
-            else
-                GrayToBgr<false>(gray, width, height, grayStride, bgr, bgrStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3GrayToBgr.cpp.o) has no symbols
-    void dummy_SimdSsse3GrayToBgr(){};
-#endif// SIMD_SSSE3_ENABLE
-}
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdStore.h"
+#include "Simd/SimdMemory.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        template <bool align> SIMD_INLINE void GrayToBgr(uint8_t * bgr, __m128i gray)
+        {
+            Store<align>((__m128i*)bgr + 0, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR0));
+            Store<align>((__m128i*)bgr + 1, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR1));
+            Store<align>((__m128i*)bgr + 2, _mm_shuffle_epi8(gray, K8_SHUFFLE_GRAY_TO_BGR2));
+        }
+
+        template <bool align> void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride));
+
+            size_t alignedWidth = AlignLo(width, A);
+            for (size_t row = 0; row < height; ++row)
+            {
+                for (size_t col = 0; col < alignedWidth; col += A)
+                {
+                    __m128i _gray = Load<align>((__m128i*)(gray + col));
+                    GrayToBgr<align>(bgr + 3 * col, _gray);
+                }
+                if (alignedWidth != width)
+                {
+                    __m128i _gray = Load<false>((__m128i*)(gray + width - A));
+                    GrayToBgr<false>(bgr + 3 * (width - A), _gray);
+                }
+                gray += grayStride;
+                bgr += bgrStride;
+            }
+        }
+
+        void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride)
+        {
+            if (Aligned(bgr) && Aligned(gray) && Aligned(bgrStride) && Aligned(grayStride))
+                GrayToBgr<true>(gray, width, height, grayStride, bgr, bgrStride);
+            else
+                GrayToBgr<false>(gray, width, height, grayStride, bgr, bgrStride);
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3Interleave.cpp b/3rdparty/simdlib/Simd/SimdSse41Interleave.cpp
similarity index 96%
rename from 3rdparty/simdlib/Simd/SimdSsse3Interleave.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41Interleave.cpp
index c7213577fd..bb6354405e 100644
--- a/3rdparty/simdlib/Simd/SimdSsse3Interleave.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41Interleave.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -27,8 +27,8 @@
 
 namespace Simd
 {
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
     {
         template <bool align> SIMD_INLINE void InterleaveBgr(const uint8_t * b, const uint8_t * g, const uint8_t * r, size_t offset, uint8_t * bgr)
         {
@@ -124,8 +124,5 @@ namespace Simd
                 InterleaveBgra<false>(b, bStride, g, gStride, r, rStride, a, aStride, width, height, bgra, bgraStride);
         }
     }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Interleave.cpp.o) has no symbols
-    void dummy_SimdSsse3Interleave(){};
-#endif// SIMD_SSSE3_ENABLE
+#endif
 }
diff --git a/3rdparty/simdlib/Simd/SimdSsse3Reduce.cpp b/3rdparty/simdlib/Simd/SimdSse41Reduce.cpp
old mode 100644
new mode 100755
similarity index 96%
rename from 3rdparty/simdlib/Simd/SimdSsse3Reduce.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41Reduce.cpp
index faded50ec7..9905a6f171
--- a/3rdparty/simdlib/Simd/SimdSsse3Reduce.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41Reduce.cpp
@@ -1,202 +1,199 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2018 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdStore.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1)
-        {
-            return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2);
-        }
-
-        SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
-        {
-            return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11));
-        }
-
-        template <size_t channelCount> __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11);
-
-        template<> SIMD_INLINE __m128i Average8<1>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
-        {
-            return Average8(s00, s01, s10, s11);
-        }
-
-        const __m128i K8_RC2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF);
-
-        template<> SIMD_INLINE __m128i Average8<2>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
-        {
-            return Average8(_mm_shuffle_epi8(s00, K8_RC2), _mm_shuffle_epi8(s01, K8_RC2), _mm_shuffle_epi8(s10, K8_RC2), _mm_shuffle_epi8(s11, K8_RC2));
-        }
-
-        const __m128i K8_RC4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF);
-
-        template<> SIMD_INLINE __m128i Average8<4>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
-        {
-            return Average8(_mm_shuffle_epi8(s00, K8_RC4), _mm_shuffle_epi8(s01, K8_RC4), _mm_shuffle_epi8(s10, K8_RC4), _mm_shuffle_epi8(s11, K8_RC4));
-        }
-
-        template <size_t channelCount, bool align> SIMD_INLINE void ReduceColor2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst)
-        {
-            __m128i s00 = Load<align>((__m128i*)src0 + 0);
-            __m128i s01 = Load<align>((__m128i*)src0 + 1);
-            __m128i s10 = Load<align>((__m128i*)src1 + 0);
-            __m128i s11 = Load<align>((__m128i*)src1 + 1);
-            Store<align>((__m128i*)dst, Average8<channelCount>(s00, s01, s10, s11));
-        }
-
-        template <size_t channelCount, bool align> void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride)
-        {
-            size_t evenWidth = AlignLo(srcWidth, 2);
-            size_t evenSize = evenWidth * channelCount;
-            size_t alignedSize = AlignLo(evenSize, DA);
-            for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2)
-            {
-                const uint8_t *src0 = src;
-                const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride);
-                size_t srcOffset = 0, dstOffset = 0;
-                for (; srcOffset < alignedSize; srcOffset += DA, dstOffset += A)
-                    ReduceColor2x2<channelCount, align>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
-                if (alignedSize != evenSize)
-                {
-                    srcOffset = evenSize - DA;
-                    dstOffset = srcOffset / 2;
-                    ReduceColor2x2<channelCount, false>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
-                }
-                if (evenWidth != srcWidth)
-                {
-                    for (size_t c = 0; c < channelCount; ++c)
-                        dst[evenSize/2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]);
-                }
-                src += 2 * srcStride;
-                dst += dstStride;
-            }
-        }
-
-        const __m128i K8_BGR0 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1);
-        const __m128i K8_BGR1 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0);
-        const __m128i K8_BGR2 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        const __m128i K8_BGR3 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1);
-        const __m128i K8_BGR4 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1);
-        const __m128i K8_BGR5 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        const __m128i K8_BGR6 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF);
-
-        template <bool align> SIMD_INLINE void ReduceBgr2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst)
-        {
-            __m128i s00 = Load<align>((__m128i*)src0 + 0);
-            __m128i s01 = Load<align>((__m128i*)src0 + 1);
-            __m128i s02 = Load<align>((__m128i*)src0 + 2);
-            __m128i s10 = Load<align>((__m128i*)src1 + 0);
-            __m128i s11 = Load<align>((__m128i*)src1 + 1);
-            __m128i s12 = Load<align>((__m128i*)src1 + 2);
-            __m128i m00 = _mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR0), _mm_shuffle_epi8(s01, K8_BGR1));
-            __m128i m01 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR2), _mm_shuffle_epi8(s01, K8_BGR3)), _mm_shuffle_epi8(s02, K8_BGR4));
-            __m128i m10 = _mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR0), _mm_shuffle_epi8(s11, K8_BGR1));
-            __m128i m11 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR2), _mm_shuffle_epi8(s11, K8_BGR3)), _mm_shuffle_epi8(s12, K8_BGR4));
-            Store<align>((__m128i*)dst + 0, Average8(m00, m01, m10, m11));
-            __m128i s03 = Load<align>((__m128i*)src0 + 3);
-            __m128i s04 = Load<align>((__m128i*)src0 + 4);
-            __m128i s13 = Load<align>((__m128i*)src1 + 3);
-            __m128i s14 = Load<align>((__m128i*)src1 + 4);
-            __m128i m02 = _mm_or_si128(_mm_shuffle_epi8(s01, K8_BGR5), _mm_shuffle_epi8(s02, K8_BGR6));
-            __m128i m03 = _mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR0), _mm_shuffle_epi8(s04, K8_BGR1));
-            __m128i m12 = _mm_or_si128(_mm_shuffle_epi8(s11, K8_BGR5), _mm_shuffle_epi8(s12, K8_BGR6));
-            __m128i m13 = _mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR0), _mm_shuffle_epi8(s14, K8_BGR1));
-            Store<align>((__m128i*)dst + 1, Average8(m02, m03, m12, m13));
-            __m128i s05 = Load<align>((__m128i*)src0 + 5);
-            __m128i s15 = Load<align>((__m128i*)src1 + 5);
-            __m128i m04 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR2), _mm_shuffle_epi8(s04, K8_BGR3)), _mm_shuffle_epi8(s05, K8_BGR4));
-            __m128i m05 = _mm_or_si128(_mm_shuffle_epi8(s04, K8_BGR5), _mm_shuffle_epi8(s05, K8_BGR6));
-            __m128i m14 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR2), _mm_shuffle_epi8(s14, K8_BGR3)), _mm_shuffle_epi8(s15, K8_BGR4));
-            __m128i m15 = _mm_or_si128(_mm_shuffle_epi8(s14, K8_BGR5), _mm_shuffle_epi8(s15, K8_BGR6));
-            Store<align>((__m128i*)dst + 2, Average8(m04, m05, m14, m15));
-        }
-
-        template <bool align> void ReduceBgr2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride)
-        {
-            size_t evenWidth = AlignLo(srcWidth, 2);
-            size_t alignedWidth = AlignLo(srcWidth, DA);
-            size_t evenSize = evenWidth * 3;
-            size_t alignedSize = alignedWidth*3;
-            size_t srcStep = DA * 3, dstStep = A*3;
-            for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2)
-            {
-                const uint8_t *src0 = src;
-                const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride);
-                size_t srcOffset = 0, dstOffset = 0;
-                for (; srcOffset < alignedSize; srcOffset += srcStep, dstOffset += dstStep)
-                    ReduceBgr2x2<align>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
-                if (alignedSize != evenSize)
-                {
-                    srcOffset = evenSize - srcStep;
-                    dstOffset = srcOffset / 2;
-                    ReduceBgr2x2<false>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
-                }
-                if (evenWidth != srcWidth)
-                {
-                    for (size_t c = 0; c < 3; ++c)
-                        dst[evenSize / 2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]);
-                }
-                src += 2 * srcStride;
-                dst += dstStride;
-            }
-        }
-
-        template <bool align> void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount)
-        {
-            assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA);
-            if (align)
-            {
-                assert(Aligned(src) && Aligned(srcStride));
-                assert(Aligned(dst) && Aligned(dstStride));
-            }
-
-            switch (channelCount)
-            {
-            case 1: ReduceColor2x2<1, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
-            case 2: ReduceColor2x2<2, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
-            case 3: ReduceBgr2x2<align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
-            case 4: ReduceColor2x2<4, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
-            default: assert(0);
-            }
-        }
-
-        void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount)
-        {
-            if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride))
-                ReduceColor2x2<true>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
-            else
-                ReduceColor2x2<false>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Reduce.cpp.o) has no symbols
-    void dummy_SimdSsse3Reduce(){};
-#endif// SIMD_SSSE3_ENABLE
-}
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdStore.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1)
+        {
+            return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2);
+        }
+
+        SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
+        {
+            return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11));
+        }
+
+        template <size_t channelCount> __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11);
+
+        template<> SIMD_INLINE __m128i Average8<1>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
+        {
+            return Average8(s00, s01, s10, s11);
+        }
+
+        const __m128i K8_RC2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF);
+
+        template<> SIMD_INLINE __m128i Average8<2>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
+        {
+            return Average8(_mm_shuffle_epi8(s00, K8_RC2), _mm_shuffle_epi8(s01, K8_RC2), _mm_shuffle_epi8(s10, K8_RC2), _mm_shuffle_epi8(s11, K8_RC2));
+        }
+
+        const __m128i K8_RC4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF);
+
+        template<> SIMD_INLINE __m128i Average8<4>(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
+        {
+            return Average8(_mm_shuffle_epi8(s00, K8_RC4), _mm_shuffle_epi8(s01, K8_RC4), _mm_shuffle_epi8(s10, K8_RC4), _mm_shuffle_epi8(s11, K8_RC4));
+        }
+
+        template <size_t channelCount, bool align> SIMD_INLINE void ReduceColor2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst)
+        {
+            __m128i s00 = Load<align>((__m128i*)src0 + 0);
+            __m128i s01 = Load<align>((__m128i*)src0 + 1);
+            __m128i s10 = Load<align>((__m128i*)src1 + 0);
+            __m128i s11 = Load<align>((__m128i*)src1 + 1);
+            Store<align>((__m128i*)dst, Average8<channelCount>(s00, s01, s10, s11));
+        }
+
+        template <size_t channelCount, bool align> void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride)
+        {
+            size_t evenWidth = AlignLo(srcWidth, 2);
+            size_t evenSize = evenWidth * channelCount;
+            size_t alignedSize = AlignLo(evenSize, DA);
+            for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2)
+            {
+                const uint8_t *src0 = src;
+                const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride);
+                size_t srcOffset = 0, dstOffset = 0;
+                for (; srcOffset < alignedSize; srcOffset += DA, dstOffset += A)
+                    ReduceColor2x2<channelCount, align>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
+                if (alignedSize != evenSize)
+                {
+                    srcOffset = evenSize - DA;
+                    dstOffset = srcOffset / 2;
+                    ReduceColor2x2<channelCount, false>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
+                }
+                if (evenWidth != srcWidth)
+                {
+                    for (size_t c = 0; c < channelCount; ++c)
+                        dst[evenSize/2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]);
+                }                
+                src += 2 * srcStride;
+                dst += dstStride;
+            }
+        }
+
+        const __m128i K8_BGR0 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1);
+        const __m128i K8_BGR1 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0);
+        const __m128i K8_BGR2 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        const __m128i K8_BGR3 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1);
+        const __m128i K8_BGR4 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1);
+        const __m128i K8_BGR5 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        const __m128i K8_BGR6 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF);
+
+        template <bool align> SIMD_INLINE void ReduceBgr2x2(const uint8_t * src0, const uint8_t * src1, uint8_t * dst)
+        {
+            __m128i s00 = Load<align>((__m128i*)src0 + 0);
+            __m128i s01 = Load<align>((__m128i*)src0 + 1);
+            __m128i s02 = Load<align>((__m128i*)src0 + 2);
+            __m128i s10 = Load<align>((__m128i*)src1 + 0);
+            __m128i s11 = Load<align>((__m128i*)src1 + 1);
+            __m128i s12 = Load<align>((__m128i*)src1 + 2);
+            __m128i m00 = _mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR0), _mm_shuffle_epi8(s01, K8_BGR1));
+            __m128i m01 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s00, K8_BGR2), _mm_shuffle_epi8(s01, K8_BGR3)), _mm_shuffle_epi8(s02, K8_BGR4));
+            __m128i m10 = _mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR0), _mm_shuffle_epi8(s11, K8_BGR1));
+            __m128i m11 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s10, K8_BGR2), _mm_shuffle_epi8(s11, K8_BGR3)), _mm_shuffle_epi8(s12, K8_BGR4));
+            Store<align>((__m128i*)dst + 0, Average8(m00, m01, m10, m11));
+            __m128i s03 = Load<align>((__m128i*)src0 + 3);
+            __m128i s04 = Load<align>((__m128i*)src0 + 4); 
+            __m128i s13 = Load<align>((__m128i*)src1 + 3);
+            __m128i s14 = Load<align>((__m128i*)src1 + 4);
+            __m128i m02 = _mm_or_si128(_mm_shuffle_epi8(s01, K8_BGR5), _mm_shuffle_epi8(s02, K8_BGR6));
+            __m128i m03 = _mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR0), _mm_shuffle_epi8(s04, K8_BGR1));
+            __m128i m12 = _mm_or_si128(_mm_shuffle_epi8(s11, K8_BGR5), _mm_shuffle_epi8(s12, K8_BGR6));
+            __m128i m13 = _mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR0), _mm_shuffle_epi8(s14, K8_BGR1));
+            Store<align>((__m128i*)dst + 1, Average8(m02, m03, m12, m13));
+            __m128i s05 = Load<align>((__m128i*)src0 + 5);
+            __m128i s15 = Load<align>((__m128i*)src1 + 5);
+            __m128i m04 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s03, K8_BGR2), _mm_shuffle_epi8(s04, K8_BGR3)), _mm_shuffle_epi8(s05, K8_BGR4));
+            __m128i m05 = _mm_or_si128(_mm_shuffle_epi8(s04, K8_BGR5), _mm_shuffle_epi8(s05, K8_BGR6));
+            __m128i m14 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(s13, K8_BGR2), _mm_shuffle_epi8(s14, K8_BGR3)), _mm_shuffle_epi8(s15, K8_BGR4));
+            __m128i m15 = _mm_or_si128(_mm_shuffle_epi8(s14, K8_BGR5), _mm_shuffle_epi8(s15, K8_BGR6));
+            Store<align>((__m128i*)dst + 2, Average8(m04, m05, m14, m15));
+        }
+
+        template <bool align> void ReduceBgr2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t * dst, size_t dstStride)
+        {
+            size_t evenWidth = AlignLo(srcWidth, 2);
+            size_t alignedWidth = AlignLo(srcWidth, DA);
+            size_t evenSize = evenWidth * 3;
+            size_t alignedSize = alignedWidth*3;
+            size_t srcStep = DA * 3, dstStep = A*3;
+            for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2)
+            {
+                const uint8_t *src0 = src;
+                const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride);
+                size_t srcOffset = 0, dstOffset = 0;
+                for (; srcOffset < alignedSize; srcOffset += srcStep, dstOffset += dstStep)
+                    ReduceBgr2x2<align>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
+                if (alignedSize != evenSize)
+                {
+                    srcOffset = evenSize - srcStep;
+                    dstOffset = srcOffset / 2;
+                    ReduceBgr2x2<false>(src0 + srcOffset, src1 + srcOffset, dst + dstOffset);
+                }
+                if (evenWidth != srcWidth)
+                {
+                    for (size_t c = 0; c < 3; ++c)
+                        dst[evenSize / 2 + c] = Base::Average(src0[evenSize + c], src1[evenSize + c]);
+                }
+                src += 2 * srcStride;
+                dst += dstStride;
+            }
+        }
+
+        template <bool align> void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount)
+        {
+            assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA);
+            if (align)
+            {
+                assert(Aligned(src) && Aligned(srcStride));
+                assert(Aligned(dst) && Aligned(dstStride));
+            }
+
+            switch (channelCount)
+            {
+            case 1: ReduceColor2x2<1, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
+            case 2: ReduceColor2x2<2, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
+            case 3: ReduceBgr2x2<align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
+            case 4: ReduceColor2x2<4, align>(src, srcWidth, srcHeight, srcStride, dst, dstStride); break;
+            default: assert(0);
+            }
+        }
+
+        void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount)
+        {
+            if (Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride))
+                ReduceColor2x2<true>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
+            else
+                ReduceColor2x2<false>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray2x2.cpp b/3rdparty/simdlib/Simd/SimdSse41ReduceGray2x2.cpp
old mode 100644
new mode 100755
similarity index 94%
rename from 3rdparty/simdlib/Simd/SimdSsse3ReduceGray2x2.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41ReduceGray2x2.cpp
index 24d071182d..dd8bd5b0e3
--- a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray2x2.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41ReduceGray2x2.cpp
@@ -1,96 +1,93 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdStore.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1)
-        {
-            return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2);
-        }
-
-        SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
-        {
-            return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11));
-        }
-
-        template <bool align> void ReduceGray2x2(
-            const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride)
-        {
-            assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA);
-            if (align)
-            {
-                assert(Aligned(src) && Aligned(srcStride));
-                assert(Aligned(dst) && Aligned(dstStride) && Aligned(dstWidth));
-            }
-
-            size_t alignedWidth = AlignLo(srcWidth, DA);
-            size_t evenWidth = AlignLo(srcWidth, 2);
-            for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2)
-            {
-                const uint8_t *src0 = src;
-                const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride);
-                size_t srcOffset = 0, dstOffset = 0;
-                for (; srcOffset < alignedWidth; srcOffset += DA, dstOffset += A)
-                {
-                    Store<align>((__m128i*)(dst + dstOffset), Average8(
-                        Load<align>((__m128i*)(src0 + srcOffset)), Load<align>((__m128i*)(src0 + srcOffset + A)),
-                        Load<align>((__m128i*)(src1 + srcOffset)), Load<align>((__m128i*)(src1 + srcOffset + A))));
-                }
-                if (alignedWidth != srcWidth)
-                {
-                    dstOffset = dstWidth - A - (evenWidth != srcWidth ? 1 : 0);
-                    srcOffset = evenWidth - DA;
-                    Store<align>((__m128i*)(dst + dstOffset), Average8(
-                        Load<align>((__m128i*)(src0 + srcOffset)), Load<align>((__m128i*)(src0 + srcOffset + A)),
-                        Load<align>((__m128i*)(src1 + srcOffset)), Load<align>((__m128i*)(src1 + srcOffset + A))));
-                    if (evenWidth != srcWidth)
-                    {
-                        dst[dstWidth - 1] = Base::Average(src0[evenWidth], src1[evenWidth]);
-                    }
-                }
-                src += 2 * srcStride;
-                dst += dstStride;
-            }
-        }
-
-        void ReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride)
-        {
-            if (Aligned(src) && Aligned(srcWidth) && Aligned(srcStride) && Aligned(dst) && Aligned(dstWidth) && Aligned(dstStride))
-                ReduceGray2x2<true>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
-            else
-                ReduceGray2x2<false>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Reduce2x2.cpp.o) has no symbols
-    void dummy_SimdSsse3Reduce2x2(){};
-#endif// SIMD_SSSE3_ENABLE
-}
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdStore.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1)
+        {
+            return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2);
+        }
+
+        SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
+        {
+            return _mm_packus_epi16(Average16(s00, s10), Average16(s01, s11));
+        }
+
+        template <bool align> void ReduceGray2x2(
+            const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride)
+        {
+            assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= DA);
+            if (align)
+            {
+                assert(Aligned(src) && Aligned(srcStride));
+                assert(Aligned(dst) && Aligned(dstStride) && Aligned(dstWidth));
+            }
+
+            size_t alignedWidth = AlignLo(srcWidth, DA);
+            size_t evenWidth = AlignLo(srcWidth, 2);
+            for (size_t srcRow = 0; srcRow < srcHeight; srcRow += 2)
+            {
+                const uint8_t *src0 = src;
+                const uint8_t *src1 = (srcRow == srcHeight - 1 ? src : src + srcStride);
+                size_t srcOffset = 0, dstOffset = 0;
+                for (; srcOffset < alignedWidth; srcOffset += DA, dstOffset += A)
+                {
+                    Store<align>((__m128i*)(dst + dstOffset), Average8(
+                        Load<align>((__m128i*)(src0 + srcOffset)), Load<align>((__m128i*)(src0 + srcOffset + A)),
+                        Load<align>((__m128i*)(src1 + srcOffset)), Load<align>((__m128i*)(src1 + srcOffset + A))));
+                }
+                if (alignedWidth != srcWidth)
+                {
+                    dstOffset = dstWidth - A - (evenWidth != srcWidth ? 1 : 0);
+                    srcOffset = evenWidth - DA;
+                    Store<align>((__m128i*)(dst + dstOffset), Average8(
+                        Load<align>((__m128i*)(src0 + srcOffset)), Load<align>((__m128i*)(src0 + srcOffset + A)),
+                        Load<align>((__m128i*)(src1 + srcOffset)), Load<align>((__m128i*)(src1 + srcOffset + A))));
+                    if (evenWidth != srcWidth)
+                    {
+                        dst[dstWidth - 1] = Base::Average(src0[evenWidth], src1[evenWidth]);
+                    }
+                }
+                src += 2 * srcStride;
+                dst += dstStride;
+            }
+        }
+
+        void ReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride)
+        {
+            if (Aligned(src) && Aligned(srcWidth) && Aligned(srcStride) && Aligned(dst) && Aligned(dstWidth) && Aligned(dstStride))
+                ReduceGray2x2<true>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
+            else
+                ReduceGray2x2<false>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray4x4.cpp b/3rdparty/simdlib/Simd/SimdSse41ReduceGray4x4.cpp
old mode 100644
new mode 100755
similarity index 96%
rename from 3rdparty/simdlib/Simd/SimdSsse3ReduceGray4x4.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41ReduceGray4x4.cpp
index 261e84c918..7754b290ba
--- a/3rdparty/simdlib/Simd/SimdSsse3ReduceGray4x4.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41ReduceGray4x4.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -26,8 +26,8 @@
 
 namespace Simd
 {
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
     {
         namespace
         {
@@ -170,8 +170,5 @@ namespace Simd
                 ReduceGray4x4<false>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
         }
     }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Reduce4x4.cpp.o) has no symbols
-    void dummy_SimdSsse3Reduce4x4(){};
-#endif// SIMD_SSSE3_ENABLE
+#endif
 }
diff --git a/3rdparty/simdlib/Simd/SimdSsse3ResizeBilinear.cpp b/3rdparty/simdlib/Simd/SimdSse41ResizeBilinear.cpp
old mode 100644
new mode 100755
similarity index 98%
rename from 3rdparty/simdlib/Simd/SimdSsse3ResizeBilinear.cpp
rename to 3rdparty/simdlib/Simd/SimdSse41ResizeBilinear.cpp
index b39f619005..50a708aa20
--- a/3rdparty/simdlib/Simd/SimdSsse3ResizeBilinear.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41ResizeBilinear.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -27,8 +27,8 @@
 
 namespace Simd
 {
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
+#ifdef SIMD_SSE41_ENABLE
+    namespace Sse41
     {
         namespace
         {
@@ -401,9 +401,6 @@ namespace Simd
             }
         }
     }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3ResizeBilinear.cpp.o) has no symbols
-    void dummy_SimdSsse3ResizeBilinear(){};
 #endif
 }
 
diff --git a/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp b/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp
old mode 100644
new mode 100755
index b766a8a209..e3e8e7b360
--- a/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp
+++ b/3rdparty/simdlib/Simd/SimdSse41Resizer.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -32,6 +32,309 @@ namespace Simd
 #ifdef SIMD_SSE41_ENABLE
     namespace Sse41
     {
+        ResizerByteBilinear::ResizerByteBilinear(const ResParam& param)
+            : Sse2::ResizerByteBilinear(param)
+            , _blocks(0)
+        {
+        }
+
+        size_t ResizerByteBilinear::BlockCountMax(size_t align)
+        {
+            return (size_t)Simd::Max(::ceil(float(_param.srcW) / (align - 1)), ::ceil(float(_param.dstW) * 2.0f / align));
+        }
+
+        void ResizerByteBilinear::EstimateParams()
+        {
+            if (_ax.data)
+                return;
+            if (_param.channels == 1 && _param.srcW < 4 * _param.dstW)
+                _blocks = BlockCountMax(A);
+            float scale = (float)_param.srcW / _param.dstW;
+            _ax.Resize(AlignHi(_param.dstW, A) * _param.channels * 2, false, _param.align);
+            uint8_t* alphas = _ax.data;
+            if (_blocks)
+            {
+                _ixg.Resize(_blocks);
+                int block = 0;
+                _ixg[0].src = 0;
+                _ixg[0].dst = 0;
+                for (int dstIndex = 0; dstIndex < (int)_param.dstW; ++dstIndex)
+                {
+                    float alpha = (float)((dstIndex + 0.5) * scale - 0.5);
+                    int srcIndex = (int)::floor(alpha);
+                    alpha -= srcIndex;
+
+                    if (srcIndex < 0)
+                    {
+                        srcIndex = 0;
+                        alpha = 0;
+                    }
+
+                    if (srcIndex > (int)_param.srcW - 2)
+                    {
+                        srcIndex = (int)_param.srcW - 2;
+                        alpha = 1;
+                    }
+
+                    int dst = 2 * dstIndex - _ixg[block].dst;
+                    int src = srcIndex - _ixg[block].src;
+                    if (src >= A - 1 || dst >= A)
+                    {
+                        block++;
+                        _ixg[block].src = Simd::Min(srcIndex, int(_param.srcW - A));
+                        _ixg[block].dst = 2 * dstIndex;
+                        dst = 0;
+                        src = srcIndex - _ixg[block].src;
+                    }
+                    _ixg[block].shuffle[dst] = src;
+                    _ixg[block].shuffle[dst + 1] = src + 1;
+
+                    alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5);
+                    alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]);
+                    alphas += 2;
+                }
+                _blocks = block + 1;
+            }
+            else
+            {
+                _ix.Resize(_param.dstW);
+                for (size_t i = 0; i < _param.dstW; ++i)
+                {
+                    float alpha = (float)((i + 0.5) * scale - 0.5);
+                    ptrdiff_t index = (ptrdiff_t)::floor(alpha);
+                    alpha -= index;
+
+                    if (index < 0)
+                    {
+                        index = 0;
+                        alpha = 0;
+                    }
+
+                    if (index > (ptrdiff_t)_param.srcW - 2)
+                    {
+                        index = _param.srcW - 2;
+                        alpha = 1;
+                    }
+
+                    _ix[i] = (int)index;
+                    alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5);
+                    alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]);
+                    for (size_t channel = 1; channel < _param.channels; channel++)
+                        ((uint16_t*)alphas)[channel] = *(uint16_t*)alphas;
+                    alphas += 2 * _param.channels;
+                }
+            }
+            size_t size = AlignHi(_param.dstW, _param.align) * _param.channels * 2;
+            _bx[0].Resize(size, false, _param.align);
+            _bx[1].Resize(size, false, _param.align);
+        }
+
+        template <size_t N> void ResizerByteBilinearInterpolateX(const __m128i* alpha, __m128i* buffer);
+
+        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<1>(const __m128i* alpha, __m128i* buffer)
+        {
+            _mm_store_si128(buffer, _mm_maddubs_epi16(_mm_load_si128(buffer), _mm_load_si128(alpha)));
+        }
+
+        const __m128i K8_SHUFFLE_X2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF);
+
+        SIMD_INLINE void ResizerByteBilinearInterpolateX2(const __m128i* alpha, __m128i* buffer)
+        {
+            __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X2);
+            _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha)));
+        }
+
+        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<2>(const __m128i* alpha, __m128i* buffer)
+        {
+            ResizerByteBilinearInterpolateX2(alpha + 0, buffer + 0);
+            ResizerByteBilinearInterpolateX2(alpha + 1, buffer + 1);
+        }
+
+        const __m128i K8_SHUFFLE_X3_00 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1);
+        const __m128i K8_SHUFFLE_X3_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0);
+        const __m128i K8_SHUFFLE_X3_10 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        const __m128i K8_SHUFFLE_X3_11 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1);
+        const __m128i K8_SHUFFLE_X3_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1);
+        const __m128i K8_SHUFFLE_X3_21 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        const __m128i K8_SHUFFLE_X3_22 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF);
+
+        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<3>(const __m128i* alpha, __m128i* buffer)
+        {
+            __m128i src[3], shuffled[3];
+            src[0] = _mm_load_si128(buffer + 0);
+            src[1] = _mm_load_si128(buffer + 1);
+            src[2] = _mm_load_si128(buffer + 2);
+            shuffled[0] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_00);
+            shuffled[0] = _mm_or_si128(shuffled[0], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_01));
+            _mm_store_si128(buffer + 0, _mm_maddubs_epi16(shuffled[0], _mm_load_si128(alpha + 0)));
+            shuffled[1] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_10);
+            shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_11));
+            shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_12));
+            _mm_store_si128(buffer + 1, _mm_maddubs_epi16(shuffled[1], _mm_load_si128(alpha + 1)));
+            shuffled[2] = _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_21);
+            shuffled[2] = _mm_or_si128(shuffled[2], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_22));
+            _mm_store_si128(buffer + 2, _mm_maddubs_epi16(shuffled[2], _mm_load_si128(alpha + 2)));
+        }
+
+        const __m128i K8_SHUFFLE_X4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF);
+
+        SIMD_INLINE void ResizerByteBilinearInterpolateX4(const __m128i* alpha, __m128i* buffer)
+        {
+            __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X4);
+            _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha)));
+        }
+
+        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<4>(const __m128i* alpha, __m128i* buffer)
+        {
+            ResizerByteBilinearInterpolateX4(alpha + 0, buffer + 0);
+            ResizerByteBilinearInterpolateX4(alpha + 1, buffer + 1);
+            ResizerByteBilinearInterpolateX4(alpha + 2, buffer + 2);
+            ResizerByteBilinearInterpolateX4(alpha + 3, buffer + 3);
+        }
+
+        const __m128i K16_FRACTION_ROUND_TERM = SIMD_MM_SET1_EPI16(Base::BILINEAR_ROUND_TERM);
+
+        template<bool align> SIMD_INLINE __m128i ResizerByteBilinearInterpolateY(const __m128i* pbx0, const __m128i* pbx1, __m128i alpha[2])
+        {
+            __m128i sum = _mm_add_epi16(_mm_mullo_epi16(Load<align>(pbx0), alpha[0]), _mm_mullo_epi16(Load<align>(pbx1), alpha[1]));
+            return _mm_srli_epi16(_mm_add_epi16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT);
+        }
+
+        template<bool align> SIMD_INLINE void ResizerByteBilinearInterpolateY(const uint8_t* bx0, const uint8_t* bx1, __m128i alpha[2], uint8_t* dst)
+        {
+            __m128i lo = ResizerByteBilinearInterpolateY<align>((__m128i*)bx0 + 0, (__m128i*)bx1 + 0, alpha);
+            __m128i hi = ResizerByteBilinearInterpolateY<align>((__m128i*)bx0 + 1, (__m128i*)bx1 + 1, alpha);
+            Store<false>((__m128i*)dst, _mm_packus_epi16(lo, hi));
+        }
+
+        template<size_t N> void ResizerByteBilinear::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
+        {
+            struct One { uint8_t val[N * 1]; };
+            struct Two { uint8_t val[N * 2]; };
+
+            size_t size = 2 * _param.dstW * N;
+            size_t aligned = AlignHi(size, DA) - DA;
+            const size_t step = A * N;
+            ptrdiff_t previous = -2;
+            __m128i a[2];
+            uint8_t* bx[2] = { _bx[0].data, _bx[1].data };
+            const uint8_t* ax = _ax.data;
+            const int32_t* ix = _ix.data;
+            size_t dstW = _param.dstW;
+
+            for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride)
+            {
+                a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst]));
+                a[1] = _mm_set1_epi16(int16_t(_ay[yDst]));
+
+                ptrdiff_t sy = _iy[yDst];
+                int k = 0;
+
+                if (sy == previous)
+                    k = 2;
+                else if (sy == previous + 1)
+                {
+                    Swap(bx[0], bx[1]);
+                    k = 1;
+                }
+
+                previous = sy;
+
+                for (; k < 2; k++)
+                {
+                    Two* pb = (Two*)bx[k];
+                    const One* psrc = (const One*)(src + (sy + k) * srcStride);
+                    for (size_t x = 0; x < dstW; x++)
+                        pb[x] = *(Two*)(psrc + ix[x]);
+
+                    uint8_t* pbx = bx[k];
+                    for (size_t i = 0; i < size; i += step)
+                        ResizerByteBilinearInterpolateX<N>((__m128i*)(ax + i), (__m128i*)(pbx + i));
+                }
+
+                for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A)
+                    ResizerByteBilinearInterpolateY<true>(bx[0] + ib, bx[1] + ib, a, dst + id);
+                size_t i = size - DA;
+                ResizerByteBilinearInterpolateY<false>(bx[0] + i, bx[1] + i, a, dst + i / 2);
+            }
+        }
+
+        template <class Idx> SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t* src, const Idx& index, const uint8_t* alpha, uint8_t* dst)
+        {
+            __m128i _src = _mm_loadu_si128((__m128i*)(src + index.src));
+            __m128i _shuffle = _mm_loadu_si128((__m128i*) & index.shuffle);
+            __m128i _alpha = _mm_loadu_si128((__m128i*)(alpha + index.dst));
+            _mm_storeu_si128((__m128i*)(dst + index.dst), _mm_maddubs_epi16(_mm_shuffle_epi8(_src, _shuffle), _alpha));
+        }
+
+        void ResizerByteBilinear::RunG(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
+        {
+            size_t bufW = AlignHi(_param.dstW, A) * 2;
+            size_t size = 2 * _param.dstW;
+            size_t aligned = AlignHi(size, DA) - DA;
+            size_t blocks = _blocks;
+            ptrdiff_t previous = -2;
+            __m128i a[2];
+            uint8_t* bx[2] = { _bx[0].data, _bx[1].data };
+            const uint8_t* ax = _ax.data;
+            const Idx* ixg = _ixg.data;
+
+            for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride)
+            {
+                a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst]));
+                a[1] = _mm_set1_epi16(int16_t(_ay[yDst]));
+
+                ptrdiff_t sy = _iy[yDst];
+                int k = 0;
+
+                if (sy == previous)
+                    k = 2;
+                else if (sy == previous + 1)
+                {
+                    Swap(bx[0], bx[1]);
+                    k = 1;
+                }
+
+                previous = sy;
+
+                for (; k < 2; k++)
+                {
+                    const uint8_t* psrc = src + (sy + k) * srcStride;
+                    uint8_t* pdst = bx[k];
+                    for (size_t i = 0; i < blocks; ++i)
+                        ResizerByteBilinearLoadGrayInterpolated(psrc, ixg[i], ax, pdst);
+                }
+
+                for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A)
+                    ResizerByteBilinearInterpolateY<true>(bx[0] + ib, bx[1] + ib, a, dst + id);
+                size_t i = size - DA;
+                ResizerByteBilinearInterpolateY<false>(bx[0] + i, bx[1] + i, a, dst + i / 2);
+            }
+        }
+
+        void ResizerByteBilinear::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
+        {
+            assert(_param.dstW >= A);
+
+            EstimateParams();
+            switch (_param.channels)
+            {
+            case 1:
+                if (_blocks)
+                    RunG(src, srcStride, dst, dstStride);
+                else
+                    Run<1>(src, srcStride, dst, dstStride);
+                break;
+            case 2: Run<2>(src, srcStride, dst, dstStride); break;
+            case 3: Run<3>(src, srcStride, dst, dstStride); break;
+            case 4: Run<4>(src, srcStride, dst, dstStride); break;
+            default:
+                assert(0);
+            }
+        }
+
+        //---------------------------------------------------------------------
+
         ResizerByteArea::ResizerByteArea(const ResParam & param)
             : Sse2::ResizerByteArea(param)
         {
@@ -200,10 +503,12 @@ namespace Simd
         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
         {
             ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128i));
-            if (type == SimdResizeChannelByte && method == SimdResizeMethodArea)
+            if (param.IsByteBilinear() && dstX >= A)
+                return new ResizerByteBilinear(param);
+            else if (param.IsByteArea())
                 return new ResizerByteArea(param);
             else
-                return Ssse3::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
+                return Sse2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
         }
     }
 #else
diff --git a/3rdparty/simdlib/Simd/SimdSsse3.h b/3rdparty/simdlib/Simd/SimdSsse3.h
deleted file mode 100644
index ed7849f39d..0000000000
--- a/3rdparty/simdlib/Simd/SimdSsse3.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#ifndef __SimdSsse3_h__
-#define __SimdSsse3_h__
-
-#include "Simd/SimdDefs.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride);
-
-        void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha);
-
-        void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha);
-
-        void BgraToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride);
-
-        void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride);
-
-        void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride);
-
-        void BgrToRgb(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * rgb, size_t rgbStride);
-
-        void DeinterleaveBgr(const uint8_t * bgr, size_t bgrStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride);
-
-        void DeinterleaveBgra(const uint8_t * bgra, size_t bgraStride, size_t width, size_t height, uint8_t * b, size_t bStride, uint8_t * g, size_t gStride, uint8_t * r, size_t rStride, uint8_t * a, size_t aStride);
-
-        void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride);
-
-        void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride);
-
-        void InterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride);
-
-        void InterleaveBgra(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, const uint8_t * a, size_t aStride, size_t width, size_t height, uint8_t * bgra, size_t bgraStride);
-
-        void ReduceColor2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
-
-        void ReduceGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
-
-        void ReduceGray4x4(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t * dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
-
-        void ResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
-            uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
-
-        // ViSP custom SIMD code
-        void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff);
-    }
-#endif// SIMD_SSSE3_ENABLE
-}
-#endif//__SimdSsse3_h__
diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp b/3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp
deleted file mode 100644
index bb01107812..0000000000
--- a/3rdparty/simdlib/Simd/SimdSsse3BgrToRGBa.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        template <bool align> SIMD_INLINE void BgrToRgba(const uint8_t * bgr, uint8_t * rgba, __m128i alpha, __m128i shuffle)
-        {
-            Store<align>((__m128i*)rgba + 0, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<align>((__m128i*)(bgr + 0)), shuffle)));
-            Store<align>((__m128i*)rgba + 1, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 12)), shuffle)));
-            Store<align>((__m128i*)rgba + 2, _mm_or_si128(alpha, _mm_shuffle_epi8(Load<false>((__m128i*)(bgr + 24)), shuffle)));
-            Store<align>((__m128i*)rgba + 3, _mm_or_si128(alpha, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(bgr + 32)), 4), shuffle)));
-        }
-
-        template <bool align> void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            __m128i _alpha = _mm_slli_si128(_mm_set1_epi32(alpha), 3);
-            __m128i _shuffle = _mm_setr_epi8(0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    BgrToRgba<align>(bgr + 3 * col, rgba + 4 * col, _alpha, _shuffle);
-                if (width != alignedWidth)
-                    BgrToRgba<false>(bgr + 3 * (width - A), rgba + 4 * (width - A), _alpha, _shuffle);
-                bgr += bgrStride;
-                rgba += rgbaStride;
-            }
-        }
-
-        void BgrToRgba(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgba, size_t rgbaStride, uint8_t alpha)
-        {
-            if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgr) && Aligned(bgrStride))
-                BgrToRgba<true>(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-            else
-                BgrToRgba<false>(bgr, width, height, bgrStride, rgba, rgbaStride, alpha);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToRGBa.cpp.o) has no symbols
-    void dummy_SimdSsse3BgrToRGBa(){};
-#endif// SIMD_SSSE3_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp b/3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp
deleted file mode 100644
index d455781ed3..0000000000
--- a/3rdparty/simdlib/Simd/SimdSsse3BgraToRGBa.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        template <bool align> SIMD_INLINE void BgraToRgba(const uint8_t * bgra, uint8_t * rgba, __m128i shuffle)
-        {
-            Store<align>((__m128i*)rgba + 0, _mm_shuffle_epi8(Load<align>((__m128i*)(bgra + 0)), shuffle));
-            Store<align>((__m128i*)rgba + 1, _mm_shuffle_epi8(Load<align>((__m128i*)(bgra + 16)), shuffle));
-            Store<align>((__m128i*)rgba + 2, _mm_shuffle_epi8(Load<align>((__m128i*)(bgra + 32)), shuffle));
-            Store<align>((__m128i*)rgba + 3, _mm_shuffle_epi8(Load<align>((__m128i*)(bgra + 48)), shuffle));
-        }
-
-        template <bool align> void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            __m128i _shuffle = _mm_setr_epi8(0x2, 0x1, 0x0, 0x3, 0x6, 0x5, 0x4, 0x7, 0xA, 0x9, 0x8, 0xB, 0xE, 0xD, 0xC, 0xF);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    BgraToRgba<align>(bgra + 4 * col, rgba + 4 * col, _shuffle);
-                if (width != alignedWidth)
-                    BgraToRgba<false>(bgra + 4 * (width - A), rgba + 4 * (width - A), _shuffle);
-                bgra += bgraStride;
-                rgba += rgbaStride;
-            }
-        }
-
-        void BgraToRgba(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * rgba, size_t rgbaStride)
-        {
-            if (Aligned(rgba) && Aligned(rgbaStride) && Aligned(bgra) && Aligned(bgraStride))
-                BgraToRgba<true>(bgra, width, height, bgraStride, rgba, rgbaStride);
-            else
-                BgraToRgba<false>(bgra, width, height, bgraStride, rgba, rgbaStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3BgrToRGBa.cpp.o) has no symbols
-    void dummy_SimdSsse3BgraToRGBa(){};
-#endif// SIMD_SSSE3_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp b/3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp
deleted file mode 100644
index 985a772d47..0000000000
--- a/3rdparty/simdlib/Simd/SimdSsse3CustomFunctions.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdBase.h"
-#include "Simd/SimdStore.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff)
-        {
-            const __m128i mask1 = _mm_set_epi8(-1, 14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0);
-            const __m128i mask2 = _mm_set_epi8(-1, 15, -1, 13, -1, 11, -1, 9, -1, 7, -1, 5, -1, 3, -1, 1);
-            const __m128i mask_out2 = _mm_set_epi8(14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0, -1);
-
-            size_t i = 0;
-            for (; i <= size-16; i+= 16) {
-                const __m128i vdata1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(img1 + i));
-                const __m128i vdata2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(img2 + i));
-
-                __m128i vdata1_reorg = _mm_shuffle_epi8(vdata1, mask1);
-                __m128i vdata2_reorg = _mm_shuffle_epi8(vdata2, mask1);
-
-                const __m128i vshift = _mm_set1_epi16(128);
-                __m128i vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift);
-
-                const __m128i v255 = _mm_set1_epi16(255);
-                const __m128i vzero = _mm_setzero_si128();
-                const __m128i vdata_diff_min_max1 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero);
-
-                vdata1_reorg = _mm_shuffle_epi8(vdata1, mask2);
-                vdata2_reorg = _mm_shuffle_epi8(vdata2, mask2);
-
-                vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift);
-                const __m128i vdata_diff_min_max2 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero);
-
-                _mm_storeu_si128(reinterpret_cast<__m128i *>(imgDiff + i), _mm_or_si128(_mm_shuffle_epi8(vdata_diff_min_max1, mask1),
-                                                                                        _mm_shuffle_epi8(vdata_diff_min_max2, mask_out2)));
-            }
-
-            if (i < size) {
-                Base::SimdImageDifference(img1 + i, img2 + i, size - i, imgDiff + i);
-            }
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3CustomFunctions.cpp.o) has no symbols
-    void dummy_SimdSsse3CustomFunctions(){};
-#endif// SIMD_SSSE3_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp b/3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp
deleted file mode 100644
index 37f2eca6c1..0000000000
--- a/3rdparty/simdlib/Simd/SimdSsse3Resizer.cpp
+++ /dev/null
@@ -1,350 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdMemory.h"
-#include "Simd/SimdStore.h"
-#include "Simd/SimdResizer.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        ResizerByteBilinear::ResizerByteBilinear(const ResParam & param)
-            : Sse2::ResizerByteBilinear(param)
-            , _blocks(0)
-        {
-        }
-
-        size_t ResizerByteBilinear::BlockCountMax(size_t align)
-        {
-            return (size_t)Simd::Max(::ceil(float(_param.srcW) / (align - 1)), ::ceil(float(_param.dstW) * 2.0f / align ));
-        }
-
-        void ResizerByteBilinear::EstimateParams()
-        {
-            if (_ax.data)
-                return;
-            if (_param.channels == 1 && _param.srcW < 4 * _param.dstW)
-                _blocks = BlockCountMax(A);
-            float scale = (float)_param.srcW / _param.dstW;
-            _ax.Resize(AlignHi(_param.dstW, A) * _param.channels * 2, false, _param.align);
-            uint8_t * alphas = _ax.data;
-            if (_blocks)
-            {
-                _ixg.Resize(_blocks);
-                int block = 0;
-                _ixg[0].src = 0;
-                _ixg[0].dst = 0;
-                for (int dstIndex = 0; dstIndex < (int)_param.dstW; ++dstIndex)
-                {
-                    float alpha = (float)((dstIndex + 0.5)*scale - 0.5);
-                    int srcIndex = (int)::floor(alpha);
-                    alpha -= srcIndex;
-
-                    if (srcIndex < 0)
-                    {
-                        srcIndex = 0;
-                        alpha = 0;
-                    }
-
-                    if (srcIndex > (int)_param.srcW - 2)
-                    {
-                        srcIndex = (int)_param.srcW - 2;
-                        alpha = 1;
-                    }
-
-                    int dst = 2 * dstIndex - _ixg[block].dst;
-                    int src = srcIndex - _ixg[block].src;
-                    if (src >= A - 1 || dst >= A)
-                    {
-                        block++;
-                        _ixg[block].src = Simd::Min(srcIndex, int(_param.srcW - A));
-                        _ixg[block].dst = 2 * dstIndex;
-                        dst = 0;
-                        src = srcIndex - _ixg[block].src;
-                    }
-                    _ixg[block].shuffle[dst] = src;
-                    _ixg[block].shuffle[dst + 1] = src + 1;
-
-                    alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5);
-                    alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]);
-                    alphas += 2;
-                }
-                _blocks = block + 1;
-            }
-            else
-            {
-                _ix.Resize(_param.dstW);
-                for (size_t i = 0; i < _param.dstW; ++i)
-                {
-                    float alpha = (float)((i + 0.5)*scale - 0.5);
-                    ptrdiff_t index = (ptrdiff_t)::floor(alpha);
-                    alpha -= index;
-
-                    if (index < 0)
-                    {
-                        index = 0;
-                        alpha = 0;
-                    }
-
-                    if (index >(ptrdiff_t)_param.srcW - 2)
-                    {
-                        index = _param.srcW - 2;
-                        alpha = 1;
-                    }
-
-                    _ix[i] = (int)index;
-                    alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5);
-                    alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]);
-                    for (size_t channel = 1; channel < _param.channels; channel++)
-                        ((uint16_t*)alphas)[channel] = *(uint16_t*)alphas;
-                    alphas += 2 * _param.channels;
-                }
-            }
-            size_t size = AlignHi(_param.dstW, _param.align)*_param.channels * 2;
-            _bx[0].Resize(size, false, _param.align);
-            _bx[1].Resize(size, false, _param.align);
-        }
-
-        template <size_t N> void ResizerByteBilinearInterpolateX(const __m128i * alpha, __m128i * buffer);
-
-        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<1>(const __m128i * alpha, __m128i * buffer)
-        {
-            _mm_store_si128(buffer, _mm_maddubs_epi16(_mm_load_si128(buffer), _mm_load_si128(alpha)));
-        }
-
-        const __m128i K8_SHUFFLE_X2 = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF);
-
-        SIMD_INLINE void ResizerByteBilinearInterpolateX2(const __m128i * alpha, __m128i * buffer)
-        {
-            __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X2);
-            _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha)));
-        }
-
-        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<2>(const __m128i * alpha, __m128i * buffer)
-        {
-            ResizerByteBilinearInterpolateX2(alpha + 0, buffer + 0);
-            ResizerByteBilinearInterpolateX2(alpha + 1, buffer + 1);
-        }
-
-        const __m128i K8_SHUFFLE_X3_00 = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1);
-        const __m128i K8_SHUFFLE_X3_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0);
-        const __m128i K8_SHUFFLE_X3_10 = SIMD_MM_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        const __m128i K8_SHUFFLE_X3_11 = SIMD_MM_SETR_EPI8(-1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1);
-        const __m128i K8_SHUFFLE_X3_12 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1);
-        const __m128i K8_SHUFFLE_X3_21 = SIMD_MM_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        const __m128i K8_SHUFFLE_X3_22 = SIMD_MM_SETR_EPI8(-1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF);
-
-        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<3>(const __m128i * alpha, __m128i * buffer)
-        {
-            __m128i src[3], shuffled[3];
-            src[0] = _mm_load_si128(buffer + 0);
-            src[1] = _mm_load_si128(buffer + 1);
-            src[2] = _mm_load_si128(buffer + 2);
-            shuffled[0] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_00);
-            shuffled[0] = _mm_or_si128(shuffled[0], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_01));
-            _mm_store_si128(buffer + 0, _mm_maddubs_epi16(shuffled[0], _mm_load_si128(alpha + 0)));
-            shuffled[1] = _mm_shuffle_epi8(src[0], K8_SHUFFLE_X3_10);
-            shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_11));
-            shuffled[1] = _mm_or_si128(shuffled[1], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_12));
-            _mm_store_si128(buffer + 1, _mm_maddubs_epi16(shuffled[1], _mm_load_si128(alpha + 1)));
-            shuffled[2] = _mm_shuffle_epi8(src[1], K8_SHUFFLE_X3_21);
-            shuffled[2] = _mm_or_si128(shuffled[2], _mm_shuffle_epi8(src[2], K8_SHUFFLE_X3_22));
-            _mm_store_si128(buffer + 2, _mm_maddubs_epi16(shuffled[2], _mm_load_si128(alpha + 2)));
-        }
-
-        const __m128i K8_SHUFFLE_X4 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF);
-
-        SIMD_INLINE void ResizerByteBilinearInterpolateX4(const __m128i * alpha, __m128i * buffer)
-        {
-            __m128i src = _mm_shuffle_epi8(_mm_load_si128(buffer), K8_SHUFFLE_X4);
-            _mm_store_si128(buffer, _mm_maddubs_epi16(src, _mm_load_si128(alpha)));
-        }
-
-        template <> SIMD_INLINE void ResizerByteBilinearInterpolateX<4>(const __m128i * alpha, __m128i * buffer)
-        {
-            ResizerByteBilinearInterpolateX4(alpha + 0, buffer + 0);
-            ResizerByteBilinearInterpolateX4(alpha + 1, buffer + 1);
-            ResizerByteBilinearInterpolateX4(alpha + 2, buffer + 2);
-            ResizerByteBilinearInterpolateX4(alpha + 3, buffer + 3);
-        }
-
-        const __m128i K16_FRACTION_ROUND_TERM = SIMD_MM_SET1_EPI16(Base::BILINEAR_ROUND_TERM);
-
-        template<bool align> SIMD_INLINE __m128i ResizerByteBilinearInterpolateY(const __m128i * pbx0, const __m128i * pbx1, __m128i alpha[2])
-        {
-            __m128i sum = _mm_add_epi16(_mm_mullo_epi16(Load<align>(pbx0), alpha[0]), _mm_mullo_epi16(Load<align>(pbx1), alpha[1]));
-            return _mm_srli_epi16(_mm_add_epi16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT);
-        }
-
-        template<bool align> SIMD_INLINE void ResizerByteBilinearInterpolateY(const uint8_t * bx0, const uint8_t * bx1, __m128i alpha[2], uint8_t * dst)
-        {
-            __m128i lo = ResizerByteBilinearInterpolateY<align>((__m128i*)bx0 + 0, (__m128i*)bx1 + 0, alpha);
-            __m128i hi = ResizerByteBilinearInterpolateY<align>((__m128i*)bx0 + 1, (__m128i*)bx1 + 1, alpha);
-            Store<false>((__m128i*)dst, _mm_packus_epi16(lo, hi));
-        }
-
-        template<size_t N> void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride)
-        {
-            struct One { uint8_t val[N * 1]; };
-            struct Two { uint8_t val[N * 2]; };
-
-            size_t size = 2 * _param.dstW*N;
-            size_t aligned = AlignHi(size, DA) - DA;
-            const size_t step = A * N;
-            ptrdiff_t previous = -2;
-            __m128i a[2];
-            uint8_t * bx[2] = { _bx[0].data, _bx[1].data };
-            const uint8_t * ax = _ax.data;
-            const int32_t * ix = _ix.data;
-            size_t dstW = _param.dstW;
-
-            for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride)
-            {
-                a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst]));
-                a[1] = _mm_set1_epi16(int16_t(_ay[yDst]));
-
-                ptrdiff_t sy = _iy[yDst];
-                int k = 0;
-
-                if (sy == previous)
-                    k = 2;
-                else if (sy == previous + 1)
-                {
-                    Swap(bx[0], bx[1]);
-                    k = 1;
-                }
-
-                previous = sy;
-
-                for (; k < 2; k++)
-                {
-                    Two * pb = (Two *)bx[k];
-                    const One * psrc = (const One *)(src + (sy + k)*srcStride);
-                    for (size_t x = 0; x < dstW; x++)
-                        pb[x] = *(Two *)(psrc + ix[x]);
-
-                    uint8_t * pbx = bx[k];
-                    for (size_t i = 0; i < size; i += step)
-                        ResizerByteBilinearInterpolateX<N>((__m128i*)(ax + i), (__m128i*)(pbx + i));
-                }
-
-                for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A)
-                    ResizerByteBilinearInterpolateY<true>(bx[0] + ib, bx[1] + ib, a, dst + id);
-                size_t i = size - DA;
-                ResizerByteBilinearInterpolateY<false>(bx[0] + i, bx[1] + i, a, dst + i / 2);
-            }
-        }
-
-        template <class Idx> SIMD_INLINE void ResizerByteBilinearLoadGrayInterpolated(const uint8_t * src, const Idx & index, const uint8_t * alpha, uint8_t * dst)
-        {
-            __m128i _src = _mm_loadu_si128((__m128i*)(src + index.src));
-            __m128i _shuffle = _mm_loadu_si128((__m128i*)&index.shuffle);
-            __m128i _alpha = _mm_loadu_si128((__m128i*)(alpha + index.dst));
-            _mm_storeu_si128((__m128i*)(dst + index.dst), _mm_maddubs_epi16(_mm_shuffle_epi8(_src, _shuffle), _alpha));
-        }
-
-        void ResizerByteBilinear::RunG(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride)
-        {
-            size_t bufW = AlignHi(_param.dstW, A) * 2;
-            size_t size = 2 * _param.dstW;
-            size_t aligned = AlignHi(size, DA) - DA;
-            size_t blocks = _blocks;
-            ptrdiff_t previous = -2;
-            __m128i a[2];
-            uint8_t * bx[2] = { _bx[0].data, _bx[1].data };
-            const uint8_t * ax = _ax.data;
-            const Idx * ixg = _ixg.data;
-
-            for (size_t yDst = 0; yDst < _param.dstH; yDst++, dst += dstStride)
-            {
-                a[0] = _mm_set1_epi16(int16_t(Base::FRACTION_RANGE - _ay[yDst]));
-                a[1] = _mm_set1_epi16(int16_t(_ay[yDst]));
-
-                ptrdiff_t sy = _iy[yDst];
-                int k = 0;
-
-                if (sy == previous)
-                    k = 2;
-                else if (sy == previous + 1)
-                {
-                    Swap(bx[0], bx[1]);
-                    k = 1;
-                }
-
-                previous = sy;
-
-                for (; k < 2; k++)
-                {
-                    const uint8_t * psrc = src + (sy + k)*srcStride;
-                    uint8_t * pdst = bx[k];
-                    for (size_t i = 0; i < blocks; ++i)
-                        ResizerByteBilinearLoadGrayInterpolated(psrc, ixg[i], ax, pdst);
-                }
-
-                for (size_t ib = 0, id = 0; ib < aligned; ib += DA, id += A)
-                    ResizerByteBilinearInterpolateY<true>(bx[0] + ib, bx[1] + ib, a, dst + id);
-                size_t i = size - DA;
-                ResizerByteBilinearInterpolateY<false>(bx[0] + i, bx[1] + i, a, dst + i / 2);
-            }
-        }
-
-        void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride)
-        {
-            assert(_param.dstW >= A);
-
-            EstimateParams();
-            switch (_param.channels)
-            {
-            case 1:
-                if(_blocks)
-                    RunG(src, srcStride, dst, dstStride);
-                else
-                    Run<1>(src, srcStride, dst, dstStride);
-                break;
-            case 2: Run<2>(src, srcStride, dst, dstStride); break;
-            case 3: Run<3>(src, srcStride, dst, dstStride); break;
-            case 4: Run<4>(src, srcStride, dst, dstStride); break;
-            default:
-                assert(0);
-            }
-        }
-
-        //---------------------------------------------------------------------
-
-        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
-        {
-            ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m128i));
-            if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear && dstX >= A)
-                return new ResizerByteBilinear(param);
-            else
-                return Sse2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3Resizer.cpp.o) has no symbols
-    void dummy_SimdSsse3Resizer(){};
-#endif//SIMD_SSSE3_ENABLE
-}
-
diff --git a/3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp b/3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp
deleted file mode 100644
index cf79dd55bd..0000000000
--- a/3rdparty/simdlib/Simd/SimdSsse3RgbToGray.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
-* Simd Library (http://ermig1979.github.io/Simd).
-*
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-#include "Simd/SimdStore.h"
-#include "Simd/SimdMemory.h"
-
-namespace Simd
-{
-#ifdef SIMD_SSSE3_ENABLE
-    namespace Ssse3
-    {
-        const __m128i K16_RED_BLUE = SIMD_MM_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
-        const __m128i K16_GREEN_ROUND = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, Base::BGR_TO_GRAY_ROUND_TERM);
-
-        SIMD_INLINE __m128i RgbaToGray32(__m128i rgba)
-        {
-            const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF);
-            const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF);
-            const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(r0b0, K16_RED_BLUE));
-            return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT);
-        }
-
-        SIMD_INLINE __m128i RgbToGray(__m128i rgba[4])
-        {
-            const __m128i lo = _mm_packs_epi32(RgbaToGray32(rgba[0]), RgbaToGray32(rgba[1]));
-            const __m128i hi = _mm_packs_epi32(RgbaToGray32(rgba[2]), RgbaToGray32(rgba[3]));
-            return _mm_packus_epi16(lo, hi);
-        }
-
-        template <bool align> SIMD_INLINE __m128i RgbToGray(const uint8_t * rgb, __m128i shuffle)
-        {
-            __m128i rgba[4];
-            rgba[0] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<align>((__m128i*)(rgb + 0)), shuffle));
-            rgba[1] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(rgb + 12)), shuffle));
-            rgba[2] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load<false>((__m128i*)(rgb + 24)), shuffle));
-            rgba[3] = _mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load<align>((__m128i*)(rgb + 32)), 4), shuffle));
-            return RgbToGray(rgba);
-        }
-
-        template <bool align> void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride)
-        {
-            assert(width >= A);
-            if (align)
-                assert(Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride));
-
-            size_t alignedWidth = AlignLo(width, A);
-
-            __m128i _shuffle = _mm_setr_epi8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);
-
-            for (size_t row = 0; row < height; ++row)
-            {
-                for (size_t col = 0; col < alignedWidth; col += A)
-                    Store<align>((__m128i*)(gray + col), RgbToGray<align>(rgb + 3 * col, _shuffle));
-                if (width != alignedWidth)
-                    Store<false>((__m128i*)(gray + width - A), RgbToGray<false>(rgb + 3 * (width - A), _shuffle));
-                rgb += rgbStride;
-                gray += grayStride;
-            }
-        }
-
-        void RgbToGray(const uint8_t * rgb, size_t width, size_t height, size_t rgbStride, uint8_t * gray, size_t grayStride)
-        {
-            if (Aligned(gray) && Aligned(grayStride) && Aligned(rgb) && Aligned(rgbStride))
-                RgbToGray<true>(rgb, width, height, rgbStride, gray, grayStride);
-            else
-                RgbToGray<false>(rgb, width, height, rgbStride, gray, grayStride);
-        }
-    }
-#else
-    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3RgbToGray.cpp.o) has no symbols
-    void dummy_SimdSsse3RgbToGray(){};
-#endif// SIMD_SSSE3_ENABLE
-}
diff --git a/3rdparty/simdlib/Simd/SimdStore.h b/3rdparty/simdlib/Simd/SimdStore.h
old mode 100644
new mode 100755
index 11ae3f7815..2b22a9616d
--- a/3rdparty/simdlib/Simd/SimdStore.h
+++ b/3rdparty/simdlib/Simd/SimdStore.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -31,8 +31,8 @@
 
 namespace Simd
 {
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         template <bool align> SIMD_INLINE void Store(float  * p, __m128 a);
 
@@ -63,13 +63,6 @@ namespace Simd
             __m128 old = Load<align>(p);
             Store<align>(p, Combine(mask, value, old));
         }
-    }
-#endif//SIMD_SSE_ENABLE
-
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
-        using namespace Sse;
 
         template <bool align> SIMD_INLINE void Store(__m128i * p, __m128i a);
 
@@ -83,6 +76,11 @@ namespace Simd
             _mm_store_si128(p, a);
         }
 
+        template <int part> SIMD_INLINE void StoreHalf(__m128i* p, __m128i a)
+        {
+            StoreHalf<part>((float*)p, _mm_castsi128_ps(a));
+        }
+
         template <bool align> SIMD_INLINE void StoreMasked(__m128i * p, __m128i value, __m128i mask)
         {
             __m128i old = Load<align>(p);
@@ -95,7 +93,6 @@ namespace Simd
     namespace Sse41
     {
 #if defined(_MSC_VER) && _MSC_VER >= 1800  && _MSC_VER < 1900 // Visual Studio 2013 compiler bug       
-        using Sse::Store;
         using Sse2::Store;
 #endif
     }
@@ -118,8 +115,8 @@ namespace Simd
 
         template <bool align> SIMD_INLINE void Store(float * p0, float * p1, __m256 a)
         {
-            Sse::Store<align>(p0, _mm256_extractf128_ps(a, 0));
-            Sse::Store<align>(p1, _mm256_extractf128_ps(a, 1));
+            Sse2::Store<align>(p0, _mm256_extractf128_ps(a, 0));
+            Sse2::Store<align>(p1, _mm256_extractf128_ps(a, 1));
         }
 
         template <bool align> SIMD_INLINE void StoreMasked(float * p, __m256 value, __m256 mask)
@@ -163,11 +160,6 @@ namespace Simd
             return _mm256_permute4x64_epi64(_mm256_packus_epi16(lo, hi), 0xD8);
         }
 
-        SIMD_INLINE __m256i PackU16ToU8(__m256i lo, __m256i hi)
-        {
-            return _mm256_permute4x64_epi64(_mm256_packus_epi16(lo, hi), 0xD8);
-        }
-
         SIMD_INLINE __m256i PackI32ToI16(__m256i lo, __m256i hi)
         {
             return _mm256_permute4x64_epi64(_mm256_packs_epi32(lo, hi), 0xD8);
@@ -184,6 +176,12 @@ namespace Simd
             lo = _mm256_permute2x128_si256(lo, hi, 0x20);
             hi = _mm256_permute2x128_si256(_lo, hi, 0x31);
         }
+
+        template <bool align> SIMD_INLINE void Store24(uint8_t * p, __m256i a)
+        {
+            Sse2::Store<align>((__m128i*)p, _mm256_extractf128_si256(a, 0));
+            Sse2::StoreHalf<0>((__m128i*)p + 1, _mm256_extractf128_si256(a, 1));
+        }
     }
 #endif//SIMD_SAVX2_ENABLE
 
@@ -230,27 +228,27 @@ namespace Simd
 
         template <bool align> SIMD_INLINE void Store(uint16_t * p, uint16x8_t a)
         {
-            Store<align>((uint8_t*)p, (uint8x16_t)a);
+            Store<align>((uint8_t*)p, vreinterpretq_u8_u16(a));
         }
 
         template <bool align> SIMD_INLINE void Store(uint16_t * p, uint16x4_t a)
         {
-            Store<align>((uint8_t*)p, (uint8x8_t)a);
+            Store<align>((uint8_t*)p, vreinterpret_u8_u16(a));
         }
 
         template <bool align> SIMD_INLINE void Store(int16_t * p, int16x8_t a)
         {
-            Store<align>((uint8_t*)p, (uint8x16_t)a);
+            Store<align>((uint8_t*)p, vreinterpretq_u8_s16(a));
         }
 
         template <bool align> SIMD_INLINE void Store(uint32_t * p, uint32x4_t a)
         {
-            Store<align>((uint8_t*)p, (uint8x16_t)a);
+            Store<align>((uint8_t*)p, vreinterpretq_u8_u32(a));
         }
 
         template <bool align> SIMD_INLINE void Store(int32_t * p, int32x4_t a)
         {
-            Store<align>((uint8_t*)p, (uint8x16_t)a);
+            Store<align>((uint8_t*)p, vreinterpretq_u8_s32(a));
         }
 
         template <bool align> SIMD_INLINE void Store2(uint8_t * p, uint8x16x2_t a);
@@ -310,7 +308,6 @@ namespace Simd
 #endif
         }
 
-
         template <bool align> SIMD_INLINE void Store3(uint8_t * p, uint8x16x3_t a);
 
         template <> SIMD_INLINE void Store3<false>(uint8_t * p, uint8x16x3_t a)
diff --git a/3rdparty/simdlib/Simd/SimdStream.h b/3rdparty/simdlib/Simd/SimdStream.h
old mode 100644
new mode 100755
index b6399bd1f1..6abf65cf68
--- a/3rdparty/simdlib/Simd/SimdStream.h
+++ b/3rdparty/simdlib/Simd/SimdStream.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -30,36 +30,31 @@ namespace Simd
 {
     const size_t STREAM_SIZE_MIN = 0x00100000;
 
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
-        template <bool align, bool stream> SIMD_INLINE void Stream(float  * p, __m128 a);
+        template <bool align, bool stream> SIMD_INLINE void Stream(float* p, __m128 a);
 
-        template <> SIMD_INLINE void Stream<false, false>(float  * p, __m128 a)
+        template <> SIMD_INLINE void Stream<false, false>(float* p, __m128 a)
         {
             _mm_storeu_ps(p, a);
         }
 
-        template <> SIMD_INLINE void Stream<false, true>(float  * p, __m128 a)
+        template <> SIMD_INLINE void Stream<false, true>(float* p, __m128 a)
         {
             _mm_storeu_ps(p, a);
         }
 
-        template <> SIMD_INLINE void Stream<true, false>(float  * p, __m128 a)
+        template <> SIMD_INLINE void Stream<true, false>(float* p, __m128 a)
         {
             _mm_store_ps(p, a);
         }
 
-        template <> SIMD_INLINE void Stream<true, true>(float  * p, __m128 a)
+        template <> SIMD_INLINE void Stream<true, true>(float* p, __m128 a)
         {
             _mm_stream_ps(p, a);
         }
-    }
-#endif//SIMD_SSE_ENABLE
 
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
         template <bool align, bool stream> SIMD_INLINE void Stream(__m128i  * p, __m128i a);
 
         template <> SIMD_INLINE void Stream<false, false>(__m128i   * p, __m128i a)
diff --git a/3rdparty/simdlib/Simd/SimdUpdate.h b/3rdparty/simdlib/Simd/SimdUpdate.h
old mode 100644
new mode 100755
index 47e9b22dc2..4c4d64b1c0
--- a/3rdparty/simdlib/Simd/SimdUpdate.h
+++ b/3rdparty/simdlib/Simd/SimdUpdate.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar.
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -47,8 +47,8 @@ namespace Simd
         }
     }
 
-#ifdef SIMD_SSE_ENABLE
-    namespace Sse
+#ifdef SIMD_SSE2_ENABLE
+    namespace Sse2
     {
         template <UpdateType update, bool align> SIMD_INLINE void Update(float  * p, __m128 a)
         {
@@ -63,13 +63,10 @@ namespace Simd
         template <> SIMD_INLINE void Update<UpdateAdd, true>(float  * p, __m128 a)
         {
             Store<true>(p, _mm_add_ps(Load<true>(p), a));
-        }
-    }
-#endif//SIMD_SSE_ENABLE
+        }   
 
-#ifdef SIMD_SSE2_ENABLE
-    namespace Sse2
-    {
+        //-----------------------------------------------------------------------------------------
+        
         template <UpdateType update, bool align> SIMD_INLINE void Update(int32_t  * p, __m128i a)
         {
             Store<align>((__m128i*)p, a);
@@ -160,6 +157,6 @@ namespace Simd
             Store<true>(p, vaddq_f32(Load<true>(p), a));
         }
     }
-#endif//SIMD_SSE_ENABLE
+#endif//SIMD_NEON_ENABLE
 }
 #endif//__SimdUpdate_h__
diff --git a/3rdparty/simdlib/Simd/SimdVersion.h b/3rdparty/simdlib/Simd/SimdVersion.h
index 72ae751ade..09efd5de91 100644
--- a/3rdparty/simdlib/Simd/SimdVersion.h
+++ b/3rdparty/simdlib/Simd/SimdVersion.h
@@ -34,7 +34,7 @@
 #ifndef __SimdVersion_h__
 #define __SimdVersion_h__
 
-#define SIMD_VERSION "4.4.82"
+#define SIMD_VERSION "4.9.107"
 
 #endif//__SimdVersion_h__
 
diff --git a/3rdparty/simdlib/Simd/SimdView.hpp b/3rdparty/simdlib/Simd/SimdView.hpp
old mode 100644
new mode 100755
index c9a51c5f61..0c61a0e6e8
--- a/3rdparty/simdlib/Simd/SimdView.hpp
+++ b/3rdparty/simdlib/Simd/SimdView.hpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2019 Yermalayeu Ihar,
+* Copyright (c) 2011-2021 Yermalayeu Ihar,
 *               2014-2019 Antonenka Mikhail,
 *               2018-2019 Dmitry Fedorov,
 *               2019-2019 Artur Voronkov.
@@ -95,7 +95,9 @@ namespace Simd
             /*! A single channel 64-bit float point pixel format. */
             Double,
             /*! A 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */
-            Rgb24
+            Rgb24,
+            /*! A 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */
+            Rgba32,
         };
 
         /*!
diff --git a/modules/core/src/image/vpImageConvert.cpp b/modules/core/src/image/vpImageConvert.cpp
index cc9ecd4853..62c2b751cf 100644
--- a/modules/core/src/image/vpImageConvert.cpp
+++ b/modules/core/src/image/vpImageConvert.cpp
@@ -744,7 +744,7 @@ vpImageConvert::convert( const cv::Mat &src, vpImage< vpRGBa > &dest, bool flip
   {
     if ( src.isContinuous() && !flip )
     {
-      SimdBgrToRgba( src.data, src.cols, src.rows, src.step[0], reinterpret_cast< uint8_t * >( dest.bitmap ),
+      SimdRgbToBgra( src.data, src.cols, src.rows, src.step[0], reinterpret_cast< uint8_t * >( dest.bitmap ),
                      dest.getWidth() * sizeof( vpRGBa ), vpRGBa::alpha_default );
     }
     else
@@ -3864,7 +3864,7 @@ vpImageConvert::BGRToRGBa( unsigned char *bgr, unsigned char *rgba, unsigned int
 {
   if ( !flip )
   {
-    SimdBgrToRgba( bgr, width, height, width * 3, rgba, width * sizeof( vpRGBa ), vpRGBa::alpha_default );
+    SimdRgbToBgra( bgr, width, height, width * 3, rgba, width * sizeof( vpRGBa ), vpRGBa::alpha_default );
   }
   else
   {

From 94fee711854e17aee14fef33324b7e9e349c3473 Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Wed, 3 Nov 2021 10:17:07 +0100
Subject: [PATCH 08/18] Add missing file.

---
 3rdparty/simdlib/Simd/SimdNeonCpu.cpp | 59 +++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 3rdparty/simdlib/Simd/SimdNeonCpu.cpp

diff --git a/3rdparty/simdlib/Simd/SimdNeonCpu.cpp b/3rdparty/simdlib/Simd/SimdNeonCpu.cpp
new file mode 100644
index 0000000000..8b644c04f6
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdNeonCpu.cpp
@@ -0,0 +1,59 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2020 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdEnable.h"
+#include "Simd/SimdCpu.h"
+
+#if defined(__GNUC__) && (defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE))
+#include <fcntl.h>
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#endif
+
+namespace Simd
+{
+#ifdef SIMD_NEON_ENABLE
+    namespace Neon
+    {
+        SIMD_INLINE bool SupportedByCPU()
+        {
+#if defined(_MSC_VER)
+            return true;
+#elif defined(__GNUC__)
+#if defined(SIMD_ARM64_ENABLE)
+            return true;
+#else
+            return Base::CheckBit(AT_HWCAP, HWCAP_NEON);
+#endif
+#else
+#error Do not know how to detect NEON support!
+#endif
+        }
+
+        bool GetEnable()
+        {
+            return SupportedByCPU();
+        }
+    }
+#endif
+}

From bd6dd785ad64d8405b45408bdb4da0ae339ad8b3 Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Wed, 3 Nov 2021 11:18:25 +0100
Subject: [PATCH 09/18] Remove not used SSE flags. Add missing SSE 4.1
 implementation.

---
 3rdparty/simdlib/CMakeLists.txt               | 64 ++---------------
 3rdparty/simdlib/Simd/SimdLib.cpp             |  7 +-
 3rdparty/simdlib/Simd/SimdSse41.h             |  3 +
 .../simdlib/Simd/SimdSse41CustomFunctions.cpp | 69 +++++++++++++++++++
 modules/io/src/image/vpImageIo.cpp            |  2 +-
 5 files changed, 83 insertions(+), 62 deletions(-)
 create mode 100644 3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp

diff --git a/3rdparty/simdlib/CMakeLists.txt b/3rdparty/simdlib/CMakeLists.txt
index dc6d111aae..1acb1341be 100644
--- a/3rdparty/simdlib/CMakeLists.txt
+++ b/3rdparty/simdlib/CMakeLists.txt
@@ -20,46 +20,31 @@ file(GLOB_RECURSE SIMD_BASE_HDR ${CMAKE_CURRENT_SOURCE_DIR}/Simd/*.h ${CMAKE_CUR
 if(X86 OR X86_64)
 
     # Flags check
-    set(SSE_FLAG    "")
     set(SSE2_FLAG   "")
-    set(SSE3_FLAG   "")
-    set(SSSE3_FLAG  "")
-    set(SSE4_1_FLAG "")
     set(SSE4_2_FLAG "")
     set(AVX_FLAG    "")
     set(AVX2_FLAG   "")
 
     if(MSVC)
         if(NOT MSVC64)
-            vp_check_compiler_flag(CXX "/arch:SSE"    HAVE_SSE_FLAG     "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse.cpp")
             vp_check_compiler_flag(CXX "/arch:SSE2"   HAVE_SSE2_FLAG    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp")
         endif()
 
         vp_check_compiler_flag(CXX "/arch:AVX"    HAVE_AVX_FLAG     "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx.cpp")
         vp_check_compiler_flag(CXX "/arch:AVX2"   HAVE_AVX2_FLAG    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp")
 
-        if(HAVE_SSE_FLAG)
-            set(SSE_FLAG "/arch:SSE")
-        endif()
         if(HAVE_SSE2_FLAG)
             set(SSE2_FLAG "/arch:SSE2")
         endif()
         if(HAVE_AVX_FLAG)
             set(AVX_FLAG    "/arch:AVX")
             set(SSE4_2_FLAG "/arch:AVX")
-            set(SSE4_1_FLAG "/arch:AVX")
-            set(SSSE3_FLAG  "/arch:AVX")
-            set(SSE3_FLAG   "/arch:AVX")
         endif()
         if(HAVE_AVX2_FLAG)
             set(AVX2_FLAG "/arch:AVX2")
         endif()
     else()
-        vp_check_compiler_flag(CXX "-msse"    HAVE_SSE_FLAG     "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse.cpp")
         vp_check_compiler_flag(CXX "-msse2"   HAVE_SSE2_FLAG    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp")
-        vp_check_compiler_flag(CXX "-msse3"   HAVE_SSE3_FLAG    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse3.cpp")
-        vp_check_compiler_flag(CXX "-mssse3"  HAVE_SSSE3_FLAG   "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_ssse3.cpp")
-        vp_check_compiler_flag(CXX "-msse4.1" HAVE_SSE4_1_FLAG  "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse41.cpp")
         vp_check_compiler_flag(CXX "-msse4.2" HAVE_SSE4_2_FLAG  "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse42.cpp")
         vp_check_compiler_flag(CXX "-mavx"    HAVE_AVX_FLAG     "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx.cpp")
         vp_check_compiler_flag(CXX "-mavx2"   HAVE_AVX2_FLAG    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp")
@@ -68,23 +53,11 @@ if(X86 OR X86_64)
         vp_check_compiler_flag(CXX "-Wno-sign-compare"       HAVE_NO_SIGN_COMPARE_FLAG     "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_warning.cpp")
         vp_check_compiler_flag(CXX "-Wno-ignored-qualifiers" HAVE_NO_IGNORED_QUALIFIERS    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_warning.cpp")
 
-        if(HAVE_SSE_FLAG)
-            set(SSE_FLAG "-msse")
-        endif()
         if(HAVE_SSE2_FLAG)
-            set(SSE2_FLAG "-msse2")
-        endif()
-        if(HAVE_SSE3_FLAG)
-            set(SSE3_FLAG "-msse3")
-        endif()
-        if(HAVE_SSSE3_FLAG)
-            set(SSSE3_FLAG "-mssse3")
-        endif()
-        if(HAVE_SSE4_1_FLAG)
-            set(SSE4_1_FLAG "-msse4.1")
+            set(SSE2_FLAG "-msse -msse2")
         endif()
         if(HAVE_SSE4_2_FLAG)
-            set(SSE4_2_FLAG "-msse4.2")
+            set(SSE4_2_FLAG "-msse3 -mssse3 -msse4.1 -msse4.2")
         endif()
         if(HAVE_AVX_FLAG)
             set(AVX_FLAG "-mavx")
@@ -110,10 +83,10 @@ if(X86 OR X86_64)
     set_source_files_properties(${SIMD_BASE_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS}")
 
     file(GLOB_RECURSE SIMD_SSE2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse2*.cpp)
-    set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG} ${SSE2_FLAG}")
+    set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE2_FLAG}")
 
     file(GLOB_RECURSE SIMD_SSE41_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse41*.cpp)
-    set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG} ${SSSE3_FLAG} ${SSE4_1_FLAG} ${SSE4_2_FLAG}")
+    set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}")
 
     file(GLOB_RECURSE SIMD_AVX1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx1*.cpp)
     set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}")
@@ -126,7 +99,7 @@ if(X86 OR X86_64)
     endif()
 
     set(SIMD_LIB_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}")
-    set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE1_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE3_SRC} ${SIMD_SSSE3_SRC} ${SIMD_SSE41_SRC} ${SIMD_SSE42_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC})
+    set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE41_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC})
 
     file(GLOB_RECURSE SIMD_LIB_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdLib.cpp)
     set_source_files_properties(${SIMD_LIB_SRC} PROPERTIES COMPILE_FLAGS "${SIMD_LIB_FLAGS}")
@@ -171,32 +144,21 @@ elseif(WINRT)
        add_library(${SIMD_LIBRARY} STATIC ${SIMD_LIB_SRC} ${SIMD_BASE_SRC} ${SIMD_NEON_SRC} ${SIMD_BASE_HDR})
     else()
         # Flags check
-        set(SSE_FLAG    "")
         set(SSE2_FLAG   "")
-        set(SSE3_FLAG   "")
-        set(SSSE3_FLAG  "")
-        set(SSE4_1_FLAG "")
         set(SSE4_2_FLAG "")
         set(AVX_FLAG    "")
         set(AVX2_FLAG   "")
 
-        vp_check_compiler_flag(CXX "/arch:SSE"    HAVE_SSE_FLAG     "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse.cpp")
         vp_check_compiler_flag(CXX "/arch:SSE2"   HAVE_SSE2_FLAG    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_sse2.cpp")
         vp_check_compiler_flag(CXX "/arch:AVX"    HAVE_AVX_FLAG     "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx.cpp")
         vp_check_compiler_flag(CXX "/arch:AVX2"   HAVE_AVX2_FLAG    "${PROJECT_SOURCE_DIR}/cmake/checks/cpu_avx2.cpp")
 
-        if(HAVE_SSE_FLAG)
-            set(SSE_FLAG "/arch:SSE")
-        endif()
         if(HAVE_SSE2_FLAG)
             set(SSE2_FLAG "/arch:SSE2")
         endif()
         if(HAVE_AVX_FLAG)
             set(AVX_FLAG    "/arch:AVX")
             set(SSE4_2_FLAG "/arch:AVX")
-            set(SSE4_1_FLAG "/arch:AVX")
-            set(SSSE3_FLAG  "/arch:AVX")
-            set(SSE3_FLAG   "/arch:AVX")
         endif()
         if(HAVE_AVX2_FLAG)
             set(AVX2_FLAG "/arch:AVX2")
@@ -205,23 +167,11 @@ elseif(WINRT)
         file(GLOB_RECURSE SIMD_BASE_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdBase*.cpp)
         set_source_files_properties(${SIMD_BASE_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS}")
 
-        file(GLOB_RECURSE SIMD_SSE1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse1*.cpp)
-        set_source_files_properties(${SIMD_SSE1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE_FLAG}")
-
         file(GLOB_RECURSE SIMD_SSE2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse2*.cpp)
         set_source_files_properties(${SIMD_SSE2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE2_FLAG}")
 
-        file(GLOB_RECURSE SIMD_SSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse3*.cpp)
-        set_source_files_properties(${SIMD_SSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE3_FLAG}")
-
-        file(GLOB_RECURSE SIMD_SSSE3_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSsse3*.cpp)
-        set_source_files_properties(${SIMD_SSSE3_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSSE3_FLAG}")
-
         file(GLOB_RECURSE SIMD_SSE41_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse41*.cpp)
-        set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_1_FLAG}")
-
-        file(GLOB_RECURSE SIMD_SSE42_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdSse42*.cpp)
-        set_source_files_properties(${SIMD_SSE42_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}")
+        set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}")
 
         file(GLOB_RECURSE SIMD_AVX1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx1*.cpp)
         set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}")
@@ -230,7 +180,7 @@ elseif(WINRT)
         set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}")
 
         set(SIMD_LIB_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}")
-        set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE1_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE3_SRC} ${SIMD_SSSE3_SRC} ${SIMD_SSE41_SRC} ${SIMD_SSE42_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC})
+        set(SIMD_ALG_SRC ${SIMD_BASE_SRC} ${SIMD_SSE2_SRC} ${SIMD_SSE41_SRC} ${SIMD_AVX1_SRC} ${SIMD_AVX2_SRC})
 
         file(GLOB_RECURSE SIMD_LIB_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdLib.cpp)
         set_source_files_properties(${SIMD_LIB_SRC} PROPERTIES COMPILE_FLAGS "${SIMD_LIB_FLAGS}")
diff --git a/3rdparty/simdlib/Simd/SimdLib.cpp b/3rdparty/simdlib/Simd/SimdLib.cpp
index b1cac8b1ba..89718bb80e 100755
--- a/3rdparty/simdlib/Simd/SimdLib.cpp
+++ b/3rdparty/simdlib/Simd/SimdLib.cpp
@@ -862,10 +862,9 @@ SIMD_API void SimdMatTranspose(const double * mat, size_t rows, size_t cols, dou
 
 SIMD_API void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff)
 {
-  //TODO:
-#ifdef SIMD_SSSE3_ENABLE
-    if (Ssse3::Enable && size >= Ssse3::A)
-        Ssse3::SimdImageDifference(img1,img2, size, imgDiff);
+#ifdef SIMD_SSE41_ENABLE
+    if (Sse41::Enable && size >= Sse41::A)
+        Sse41::SimdImageDifference(img1,img2, size, imgDiff);
     else
 #endif
         Base::SimdImageDifference(img1, img2, size, imgDiff);
diff --git a/3rdparty/simdlib/Simd/SimdSse41.h b/3rdparty/simdlib/Simd/SimdSse41.h
index 958fc11bc5..7a4bb04ad8 100755
--- a/3rdparty/simdlib/Simd/SimdSse41.h
+++ b/3rdparty/simdlib/Simd/SimdSse41.h
@@ -70,6 +70,9 @@ namespace Simd
         void RgbToBgra(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
 
         void RgbToGray(const uint8_t* rgb, size_t width, size_t height, size_t rgbStride, uint8_t* gray, size_t grayStride);
+
+        // ViSP custom SIMD code
+        void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff);
     }
 #endif// SIMD_SSE41_ENABLE
 }
diff --git a/3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp b/3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp
new file mode 100644
index 0000000000..f34a29329d
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdSse41CustomFunctions.cpp
@@ -0,0 +1,69 @@
+/*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdBase.h"
+#include "Simd/SimdStore.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE
+    namespace Sse41
+    {
+        void SimdImageDifference(const unsigned char * img1, const unsigned char * img2, size_t size, unsigned char * imgDiff)
+        {
+            const __m128i mask1 = _mm_set_epi8(-1, 14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0);
+            const __m128i mask2 = _mm_set_epi8(-1, 15, -1, 13, -1, 11, -1, 9, -1, 7, -1, 5, -1, 3, -1, 1);
+            const __m128i mask_out2 = _mm_set_epi8(14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0, -1);
+
+            size_t i = 0;
+            for (; i <= size-16; i+= 16) {
+                const __m128i vdata1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(img1 + i));
+                const __m128i vdata2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(img2 + i));
+
+                __m128i vdata1_reorg = _mm_shuffle_epi8(vdata1, mask1);
+                __m128i vdata2_reorg = _mm_shuffle_epi8(vdata2, mask1);
+
+                const __m128i vshift = _mm_set1_epi16(128);
+                __m128i vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift);
+
+                const __m128i v255 = _mm_set1_epi16(255);
+                const __m128i vzero = _mm_setzero_si128();
+                const __m128i vdata_diff_min_max1 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero);
+
+                vdata1_reorg = _mm_shuffle_epi8(vdata1, mask2);
+                vdata2_reorg = _mm_shuffle_epi8(vdata2, mask2);
+
+                vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift);
+                const __m128i vdata_diff_min_max2 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero);
+
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(imgDiff + i), _mm_or_si128(_mm_shuffle_epi8(vdata_diff_min_max1, mask1),
+                                                                                        _mm_shuffle_epi8(vdata_diff_min_max2, mask_out2)));
+            }
+
+            if (i < size) {
+                Base::SimdImageDifference(img1 + i, img2 + i, size - i, imgDiff + i);
+            }
+        }
+    }
+#else
+    // Work arround to avoid warning: libvisp_simdlib.a(SimdSsse3CustomFunctions.cpp.o) has no symbols
+    void dummy_SimdSse41CustomFunctions(){};
+#endif// SIMD_SSE41_ENABLE
+}
diff --git a/modules/io/src/image/vpImageIo.cpp b/modules/io/src/image/vpImageIo.cpp
index 633503389c..ab290fa5f7 100644
--- a/modules/io/src/image/vpImageIo.cpp
+++ b/modules/io/src/image/vpImageIo.cpp
@@ -102,7 +102,7 @@ void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const st
   while (cpt_elt != nb_elt) {
     // Skip empty lines or lines starting with # (comment)
     while (std::getline(fd, line) && (line.compare(0, 1, "#") == 0 || line.size() == 0)) {
-    };
+    }
 
     if (fd.eof()) {
       fd.close();

From b75d20064519c27e8e70336ebf53ce4fbad026fd Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Wed, 3 Nov 2021 18:07:41 +0100
Subject: [PATCH 10/18] WIP code to add and test image loading/saving using
 Simd and for JPEG and PNG image format.

---
 3rdparty/simdlib/CMakeLists.txt               |    4 +-
 3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp   |  158 ++
 3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp   |  138 +
 .../simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp    |  351 +++
 .../simdlib/Simd/SimdAvx2ImageSavePng.cpp     |  369 +++
 3rdparty/simdlib/Simd/SimdBase.h              |    4 +
 3rdparty/simdlib/Simd/SimdBaseCrc32.cpp       |  978 +++++++
 3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp   |  371 +++
 .../simdlib/Simd/SimdBaseImageLoadJpeg.cpp    | 2456 +++++++++++++++++
 .../simdlib/Simd/SimdBaseImageLoadPng.cpp     | 1317 +++++++++
 3rdparty/simdlib/Simd/SimdBaseImageSave.cpp   |  340 +++
 .../simdlib/Simd/SimdBaseImageSaveJpeg.cpp    |  451 +++
 .../simdlib/Simd/SimdBaseImageSavePng.cpp     |  379 +++
 3rdparty/simdlib/Simd/SimdImageLoad.h         |  396 +++
 3rdparty/simdlib/Simd/SimdImageSave.h         |  386 +++
 3rdparty/simdlib/Simd/SimdImageSaveJpeg.h     |  649 +++++
 3rdparty/simdlib/Simd/SimdImageSavePng.h      |  235 ++
 3rdparty/simdlib/Simd/SimdLib.cpp             |   32 +-
 3rdparty/simdlib/Simd/SimdLib.h               |  109 +-
 3rdparty/simdlib/Simd/SimdMath.h              |    5 +
 3rdparty/simdlib/Simd/SimdMemory.h            |   19 +
 3rdparty/simdlib/Simd/SimdMemoryStream.h      |  510 ++++
 3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp   |  154 ++
 3rdparty/simdlib/Simd/SimdNeonImageSave.cpp   |  134 +
 3rdparty/simdlib/Simd/SimdPerformance.h       |  197 ++
 3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp  |  159 ++
 .../simdlib/Simd/SimdSse41ImageLoadPng.cpp    | 1805 ++++++++++++
 3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp  |  139 +
 .../simdlib/Simd/SimdSse41ImageSaveJpeg.cpp   |  431 +++
 .../simdlib/Simd/SimdSse41ImageSavePng.cpp    |  370 +++
 3rdparty/simdlib/Simd/SimdView.hpp            |  209 +-
 CMakeLists.txt                                |    2 +
 modules/io/CMakeLists.txt                     |   14 +-
 modules/io/include/visp3/io/vpImageIo.h       |    8 +
 modules/io/src/image/vpImageIo.cpp            |   63 +
 modules/io/test/perfImageLoadSave.cpp         |  461 ++++
 36 files changed, 13646 insertions(+), 157 deletions(-)
 create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseCrc32.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageSave.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdImageLoad.h
 create mode 100644 3rdparty/simdlib/Simd/SimdImageSave.h
 create mode 100644 3rdparty/simdlib/Simd/SimdImageSaveJpeg.h
 create mode 100644 3rdparty/simdlib/Simd/SimdImageSavePng.h
 create mode 100644 3rdparty/simdlib/Simd/SimdMemoryStream.h
 create mode 100644 3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdNeonImageSave.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdPerformance.h
 create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp
 create mode 100644 3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp
 create mode 100644 modules/io/test/perfImageLoadSave.cpp

diff --git a/3rdparty/simdlib/CMakeLists.txt b/3rdparty/simdlib/CMakeLists.txt
index 1acb1341be..95b3358ad2 100644
--- a/3rdparty/simdlib/CMakeLists.txt
+++ b/3rdparty/simdlib/CMakeLists.txt
@@ -93,9 +93,9 @@ if(X86 OR X86_64)
 
     file(GLOB_RECURSE SIMD_AVX2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx2*.cpp)
     if(MSVC)
-        set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}")
+        set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
     else()
-        set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mfma")
+        set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mfma -mbmi -mbmi2 -mlzcnt -fabi-version=4 -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
     endif()
 
     set(SIMD_LIB_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG}")
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp
new file mode 100644
index 0000000000..aad4785761
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdAvx2ImageLoad.cpp
@@ -0,0 +1,158 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdAvx2.h"
+
+#include <memory>
+
+namespace Simd
+{
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param)
+            : Sse41::ImagePgmTxtLoader(param)
+        {
+        }
+
+        void ImagePgmTxtLoader::SetConverters()
+        {
+            Sse41::ImagePgmTxtLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _toAny = Avx2::GrayToBgr; break;
+                case SimdPixelFormatBgra32: _toBgra = Avx2::GrayToBgra; break;
+                case SimdPixelFormatRgb24: _toAny = Avx2::GrayToBgr; break;
+                case SimdPixelFormatRgba32: _toBgra = Avx2::GrayToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param)
+            : Sse41::ImagePgmBinLoader(param)
+        {
+        }
+
+        void ImagePgmBinLoader::SetConverters()
+        {
+            Sse41::ImagePgmBinLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _toAny = Avx2::GrayToBgr; break;
+                case SimdPixelFormatBgra32: _toBgra = Avx2::GrayToBgra; break;
+                case SimdPixelFormatRgb24: _toAny = Avx2::GrayToBgr; break;
+                case SimdPixelFormatRgba32: _toBgra = Avx2::GrayToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param)
+            : Sse41::ImagePpmTxtLoader(param)
+        {
+        }
+
+        void ImagePpmTxtLoader::SetConverters()
+        {
+            Sse41::ImagePpmTxtLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _toAny = Avx2::RgbToGray; break;
+                case SimdPixelFormatBgr24: _toAny = Avx2::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _toBgra = Avx2::RgbToBgra; break;
+                case SimdPixelFormatRgba32: _toBgra = Avx2::BgrToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param)
+            : Sse41::ImagePpmBinLoader(param)
+        {
+        }
+
+        void ImagePpmBinLoader::SetConverters()
+        {
+            Sse41::ImagePpmBinLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _toAny = Avx2::RgbToGray; break;
+                case SimdPixelFormatBgr24: _toAny = Avx2::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _toBgra = Avx2::RgbToBgra; break;
+                case SimdPixelFormatRgba32: _toBgra = Avx2::BgrToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageLoader* CreateImageLoader(const ImageLoaderParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinLoader(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinLoader(param);
+            case SimdImageFilePng: return new Sse41::ImagePngLoader(param);
+            case SimdImageFileJpeg: return new Base::ImageJpegLoader(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+        {
+            ImageLoaderParam param(data, size, *format);
+            if (param.Validate())
+            {
+                Holder<ImageLoader> loader(CreateImageLoader(param));
+                if (loader)
+                {
+                    if (loader->FromStream())
+                        return loader->Release(stride, width, height, format);
+                }
+            }
+            return NULL;
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp
new file mode 100644
index 0000000000..bd7e057092
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdAvx2ImageSave.cpp
@@ -0,0 +1,138 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdAvx2.h"
+
+#include <memory>
+
+namespace Simd
+{
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param)
+            : Sse41::ImagePgmTxtSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _convert = Avx2::BgrToGray; break;
+                case SimdPixelFormatBgra32: _convert = Avx2::BgraToGray; break;
+                case SimdPixelFormatRgb24: _convert = Avx2::RgbToGray; break;
+                case SimdPixelFormatRgba32: _convert = Avx2::RgbaToGray; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param)
+            : Sse41::ImagePgmBinSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _convert = Avx2::BgrToGray; break;
+                case SimdPixelFormatBgra32: _convert = Avx2::BgraToGray; break;
+                case SimdPixelFormatRgb24: _convert = Avx2::RgbToGray; break;
+                case SimdPixelFormatRgba32: _convert = Avx2::RgbaToGray; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param)
+            : Sse41::ImagePpmTxtSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _convert = Avx2::GrayToBgr; break;
+                case SimdPixelFormatBgr24: _convert = Avx2::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _convert = Avx2::BgraToRgb; break;
+                case SimdPixelFormatRgba32: _convert = Avx2::BgraToBgr; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param)
+            : Sse41::ImagePpmBinSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _convert = Avx2::GrayToBgr; break;
+                case SimdPixelFormatBgr24: _convert = Avx2::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _convert = Avx2::BgraToRgb; break;
+                case SimdPixelFormatRgba32: _convert = Avx2::BgraToBgr; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageSaver* CreateImageSaver(const ImageSaverParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinSaver(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinSaver(param);
+            case SimdImageFilePng: return new ImagePngSaver(param);
+            case SimdImageFileJpeg: return new ImageJpegSaver(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size)
+        {
+            ImageSaverParam param(width, height, format, file, quality);
+            if (param.Validate())
+            {
+                Holder<ImageSaver> saver(CreateImageSaver(param));
+                if (saver)
+                {
+                    if (saver->ToStream(src, stride))
+                        return saver->Release(size);
+                }
+            }
+            return NULL;
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp
new file mode 100644
index 0000000000..2ff51e4dc1
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdAvx2ImageSaveJpeg.cpp
@@ -0,0 +1,351 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdImageSaveJpeg.h"
+#include "Simd/SimdLoad.h"
+#include "Simd/SimdAvx2.h"
+
+namespace Simd
+{
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        const uint32_t JpegZigZagTi32[64] = {
+            0, 8, 1, 2, 9, 16, 24, 17,
+            10, 3, 4, 11, 18, 25, 32, 40,
+            33, 26, 19, 12, 5, 6, 13, 20,
+            27, 34, 41, 48, 56, 49, 42, 35,
+            28, 21, 14, 7, 15, 22, 29, 36,
+            43, 50, 57, 58, 51, 44, 37, 30,
+            23, 31, 38, 45, 52, 59, 60, 53,
+            46, 39, 47, 54, 61, 62, 55, 63 };
+
+        //---------------------------------------------------------------------
+
+        static int JpegProcessDu(Base::BitBuf& bitBuf, float* CDU, int stride, const float* fdtbl, int DC, const uint16_t HTDC[256][2], const uint16_t HTAC[256][2])
+        {
+            SIMD_ALIGNED(32) int DUO[64], DU[64];
+            JpegDct(CDU, stride, fdtbl, DUO);
+            union
+            {
+                uint64_t u64[1];
+                uint32_t u32[2];
+                uint8_t u8[8];
+            } dum;
+            for (int i = 0, j = 0; i < 64; i += 8, j++)
+            {
+                __m256i du = _mm256_i32gather_epi32(DUO, _mm256_loadu_si256((__m256i*)(JpegZigZagTi32 + i)), 4);
+                dum.u8[j] = ~_mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpeq_epi32(du, Avx2::K_ZERO)));
+                _mm256_storeu_si256((__m256i*)(DU + i), du);
+            }
+            int diff = DU[0] - DC;
+            if (diff == 0)
+                bitBuf.Push(HTDC[0]);
+            else
+            {
+                uint16_t bits[2];
+                Base::JpegCalcBits(diff, bits);
+                bitBuf.Push(HTDC[bits[1]]);
+                bitBuf.Push(bits);
+            }
+#if defined(SIMD_X64_ENABLE)
+            if (dum.u64[0] == 0)
+            {
+                bitBuf.Push(HTAC[0x00]);
+                return DU[0];
+            }
+            dum.u64[0] >>= 1;
+            int i = 1;
+            for (; dum.u64[0]; ++i, dum.u64[0] >>= 1)
+            {
+                int nrzeroes = (int)_tzcnt_u64(dum.u64[0]);
+                i += nrzeroes;
+                dum.u64[0] >>= nrzeroes;
+                if (nrzeroes >= 16)
+                {
+                    for (int nrmarker = 16; nrmarker <= nrzeroes; nrmarker += 16)
+                        bitBuf.Push(HTAC[0xF0]);
+                    nrzeroes &= 15;
+                }
+                uint16_t bits[2];
+                Base::JpegCalcBits(DU[i], bits);
+                bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]);
+                bitBuf.Push(bits);
+            }
+            if (i < 64)
+                bitBuf.Push(HTAC[0x00]);
+#else
+            int end0pos = 64;
+            do
+            {
+                end0pos -= 8;
+                int mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_loadu_si256((__m256i*)(DU + end0pos)), Avx2::K_ZERO));
+                if (mask)
+                {
+                    end0pos += 7 - _lzcnt_u32(mask) / 4;
+                    break;
+                }
+            } 
+            while (end0pos > 0);
+            if (end0pos == 0)
+            {
+                bitBuf.Push(HTAC[0x00]);
+                return DU[0];
+            }
+            for (int i = 1; i <= end0pos; ++i)
+            {
+                int startpos = i;
+                for (; DU[i] == 0 && i <= end0pos; ++i);
+                int nrzeroes = i - startpos;
+                if (nrzeroes >= 16)
+                {
+                    int lng = nrzeroes >> 4;
+                    int nrmarker;
+                    for (nrmarker = 1; nrmarker <= lng; ++nrmarker)
+                        bitBuf.Push(HTAC[0xF0]);
+                    nrzeroes &= 15;
+                }
+                uint16_t bits[2];
+                Base::JpegCalcBits(DU[i], bits);
+                bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]);
+                bitBuf.Push(bits);
+            }
+            if (end0pos != 63)
+                bitBuf.Push(HTAC[0x00]);
+#endif
+            return DU[0];
+        }
+
+        SIMD_INLINE void RgbToYuvInit(__m256 k[10])
+        {
+            k[0] = _mm256_set1_ps(+0.29900f);
+            k[1] = _mm256_set1_ps(+0.58700f);
+            k[2] = _mm256_set1_ps(+0.11400f);
+            k[3] = _mm256_set1_ps(-128.000f);
+            k[4] = _mm256_set1_ps(-0.16874f);
+            k[5] = _mm256_set1_ps(-0.33126f);
+            k[6] = _mm256_set1_ps(+0.50000f);
+            k[7] = _mm256_set1_ps(+0.50000f);
+            k[8] = _mm256_set1_ps(-0.41869f);
+            k[9] = _mm256_set1_ps(-0.08131f);
+        }
+
+        SIMD_INLINE void RgbToYuv(const uint8_t* r, const uint8_t* g, const uint8_t* b, int stride, int height, 
+            const __m256 k[10], float* y, float* u, float* v, int size)
+        {
+            for (int row = 0; row < size;)
+            {
+                for (int col = 0; col < size; col += 8)
+                {
+                    __m256 _r = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(r + col))));
+                    __m256 _g = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(g + col))));
+                    __m256 _b = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(b + col))));
+                    _mm256_storeu_ps(y + col, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, k[0]), _mm256_mul_ps(_g, k[1])), _mm256_mul_ps(_b, k[2])), k[3]));
+                    //_mm256_storeu_ps(y + col, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, _yr), _mm256_mul_ps(_g, _yg)), _mm256_add_ps(_mm256_mul_ps(_b, _yb), _yt)));
+                    _mm256_storeu_ps(u + col, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, k[4]), _mm256_mul_ps(_g, k[5])), _mm256_mul_ps(_b, k[6])));
+                    _mm256_storeu_ps(v + col, _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(_r, k[7]), _mm256_mul_ps(_g, k[8])), _mm256_mul_ps(_b, k[9])));
+                }
+                if(++row < height)
+                    r += stride, g += stride, b += stride;
+                y += size, u += size, v += size;
+            }
+        }
+
+        SIMD_INLINE void GrayToY(const uint8_t* g, int stride, int height, const __m256 k[10], float* y, int size)
+        {
+            for (int row = 0; row < size;)
+            {
+                for (int col = 0; col < size; col += 8)
+                {
+                    __m256 _g = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(g + col))));
+                    _mm256_storeu_ps(y + col, _mm256_add_ps(_g, k[3]));
+                }
+                if (++row < height)
+                    g += stride;
+                y += size;
+            }
+        }
+
+        SIMD_INLINE void SubUv(const float * src, float * dst)
+        {
+            __m256 _0_25 = _mm256_set1_ps(0.25f), s0, s1;
+            for (int yy = 0; yy < 8; yy += 1)
+            {
+                s0 = _mm256_add_ps(_mm256_loadu_ps(src + 0), _mm256_loadu_ps(src + 16));
+                s1 = _mm256_add_ps(_mm256_loadu_ps(src + 8), _mm256_loadu_ps(src + 24));
+                _mm256_storeu_ps(dst + 0, _mm256_mul_ps(PermutedHorizontalAdd(s0, s1), _0_25));
+                src += 32;
+                dst += 8;
+            }
+        }
+
+        void JpegWriteBlockSubs(OutputMemoryStream& stream, int width, int height, const uint8_t* red,
+            const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3])
+        {
+            __m256 k[10];
+            RgbToYuvInit(k);
+            int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2];
+            int width16 = width & (~15);
+            bool gray = red == green && red == blue;
+            Base::BitBuf bitBuf;
+            for (int y = 0; y < height; y += 16)
+            {
+                int x = 0;
+                SIMD_ALIGNED(16) float Y[256], U[256], V[256];
+                SIMD_ALIGNED(16) float subU[64], subV[64];
+                for (; x < width16; x += 16)
+                {
+                    if (gray)
+                        GrayToY(red + x, stride, height - y, k, Y, 16);
+                    else
+                        RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 16);
+                    DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        SubUv(U, subU);
+                        SubUv(V, subV);
+                        DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                    if (bitBuf.Full())
+                    {
+                        Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+                        bitBuf.Clear();
+                    }
+                }
+                for (; x < width; x += 16)
+                {
+                    if (gray)
+                        Base::GrayToY(red + x, stride, height - y, width - x, Y, 16);
+                    else
+                        Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 16);
+                    DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        SubUv(U, subU);
+                        SubUv(V, subV);
+                        DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                }
+            }
+            Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+            bitBuf.Clear();
+        }
+
+        void JpegWriteBlockFull(OutputMemoryStream& stream, int width, int height, const uint8_t* red,
+            const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3])
+        {
+            __m256 k[10];
+            RgbToYuvInit(k);
+            int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2];
+            int width8 = width & (~7);
+            bool gray = red == green && red == blue;
+            Base::BitBuf bitBuf;
+            for (int y = 0; y < height; y += 8)
+            {
+                int x = 0;
+                SIMD_ALIGNED(16) float Y[64], U[64], V[64];
+                for (; x < width8; x += 8)
+                {
+                    if (gray)
+                        GrayToY(red + x, stride, height - y, k, Y, 8);
+                    else
+                        RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 8);
+                    DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                    if (bitBuf.Full())
+                    {
+                        Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+                        bitBuf.Clear();
+                    }
+                }
+                for (; x < width; x += 8)
+                {
+                    if (gray)
+                        Base::GrayToY(red + x, stride, height - y, width - x, Y, 8);
+                    else
+                        Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 8);
+                    DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                }
+                Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+                bitBuf.Clear();
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageJpegSaver::ImageJpegSaver(const ImageSaverParam& param)
+            : Sse41::ImageJpegSaver(param)
+        {
+        }
+
+        void ImageJpegSaver::Init()
+        {
+            Sse41::ImageJpegSaver::Init();
+            if (_param.width >= 32)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24:
+                case SimdPixelFormatRgb24:
+                    _deintBgr = Avx2::DeinterleaveBgr;
+                    break;
+                case SimdPixelFormatBgra32:
+                case SimdPixelFormatRgba32:
+                    _deintBgra = Avx2::DeinterleaveBgra;
+                    break;
+                default: 
+                    break;
+                }
+            }
+            _writeBlock = _subSample ? JpegWriteBlockSubs : JpegWriteBlockFull;
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp b/3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp
new file mode 100644
index 0000000000..3cfa79fc62
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdAvx2ImageSavePng.cpp
@@ -0,0 +1,369 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdImageSavePng.h"
+#include "Simd/SimdAvx2.h"
+#include "Simd/SimdExtract.h"
+
+namespace Simd
+{        
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        static uint32_t ZlibAdler32(uint8_t* data, int size)
+        {
+            __m256i _i0 = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7), _8 = _mm256_set1_epi32(8);
+            uint32_t lo = 1, hi = 0;
+            for (int b = 0, n = (int)(size % 5552); b < size;)
+            {
+                int n8 = n & (~7), i = 0;
+                __m256i _i = _mm256_add_epi32(_i0, _mm256_set1_epi32(n));
+                __m256i _l = _mm256_setzero_si256(), _h = _mm256_setzero_si256();
+                for (; i < n8; i += 8)
+                {
+                    __m256i d = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)(data + b + i)));
+                    _l = _mm256_add_epi32(_l, d);
+                    _h = _mm256_add_epi32(_h, _mm256_mullo_epi32(d, _i));
+                    _i = _mm256_sub_epi32(_i, _8);
+                }
+                int l = Avx2::ExtractSum<uint32_t>(_l), h = Avx2::ExtractSum<uint32_t>(_h);
+                for (; i < n; ++i)
+                {
+                    l += data[b + i];
+                    h += data[b + i]*(n - i);
+                }
+                hi = (hi + h + lo*n) % 65521;
+                lo = (lo + l) % 65521;
+                b += n;
+                n = 5552;
+            }
+            return (hi << 16) | lo;
+        }
+
+        void ZlibCompress(uint8_t* data, int size, int quality, OutputMemoryStream& stream)
+        {
+            const int ZHASH = 16384;
+            if (quality < 5)
+                quality = 5;
+            const int basket = quality * 2;
+            Array32i hashTable(ZHASH * basket);
+            memset(hashTable.data, -1, hashTable.RawSize());
+
+            stream.Write(uint8_t(0x78));
+            stream.Write(uint8_t(0x5e));
+            stream.WriteBits(1, 1);
+            stream.WriteBits(1, 2);
+
+            int i = 0, j;
+            while (i < size - 3)
+            {
+                int h = Base::ZlibHash(data + i) & (ZHASH - 1), best = 3;
+                uint8_t* bestLoc = 0;
+                int* hList = hashTable.data + h * basket;
+                for (j = 0; hList[j] != -1 && j < basket; ++j)
+                {
+                    if (hList[j] > i - 32768)
+                    {
+                        int d = Avx2::ZlibCount(data + hList[j], data + i, size - i);
+                        if (d >= best)
+                        {
+                            best = d;
+                            bestLoc = data + hList[j];
+                        }
+                    }
+                }
+                if (j == basket)
+                {
+                    memcpy(hList, hList + quality, quality * sizeof(int));
+                    memset(hList + quality, -1, quality * sizeof(int));
+                    j = quality;
+                }
+                hList[j] = i;
+
+                if (bestLoc)
+                {
+                    h = Base::ZlibHash(data + i + 1) & (ZHASH - 1);
+                    int* hList = hashTable.data + h * basket;
+                    for (j = 0; hList[j] != -1 && j < basket; ++j)
+                    {
+                        if (hList[j] > i - 32767)
+                        {
+                            int e = Avx2::ZlibCount(data + hList[j], data + i + 1, size - i - 1);
+                            if (e > best)
+                            {
+                                bestLoc = NULL;
+                                break;
+                            }
+                        }
+                    }
+                }
+
+                if (bestLoc)
+                {
+                    int d = (int)(data + i - bestLoc);
+                    assert(d <= 32767 && best <= 258);
+                    for (j = 0; best > Base::ZlibLenC[j + 1] - 1; ++j);
+                    Base::ZlibHuff(j + 257, stream);
+                    if (Base::ZlibLenEb[j])
+                        stream.WriteBits(best - Base::ZlibLenC[j], Base::ZlibLenEb[j]);
+                    for (j = 0; d > Base::ZlibDistC[j + 1] - 1; ++j);
+                    stream.WriteBits(Base::ZlibBitRev(j, 5), 5);
+                    if (Base::ZlibDistEb[j])
+                        stream.WriteBits(d - Base::ZlibDistC[j], Base::ZlibDistEb[j]);
+                    i += best;
+                }
+                else
+                {
+                    Base::ZlibHuffB(data[i], stream);
+                    ++i;
+                }
+            }
+            for (; i < size; ++i)
+                Base::ZlibHuffB(data[i], stream);
+            Base::ZlibHuff(256, stream);
+            stream.FlushBits();
+            stream.WriteBe32u(ZlibAdler32(data, size));
+        }
+
+        uint32_t EncodeLine0(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size, A);
+            __m256i _sum = _mm256_setzero_si256();
+            for (; i < sizeA; i += A)
+            {
+                __m256i _src = _mm256_loadu_si256((__m256i*)(src + i));
+                _mm256_storeu_si256((__m256i*)(dst + i), _src);
+                _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_src)));
+            }
+            uint32_t sum = Avx2::ExtractSum<uint32_t>(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine1(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            __m256i _sum = _mm256_setzero_si256();
+            for (; i < sizeA; i += A)
+            {
+                __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i));
+                __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n));
+                __m256i _dst = _mm256_sub_epi8(_src0, _src1);
+                _mm256_storeu_si256((__m256i*)(dst + i), _dst);
+                _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst)));
+            }
+            sum += Avx2::ExtractSum<uint32_t>(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - n];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine2(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i] - src[i - stride];
+                sum += ::abs(dst[i]);
+            }
+            __m256i _sum = _mm256_setzero_si256();
+            for (; i < sizeA; i += A)
+            {
+                __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i));
+                __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - stride));
+                __m256i _dst = _mm256_sub_epi8(_src0, _src1);
+                _mm256_storeu_si256((__m256i*)(dst + i), _dst);
+                _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst)));
+            }
+            sum += Avx2::ExtractSum<uint32_t>(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - stride];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine3(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i] - (src[i - stride] >> 1);
+                sum += ::abs(dst[i]);
+            }
+            __m256i _sum = _mm256_setzero_si256();
+            for (; i < sizeA; i += A)
+            {
+                __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i));
+                __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n));
+                __m256i _src2 = _mm256_loadu_si256((__m256i*)(src + i - stride));
+                __m256i lo = _mm256_srli_epi16(_mm256_add_epi16(UnpackU8<0>(_src1), UnpackU8<0>(_src2)), 1);
+                __m256i hi = _mm256_srli_epi16(_mm256_add_epi16(UnpackU8<1>(_src1), UnpackU8<1>(_src2)), 1);
+                __m256i _dst = _mm256_sub_epi8(_src0, _mm256_packus_epi16(lo, hi));
+                _mm256_storeu_si256((__m256i*)(dst + i), _dst);
+                _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst)));
+            }
+            sum += Avx2::ExtractSum<uint32_t>(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - ((src[i - n] + src[i - stride]) >> 1);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        SIMD_INLINE __m256i Paeth(__m256i a, __m256i b, __m256i c)
+        {
+            __m256i p = _mm256_sub_epi16(_mm256_add_epi16(a, b), c);
+            __m256i pa = _mm256_abs_epi16(_mm256_sub_epi16(p, a));
+            __m256i pb = _mm256_abs_epi16(_mm256_sub_epi16(p, b));
+            __m256i pc = _mm256_abs_epi16(_mm256_sub_epi16(p, c));
+            __m256i mbc = _mm256_or_si256(_mm256_cmpgt_epi16(pa, pb), _mm256_cmpgt_epi16(pa, pc));
+            __m256i mc = _mm256_cmpgt_epi16(pb, pc);
+            return _mm256_blendv_epi8(a, _mm256_blendv_epi8(b, c, mc), mbc);
+        }
+
+        uint32_t EncodeLine4(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = (int8_t)(src[i] - src[i - stride]);
+                sum += ::abs(dst[i]);
+            }
+            __m256i _sum = _mm256_setzero_si256();
+            for (; i < sizeA; i += A)
+            {
+                __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i));
+                __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n));
+                __m256i _src2 = _mm256_loadu_si256((__m256i*)(src + i - stride));
+                __m256i _src3 = _mm256_loadu_si256((__m256i*)(src + i - stride - n));
+                __m256i lo = Paeth(UnpackU8<0>(_src1), UnpackU8<0>(_src2), UnpackU8<0>(_src3));
+                __m256i hi = Paeth(UnpackU8<1>(_src1), UnpackU8<1>(_src2), UnpackU8<1>(_src3));
+                __m256i _dst = _mm256_sub_epi8(_src0, _mm256_packus_epi16(lo, hi));
+                _mm256_storeu_si256((__m256i*)(dst + i), _dst);
+                _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst)));
+            }
+            sum += Avx2::ExtractSum<uint32_t>(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - Base::Paeth(src[i - n], src[i - stride], src[i - stride - n]);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine5(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            __m256i _sum = _mm256_setzero_si256();
+            for (; i < sizeA; i += A)
+            {
+                __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i));
+                __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n));
+                __m256i lo = _mm256_srli_epi16(UnpackU8<0>(_src1), 1);
+                __m256i hi = _mm256_srli_epi16(UnpackU8<1>(_src1), 1);
+                __m256i _dst = _mm256_sub_epi8(_src0, _mm256_packus_epi16(lo, hi));
+                _mm256_storeu_si256((__m256i*)(dst + i), _dst);
+                _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst)));
+            }
+            sum += Avx2::ExtractSum<uint32_t>(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - (src[i - n] >> 1);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine6(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            __m256i _sum = _mm256_setzero_si256();
+            for (; i < sizeA; i += A)
+            {
+                __m256i _src0 = _mm256_loadu_si256((__m256i*)(src + i));
+                __m256i _src1 = _mm256_loadu_si256((__m256i*)(src + i - n));
+                __m256i _dst = _mm256_sub_epi8(_src0, _src1);
+                _mm256_storeu_si256((__m256i*)(dst + i), _dst);
+                _sum = _mm256_add_epi32(_sum, _mm256_sad_epu8(_mm256_setzero_si256(), _mm256_abs_epi8(_dst)));
+            }
+            sum += Avx2::ExtractSum<uint32_t>(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - n];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        ImagePngSaver::ImagePngSaver(const ImageSaverParam& param)
+            : Sse41::ImagePngSaver(param)
+        {
+            if (_param.format == SimdPixelFormatBgr24)
+                _convert = Avx2::BgrToRgb;
+            else if (_param.format == SimdPixelFormatBgra32)
+                _convert = Avx2::BgraToRgba;
+            _encode[0] = Avx2::EncodeLine0;
+            _encode[1] = Avx2::EncodeLine1;
+            _encode[2] = Avx2::EncodeLine2;
+            _encode[3] = Avx2::EncodeLine3;
+            _encode[4] = Avx2::EncodeLine4;
+            _encode[5] = Avx2::EncodeLine5;
+            _encode[6] = Avx2::EncodeLine6;
+            _compress = Avx2::ZlibCompress;
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdBase.h b/3rdparty/simdlib/Simd/SimdBase.h
index 998a7b7cbe..3ad6e60d96 100755
--- a/3rdparty/simdlib/Simd/SimdBase.h
+++ b/3rdparty/simdlib/Simd/SimdBase.h
@@ -32,6 +32,10 @@ namespace Simd
 {
     namespace Base
     {
+        uint32_t Crc32(const void* src, size_t size);
+
+        uint32_t Crc32c(const void * src, size_t size);
+
         void BgraToBgr(const uint8_t * bgra, size_t size, uint8_t * bgr, bool lastRow);
 
         void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride);
diff --git a/3rdparty/simdlib/Simd/SimdBaseCrc32.cpp b/3rdparty/simdlib/Simd/SimdBaseCrc32.cpp
new file mode 100644
index 0000000000..4008b0f0d8
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseCrc32.cpp
@@ -0,0 +1,978 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdDefs.h"
+
+namespace Simd
+{
+    namespace Base
+    {
+        static SIMD_INLINE uint32_t Reorder32(uint32_t x)
+        {
+#if defined(__GNUC__) || defined(__clang__)
+            return __builtin_bswap32(x);
+#else
+            return (x >> 24) |
+                ((x >> 8) & 0x0000FF00) |
+                ((x << 8) & 0x00FF0000) |
+                (x << 24);
+#endif
+        }
+
+        // Precalculated CRC32c lookup table for polynomial 0xEDB88320.
+        static const uint32_t Crc32Table[16][256] =
+        {
+            {
+                0x00000000,0x77073096,0xEE0E612C,0x990951BA,0x076DC419,0x706AF48F,0xE963A535,0x9E6495A3,
+                0x0EDB8832,0x79DCB8A4,0xE0D5E91E,0x97D2D988,0x09B64C2B,0x7EB17CBD,0xE7B82D07,0x90BF1D91,
+                0x1DB71064,0x6AB020F2,0xF3B97148,0x84BE41DE,0x1ADAD47D,0x6DDDE4EB,0xF4D4B551,0x83D385C7,
+                0x136C9856,0x646BA8C0,0xFD62F97A,0x8A65C9EC,0x14015C4F,0x63066CD9,0xFA0F3D63,0x8D080DF5,
+                0x3B6E20C8,0x4C69105E,0xD56041E4,0xA2677172,0x3C03E4D1,0x4B04D447,0xD20D85FD,0xA50AB56B,
+                0x35B5A8FA,0x42B2986C,0xDBBBC9D6,0xACBCF940,0x32D86CE3,0x45DF5C75,0xDCD60DCF,0xABD13D59,
+                0x26D930AC,0x51DE003A,0xC8D75180,0xBFD06116,0x21B4F4B5,0x56B3C423,0xCFBA9599,0xB8BDA50F,
+                0x2802B89E,0x5F058808,0xC60CD9B2,0xB10BE924,0x2F6F7C87,0x58684C11,0xC1611DAB,0xB6662D3D,
+                0x76DC4190,0x01DB7106,0x98D220BC,0xEFD5102A,0x71B18589,0x06B6B51F,0x9FBFE4A5,0xE8B8D433,
+                0x7807C9A2,0x0F00F934,0x9609A88E,0xE10E9818,0x7F6A0DBB,0x086D3D2D,0x91646C97,0xE6635C01,
+                0x6B6B51F4,0x1C6C6162,0x856530D8,0xF262004E,0x6C0695ED,0x1B01A57B,0x8208F4C1,0xF50FC457,
+                0x65B0D9C6,0x12B7E950,0x8BBEB8EA,0xFCB9887C,0x62DD1DDF,0x15DA2D49,0x8CD37CF3,0xFBD44C65,
+                0x4DB26158,0x3AB551CE,0xA3BC0074,0xD4BB30E2,0x4ADFA541,0x3DD895D7,0xA4D1C46D,0xD3D6F4FB,
+                0x4369E96A,0x346ED9FC,0xAD678846,0xDA60B8D0,0x44042D73,0x33031DE5,0xAA0A4C5F,0xDD0D7CC9,
+                0x5005713C,0x270241AA,0xBE0B1010,0xC90C2086,0x5768B525,0x206F85B3,0xB966D409,0xCE61E49F,
+                0x5EDEF90E,0x29D9C998,0xB0D09822,0xC7D7A8B4,0x59B33D17,0x2EB40D81,0xB7BD5C3B,0xC0BA6CAD,
+                0xEDB88320,0x9ABFB3B6,0x03B6E20C,0x74B1D29A,0xEAD54739,0x9DD277AF,0x04DB2615,0x73DC1683,
+                0xE3630B12,0x94643B84,0x0D6D6A3E,0x7A6A5AA8,0xE40ECF0B,0x9309FF9D,0x0A00AE27,0x7D079EB1,
+                0xF00F9344,0x8708A3D2,0x1E01F268,0x6906C2FE,0xF762575D,0x806567CB,0x196C3671,0x6E6B06E7,
+                0xFED41B76,0x89D32BE0,0x10DA7A5A,0x67DD4ACC,0xF9B9DF6F,0x8EBEEFF9,0x17B7BE43,0x60B08ED5,
+                0xD6D6A3E8,0xA1D1937E,0x38D8C2C4,0x4FDFF252,0xD1BB67F1,0xA6BC5767,0x3FB506DD,0x48B2364B,
+                0xD80D2BDA,0xAF0A1B4C,0x36034AF6,0x41047A60,0xDF60EFC3,0xA867DF55,0x316E8EEF,0x4669BE79,
+                0xCB61B38C,0xBC66831A,0x256FD2A0,0x5268E236,0xCC0C7795,0xBB0B4703,0x220216B9,0x5505262F,
+                0xC5BA3BBE,0xB2BD0B28,0x2BB45A92,0x5CB36A04,0xC2D7FFA7,0xB5D0CF31,0x2CD99E8B,0x5BDEAE1D,
+                0x9B64C2B0,0xEC63F226,0x756AA39C,0x026D930A,0x9C0906A9,0xEB0E363F,0x72076785,0x05005713,
+                0x95BF4A82,0xE2B87A14,0x7BB12BAE,0x0CB61B38,0x92D28E9B,0xE5D5BE0D,0x7CDCEFB7,0x0BDBDF21,
+                0x86D3D2D4,0xF1D4E242,0x68DDB3F8,0x1FDA836E,0x81BE16CD,0xF6B9265B,0x6FB077E1,0x18B74777,
+                0x88085AE6,0xFF0F6A70,0x66063BCA,0x11010B5C,0x8F659EFF,0xF862AE69,0x616BFFD3,0x166CCF45,
+                0xA00AE278,0xD70DD2EE,0x4E048354,0x3903B3C2,0xA7672661,0xD06016F7,0x4969474D,0x3E6E77DB,
+                0xAED16A4A,0xD9D65ADC,0x40DF0B66,0x37D83BF0,0xA9BCAE53,0xDEBB9EC5,0x47B2CF7F,0x30B5FFE9,
+                0xBDBDF21C,0xCABAC28A,0x53B39330,0x24B4A3A6,0xBAD03605,0xCDD70693,0x54DE5729,0x23D967BF,
+                0xB3667A2E,0xC4614AB8,0x5D681B02,0x2A6F2B94,0xB40BBE37,0xC30C8EA1,0x5A05DF1B,0x2D02EF8D,
+            },
+            {
+                0x00000000,0x191B3141,0x32366282,0x2B2D53C3,0x646CC504,0x7D77F445,0x565AA786,0x4F4196C7,
+                0xC8D98A08,0xD1C2BB49,0xFAEFE88A,0xE3F4D9CB,0xACB54F0C,0xB5AE7E4D,0x9E832D8E,0x87981CCF,
+                0x4AC21251,0x53D92310,0x78F470D3,0x61EF4192,0x2EAED755,0x37B5E614,0x1C98B5D7,0x05838496,
+                0x821B9859,0x9B00A918,0xB02DFADB,0xA936CB9A,0xE6775D5D,0xFF6C6C1C,0xD4413FDF,0xCD5A0E9E,
+                0x958424A2,0x8C9F15E3,0xA7B24620,0xBEA97761,0xF1E8E1A6,0xE8F3D0E7,0xC3DE8324,0xDAC5B265,
+                0x5D5DAEAA,0x44469FEB,0x6F6BCC28,0x7670FD69,0x39316BAE,0x202A5AEF,0x0B07092C,0x121C386D,
+                0xDF4636F3,0xC65D07B2,0xED705471,0xF46B6530,0xBB2AF3F7,0xA231C2B6,0x891C9175,0x9007A034,
+                0x179FBCFB,0x0E848DBA,0x25A9DE79,0x3CB2EF38,0x73F379FF,0x6AE848BE,0x41C51B7D,0x58DE2A3C,
+                0xF0794F05,0xE9627E44,0xC24F2D87,0xDB541CC6,0x94158A01,0x8D0EBB40,0xA623E883,0xBF38D9C2,
+                0x38A0C50D,0x21BBF44C,0x0A96A78F,0x138D96CE,0x5CCC0009,0x45D73148,0x6EFA628B,0x77E153CA,
+                0xBABB5D54,0xA3A06C15,0x888D3FD6,0x91960E97,0xDED79850,0xC7CCA911,0xECE1FAD2,0xF5FACB93,
+                0x7262D75C,0x6B79E61D,0x4054B5DE,0x594F849F,0x160E1258,0x0F152319,0x243870DA,0x3D23419B,
+                0x65FD6BA7,0x7CE65AE6,0x57CB0925,0x4ED03864,0x0191AEA3,0x188A9FE2,0x33A7CC21,0x2ABCFD60,
+                0xAD24E1AF,0xB43FD0EE,0x9F12832D,0x8609B26C,0xC94824AB,0xD05315EA,0xFB7E4629,0xE2657768,
+                0x2F3F79F6,0x362448B7,0x1D091B74,0x04122A35,0x4B53BCF2,0x52488DB3,0x7965DE70,0x607EEF31,
+                0xE7E6F3FE,0xFEFDC2BF,0xD5D0917C,0xCCCBA03D,0x838A36FA,0x9A9107BB,0xB1BC5478,0xA8A76539,
+                0x3B83984B,0x2298A90A,0x09B5FAC9,0x10AECB88,0x5FEF5D4F,0x46F46C0E,0x6DD93FCD,0x74C20E8C,
+                0xF35A1243,0xEA412302,0xC16C70C1,0xD8774180,0x9736D747,0x8E2DE606,0xA500B5C5,0xBC1B8484,
+                0x71418A1A,0x685ABB5B,0x4377E898,0x5A6CD9D9,0x152D4F1E,0x0C367E5F,0x271B2D9C,0x3E001CDD,
+                0xB9980012,0xA0833153,0x8BAE6290,0x92B553D1,0xDDF4C516,0xC4EFF457,0xEFC2A794,0xF6D996D5,
+                0xAE07BCE9,0xB71C8DA8,0x9C31DE6B,0x852AEF2A,0xCA6B79ED,0xD37048AC,0xF85D1B6F,0xE1462A2E,
+                0x66DE36E1,0x7FC507A0,0x54E85463,0x4DF36522,0x02B2F3E5,0x1BA9C2A4,0x30849167,0x299FA026,
+                0xE4C5AEB8,0xFDDE9FF9,0xD6F3CC3A,0xCFE8FD7B,0x80A96BBC,0x99B25AFD,0xB29F093E,0xAB84387F,
+                0x2C1C24B0,0x350715F1,0x1E2A4632,0x07317773,0x4870E1B4,0x516BD0F5,0x7A468336,0x635DB277,
+                0xCBFAD74E,0xD2E1E60F,0xF9CCB5CC,0xE0D7848D,0xAF96124A,0xB68D230B,0x9DA070C8,0x84BB4189,
+                0x03235D46,0x1A386C07,0x31153FC4,0x280E0E85,0x674F9842,0x7E54A903,0x5579FAC0,0x4C62CB81,
+                0x8138C51F,0x9823F45E,0xB30EA79D,0xAA1596DC,0xE554001B,0xFC4F315A,0xD7626299,0xCE7953D8,
+                0x49E14F17,0x50FA7E56,0x7BD72D95,0x62CC1CD4,0x2D8D8A13,0x3496BB52,0x1FBBE891,0x06A0D9D0,
+                0x5E7EF3EC,0x4765C2AD,0x6C48916E,0x7553A02F,0x3A1236E8,0x230907A9,0x0824546A,0x113F652B,
+                0x96A779E4,0x8FBC48A5,0xA4911B66,0xBD8A2A27,0xF2CBBCE0,0xEBD08DA1,0xC0FDDE62,0xD9E6EF23,
+                0x14BCE1BD,0x0DA7D0FC,0x268A833F,0x3F91B27E,0x70D024B9,0x69CB15F8,0x42E6463B,0x5BFD777A,
+                0xDC656BB5,0xC57E5AF4,0xEE530937,0xF7483876,0xB809AEB1,0xA1129FF0,0x8A3FCC33,0x9324FD72,
+            },
+            {
+                0x00000000,0x01C26A37,0x0384D46E,0x0246BE59,0x0709A8DC,0x06CBC2EB,0x048D7CB2,0x054F1685,
+                0x0E1351B8,0x0FD13B8F,0x0D9785D6,0x0C55EFE1,0x091AF964,0x08D89353,0x0A9E2D0A,0x0B5C473D,
+                0x1C26A370,0x1DE4C947,0x1FA2771E,0x1E601D29,0x1B2F0BAC,0x1AED619B,0x18ABDFC2,0x1969B5F5,
+                0x1235F2C8,0x13F798FF,0x11B126A6,0x10734C91,0x153C5A14,0x14FE3023,0x16B88E7A,0x177AE44D,
+                0x384D46E0,0x398F2CD7,0x3BC9928E,0x3A0BF8B9,0x3F44EE3C,0x3E86840B,0x3CC03A52,0x3D025065,
+                0x365E1758,0x379C7D6F,0x35DAC336,0x3418A901,0x3157BF84,0x3095D5B3,0x32D36BEA,0x331101DD,
+                0x246BE590,0x25A98FA7,0x27EF31FE,0x262D5BC9,0x23624D4C,0x22A0277B,0x20E69922,0x2124F315,
+                0x2A78B428,0x2BBADE1F,0x29FC6046,0x283E0A71,0x2D711CF4,0x2CB376C3,0x2EF5C89A,0x2F37A2AD,
+                0x709A8DC0,0x7158E7F7,0x731E59AE,0x72DC3399,0x7793251C,0x76514F2B,0x7417F172,0x75D59B45,
+                0x7E89DC78,0x7F4BB64F,0x7D0D0816,0x7CCF6221,0x798074A4,0x78421E93,0x7A04A0CA,0x7BC6CAFD,
+                0x6CBC2EB0,0x6D7E4487,0x6F38FADE,0x6EFA90E9,0x6BB5866C,0x6A77EC5B,0x68315202,0x69F33835,
+                0x62AF7F08,0x636D153F,0x612BAB66,0x60E9C151,0x65A6D7D4,0x6464BDE3,0x662203BA,0x67E0698D,
+                0x48D7CB20,0x4915A117,0x4B531F4E,0x4A917579,0x4FDE63FC,0x4E1C09CB,0x4C5AB792,0x4D98DDA5,
+                0x46C49A98,0x4706F0AF,0x45404EF6,0x448224C1,0x41CD3244,0x400F5873,0x4249E62A,0x438B8C1D,
+                0x54F16850,0x55330267,0x5775BC3E,0x56B7D609,0x53F8C08C,0x523AAABB,0x507C14E2,0x51BE7ED5,
+                0x5AE239E8,0x5B2053DF,0x5966ED86,0x58A487B1,0x5DEB9134,0x5C29FB03,0x5E6F455A,0x5FAD2F6D,
+                0xE1351B80,0xE0F771B7,0xE2B1CFEE,0xE373A5D9,0xE63CB35C,0xE7FED96B,0xE5B86732,0xE47A0D05,
+                0xEF264A38,0xEEE4200F,0xECA29E56,0xED60F461,0xE82FE2E4,0xE9ED88D3,0xEBAB368A,0xEA695CBD,
+                0xFD13B8F0,0xFCD1D2C7,0xFE976C9E,0xFF5506A9,0xFA1A102C,0xFBD87A1B,0xF99EC442,0xF85CAE75,
+                0xF300E948,0xF2C2837F,0xF0843D26,0xF1465711,0xF4094194,0xF5CB2BA3,0xF78D95FA,0xF64FFFCD,
+                0xD9785D60,0xD8BA3757,0xDAFC890E,0xDB3EE339,0xDE71F5BC,0xDFB39F8B,0xDDF521D2,0xDC374BE5,
+                0xD76B0CD8,0xD6A966EF,0xD4EFD8B6,0xD52DB281,0xD062A404,0xD1A0CE33,0xD3E6706A,0xD2241A5D,
+                0xC55EFE10,0xC49C9427,0xC6DA2A7E,0xC7184049,0xC25756CC,0xC3953CFB,0xC1D382A2,0xC011E895,
+                0xCB4DAFA8,0xCA8FC59F,0xC8C97BC6,0xC90B11F1,0xCC440774,0xCD866D43,0xCFC0D31A,0xCE02B92D,
+                0x91AF9640,0x906DFC77,0x922B422E,0x93E92819,0x96A63E9C,0x976454AB,0x9522EAF2,0x94E080C5,
+                0x9FBCC7F8,0x9E7EADCF,0x9C381396,0x9DFA79A1,0x98B56F24,0x99770513,0x9B31BB4A,0x9AF3D17D,
+                0x8D893530,0x8C4B5F07,0x8E0DE15E,0x8FCF8B69,0x8A809DEC,0x8B42F7DB,0x89044982,0x88C623B5,
+                0x839A6488,0x82580EBF,0x801EB0E6,0x81DCDAD1,0x8493CC54,0x8551A663,0x8717183A,0x86D5720D,
+                0xA9E2D0A0,0xA820BA97,0xAA6604CE,0xABA46EF9,0xAEEB787C,0xAF29124B,0xAD6FAC12,0xACADC625,
+                0xA7F18118,0xA633EB2F,0xA4755576,0xA5B73F41,0xA0F829C4,0xA13A43F3,0xA37CFDAA,0xA2BE979D,
+                0xB5C473D0,0xB40619E7,0xB640A7BE,0xB782CD89,0xB2CDDB0C,0xB30FB13B,0xB1490F62,0xB08B6555,
+                0xBBD72268,0xBA15485F,0xB853F606,0xB9919C31,0xBCDE8AB4,0xBD1CE083,0xBF5A5EDA,0xBE9834ED,
+            },
+            {
+                0x00000000,0xB8BC6765,0xAA09C88B,0x12B5AFEE,0x8F629757,0x37DEF032,0x256B5FDC,0x9DD738B9,
+                0xC5B428EF,0x7D084F8A,0x6FBDE064,0xD7018701,0x4AD6BFB8,0xF26AD8DD,0xE0DF7733,0x58631056,
+                0x5019579F,0xE8A530FA,0xFA109F14,0x42ACF871,0xDF7BC0C8,0x67C7A7AD,0x75720843,0xCDCE6F26,
+                0x95AD7F70,0x2D111815,0x3FA4B7FB,0x8718D09E,0x1ACFE827,0xA2738F42,0xB0C620AC,0x087A47C9,
+                0xA032AF3E,0x188EC85B,0x0A3B67B5,0xB28700D0,0x2F503869,0x97EC5F0C,0x8559F0E2,0x3DE59787,
+                0x658687D1,0xDD3AE0B4,0xCF8F4F5A,0x7733283F,0xEAE41086,0x525877E3,0x40EDD80D,0xF851BF68,
+                0xF02BF8A1,0x48979FC4,0x5A22302A,0xE29E574F,0x7F496FF6,0xC7F50893,0xD540A77D,0x6DFCC018,
+                0x359FD04E,0x8D23B72B,0x9F9618C5,0x272A7FA0,0xBAFD4719,0x0241207C,0x10F48F92,0xA848E8F7,
+                0x9B14583D,0x23A83F58,0x311D90B6,0x89A1F7D3,0x1476CF6A,0xACCAA80F,0xBE7F07E1,0x06C36084,
+                0x5EA070D2,0xE61C17B7,0xF4A9B859,0x4C15DF3C,0xD1C2E785,0x697E80E0,0x7BCB2F0E,0xC377486B,
+                0xCB0D0FA2,0x73B168C7,0x6104C729,0xD9B8A04C,0x446F98F5,0xFCD3FF90,0xEE66507E,0x56DA371B,
+                0x0EB9274D,0xB6054028,0xA4B0EFC6,0x1C0C88A3,0x81DBB01A,0x3967D77F,0x2BD27891,0x936E1FF4,
+                0x3B26F703,0x839A9066,0x912F3F88,0x299358ED,0xB4446054,0x0CF80731,0x1E4DA8DF,0xA6F1CFBA,
+                0xFE92DFEC,0x462EB889,0x549B1767,0xEC277002,0x71F048BB,0xC94C2FDE,0xDBF98030,0x6345E755,
+                0x6B3FA09C,0xD383C7F9,0xC1366817,0x798A0F72,0xE45D37CB,0x5CE150AE,0x4E54FF40,0xF6E89825,
+                0xAE8B8873,0x1637EF16,0x048240F8,0xBC3E279D,0x21E91F24,0x99557841,0x8BE0D7AF,0x335CB0CA,
+                0xED59B63B,0x55E5D15E,0x47507EB0,0xFFEC19D5,0x623B216C,0xDA874609,0xC832E9E7,0x708E8E82,
+                0x28ED9ED4,0x9051F9B1,0x82E4565F,0x3A58313A,0xA78F0983,0x1F336EE6,0x0D86C108,0xB53AA66D,
+                0xBD40E1A4,0x05FC86C1,0x1749292F,0xAFF54E4A,0x322276F3,0x8A9E1196,0x982BBE78,0x2097D91D,
+                0x78F4C94B,0xC048AE2E,0xD2FD01C0,0x6A4166A5,0xF7965E1C,0x4F2A3979,0x5D9F9697,0xE523F1F2,
+                0x4D6B1905,0xF5D77E60,0xE762D18E,0x5FDEB6EB,0xC2098E52,0x7AB5E937,0x680046D9,0xD0BC21BC,
+                0x88DF31EA,0x3063568F,0x22D6F961,0x9A6A9E04,0x07BDA6BD,0xBF01C1D8,0xADB46E36,0x15080953,
+                0x1D724E9A,0xA5CE29FF,0xB77B8611,0x0FC7E174,0x9210D9CD,0x2AACBEA8,0x38191146,0x80A57623,
+                0xD8C66675,0x607A0110,0x72CFAEFE,0xCA73C99B,0x57A4F122,0xEF189647,0xFDAD39A9,0x45115ECC,
+                0x764DEE06,0xCEF18963,0xDC44268D,0x64F841E8,0xF92F7951,0x41931E34,0x5326B1DA,0xEB9AD6BF,
+                0xB3F9C6E9,0x0B45A18C,0x19F00E62,0xA14C6907,0x3C9B51BE,0x842736DB,0x96929935,0x2E2EFE50,
+                0x2654B999,0x9EE8DEFC,0x8C5D7112,0x34E11677,0xA9362ECE,0x118A49AB,0x033FE645,0xBB838120,
+                0xE3E09176,0x5B5CF613,0x49E959FD,0xF1553E98,0x6C820621,0xD43E6144,0xC68BCEAA,0x7E37A9CF,
+                0xD67F4138,0x6EC3265D,0x7C7689B3,0xC4CAEED6,0x591DD66F,0xE1A1B10A,0xF3141EE4,0x4BA87981,
+                0x13CB69D7,0xAB770EB2,0xB9C2A15C,0x017EC639,0x9CA9FE80,0x241599E5,0x36A0360B,0x8E1C516E,
+                0x866616A7,0x3EDA71C2,0x2C6FDE2C,0x94D3B949,0x090481F0,0xB1B8E695,0xA30D497B,0x1BB12E1E,
+                0x43D23E48,0xFB6E592D,0xE9DBF6C3,0x516791A6,0xCCB0A91F,0x740CCE7A,0x66B96194,0xDE0506F1,
+            },
+            {
+                0x00000000,0x3D6029B0,0x7AC05360,0x47A07AD0,0xF580A6C0,0xC8E08F70,0x8F40F5A0,0xB220DC10,
+                0x30704BC1,0x0D106271,0x4AB018A1,0x77D03111,0xC5F0ED01,0xF890C4B1,0xBF30BE61,0x825097D1,
+                0x60E09782,0x5D80BE32,0x1A20C4E2,0x2740ED52,0x95603142,0xA80018F2,0xEFA06222,0xD2C04B92,
+                0x5090DC43,0x6DF0F5F3,0x2A508F23,0x1730A693,0xA5107A83,0x98705333,0xDFD029E3,0xE2B00053,
+                0xC1C12F04,0xFCA106B4,0xBB017C64,0x866155D4,0x344189C4,0x0921A074,0x4E81DAA4,0x73E1F314,
+                0xF1B164C5,0xCCD14D75,0x8B7137A5,0xB6111E15,0x0431C205,0x3951EBB5,0x7EF19165,0x4391B8D5,
+                0xA121B886,0x9C419136,0xDBE1EBE6,0xE681C256,0x54A11E46,0x69C137F6,0x2E614D26,0x13016496,
+                0x9151F347,0xAC31DAF7,0xEB91A027,0xD6F18997,0x64D15587,0x59B17C37,0x1E1106E7,0x23712F57,
+                0x58F35849,0x659371F9,0x22330B29,0x1F532299,0xAD73FE89,0x9013D739,0xD7B3ADE9,0xEAD38459,
+                0x68831388,0x55E33A38,0x124340E8,0x2F236958,0x9D03B548,0xA0639CF8,0xE7C3E628,0xDAA3CF98,
+                0x3813CFCB,0x0573E67B,0x42D39CAB,0x7FB3B51B,0xCD93690B,0xF0F340BB,0xB7533A6B,0x8A3313DB,
+                0x0863840A,0x3503ADBA,0x72A3D76A,0x4FC3FEDA,0xFDE322CA,0xC0830B7A,0x872371AA,0xBA43581A,
+                0x9932774D,0xA4525EFD,0xE3F2242D,0xDE920D9D,0x6CB2D18D,0x51D2F83D,0x167282ED,0x2B12AB5D,
+                0xA9423C8C,0x9422153C,0xD3826FEC,0xEEE2465C,0x5CC29A4C,0x61A2B3FC,0x2602C92C,0x1B62E09C,
+                0xF9D2E0CF,0xC4B2C97F,0x8312B3AF,0xBE729A1F,0x0C52460F,0x31326FBF,0x7692156F,0x4BF23CDF,
+                0xC9A2AB0E,0xF4C282BE,0xB362F86E,0x8E02D1DE,0x3C220DCE,0x0142247E,0x46E25EAE,0x7B82771E,
+                0xB1E6B092,0x8C869922,0xCB26E3F2,0xF646CA42,0x44661652,0x79063FE2,0x3EA64532,0x03C66C82,
+                0x8196FB53,0xBCF6D2E3,0xFB56A833,0xC6368183,0x74165D93,0x49767423,0x0ED60EF3,0x33B62743,
+                0xD1062710,0xEC660EA0,0xABC67470,0x96A65DC0,0x248681D0,0x19E6A860,0x5E46D2B0,0x6326FB00,
+                0xE1766CD1,0xDC164561,0x9BB63FB1,0xA6D61601,0x14F6CA11,0x2996E3A1,0x6E369971,0x5356B0C1,
+                0x70279F96,0x4D47B626,0x0AE7CCF6,0x3787E546,0x85A73956,0xB8C710E6,0xFF676A36,0xC2074386,
+                0x4057D457,0x7D37FDE7,0x3A978737,0x07F7AE87,0xB5D77297,0x88B75B27,0xCF1721F7,0xF2770847,
+                0x10C70814,0x2DA721A4,0x6A075B74,0x576772C4,0xE547AED4,0xD8278764,0x9F87FDB4,0xA2E7D404,
+                0x20B743D5,0x1DD76A65,0x5A7710B5,0x67173905,0xD537E515,0xE857CCA5,0xAFF7B675,0x92979FC5,
+                0xE915E8DB,0xD475C16B,0x93D5BBBB,0xAEB5920B,0x1C954E1B,0x21F567AB,0x66551D7B,0x5B3534CB,
+                0xD965A31A,0xE4058AAA,0xA3A5F07A,0x9EC5D9CA,0x2CE505DA,0x11852C6A,0x562556BA,0x6B457F0A,
+                0x89F57F59,0xB49556E9,0xF3352C39,0xCE550589,0x7C75D999,0x4115F029,0x06B58AF9,0x3BD5A349,
+                0xB9853498,0x84E51D28,0xC34567F8,0xFE254E48,0x4C059258,0x7165BBE8,0x36C5C138,0x0BA5E888,
+                0x28D4C7DF,0x15B4EE6F,0x521494BF,0x6F74BD0F,0xDD54611F,0xE03448AF,0xA794327F,0x9AF41BCF,
+                0x18A48C1E,0x25C4A5AE,0x6264DF7E,0x5F04F6CE,0xED242ADE,0xD044036E,0x97E479BE,0xAA84500E,
+                0x4834505D,0x755479ED,0x32F4033D,0x0F942A8D,0xBDB4F69D,0x80D4DF2D,0xC774A5FD,0xFA148C4D,
+                0x78441B9C,0x4524322C,0x028448FC,0x3FE4614C,0x8DC4BD5C,0xB0A494EC,0xF704EE3C,0xCA64C78C,
+            },
+            {
+                0x00000000,0xCB5CD3A5,0x4DC8A10B,0x869472AE,0x9B914216,0x50CD91B3,0xD659E31D,0x1D0530B8,
+                0xEC53826D,0x270F51C8,0xA19B2366,0x6AC7F0C3,0x77C2C07B,0xBC9E13DE,0x3A0A6170,0xF156B2D5,
+                0x03D6029B,0xC88AD13E,0x4E1EA390,0x85427035,0x9847408D,0x531B9328,0xD58FE186,0x1ED33223,
+                0xEF8580F6,0x24D95353,0xA24D21FD,0x6911F258,0x7414C2E0,0xBF481145,0x39DC63EB,0xF280B04E,
+                0x07AC0536,0xCCF0D693,0x4A64A43D,0x81387798,0x9C3D4720,0x57619485,0xD1F5E62B,0x1AA9358E,
+                0xEBFF875B,0x20A354FE,0xA6372650,0x6D6BF5F5,0x706EC54D,0xBB3216E8,0x3DA66446,0xF6FAB7E3,
+                0x047A07AD,0xCF26D408,0x49B2A6A6,0x82EE7503,0x9FEB45BB,0x54B7961E,0xD223E4B0,0x197F3715,
+                0xE82985C0,0x23755665,0xA5E124CB,0x6EBDF76E,0x73B8C7D6,0xB8E41473,0x3E7066DD,0xF52CB578,
+                0x0F580A6C,0xC404D9C9,0x4290AB67,0x89CC78C2,0x94C9487A,0x5F959BDF,0xD901E971,0x125D3AD4,
+                0xE30B8801,0x28575BA4,0xAEC3290A,0x659FFAAF,0x789ACA17,0xB3C619B2,0x35526B1C,0xFE0EB8B9,
+                0x0C8E08F7,0xC7D2DB52,0x4146A9FC,0x8A1A7A59,0x971F4AE1,0x5C439944,0xDAD7EBEA,0x118B384F,
+                0xE0DD8A9A,0x2B81593F,0xAD152B91,0x6649F834,0x7B4CC88C,0xB0101B29,0x36846987,0xFDD8BA22,
+                0x08F40F5A,0xC3A8DCFF,0x453CAE51,0x8E607DF4,0x93654D4C,0x58399EE9,0xDEADEC47,0x15F13FE2,
+                0xE4A78D37,0x2FFB5E92,0xA96F2C3C,0x6233FF99,0x7F36CF21,0xB46A1C84,0x32FE6E2A,0xF9A2BD8F,
+                0x0B220DC1,0xC07EDE64,0x46EAACCA,0x8DB67F6F,0x90B34FD7,0x5BEF9C72,0xDD7BEEDC,0x16273D79,
+                0xE7718FAC,0x2C2D5C09,0xAAB92EA7,0x61E5FD02,0x7CE0CDBA,0xB7BC1E1F,0x31286CB1,0xFA74BF14,
+                0x1EB014D8,0xD5ECC77D,0x5378B5D3,0x98246676,0x852156CE,0x4E7D856B,0xC8E9F7C5,0x03B52460,
+                0xF2E396B5,0x39BF4510,0xBF2B37BE,0x7477E41B,0x6972D4A3,0xA22E0706,0x24BA75A8,0xEFE6A60D,
+                0x1D661643,0xD63AC5E6,0x50AEB748,0x9BF264ED,0x86F75455,0x4DAB87F0,0xCB3FF55E,0x006326FB,
+                0xF135942E,0x3A69478B,0xBCFD3525,0x77A1E680,0x6AA4D638,0xA1F8059D,0x276C7733,0xEC30A496,
+                0x191C11EE,0xD240C24B,0x54D4B0E5,0x9F886340,0x828D53F8,0x49D1805D,0xCF45F2F3,0x04192156,
+                0xF54F9383,0x3E134026,0xB8873288,0x73DBE12D,0x6EDED195,0xA5820230,0x2316709E,0xE84AA33B,
+                0x1ACA1375,0xD196C0D0,0x5702B27E,0x9C5E61DB,0x815B5163,0x4A0782C6,0xCC93F068,0x07CF23CD,
+                0xF6999118,0x3DC542BD,0xBB513013,0x700DE3B6,0x6D08D30E,0xA65400AB,0x20C07205,0xEB9CA1A0,
+                0x11E81EB4,0xDAB4CD11,0x5C20BFBF,0x977C6C1A,0x8A795CA2,0x41258F07,0xC7B1FDA9,0x0CED2E0C,
+                0xFDBB9CD9,0x36E74F7C,0xB0733DD2,0x7B2FEE77,0x662ADECF,0xAD760D6A,0x2BE27FC4,0xE0BEAC61,
+                0x123E1C2F,0xD962CF8A,0x5FF6BD24,0x94AA6E81,0x89AF5E39,0x42F38D9C,0xC467FF32,0x0F3B2C97,
+                0xFE6D9E42,0x35314DE7,0xB3A53F49,0x78F9ECEC,0x65FCDC54,0xAEA00FF1,0x28347D5F,0xE368AEFA,
+                0x16441B82,0xDD18C827,0x5B8CBA89,0x90D0692C,0x8DD55994,0x46898A31,0xC01DF89F,0x0B412B3A,
+                0xFA1799EF,0x314B4A4A,0xB7DF38E4,0x7C83EB41,0x6186DBF9,0xAADA085C,0x2C4E7AF2,0xE712A957,
+                0x15921919,0xDECECABC,0x585AB812,0x93066BB7,0x8E035B0F,0x455F88AA,0xC3CBFA04,0x089729A1,
+                0xF9C19B74,0x329D48D1,0xB4093A7F,0x7F55E9DA,0x6250D962,0xA90C0AC7,0x2F987869,0xE4C4ABCC,
+            },
+            {
+                0x00000000,0xA6770BB4,0x979F1129,0x31E81A9D,0xF44F2413,0x52382FA7,0x63D0353A,0xC5A73E8E,
+                0x33EF4E67,0x959845D3,0xA4705F4E,0x020754FA,0xC7A06A74,0x61D761C0,0x503F7B5D,0xF64870E9,
+                0x67DE9CCE,0xC1A9977A,0xF0418DE7,0x56368653,0x9391B8DD,0x35E6B369,0x040EA9F4,0xA279A240,
+                0x5431D2A9,0xF246D91D,0xC3AEC380,0x65D9C834,0xA07EF6BA,0x0609FD0E,0x37E1E793,0x9196EC27,
+                0xCFBD399C,0x69CA3228,0x582228B5,0xFE552301,0x3BF21D8F,0x9D85163B,0xAC6D0CA6,0x0A1A0712,
+                0xFC5277FB,0x5A257C4F,0x6BCD66D2,0xCDBA6D66,0x081D53E8,0xAE6A585C,0x9F8242C1,0x39F54975,
+                0xA863A552,0x0E14AEE6,0x3FFCB47B,0x998BBFCF,0x5C2C8141,0xFA5B8AF5,0xCBB39068,0x6DC49BDC,
+                0x9B8CEB35,0x3DFBE081,0x0C13FA1C,0xAA64F1A8,0x6FC3CF26,0xC9B4C492,0xF85CDE0F,0x5E2BD5BB,
+                0x440B7579,0xE27C7ECD,0xD3946450,0x75E36FE4,0xB044516A,0x16335ADE,0x27DB4043,0x81AC4BF7,
+                0x77E43B1E,0xD19330AA,0xE07B2A37,0x460C2183,0x83AB1F0D,0x25DC14B9,0x14340E24,0xB2430590,
+                0x23D5E9B7,0x85A2E203,0xB44AF89E,0x123DF32A,0xD79ACDA4,0x71EDC610,0x4005DC8D,0xE672D739,
+                0x103AA7D0,0xB64DAC64,0x87A5B6F9,0x21D2BD4D,0xE47583C3,0x42028877,0x73EA92EA,0xD59D995E,
+                0x8BB64CE5,0x2DC14751,0x1C295DCC,0xBA5E5678,0x7FF968F6,0xD98E6342,0xE86679DF,0x4E11726B,
+                0xB8590282,0x1E2E0936,0x2FC613AB,0x89B1181F,0x4C162691,0xEA612D25,0xDB8937B8,0x7DFE3C0C,
+                0xEC68D02B,0x4A1FDB9F,0x7BF7C102,0xDD80CAB6,0x1827F438,0xBE50FF8C,0x8FB8E511,0x29CFEEA5,
+                0xDF879E4C,0x79F095F8,0x48188F65,0xEE6F84D1,0x2BC8BA5F,0x8DBFB1EB,0xBC57AB76,0x1A20A0C2,
+                0x8816EAF2,0x2E61E146,0x1F89FBDB,0xB9FEF06F,0x7C59CEE1,0xDA2EC555,0xEBC6DFC8,0x4DB1D47C,
+                0xBBF9A495,0x1D8EAF21,0x2C66B5BC,0x8A11BE08,0x4FB68086,0xE9C18B32,0xD82991AF,0x7E5E9A1B,
+                0xEFC8763C,0x49BF7D88,0x78576715,0xDE206CA1,0x1B87522F,0xBDF0599B,0x8C184306,0x2A6F48B2,
+                0xDC27385B,0x7A5033EF,0x4BB82972,0xEDCF22C6,0x28681C48,0x8E1F17FC,0xBFF70D61,0x198006D5,
+                0x47ABD36E,0xE1DCD8DA,0xD034C247,0x7643C9F3,0xB3E4F77D,0x1593FCC9,0x247BE654,0x820CEDE0,
+                0x74449D09,0xD23396BD,0xE3DB8C20,0x45AC8794,0x800BB91A,0x267CB2AE,0x1794A833,0xB1E3A387,
+                0x20754FA0,0x86024414,0xB7EA5E89,0x119D553D,0xD43A6BB3,0x724D6007,0x43A57A9A,0xE5D2712E,
+                0x139A01C7,0xB5ED0A73,0x840510EE,0x22721B5A,0xE7D525D4,0x41A22E60,0x704A34FD,0xD63D3F49,
+                0xCC1D9F8B,0x6A6A943F,0x5B828EA2,0xFDF58516,0x3852BB98,0x9E25B02C,0xAFCDAAB1,0x09BAA105,
+                0xFFF2D1EC,0x5985DA58,0x686DC0C5,0xCE1ACB71,0x0BBDF5FF,0xADCAFE4B,0x9C22E4D6,0x3A55EF62,
+                0xABC30345,0x0DB408F1,0x3C5C126C,0x9A2B19D8,0x5F8C2756,0xF9FB2CE2,0xC813367F,0x6E643DCB,
+                0x982C4D22,0x3E5B4696,0x0FB35C0B,0xA9C457BF,0x6C636931,0xCA146285,0xFBFC7818,0x5D8B73AC,
+                0x03A0A617,0xA5D7ADA3,0x943FB73E,0x3248BC8A,0xF7EF8204,0x519889B0,0x6070932D,0xC6079899,
+                0x304FE870,0x9638E3C4,0xA7D0F959,0x01A7F2ED,0xC400CC63,0x6277C7D7,0x539FDD4A,0xF5E8D6FE,
+                0x647E3AD9,0xC209316D,0xF3E12BF0,0x55962044,0x90311ECA,0x3646157E,0x07AE0FE3,0xA1D90457,
+                0x579174BE,0xF1E67F0A,0xC00E6597,0x66796E23,0xA3DE50AD,0x05A95B19,0x34414184,0x92364A30,
+            },
+            {
+                0x00000000,0xCCAA009E,0x4225077D,0x8E8F07E3,0x844A0EFA,0x48E00E64,0xC66F0987,0x0AC50919,
+                0xD3E51BB5,0x1F4F1B2B,0x91C01CC8,0x5D6A1C56,0x57AF154F,0x9B0515D1,0x158A1232,0xD92012AC,
+                0x7CBB312B,0xB01131B5,0x3E9E3656,0xF23436C8,0xF8F13FD1,0x345B3F4F,0xBAD438AC,0x767E3832,
+                0xAF5E2A9E,0x63F42A00,0xED7B2DE3,0x21D12D7D,0x2B142464,0xE7BE24FA,0x69312319,0xA59B2387,
+                0xF9766256,0x35DC62C8,0xBB53652B,0x77F965B5,0x7D3C6CAC,0xB1966C32,0x3F196BD1,0xF3B36B4F,
+                0x2A9379E3,0xE639797D,0x68B67E9E,0xA41C7E00,0xAED97719,0x62737787,0xECFC7064,0x205670FA,
+                0x85CD537D,0x496753E3,0xC7E85400,0x0B42549E,0x01875D87,0xCD2D5D19,0x43A25AFA,0x8F085A64,
+                0x562848C8,0x9A824856,0x140D4FB5,0xD8A74F2B,0xD2624632,0x1EC846AC,0x9047414F,0x5CED41D1,
+                0x299DC2ED,0xE537C273,0x6BB8C590,0xA712C50E,0xADD7CC17,0x617DCC89,0xEFF2CB6A,0x2358CBF4,
+                0xFA78D958,0x36D2D9C6,0xB85DDE25,0x74F7DEBB,0x7E32D7A2,0xB298D73C,0x3C17D0DF,0xF0BDD041,
+                0x5526F3C6,0x998CF358,0x1703F4BB,0xDBA9F425,0xD16CFD3C,0x1DC6FDA2,0x9349FA41,0x5FE3FADF,
+                0x86C3E873,0x4A69E8ED,0xC4E6EF0E,0x084CEF90,0x0289E689,0xCE23E617,0x40ACE1F4,0x8C06E16A,
+                0xD0EBA0BB,0x1C41A025,0x92CEA7C6,0x5E64A758,0x54A1AE41,0x980BAEDF,0x1684A93C,0xDA2EA9A2,
+                0x030EBB0E,0xCFA4BB90,0x412BBC73,0x8D81BCED,0x8744B5F4,0x4BEEB56A,0xC561B289,0x09CBB217,
+                0xAC509190,0x60FA910E,0xEE7596ED,0x22DF9673,0x281A9F6A,0xE4B09FF4,0x6A3F9817,0xA6959889,
+                0x7FB58A25,0xB31F8ABB,0x3D908D58,0xF13A8DC6,0xFBFF84DF,0x37558441,0xB9DA83A2,0x7570833C,
+                0x533B85DA,0x9F918544,0x111E82A7,0xDDB48239,0xD7718B20,0x1BDB8BBE,0x95548C5D,0x59FE8CC3,
+                0x80DE9E6F,0x4C749EF1,0xC2FB9912,0x0E51998C,0x04949095,0xC83E900B,0x46B197E8,0x8A1B9776,
+                0x2F80B4F1,0xE32AB46F,0x6DA5B38C,0xA10FB312,0xABCABA0B,0x6760BA95,0xE9EFBD76,0x2545BDE8,
+                0xFC65AF44,0x30CFAFDA,0xBE40A839,0x72EAA8A7,0x782FA1BE,0xB485A120,0x3A0AA6C3,0xF6A0A65D,
+                0xAA4DE78C,0x66E7E712,0xE868E0F1,0x24C2E06F,0x2E07E976,0xE2ADE9E8,0x6C22EE0B,0xA088EE95,
+                0x79A8FC39,0xB502FCA7,0x3B8DFB44,0xF727FBDA,0xFDE2F2C3,0x3148F25D,0xBFC7F5BE,0x736DF520,
+                0xD6F6D6A7,0x1A5CD639,0x94D3D1DA,0x5879D144,0x52BCD85D,0x9E16D8C3,0x1099DF20,0xDC33DFBE,
+                0x0513CD12,0xC9B9CD8C,0x4736CA6F,0x8B9CCAF1,0x8159C3E8,0x4DF3C376,0xC37CC495,0x0FD6C40B,
+                0x7AA64737,0xB60C47A9,0x3883404A,0xF42940D4,0xFEEC49CD,0x32464953,0xBCC94EB0,0x70634E2E,
+                0xA9435C82,0x65E95C1C,0xEB665BFF,0x27CC5B61,0x2D095278,0xE1A352E6,0x6F2C5505,0xA386559B,
+                0x061D761C,0xCAB77682,0x44387161,0x889271FF,0x825778E6,0x4EFD7878,0xC0727F9B,0x0CD87F05,
+                0xD5F86DA9,0x19526D37,0x97DD6AD4,0x5B776A4A,0x51B26353,0x9D1863CD,0x1397642E,0xDF3D64B0,
+                0x83D02561,0x4F7A25FF,0xC1F5221C,0x0D5F2282,0x079A2B9B,0xCB302B05,0x45BF2CE6,0x89152C78,
+                0x50353ED4,0x9C9F3E4A,0x121039A9,0xDEBA3937,0xD47F302E,0x18D530B0,0x965A3753,0x5AF037CD,
+                0xFF6B144A,0x33C114D4,0xBD4E1337,0x71E413A9,0x7B211AB0,0xB78B1A2E,0x39041DCD,0xF5AE1D53,
+                0x2C8E0FFF,0xE0240F61,0x6EAB0882,0xA201081C,0xA8C40105,0x646E019B,0xEAE10678,0x264B06E6,
+            },
+            {
+                0x00000000,0x177B1443,0x2EF62886,0x398D3CC5,0x5DEC510C,0x4A97454F,0x731A798A,0x64616DC9,
+                0xBBD8A218,0xACA3B65B,0x952E8A9E,0x82559EDD,0xE634F314,0xF14FE757,0xC8C2DB92,0xDFB9CFD1,
+                0xACC04271,0xBBBB5632,0x82366AF7,0x954D7EB4,0xF12C137D,0xE657073E,0xDFDA3BFB,0xC8A12FB8,
+                0x1718E069,0x0063F42A,0x39EEC8EF,0x2E95DCAC,0x4AF4B165,0x5D8FA526,0x640299E3,0x73798DA0,
+                0x82F182A3,0x958A96E0,0xAC07AA25,0xBB7CBE66,0xDF1DD3AF,0xC866C7EC,0xF1EBFB29,0xE690EF6A,
+                0x392920BB,0x2E5234F8,0x17DF083D,0x00A41C7E,0x64C571B7,0x73BE65F4,0x4A335931,0x5D484D72,
+                0x2E31C0D2,0x394AD491,0x00C7E854,0x17BCFC17,0x73DD91DE,0x64A6859D,0x5D2BB958,0x4A50AD1B,
+                0x95E962CA,0x82927689,0xBB1F4A4C,0xAC645E0F,0xC80533C6,0xDF7E2785,0xE6F31B40,0xF1880F03,
+                0xDE920307,0xC9E91744,0xF0642B81,0xE71F3FC2,0x837E520B,0x94054648,0xAD887A8D,0xBAF36ECE,
+                0x654AA11F,0x7231B55C,0x4BBC8999,0x5CC79DDA,0x38A6F013,0x2FDDE450,0x1650D895,0x012BCCD6,
+                0x72524176,0x65295535,0x5CA469F0,0x4BDF7DB3,0x2FBE107A,0x38C50439,0x014838FC,0x16332CBF,
+                0xC98AE36E,0xDEF1F72D,0xE77CCBE8,0xF007DFAB,0x9466B262,0x831DA621,0xBA909AE4,0xADEB8EA7,
+                0x5C6381A4,0x4B1895E7,0x7295A922,0x65EEBD61,0x018FD0A8,0x16F4C4EB,0x2F79F82E,0x3802EC6D,
+                0xE7BB23BC,0xF0C037FF,0xC94D0B3A,0xDE361F79,0xBA5772B0,0xAD2C66F3,0x94A15A36,0x83DA4E75,
+                0xF0A3C3D5,0xE7D8D796,0xDE55EB53,0xC92EFF10,0xAD4F92D9,0xBA34869A,0x83B9BA5F,0x94C2AE1C,
+                0x4B7B61CD,0x5C00758E,0x658D494B,0x72F65D08,0x169730C1,0x01EC2482,0x38611847,0x2F1A0C04,
+                0x6655004F,0x712E140C,0x48A328C9,0x5FD83C8A,0x3BB95143,0x2CC24500,0x154F79C5,0x02346D86,
+                0xDD8DA257,0xCAF6B614,0xF37B8AD1,0xE4009E92,0x8061F35B,0x971AE718,0xAE97DBDD,0xB9ECCF9E,
+                0xCA95423E,0xDDEE567D,0xE4636AB8,0xF3187EFB,0x97791332,0x80020771,0xB98F3BB4,0xAEF42FF7,
+                0x714DE026,0x6636F465,0x5FBBC8A0,0x48C0DCE3,0x2CA1B12A,0x3BDAA569,0x025799AC,0x152C8DEF,
+                0xE4A482EC,0xF3DF96AF,0xCA52AA6A,0xDD29BE29,0xB948D3E0,0xAE33C7A3,0x97BEFB66,0x80C5EF25,
+                0x5F7C20F4,0x480734B7,0x718A0872,0x66F11C31,0x029071F8,0x15EB65BB,0x2C66597E,0x3B1D4D3D,
+                0x4864C09D,0x5F1FD4DE,0x6692E81B,0x71E9FC58,0x15889191,0x02F385D2,0x3B7EB917,0x2C05AD54,
+                0xF3BC6285,0xE4C776C6,0xDD4A4A03,0xCA315E40,0xAE503389,0xB92B27CA,0x80A61B0F,0x97DD0F4C,
+                0xB8C70348,0xAFBC170B,0x96312BCE,0x814A3F8D,0xE52B5244,0xF2504607,0xCBDD7AC2,0xDCA66E81,
+                0x031FA150,0x1464B513,0x2DE989D6,0x3A929D95,0x5EF3F05C,0x4988E41F,0x7005D8DA,0x677ECC99,
+                0x14074139,0x037C557A,0x3AF169BF,0x2D8A7DFC,0x49EB1035,0x5E900476,0x671D38B3,0x70662CF0,
+                0xAFDFE321,0xB8A4F762,0x8129CBA7,0x9652DFE4,0xF233B22D,0xE548A66E,0xDCC59AAB,0xCBBE8EE8,
+                0x3A3681EB,0x2D4D95A8,0x14C0A96D,0x03BBBD2E,0x67DAD0E7,0x70A1C4A4,0x492CF861,0x5E57EC22,
+                0x81EE23F3,0x969537B0,0xAF180B75,0xB8631F36,0xDC0272FF,0xCB7966BC,0xF2F45A79,0xE58F4E3A,
+                0x96F6C39A,0x818DD7D9,0xB800EB1C,0xAF7BFF5F,0xCB1A9296,0xDC6186D5,0xE5ECBA10,0xF297AE53,
+                0x2D2E6182,0x3A5575C1,0x03D84904,0x14A35D47,0x70C2308E,0x67B924CD,0x5E341808,0x494F0C4B,
+            },
+            {
+                0x00000000,0xEFC26B3E,0x04F5D03D,0xEB37BB03,0x09EBA07A,0xE629CB44,0x0D1E7047,0xE2DC1B79,
+                0x13D740F4,0xFC152BCA,0x172290C9,0xF8E0FBF7,0x1A3CE08E,0xF5FE8BB0,0x1EC930B3,0xF10B5B8D,
+                0x27AE81E8,0xC86CEAD6,0x235B51D5,0xCC993AEB,0x2E452192,0xC1874AAC,0x2AB0F1AF,0xC5729A91,
+                0x3479C11C,0xDBBBAA22,0x308C1121,0xDF4E7A1F,0x3D926166,0xD2500A58,0x3967B15B,0xD6A5DA65,
+                0x4F5D03D0,0xA09F68EE,0x4BA8D3ED,0xA46AB8D3,0x46B6A3AA,0xA974C894,0x42437397,0xAD8118A9,
+                0x5C8A4324,0xB348281A,0x587F9319,0xB7BDF827,0x5561E35E,0xBAA38860,0x51943363,0xBE56585D,
+                0x68F38238,0x8731E906,0x6C065205,0x83C4393B,0x61182242,0x8EDA497C,0x65EDF27F,0x8A2F9941,
+                0x7B24C2CC,0x94E6A9F2,0x7FD112F1,0x901379CF,0x72CF62B6,0x9D0D0988,0x763AB28B,0x99F8D9B5,
+                0x9EBA07A0,0x71786C9E,0x9A4FD79D,0x758DBCA3,0x9751A7DA,0x7893CCE4,0x93A477E7,0x7C661CD9,
+                0x8D6D4754,0x62AF2C6A,0x89989769,0x665AFC57,0x8486E72E,0x6B448C10,0x80733713,0x6FB15C2D,
+                0xB9148648,0x56D6ED76,0xBDE15675,0x52233D4B,0xB0FF2632,0x5F3D4D0C,0xB40AF60F,0x5BC89D31,
+                0xAAC3C6BC,0x4501AD82,0xAE361681,0x41F47DBF,0xA32866C6,0x4CEA0DF8,0xA7DDB6FB,0x481FDDC5,
+                0xD1E70470,0x3E256F4E,0xD512D44D,0x3AD0BF73,0xD80CA40A,0x37CECF34,0xDCF97437,0x333B1F09,
+                0xC2304484,0x2DF22FBA,0xC6C594B9,0x2907FF87,0xCBDBE4FE,0x24198FC0,0xCF2E34C3,0x20EC5FFD,
+                0xF6498598,0x198BEEA6,0xF2BC55A5,0x1D7E3E9B,0xFFA225E2,0x10604EDC,0xFB57F5DF,0x14959EE1,
+                0xE59EC56C,0x0A5CAE52,0xE16B1551,0x0EA97E6F,0xEC756516,0x03B70E28,0xE880B52B,0x0742DE15,
+                0xE6050901,0x09C7623F,0xE2F0D93C,0x0D32B202,0xEFEEA97B,0x002CC245,0xEB1B7946,0x04D91278,
+                0xF5D249F5,0x1A1022CB,0xF12799C8,0x1EE5F2F6,0xFC39E98F,0x13FB82B1,0xF8CC39B2,0x170E528C,
+                0xC1AB88E9,0x2E69E3D7,0xC55E58D4,0x2A9C33EA,0xC8402893,0x278243AD,0xCCB5F8AE,0x23779390,
+                0xD27CC81D,0x3DBEA323,0xD6891820,0x394B731E,0xDB976867,0x34550359,0xDF62B85A,0x30A0D364,
+                0xA9580AD1,0x469A61EF,0xADADDAEC,0x426FB1D2,0xA0B3AAAB,0x4F71C195,0xA4467A96,0x4B8411A8,
+                0xBA8F4A25,0x554D211B,0xBE7A9A18,0x51B8F126,0xB364EA5F,0x5CA68161,0xB7913A62,0x5853515C,
+                0x8EF68B39,0x6134E007,0x8A035B04,0x65C1303A,0x871D2B43,0x68DF407D,0x83E8FB7E,0x6C2A9040,
+                0x9D21CBCD,0x72E3A0F3,0x99D41BF0,0x761670CE,0x94CA6BB7,0x7B080089,0x903FBB8A,0x7FFDD0B4,
+                0x78BF0EA1,0x977D659F,0x7C4ADE9C,0x9388B5A2,0x7154AEDB,0x9E96C5E5,0x75A17EE6,0x9A6315D8,
+                0x6B684E55,0x84AA256B,0x6F9D9E68,0x805FF556,0x6283EE2F,0x8D418511,0x66763E12,0x89B4552C,
+                0x5F118F49,0xB0D3E477,0x5BE45F74,0xB426344A,0x56FA2F33,0xB938440D,0x520FFF0E,0xBDCD9430,
+                0x4CC6CFBD,0xA304A483,0x48331F80,0xA7F174BE,0x452D6FC7,0xAAEF04F9,0x41D8BFFA,0xAE1AD4C4,
+                0x37E20D71,0xD820664F,0x3317DD4C,0xDCD5B672,0x3E09AD0B,0xD1CBC635,0x3AFC7D36,0xD53E1608,
+                0x24354D85,0xCBF726BB,0x20C09DB8,0xCF02F686,0x2DDEEDFF,0xC21C86C1,0x292B3DC2,0xC6E956FC,
+                0x104C8C99,0xFF8EE7A7,0x14B95CA4,0xFB7B379A,0x19A72CE3,0xF66547DD,0x1D52FCDE,0xF29097E0,
+                0x039BCC6D,0xEC59A753,0x076E1C50,0xE8AC776E,0x0A706C17,0xE5B20729,0x0E85BC2A,0xE147D714,
+            },
+            {
+                0x00000000,0xC18EDFC0,0x586CB9C1,0x99E26601,0xB0D97382,0x7157AC42,0xE8B5CA43,0x293B1583,
+                0xBAC3E145,0x7B4D3E85,0xE2AF5884,0x23218744,0x0A1A92C7,0xCB944D07,0x52762B06,0x93F8F4C6,
+                0xAEF6C4CB,0x6F781B0B,0xF69A7D0A,0x3714A2CA,0x1E2FB749,0xDFA16889,0x46430E88,0x87CDD148,
+                0x1435258E,0xD5BBFA4E,0x4C599C4F,0x8DD7438F,0xA4EC560C,0x656289CC,0xFC80EFCD,0x3D0E300D,
+                0x869C8FD7,0x47125017,0xDEF03616,0x1F7EE9D6,0x3645FC55,0xF7CB2395,0x6E294594,0xAFA79A54,
+                0x3C5F6E92,0xFDD1B152,0x6433D753,0xA5BD0893,0x8C861D10,0x4D08C2D0,0xD4EAA4D1,0x15647B11,
+                0x286A4B1C,0xE9E494DC,0x7006F2DD,0xB1882D1D,0x98B3389E,0x593DE75E,0xC0DF815F,0x01515E9F,
+                0x92A9AA59,0x53277599,0xCAC51398,0x0B4BCC58,0x2270D9DB,0xE3FE061B,0x7A1C601A,0xBB92BFDA,
+                0xD64819EF,0x17C6C62F,0x8E24A02E,0x4FAA7FEE,0x66916A6D,0xA71FB5AD,0x3EFDD3AC,0xFF730C6C,
+                0x6C8BF8AA,0xAD05276A,0x34E7416B,0xF5699EAB,0xDC528B28,0x1DDC54E8,0x843E32E9,0x45B0ED29,
+                0x78BEDD24,0xB93002E4,0x20D264E5,0xE15CBB25,0xC867AEA6,0x09E97166,0x900B1767,0x5185C8A7,
+                0xC27D3C61,0x03F3E3A1,0x9A1185A0,0x5B9F5A60,0x72A44FE3,0xB32A9023,0x2AC8F622,0xEB4629E2,
+                0x50D49638,0x915A49F8,0x08B82FF9,0xC936F039,0xE00DE5BA,0x21833A7A,0xB8615C7B,0x79EF83BB,
+                0xEA17777D,0x2B99A8BD,0xB27BCEBC,0x73F5117C,0x5ACE04FF,0x9B40DB3F,0x02A2BD3E,0xC32C62FE,
+                0xFE2252F3,0x3FAC8D33,0xA64EEB32,0x67C034F2,0x4EFB2171,0x8F75FEB1,0x169798B0,0xD7194770,
+                0x44E1B3B6,0x856F6C76,0x1C8D0A77,0xDD03D5B7,0xF438C034,0x35B61FF4,0xAC5479F5,0x6DDAA635,
+                0x77E1359F,0xB66FEA5F,0x2F8D8C5E,0xEE03539E,0xC738461D,0x06B699DD,0x9F54FFDC,0x5EDA201C,
+                0xCD22D4DA,0x0CAC0B1A,0x954E6D1B,0x54C0B2DB,0x7DFBA758,0xBC757898,0x25971E99,0xE419C159,
+                0xD917F154,0x18992E94,0x817B4895,0x40F59755,0x69CE82D6,0xA8405D16,0x31A23B17,0xF02CE4D7,
+                0x63D41011,0xA25ACFD1,0x3BB8A9D0,0xFA367610,0xD30D6393,0x1283BC53,0x8B61DA52,0x4AEF0592,
+                0xF17DBA48,0x30F36588,0xA9110389,0x689FDC49,0x41A4C9CA,0x802A160A,0x19C8700B,0xD846AFCB,
+                0x4BBE5B0D,0x8A3084CD,0x13D2E2CC,0xD25C3D0C,0xFB67288F,0x3AE9F74F,0xA30B914E,0x62854E8E,
+                0x5F8B7E83,0x9E05A143,0x07E7C742,0xC6691882,0xEF520D01,0x2EDCD2C1,0xB73EB4C0,0x76B06B00,
+                0xE5489FC6,0x24C64006,0xBD242607,0x7CAAF9C7,0x5591EC44,0x941F3384,0x0DFD5585,0xCC738A45,
+                0xA1A92C70,0x6027F3B0,0xF9C595B1,0x384B4A71,0x11705FF2,0xD0FE8032,0x491CE633,0x889239F3,
+                0x1B6ACD35,0xDAE412F5,0x430674F4,0x8288AB34,0xABB3BEB7,0x6A3D6177,0xF3DF0776,0x3251D8B6,
+                0x0F5FE8BB,0xCED1377B,0x5733517A,0x96BD8EBA,0xBF869B39,0x7E0844F9,0xE7EA22F8,0x2664FD38,
+                0xB59C09FE,0x7412D63E,0xEDF0B03F,0x2C7E6FFF,0x05457A7C,0xC4CBA5BC,0x5D29C3BD,0x9CA71C7D,
+                0x2735A3A7,0xE6BB7C67,0x7F591A66,0xBED7C5A6,0x97ECD025,0x56620FE5,0xCF8069E4,0x0E0EB624,
+                0x9DF642E2,0x5C789D22,0xC59AFB23,0x041424E3,0x2D2F3160,0xECA1EEA0,0x754388A1,0xB4CD5761,
+                0x89C3676C,0x484DB8AC,0xD1AFDEAD,0x1021016D,0x391A14EE,0xF894CB2E,0x6176AD2F,0xA0F872EF,
+                0x33008629,0xF28E59E9,0x6B6C3FE8,0xAAE2E028,0x83D9F5AB,0x42572A6B,0xDBB54C6A,0x1A3B93AA,
+            },
+            {
+                0x00000000,0x9BA54C6F,0xEC3B9E9F,0x779ED2F0,0x03063B7F,0x98A37710,0xEF3DA5E0,0x7498E98F,
+                0x060C76FE,0x9DA93A91,0xEA37E861,0x7192A40E,0x050A4D81,0x9EAF01EE,0xE931D31E,0x72949F71,
+                0x0C18EDFC,0x97BDA193,0xE0237363,0x7B863F0C,0x0F1ED683,0x94BB9AEC,0xE325481C,0x78800473,
+                0x0A149B02,0x91B1D76D,0xE62F059D,0x7D8A49F2,0x0912A07D,0x92B7EC12,0xE5293EE2,0x7E8C728D,
+                0x1831DBF8,0x83949797,0xF40A4567,0x6FAF0908,0x1B37E087,0x8092ACE8,0xF70C7E18,0x6CA93277,
+                0x1E3DAD06,0x8598E169,0xF2063399,0x69A37FF6,0x1D3B9679,0x869EDA16,0xF10008E6,0x6AA54489,
+                0x14293604,0x8F8C7A6B,0xF812A89B,0x63B7E4F4,0x172F0D7B,0x8C8A4114,0xFB1493E4,0x60B1DF8B,
+                0x122540FA,0x89800C95,0xFE1EDE65,0x65BB920A,0x11237B85,0x8A8637EA,0xFD18E51A,0x66BDA975,
+                0x3063B7F0,0xABC6FB9F,0xDC58296F,0x47FD6500,0x33658C8F,0xA8C0C0E0,0xDF5E1210,0x44FB5E7F,
+                0x366FC10E,0xADCA8D61,0xDA545F91,0x41F113FE,0x3569FA71,0xAECCB61E,0xD95264EE,0x42F72881,
+                0x3C7B5A0C,0xA7DE1663,0xD040C493,0x4BE588FC,0x3F7D6173,0xA4D82D1C,0xD346FFEC,0x48E3B383,
+                0x3A772CF2,0xA1D2609D,0xD64CB26D,0x4DE9FE02,0x3971178D,0xA2D45BE2,0xD54A8912,0x4EEFC57D,
+                0x28526C08,0xB3F72067,0xC469F297,0x5FCCBEF8,0x2B545777,0xB0F11B18,0xC76FC9E8,0x5CCA8587,
+                0x2E5E1AF6,0xB5FB5699,0xC2658469,0x59C0C806,0x2D582189,0xB6FD6DE6,0xC163BF16,0x5AC6F379,
+                0x244A81F4,0xBFEFCD9B,0xC8711F6B,0x53D45304,0x274CBA8B,0xBCE9F6E4,0xCB772414,0x50D2687B,
+                0x2246F70A,0xB9E3BB65,0xCE7D6995,0x55D825FA,0x2140CC75,0xBAE5801A,0xCD7B52EA,0x56DE1E85,
+                0x60C76FE0,0xFB62238F,0x8CFCF17F,0x1759BD10,0x63C1549F,0xF86418F0,0x8FFACA00,0x145F866F,
+                0x66CB191E,0xFD6E5571,0x8AF08781,0x1155CBEE,0x65CD2261,0xFE686E0E,0x89F6BCFE,0x1253F091,
+                0x6CDF821C,0xF77ACE73,0x80E41C83,0x1B4150EC,0x6FD9B963,0xF47CF50C,0x83E227FC,0x18476B93,
+                0x6AD3F4E2,0xF176B88D,0x86E86A7D,0x1D4D2612,0x69D5CF9D,0xF27083F2,0x85EE5102,0x1E4B1D6D,
+                0x78F6B418,0xE353F877,0x94CD2A87,0x0F6866E8,0x7BF08F67,0xE055C308,0x97CB11F8,0x0C6E5D97,
+                0x7EFAC2E6,0xE55F8E89,0x92C15C79,0x09641016,0x7DFCF999,0xE659B5F6,0x91C76706,0x0A622B69,
+                0x74EE59E4,0xEF4B158B,0x98D5C77B,0x03708B14,0x77E8629B,0xEC4D2EF4,0x9BD3FC04,0x0076B06B,
+                0x72E22F1A,0xE9476375,0x9ED9B185,0x057CFDEA,0x71E41465,0xEA41580A,0x9DDF8AFA,0x067AC695,
+                0x50A4D810,0xCB01947F,0xBC9F468F,0x273A0AE0,0x53A2E36F,0xC807AF00,0xBF997DF0,0x243C319F,
+                0x56A8AEEE,0xCD0DE281,0xBA933071,0x21367C1E,0x55AE9591,0xCE0BD9FE,0xB9950B0E,0x22304761,
+                0x5CBC35EC,0xC7197983,0xB087AB73,0x2B22E71C,0x5FBA0E93,0xC41F42FC,0xB381900C,0x2824DC63,
+                0x5AB04312,0xC1150F7D,0xB68BDD8D,0x2D2E91E2,0x59B6786D,0xC2133402,0xB58DE6F2,0x2E28AA9D,
+                0x489503E8,0xD3304F87,0xA4AE9D77,0x3F0BD118,0x4B933897,0xD03674F8,0xA7A8A608,0x3C0DEA67,
+                0x4E997516,0xD53C3979,0xA2A2EB89,0x3907A7E6,0x4D9F4E69,0xD63A0206,0xA1A4D0F6,0x3A019C99,
+                0x448DEE14,0xDF28A27B,0xA8B6708B,0x33133CE4,0x478BD56B,0xDC2E9904,0xABB04BF4,0x3015079B,
+                0x428198EA,0xD924D485,0xAEBA0675,0x351F4A1A,0x4187A395,0xDA22EFFA,0xADBC3D0A,0x36197165,
+            },
+            {
+                0x00000000,0xDD96D985,0x605CB54B,0xBDCA6CCE,0xC0B96A96,0x1D2FB313,0xA0E5DFDD,0x7D730658,
+                0x5A03D36D,0x87950AE8,0x3A5F6626,0xE7C9BFA3,0x9ABAB9FB,0x472C607E,0xFAE60CB0,0x2770D535,
+                0xB407A6DA,0x69917F5F,0xD45B1391,0x09CDCA14,0x74BECC4C,0xA92815C9,0x14E27907,0xC974A082,
+                0xEE0475B7,0x3392AC32,0x8E58C0FC,0x53CE1979,0x2EBD1F21,0xF32BC6A4,0x4EE1AA6A,0x937773EF,
+                0xB37E4BF5,0x6EE89270,0xD322FEBE,0x0EB4273B,0x73C72163,0xAE51F8E6,0x139B9428,0xCE0D4DAD,
+                0xE97D9898,0x34EB411D,0x89212DD3,0x54B7F456,0x29C4F20E,0xF4522B8B,0x49984745,0x940E9EC0,
+                0x0779ED2F,0xDAEF34AA,0x67255864,0xBAB381E1,0xC7C087B9,0x1A565E3C,0xA79C32F2,0x7A0AEB77,
+                0x5D7A3E42,0x80ECE7C7,0x3D268B09,0xE0B0528C,0x9DC354D4,0x40558D51,0xFD9FE19F,0x2009381A,
+                0xBD8D91AB,0x601B482E,0xDDD124E0,0x0047FD65,0x7D34FB3D,0xA0A222B8,0x1D684E76,0xC0FE97F3,
+                0xE78E42C6,0x3A189B43,0x87D2F78D,0x5A442E08,0x27372850,0xFAA1F1D5,0x476B9D1B,0x9AFD449E,
+                0x098A3771,0xD41CEEF4,0x69D6823A,0xB4405BBF,0xC9335DE7,0x14A58462,0xA96FE8AC,0x74F93129,
+                0x5389E41C,0x8E1F3D99,0x33D55157,0xEE4388D2,0x93308E8A,0x4EA6570F,0xF36C3BC1,0x2EFAE244,
+                0x0EF3DA5E,0xD36503DB,0x6EAF6F15,0xB339B690,0xCE4AB0C8,0x13DC694D,0xAE160583,0x7380DC06,
+                0x54F00933,0x8966D0B6,0x34ACBC78,0xE93A65FD,0x944963A5,0x49DFBA20,0xF415D6EE,0x29830F6B,
+                0xBAF47C84,0x6762A501,0xDAA8C9CF,0x073E104A,0x7A4D1612,0xA7DBCF97,0x1A11A359,0xC7877ADC,
+                0xE0F7AFE9,0x3D61766C,0x80AB1AA2,0x5D3DC327,0x204EC57F,0xFDD81CFA,0x40127034,0x9D84A9B1,
+                0xA06A2517,0x7DFCFC92,0xC036905C,0x1DA049D9,0x60D34F81,0xBD459604,0x008FFACA,0xDD19234F,
+                0xFA69F67A,0x27FF2FFF,0x9A354331,0x47A39AB4,0x3AD09CEC,0xE7464569,0x5A8C29A7,0x871AF022,
+                0x146D83CD,0xC9FB5A48,0x74313686,0xA9A7EF03,0xD4D4E95B,0x094230DE,0xB4885C10,0x691E8595,
+                0x4E6E50A0,0x93F88925,0x2E32E5EB,0xF3A43C6E,0x8ED73A36,0x5341E3B3,0xEE8B8F7D,0x331D56F8,
+                0x13146EE2,0xCE82B767,0x7348DBA9,0xAEDE022C,0xD3AD0474,0x0E3BDDF1,0xB3F1B13F,0x6E6768BA,
+                0x4917BD8F,0x9481640A,0x294B08C4,0xF4DDD141,0x89AED719,0x54380E9C,0xE9F26252,0x3464BBD7,
+                0xA713C838,0x7A8511BD,0xC74F7D73,0x1AD9A4F6,0x67AAA2AE,0xBA3C7B2B,0x07F617E5,0xDA60CE60,
+                0xFD101B55,0x2086C2D0,0x9D4CAE1E,0x40DA779B,0x3DA971C3,0xE03FA846,0x5DF5C488,0x80631D0D,
+                0x1DE7B4BC,0xC0716D39,0x7DBB01F7,0xA02DD872,0xDD5EDE2A,0x00C807AF,0xBD026B61,0x6094B2E4,
+                0x47E467D1,0x9A72BE54,0x27B8D29A,0xFA2E0B1F,0x875D0D47,0x5ACBD4C2,0xE701B80C,0x3A976189,
+                0xA9E01266,0x7476CBE3,0xC9BCA72D,0x142A7EA8,0x695978F0,0xB4CFA175,0x0905CDBB,0xD493143E,
+                0xF3E3C10B,0x2E75188E,0x93BF7440,0x4E29ADC5,0x335AAB9D,0xEECC7218,0x53061ED6,0x8E90C753,
+                0xAE99FF49,0x730F26CC,0xCEC54A02,0x13539387,0x6E2095DF,0xB3B64C5A,0x0E7C2094,0xD3EAF911,
+                0xF49A2C24,0x290CF5A1,0x94C6996F,0x495040EA,0x342346B2,0xE9B59F37,0x547FF3F9,0x89E92A7C,
+                0x1A9E5993,0xC7088016,0x7AC2ECD8,0xA754355D,0xDA273305,0x07B1EA80,0xBA7B864E,0x67ED5FCB,
+                0x409D8AFE,0x9D0B537B,0x20C13FB5,0xFD57E630,0x8024E068,0x5DB239ED,0xE0785523,0x3DEE8CA6,
+            },
+            {
+                0x00000000,0x9D0FE176,0xE16EC4AD,0x7C6125DB,0x19AC8F1B,0x84A36E6D,0xF8C24BB6,0x65CDAAC0,
+                0x33591E36,0xAE56FF40,0xD237DA9B,0x4F383BED,0x2AF5912D,0xB7FA705B,0xCB9B5580,0x5694B4F6,
+                0x66B23C6C,0xFBBDDD1A,0x87DCF8C1,0x1AD319B7,0x7F1EB377,0xE2115201,0x9E7077DA,0x037F96AC,
+                0x55EB225A,0xC8E4C32C,0xB485E6F7,0x298A0781,0x4C47AD41,0xD1484C37,0xAD2969EC,0x3026889A,
+                0xCD6478D8,0x506B99AE,0x2C0ABC75,0xB1055D03,0xD4C8F7C3,0x49C716B5,0x35A6336E,0xA8A9D218,
+                0xFE3D66EE,0x63328798,0x1F53A243,0x825C4335,0xE791E9F5,0x7A9E0883,0x06FF2D58,0x9BF0CC2E,
+                0xABD644B4,0x36D9A5C2,0x4AB88019,0xD7B7616F,0xB27ACBAF,0x2F752AD9,0x53140F02,0xCE1BEE74,
+                0x988F5A82,0x0580BBF4,0x79E19E2F,0xE4EE7F59,0x8123D599,0x1C2C34EF,0x604D1134,0xFD42F042,
+                0x41B9F7F1,0xDCB61687,0xA0D7335C,0x3DD8D22A,0x581578EA,0xC51A999C,0xB97BBC47,0x24745D31,
+                0x72E0E9C7,0xEFEF08B1,0x938E2D6A,0x0E81CC1C,0x6B4C66DC,0xF64387AA,0x8A22A271,0x172D4307,
+                0x270BCB9D,0xBA042AEB,0xC6650F30,0x5B6AEE46,0x3EA74486,0xA3A8A5F0,0xDFC9802B,0x42C6615D,
+                0x1452D5AB,0x895D34DD,0xF53C1106,0x6833F070,0x0DFE5AB0,0x90F1BBC6,0xEC909E1D,0x719F7F6B,
+                0x8CDD8F29,0x11D26E5F,0x6DB34B84,0xF0BCAAF2,0x95710032,0x087EE144,0x741FC49F,0xE91025E9,
+                0xBF84911F,0x228B7069,0x5EEA55B2,0xC3E5B4C4,0xA6281E04,0x3B27FF72,0x4746DAA9,0xDA493BDF,
+                0xEA6FB345,0x77605233,0x0B0177E8,0x960E969E,0xF3C33C5E,0x6ECCDD28,0x12ADF8F3,0x8FA21985,
+                0xD936AD73,0x44394C05,0x385869DE,0xA55788A8,0xC09A2268,0x5D95C31E,0x21F4E6C5,0xBCFB07B3,
+                0x8373EFE2,0x1E7C0E94,0x621D2B4F,0xFF12CA39,0x9ADF60F9,0x07D0818F,0x7BB1A454,0xE6BE4522,
+                0xB02AF1D4,0x2D2510A2,0x51443579,0xCC4BD40F,0xA9867ECF,0x34899FB9,0x48E8BA62,0xD5E75B14,
+                0xE5C1D38E,0x78CE32F8,0x04AF1723,0x99A0F655,0xFC6D5C95,0x6162BDE3,0x1D039838,0x800C794E,
+                0xD698CDB8,0x4B972CCE,0x37F60915,0xAAF9E863,0xCF3442A3,0x523BA3D5,0x2E5A860E,0xB3556778,
+                0x4E17973A,0xD318764C,0xAF795397,0x3276B2E1,0x57BB1821,0xCAB4F957,0xB6D5DC8C,0x2BDA3DFA,
+                0x7D4E890C,0xE041687A,0x9C204DA1,0x012FACD7,0x64E20617,0xF9EDE761,0x858CC2BA,0x188323CC,
+                0x28A5AB56,0xB5AA4A20,0xC9CB6FFB,0x54C48E8D,0x3109244D,0xAC06C53B,0xD067E0E0,0x4D680196,
+                0x1BFCB560,0x86F35416,0xFA9271CD,0x679D90BB,0x02503A7B,0x9F5FDB0D,0xE33EFED6,0x7E311FA0,
+                0xC2CA1813,0x5FC5F965,0x23A4DCBE,0xBEAB3DC8,0xDB669708,0x4669767E,0x3A0853A5,0xA707B2D3,
+                0xF1930625,0x6C9CE753,0x10FDC288,0x8DF223FE,0xE83F893E,0x75306848,0x09514D93,0x945EACE5,
+                0xA478247F,0x3977C509,0x4516E0D2,0xD81901A4,0xBDD4AB64,0x20DB4A12,0x5CBA6FC9,0xC1B58EBF,
+                0x97213A49,0x0A2EDB3F,0x764FFEE4,0xEB401F92,0x8E8DB552,0x13825424,0x6FE371FF,0xF2EC9089,
+                0x0FAE60CB,0x92A181BD,0xEEC0A466,0x73CF4510,0x1602EFD0,0x8B0D0EA6,0xF76C2B7D,0x6A63CA0B,
+                0x3CF77EFD,0xA1F89F8B,0xDD99BA50,0x40965B26,0x255BF1E6,0xB8541090,0xC435354B,0x593AD43D,
+                0x691C5CA7,0xF413BDD1,0x8872980A,0x157D797C,0x70B0D3BC,0xEDBF32CA,0x91DE1711,0x0CD1F667,
+                0x5A454291,0xC74AA3E7,0xBB2B863C,0x2624674A,0x43E9CD8A,0xDEE62CFC,0xA2870927,0x3F88E851,
+            },
+            {
+                0x00000000,0xB9FBDBE8,0xA886B191,0x117D6A79,0x8A7C6563,0x3387BE8B,0x22FAD4F2,0x9B010F1A,
+                0xCF89CC87,0x7672176F,0x670F7D16,0xDEF4A6FE,0x45F5A9E4,0xFC0E720C,0xED731875,0x5488C39D,
+                0x44629F4F,0xFD9944A7,0xECE42EDE,0x551FF536,0xCE1EFA2C,0x77E521C4,0x66984BBD,0xDF639055,
+                0x8BEB53C8,0x32108820,0x236DE259,0x9A9639B1,0x019736AB,0xB86CED43,0xA911873A,0x10EA5CD2,
+                0x88C53E9E,0x313EE576,0x20438F0F,0x99B854E7,0x02B95BFD,0xBB428015,0xAA3FEA6C,0x13C43184,
+                0x474CF219,0xFEB729F1,0xEFCA4388,0x56319860,0xCD30977A,0x74CB4C92,0x65B626EB,0xDC4DFD03,
+                0xCCA7A1D1,0x755C7A39,0x64211040,0xDDDACBA8,0x46DBC4B2,0xFF201F5A,0xEE5D7523,0x57A6AECB,
+                0x032E6D56,0xBAD5B6BE,0xABA8DCC7,0x1253072F,0x89520835,0x30A9D3DD,0x21D4B9A4,0x982F624C,
+                0xCAFB7B7D,0x7300A095,0x627DCAEC,0xDB861104,0x40871E1E,0xF97CC5F6,0xE801AF8F,0x51FA7467,
+                0x0572B7FA,0xBC896C12,0xADF4066B,0x140FDD83,0x8F0ED299,0x36F50971,0x27886308,0x9E73B8E0,
+                0x8E99E432,0x37623FDA,0x261F55A3,0x9FE48E4B,0x04E58151,0xBD1E5AB9,0xAC6330C0,0x1598EB28,
+                0x411028B5,0xF8EBF35D,0xE9969924,0x506D42CC,0xCB6C4DD6,0x7297963E,0x63EAFC47,0xDA1127AF,
+                0x423E45E3,0xFBC59E0B,0xEAB8F472,0x53432F9A,0xC8422080,0x71B9FB68,0x60C49111,0xD93F4AF9,
+                0x8DB78964,0x344C528C,0x253138F5,0x9CCAE31D,0x07CBEC07,0xBE3037EF,0xAF4D5D96,0x16B6867E,
+                0x065CDAAC,0xBFA70144,0xAEDA6B3D,0x1721B0D5,0x8C20BFCF,0x35DB6427,0x24A60E5E,0x9D5DD5B6,
+                0xC9D5162B,0x702ECDC3,0x6153A7BA,0xD8A87C52,0x43A97348,0xFA52A8A0,0xEB2FC2D9,0x52D41931,
+                0x4E87F0BB,0xF77C2B53,0xE601412A,0x5FFA9AC2,0xC4FB95D8,0x7D004E30,0x6C7D2449,0xD586FFA1,
+                0x810E3C3C,0x38F5E7D4,0x29888DAD,0x90735645,0x0B72595F,0xB28982B7,0xA3F4E8CE,0x1A0F3326,
+                0x0AE56FF4,0xB31EB41C,0xA263DE65,0x1B98058D,0x80990A97,0x3962D17F,0x281FBB06,0x91E460EE,
+                0xC56CA373,0x7C97789B,0x6DEA12E2,0xD411C90A,0x4F10C610,0xF6EB1DF8,0xE7967781,0x5E6DAC69,
+                0xC642CE25,0x7FB915CD,0x6EC47FB4,0xD73FA45C,0x4C3EAB46,0xF5C570AE,0xE4B81AD7,0x5D43C13F,
+                0x09CB02A2,0xB030D94A,0xA14DB333,0x18B668DB,0x83B767C1,0x3A4CBC29,0x2B31D650,0x92CA0DB8,
+                0x8220516A,0x3BDB8A82,0x2AA6E0FB,0x935D3B13,0x085C3409,0xB1A7EFE1,0xA0DA8598,0x19215E70,
+                0x4DA99DED,0xF4524605,0xE52F2C7C,0x5CD4F794,0xC7D5F88E,0x7E2E2366,0x6F53491F,0xD6A892F7,
+                0x847C8BC6,0x3D87502E,0x2CFA3A57,0x9501E1BF,0x0E00EEA5,0xB7FB354D,0xA6865F34,0x1F7D84DC,
+                0x4BF54741,0xF20E9CA9,0xE373F6D0,0x5A882D38,0xC1892222,0x7872F9CA,0x690F93B3,0xD0F4485B,
+                0xC01E1489,0x79E5CF61,0x6898A518,0xD1637EF0,0x4A6271EA,0xF399AA02,0xE2E4C07B,0x5B1F1B93,
+                0x0F97D80E,0xB66C03E6,0xA711699F,0x1EEAB277,0x85EBBD6D,0x3C106685,0x2D6D0CFC,0x9496D714,
+                0x0CB9B558,0xB5426EB0,0xA43F04C9,0x1DC4DF21,0x86C5D03B,0x3F3E0BD3,0x2E4361AA,0x97B8BA42,
+                0xC33079DF,0x7ACBA237,0x6BB6C84E,0xD24D13A6,0x494C1CBC,0xF0B7C754,0xE1CAAD2D,0x583176C5,
+                0x48DB2A17,0xF120F1FF,0xE05D9B86,0x59A6406E,0xC2A74F74,0x7B5C949C,0x6A21FEE5,0xD3DA250D,
+                0x8752E690,0x3EA93D78,0x2FD45701,0x962F8CE9,0x0D2E83F3,0xB4D5581B,0xA5A83262,0x1C53E98A,
+            },
+            {
+                0x00000000,0xAE689191,0x87A02563,0x29C8B4F2,0xD4314C87,0x7A59DD16,0x539169E4,0xFDF9F875,
+                0x73139F4F,0xDD7B0EDE,0xF4B3BA2C,0x5ADB2BBD,0xA722D3C8,0x094A4259,0x2082F6AB,0x8EEA673A,
+                0xE6273E9E,0x484FAF0F,0x61871BFD,0xCFEF8A6C,0x32167219,0x9C7EE388,0xB5B6577A,0x1BDEC6EB,
+                0x9534A1D1,0x3B5C3040,0x129484B2,0xBCFC1523,0x4105ED56,0xEF6D7CC7,0xC6A5C835,0x68CD59A4,
+                0x173F7B7D,0xB957EAEC,0x909F5E1E,0x3EF7CF8F,0xC30E37FA,0x6D66A66B,0x44AE1299,0xEAC68308,
+                0x642CE432,0xCA4475A3,0xE38CC151,0x4DE450C0,0xB01DA8B5,0x1E753924,0x37BD8DD6,0x99D51C47,
+                0xF11845E3,0x5F70D472,0x76B86080,0xD8D0F111,0x25290964,0x8B4198F5,0xA2892C07,0x0CE1BD96,
+                0x820BDAAC,0x2C634B3D,0x05ABFFCF,0xABC36E5E,0x563A962B,0xF85207BA,0xD19AB348,0x7FF222D9,
+                0x2E7EF6FA,0x8016676B,0xA9DED399,0x07B64208,0xFA4FBA7D,0x54272BEC,0x7DEF9F1E,0xD3870E8F,
+                0x5D6D69B5,0xF305F824,0xDACD4CD6,0x74A5DD47,0x895C2532,0x2734B4A3,0x0EFC0051,0xA09491C0,
+                0xC859C864,0x663159F5,0x4FF9ED07,0xE1917C96,0x1C6884E3,0xB2001572,0x9BC8A180,0x35A03011,
+                0xBB4A572B,0x1522C6BA,0x3CEA7248,0x9282E3D9,0x6F7B1BAC,0xC1138A3D,0xE8DB3ECF,0x46B3AF5E,
+                0x39418D87,0x97291C16,0xBEE1A8E4,0x10893975,0xED70C100,0x43185091,0x6AD0E463,0xC4B875F2,
+                0x4A5212C8,0xE43A8359,0xCDF237AB,0x639AA63A,0x9E635E4F,0x300BCFDE,0x19C37B2C,0xB7ABEABD,
+                0xDF66B319,0x710E2288,0x58C6967A,0xF6AE07EB,0x0B57FF9E,0xA53F6E0F,0x8CF7DAFD,0x229F4B6C,
+                0xAC752C56,0x021DBDC7,0x2BD50935,0x85BD98A4,0x784460D1,0xD62CF140,0xFFE445B2,0x518CD423,
+                0x5CFDEDF4,0xF2957C65,0xDB5DC897,0x75355906,0x88CCA173,0x26A430E2,0x0F6C8410,0xA1041581,
+                0x2FEE72BB,0x8186E32A,0xA84E57D8,0x0626C649,0xFBDF3E3C,0x55B7AFAD,0x7C7F1B5F,0xD2178ACE,
+                0xBADAD36A,0x14B242FB,0x3D7AF609,0x93126798,0x6EEB9FED,0xC0830E7C,0xE94BBA8E,0x47232B1F,
+                0xC9C94C25,0x67A1DDB4,0x4E696946,0xE001F8D7,0x1DF800A2,0xB3909133,0x9A5825C1,0x3430B450,
+                0x4BC29689,0xE5AA0718,0xCC62B3EA,0x620A227B,0x9FF3DA0E,0x319B4B9F,0x1853FF6D,0xB63B6EFC,
+                0x38D109C6,0x96B99857,0xBF712CA5,0x1119BD34,0xECE04541,0x4288D4D0,0x6B406022,0xC528F1B3,
+                0xADE5A817,0x038D3986,0x2A458D74,0x842D1CE5,0x79D4E490,0xD7BC7501,0xFE74C1F3,0x501C5062,
+                0xDEF63758,0x709EA6C9,0x5956123B,0xF73E83AA,0x0AC77BDF,0xA4AFEA4E,0x8D675EBC,0x230FCF2D,
+                0x72831B0E,0xDCEB8A9F,0xF5233E6D,0x5B4BAFFC,0xA6B25789,0x08DAC618,0x211272EA,0x8F7AE37B,
+                0x01908441,0xAFF815D0,0x8630A122,0x285830B3,0xD5A1C8C6,0x7BC95957,0x5201EDA5,0xFC697C34,
+                0x94A42590,0x3ACCB401,0x130400F3,0xBD6C9162,0x40956917,0xEEFDF886,0xC7354C74,0x695DDDE5,
+                0xE7B7BADF,0x49DF2B4E,0x60179FBC,0xCE7F0E2D,0x3386F658,0x9DEE67C9,0xB426D33B,0x1A4E42AA,
+                0x65BC6073,0xCBD4F1E2,0xE21C4510,0x4C74D481,0xB18D2CF4,0x1FE5BD65,0x362D0997,0x98459806,
+                0x16AFFF3C,0xB8C76EAD,0x910FDA5F,0x3F674BCE,0xC29EB3BB,0x6CF6222A,0x453E96D8,0xEB560749,
+                0x839B5EED,0x2DF3CF7C,0x043B7B8E,0xAA53EA1F,0x57AA126A,0xF9C283FB,0xD00A3709,0x7E62A698,
+                0xF088C1A2,0x5EE05033,0x7728E4C1,0xD9407550,0x24B98D25,0x8AD11CB4,0xA319A846,0x0D7139D7,
+            }
+        };
+
+        uint32_t Crc32(const void* src, size_t size)
+        {
+            const uint8_t* p8 = (const uint8_t*)src;
+            uint32_t crc = 0xFFFFFFFF;
+
+            for (; ((uintptr_t)p8 & (sizeof(uint32_t) - 1)) != 0 && size > 0; ++p8, --size)
+                crc = Crc32Table[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8);
+
+            const uint32_t* p32 = (const uint32_t*)p8;
+            for (; size >= 16; size -= 16)
+            {
+#ifdef SIMD_BIG_ENDIAN
+                uint32_t v0 = *p32++ ^ Reorder32(crc);
+                uint32_t v1 = *p32++;
+                uint32_t v2 = *p32++;
+                uint32_t v3 = *p32++;
+                crc = 
+                    Crc32Table[0x0][v3 & 0xFF] ^
+                    Crc32Table[0x1][(v3 >> 8) & 0xFF] ^
+                    Crc32Table[0x2][(v3 >> 16) & 0xFF] ^
+                    Crc32Table[0x3][(v3 >> 24) & 0xFF] ^
+                    Crc32Table[0x4][v2 & 0xFF] ^
+                    Crc32Table[0x5][(v2 >> 8) & 0xFF] ^
+                    Crc32Table[0x6][(v2 >> 16) & 0xFF] ^
+                    Crc32Table[0x7][(v2 >> 24) & 0xFF] ^
+                    Crc32Table[0x8][v1 & 0xFF] ^
+                    Crc32Table[0x9][(v1 >> 8) & 0xFF] ^
+                    Crc32Table[0xA][(v1 >> 16) & 0xFF] ^
+                    Crc32Table[0xB][(v1 >> 24) & 0xFF] ^
+                    Crc32Table[0xC][v0 & 0xFF] ^
+                    Crc32Table[0xD][(v0 >> 8) & 0xFF] ^
+                    Crc32Table[0xE][(v0 >> 16) & 0xFF] ^
+                    Crc32Table[0xF][(v0 >> 24) & 0xFF];
+#else
+                uint32_t v0 = *p32++ ^ crc;
+                uint32_t v1 = *p32++;
+                uint32_t v2 = *p32++;
+                uint32_t v3 = *p32++;
+                crc = 
+                    Crc32Table[0x0][(v3 >> 24) & 0xFF] ^
+                    Crc32Table[0x1][(v3 >> 16) & 0xFF] ^
+                    Crc32Table[0x2][(v3 >> 8) & 0xFF] ^
+                    Crc32Table[0x3][v3 & 0xFF] ^
+                    Crc32Table[0x4][(v2 >> 24) & 0xFF] ^
+                    Crc32Table[0x5][(v2 >> 16) & 0xFF] ^
+                    Crc32Table[0x6][(v2 >> 8) & 0xFF] ^
+                    Crc32Table[0x7][v2 & 0xFF] ^
+                    Crc32Table[0x8][(v1 >> 24) & 0xFF] ^
+                    Crc32Table[0x9][(v1 >> 16) & 0xFF] ^
+                    Crc32Table[0xA][(v1 >> 8) & 0xFF] ^
+                    Crc32Table[0xB][v1 & 0xFF] ^
+                    Crc32Table[0xC][(v0 >> 24) & 0xFF] ^
+                    Crc32Table[0xD][(v0 >> 16) & 0xFF] ^
+                    Crc32Table[0xE][(v0 >> 8) & 0xFF] ^
+                    Crc32Table[0xF][v0 & 0xFF];
+#endif
+            }
+
+            for (p8 = (const uint8_t*)p32; size > 0; ++p8, size--)
+                crc = Crc32Table[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8);
+
+            return (~crc);
+        }
+
+        //---------------------------------------------------------------------
+
+        // Precalculated CRC32c lookup table for polynomial 0x1EDC6F41 (castagnoli-crc).
+        static const uint32_t Crc32cTable[8][256] =
+        {
+            {
+                0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
+                0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
+                0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+                0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
+                0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35,
+                0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+                0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a,
+                0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
+                0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+                0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
+                0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
+                0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+                0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
+                0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
+                0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+                0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
+                0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
+                0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+                0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
+                0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
+                0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+                0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982,
+                0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
+                0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+                0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
+                0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
+                0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+                0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
+                0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
+                0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+                0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
+                0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351
+            },
+            {
+                0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945,
+                0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd,
+                0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4,
+                0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c,
+                0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47,
+                0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff,
+                0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6,
+                0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e,
+                0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41,
+                0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9,
+                0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0,
+                0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78,
+                0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43,
+                0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb,
+                0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2,
+                0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a,
+                0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc,
+                0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004,
+                0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d,
+                0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185,
+                0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be,
+                0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306,
+                0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f,
+                0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287,
+                0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8,
+                0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600,
+                0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439,
+                0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781,
+                0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba,
+                0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502,
+                0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b,
+                0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483
+            },
+            {
+                0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469,
+                0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac,
+                0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3,
+                0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726,
+                0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d,
+                0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8,
+                0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7,
+                0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32,
+                0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0,
+                0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75,
+                0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a,
+                0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff,
+                0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4,
+                0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161,
+                0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e,
+                0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb,
+                0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a,
+                0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def,
+                0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0,
+                0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065,
+                0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e,
+                0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb,
+                0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4,
+                0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71,
+                0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3,
+                0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36,
+                0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79,
+                0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc,
+                0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7,
+                0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622,
+                0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d,
+                0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8
+            },
+            {
+                0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca,
+                0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c,
+                0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7,
+                0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11,
+                0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41,
+                0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7,
+                0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c,
+                0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a,
+                0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d,
+                0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb,
+                0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610,
+                0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6,
+                0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6,
+                0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040,
+                0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b,
+                0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d,
+                0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5,
+                0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213,
+                0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8,
+                0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e,
+                0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e,
+                0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698,
+                0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443,
+                0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5,
+                0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12,
+                0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4,
+                0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f,
+                0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9,
+                0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99,
+                0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f,
+                0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4,
+                0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842
+            },
+            {
+                0x00000000, 0x38116fac, 0x7022df58, 0x4833b0f4, 0xe045beb0, 0xd854d11c, 0x906761e8, 0xa8760e44,
+                0xc5670b91, 0xfd76643d, 0xb545d4c9, 0x8d54bb65, 0x2522b521, 0x1d33da8d, 0x55006a79, 0x6d1105d5,
+                0x8f2261d3, 0xb7330e7f, 0xff00be8b, 0xc711d127, 0x6f67df63, 0x5776b0cf, 0x1f45003b, 0x27546f97,
+                0x4a456a42, 0x725405ee, 0x3a67b51a, 0x0276dab6, 0xaa00d4f2, 0x9211bb5e, 0xda220baa, 0xe2336406,
+                0x1ba8b557, 0x23b9dafb, 0x6b8a6a0f, 0x539b05a3, 0xfbed0be7, 0xc3fc644b, 0x8bcfd4bf, 0xb3debb13,
+                0xdecfbec6, 0xe6ded16a, 0xaeed619e, 0x96fc0e32, 0x3e8a0076, 0x069b6fda, 0x4ea8df2e, 0x76b9b082,
+                0x948ad484, 0xac9bbb28, 0xe4a80bdc, 0xdcb96470, 0x74cf6a34, 0x4cde0598, 0x04edb56c, 0x3cfcdac0,
+                0x51eddf15, 0x69fcb0b9, 0x21cf004d, 0x19de6fe1, 0xb1a861a5, 0x89b90e09, 0xc18abefd, 0xf99bd151,
+                0x37516aae, 0x0f400502, 0x4773b5f6, 0x7f62da5a, 0xd714d41e, 0xef05bbb2, 0xa7360b46, 0x9f2764ea,
+                0xf236613f, 0xca270e93, 0x8214be67, 0xba05d1cb, 0x1273df8f, 0x2a62b023, 0x625100d7, 0x5a406f7b,
+                0xb8730b7d, 0x806264d1, 0xc851d425, 0xf040bb89, 0x5836b5cd, 0x6027da61, 0x28146a95, 0x10050539,
+                0x7d1400ec, 0x45056f40, 0x0d36dfb4, 0x3527b018, 0x9d51be5c, 0xa540d1f0, 0xed736104, 0xd5620ea8,
+                0x2cf9dff9, 0x14e8b055, 0x5cdb00a1, 0x64ca6f0d, 0xccbc6149, 0xf4ad0ee5, 0xbc9ebe11, 0x848fd1bd,
+                0xe99ed468, 0xd18fbbc4, 0x99bc0b30, 0xa1ad649c, 0x09db6ad8, 0x31ca0574, 0x79f9b580, 0x41e8da2c,
+                0xa3dbbe2a, 0x9bcad186, 0xd3f96172, 0xebe80ede, 0x439e009a, 0x7b8f6f36, 0x33bcdfc2, 0x0badb06e,
+                0x66bcb5bb, 0x5eadda17, 0x169e6ae3, 0x2e8f054f, 0x86f90b0b, 0xbee864a7, 0xf6dbd453, 0xcecabbff,
+                0x6ea2d55c, 0x56b3baf0, 0x1e800a04, 0x269165a8, 0x8ee76bec, 0xb6f60440, 0xfec5b4b4, 0xc6d4db18,
+                0xabc5decd, 0x93d4b161, 0xdbe70195, 0xe3f66e39, 0x4b80607d, 0x73910fd1, 0x3ba2bf25, 0x03b3d089,
+                0xe180b48f, 0xd991db23, 0x91a26bd7, 0xa9b3047b, 0x01c50a3f, 0x39d46593, 0x71e7d567, 0x49f6bacb,
+                0x24e7bf1e, 0x1cf6d0b2, 0x54c56046, 0x6cd40fea, 0xc4a201ae, 0xfcb36e02, 0xb480def6, 0x8c91b15a,
+                0x750a600b, 0x4d1b0fa7, 0x0528bf53, 0x3d39d0ff, 0x954fdebb, 0xad5eb117, 0xe56d01e3, 0xdd7c6e4f,
+                0xb06d6b9a, 0x887c0436, 0xc04fb4c2, 0xf85edb6e, 0x5028d52a, 0x6839ba86, 0x200a0a72, 0x181b65de,
+                0xfa2801d8, 0xc2396e74, 0x8a0ade80, 0xb21bb12c, 0x1a6dbf68, 0x227cd0c4, 0x6a4f6030, 0x525e0f9c,
+                0x3f4f0a49, 0x075e65e5, 0x4f6dd511, 0x777cbabd, 0xdf0ab4f9, 0xe71bdb55, 0xaf286ba1, 0x9739040d,
+                0x59f3bff2, 0x61e2d05e, 0x29d160aa, 0x11c00f06, 0xb9b60142, 0x81a76eee, 0xc994de1a, 0xf185b1b6,
+                0x9c94b463, 0xa485dbcf, 0xecb66b3b, 0xd4a70497, 0x7cd10ad3, 0x44c0657f, 0x0cf3d58b, 0x34e2ba27,
+                0xd6d1de21, 0xeec0b18d, 0xa6f30179, 0x9ee26ed5, 0x36946091, 0x0e850f3d, 0x46b6bfc9, 0x7ea7d065,
+                0x13b6d5b0, 0x2ba7ba1c, 0x63940ae8, 0x5b856544, 0xf3f36b00, 0xcbe204ac, 0x83d1b458, 0xbbc0dbf4,
+                0x425b0aa5, 0x7a4a6509, 0x3279d5fd, 0x0a68ba51, 0xa21eb415, 0x9a0fdbb9, 0xd23c6b4d, 0xea2d04e1,
+                0x873c0134, 0xbf2d6e98, 0xf71ede6c, 0xcf0fb1c0, 0x6779bf84, 0x5f68d028, 0x175b60dc, 0x2f4a0f70,
+                0xcd796b76, 0xf56804da, 0xbd5bb42e, 0x854adb82, 0x2d3cd5c6, 0x152dba6a, 0x5d1e0a9e, 0x650f6532,
+                0x081e60e7, 0x300f0f4b, 0x783cbfbf, 0x402dd013, 0xe85bde57, 0xd04ab1fb, 0x9879010f, 0xa0686ea3
+            },
+            {
+                0x00000000, 0xef306b19, 0xdb8ca0c3, 0x34bccbda, 0xb2f53777, 0x5dc55c6e, 0x697997b4, 0x8649fcad,
+                0x6006181f, 0x8f367306, 0xbb8ab8dc, 0x54bad3c5, 0xd2f32f68, 0x3dc34471, 0x097f8fab, 0xe64fe4b2,
+                0xc00c303e, 0x2f3c5b27, 0x1b8090fd, 0xf4b0fbe4, 0x72f90749, 0x9dc96c50, 0xa975a78a, 0x4645cc93,
+                0xa00a2821, 0x4f3a4338, 0x7b8688e2, 0x94b6e3fb, 0x12ff1f56, 0xfdcf744f, 0xc973bf95, 0x2643d48c,
+                0x85f4168d, 0x6ac47d94, 0x5e78b64e, 0xb148dd57, 0x370121fa, 0xd8314ae3, 0xec8d8139, 0x03bdea20,
+                0xe5f20e92, 0x0ac2658b, 0x3e7eae51, 0xd14ec548, 0x570739e5, 0xb83752fc, 0x8c8b9926, 0x63bbf23f,
+                0x45f826b3, 0xaac84daa, 0x9e748670, 0x7144ed69, 0xf70d11c4, 0x183d7add, 0x2c81b107, 0xc3b1da1e,
+                0x25fe3eac, 0xcace55b5, 0xfe729e6f, 0x1142f576, 0x970b09db, 0x783b62c2, 0x4c87a918, 0xa3b7c201,
+                0x0e045beb, 0xe13430f2, 0xd588fb28, 0x3ab89031, 0xbcf16c9c, 0x53c10785, 0x677dcc5f, 0x884da746,
+                0x6e0243f4, 0x813228ed, 0xb58ee337, 0x5abe882e, 0xdcf77483, 0x33c71f9a, 0x077bd440, 0xe84bbf59,
+                0xce086bd5, 0x213800cc, 0x1584cb16, 0xfab4a00f, 0x7cfd5ca2, 0x93cd37bb, 0xa771fc61, 0x48419778,
+                0xae0e73ca, 0x413e18d3, 0x7582d309, 0x9ab2b810, 0x1cfb44bd, 0xf3cb2fa4, 0xc777e47e, 0x28478f67,
+                0x8bf04d66, 0x64c0267f, 0x507ceda5, 0xbf4c86bc, 0x39057a11, 0xd6351108, 0xe289dad2, 0x0db9b1cb,
+                0xebf65579, 0x04c63e60, 0x307af5ba, 0xdf4a9ea3, 0x5903620e, 0xb6330917, 0x828fc2cd, 0x6dbfa9d4,
+                0x4bfc7d58, 0xa4cc1641, 0x9070dd9b, 0x7f40b682, 0xf9094a2f, 0x16392136, 0x2285eaec, 0xcdb581f5,
+                0x2bfa6547, 0xc4ca0e5e, 0xf076c584, 0x1f46ae9d, 0x990f5230, 0x763f3929, 0x4283f2f3, 0xadb399ea,
+                0x1c08b7d6, 0xf338dccf, 0xc7841715, 0x28b47c0c, 0xaefd80a1, 0x41cdebb8, 0x75712062, 0x9a414b7b,
+                0x7c0eafc9, 0x933ec4d0, 0xa7820f0a, 0x48b26413, 0xcefb98be, 0x21cbf3a7, 0x1577387d, 0xfa475364,
+                0xdc0487e8, 0x3334ecf1, 0x0788272b, 0xe8b84c32, 0x6ef1b09f, 0x81c1db86, 0xb57d105c, 0x5a4d7b45,
+                0xbc029ff7, 0x5332f4ee, 0x678e3f34, 0x88be542d, 0x0ef7a880, 0xe1c7c399, 0xd57b0843, 0x3a4b635a,
+                0x99fca15b, 0x76ccca42, 0x42700198, 0xad406a81, 0x2b09962c, 0xc439fd35, 0xf08536ef, 0x1fb55df6,
+                0xf9fab944, 0x16cad25d, 0x22761987, 0xcd46729e, 0x4b0f8e33, 0xa43fe52a, 0x90832ef0, 0x7fb345e9,
+                0x59f09165, 0xb6c0fa7c, 0x827c31a6, 0x6d4c5abf, 0xeb05a612, 0x0435cd0b, 0x308906d1, 0xdfb96dc8,
+                0x39f6897a, 0xd6c6e263, 0xe27a29b9, 0x0d4a42a0, 0x8b03be0d, 0x6433d514, 0x508f1ece, 0xbfbf75d7,
+                0x120cec3d, 0xfd3c8724, 0xc9804cfe, 0x26b027e7, 0xa0f9db4a, 0x4fc9b053, 0x7b757b89, 0x94451090,
+                0x720af422, 0x9d3a9f3b, 0xa98654e1, 0x46b63ff8, 0xc0ffc355, 0x2fcfa84c, 0x1b736396, 0xf443088f,
+                0xd200dc03, 0x3d30b71a, 0x098c7cc0, 0xe6bc17d9, 0x60f5eb74, 0x8fc5806d, 0xbb794bb7, 0x544920ae,
+                0xb206c41c, 0x5d36af05, 0x698a64df, 0x86ba0fc6, 0x00f3f36b, 0xefc39872, 0xdb7f53a8, 0x344f38b1,
+                0x97f8fab0, 0x78c891a9, 0x4c745a73, 0xa344316a, 0x250dcdc7, 0xca3da6de, 0xfe816d04, 0x11b1061d,
+                0xf7fee2af, 0x18ce89b6, 0x2c72426c, 0xc3422975, 0x450bd5d8, 0xaa3bbec1, 0x9e87751b, 0x71b71e02,
+                0x57f4ca8e, 0xb8c4a197, 0x8c786a4d, 0x63480154, 0xe501fdf9, 0x0a3196e0, 0x3e8d5d3a, 0xd1bd3623,
+                0x37f2d291, 0xd8c2b988, 0xec7e7252, 0x034e194b, 0x8507e5e6, 0x6a378eff, 0x5e8b4525, 0xb1bb2e3c
+            },
+            {
+                0x00000000, 0x68032cc8, 0xd0065990, 0xb8057558, 0xa5e0c5d1, 0xcde3e919, 0x75e69c41, 0x1de5b089,
+                0x4e2dfd53, 0x262ed19b, 0x9e2ba4c3, 0xf628880b, 0xebcd3882, 0x83ce144a, 0x3bcb6112, 0x53c84dda,
+                0x9c5bfaa6, 0xf458d66e, 0x4c5da336, 0x245e8ffe, 0x39bb3f77, 0x51b813bf, 0xe9bd66e7, 0x81be4a2f,
+                0xd27607f5, 0xba752b3d, 0x02705e65, 0x6a7372ad, 0x7796c224, 0x1f95eeec, 0xa7909bb4, 0xcf93b77c,
+                0x3d5b83bd, 0x5558af75, 0xed5dda2d, 0x855ef6e5, 0x98bb466c, 0xf0b86aa4, 0x48bd1ffc, 0x20be3334,
+                0x73767eee, 0x1b755226, 0xa370277e, 0xcb730bb6, 0xd696bb3f, 0xbe9597f7, 0x0690e2af, 0x6e93ce67,
+                0xa100791b, 0xc90355d3, 0x7106208b, 0x19050c43, 0x04e0bcca, 0x6ce39002, 0xd4e6e55a, 0xbce5c992,
+                0xef2d8448, 0x872ea880, 0x3f2bddd8, 0x5728f110, 0x4acd4199, 0x22ce6d51, 0x9acb1809, 0xf2c834c1,
+                0x7ab7077a, 0x12b42bb2, 0xaab15eea, 0xc2b27222, 0xdf57c2ab, 0xb754ee63, 0x0f519b3b, 0x6752b7f3,
+                0x349afa29, 0x5c99d6e1, 0xe49ca3b9, 0x8c9f8f71, 0x917a3ff8, 0xf9791330, 0x417c6668, 0x297f4aa0,
+                0xe6ecfddc, 0x8eefd114, 0x36eaa44c, 0x5ee98884, 0x430c380d, 0x2b0f14c5, 0x930a619d, 0xfb094d55,
+                0xa8c1008f, 0xc0c22c47, 0x78c7591f, 0x10c475d7, 0x0d21c55e, 0x6522e996, 0xdd279cce, 0xb524b006,
+                0x47ec84c7, 0x2fefa80f, 0x97eadd57, 0xffe9f19f, 0xe20c4116, 0x8a0f6dde, 0x320a1886, 0x5a09344e,
+                0x09c17994, 0x61c2555c, 0xd9c72004, 0xb1c40ccc, 0xac21bc45, 0xc422908d, 0x7c27e5d5, 0x1424c91d,
+                0xdbb77e61, 0xb3b452a9, 0x0bb127f1, 0x63b20b39, 0x7e57bbb0, 0x16549778, 0xae51e220, 0xc652cee8,
+                0x959a8332, 0xfd99affa, 0x459cdaa2, 0x2d9ff66a, 0x307a46e3, 0x58796a2b, 0xe07c1f73, 0x887f33bb,
+                0xf56e0ef4, 0x9d6d223c, 0x25685764, 0x4d6b7bac, 0x508ecb25, 0x388de7ed, 0x808892b5, 0xe88bbe7d,
+                0xbb43f3a7, 0xd340df6f, 0x6b45aa37, 0x034686ff, 0x1ea33676, 0x76a01abe, 0xcea56fe6, 0xa6a6432e,
+                0x6935f452, 0x0136d89a, 0xb933adc2, 0xd130810a, 0xccd53183, 0xa4d61d4b, 0x1cd36813, 0x74d044db,
+                0x27180901, 0x4f1b25c9, 0xf71e5091, 0x9f1d7c59, 0x82f8ccd0, 0xeafbe018, 0x52fe9540, 0x3afdb988,
+                0xc8358d49, 0xa036a181, 0x1833d4d9, 0x7030f811, 0x6dd54898, 0x05d66450, 0xbdd31108, 0xd5d03dc0,
+                0x8618701a, 0xee1b5cd2, 0x561e298a, 0x3e1d0542, 0x23f8b5cb, 0x4bfb9903, 0xf3feec5b, 0x9bfdc093,
+                0x546e77ef, 0x3c6d5b27, 0x84682e7f, 0xec6b02b7, 0xf18eb23e, 0x998d9ef6, 0x2188ebae, 0x498bc766,
+                0x1a438abc, 0x7240a674, 0xca45d32c, 0xa246ffe4, 0xbfa34f6d, 0xd7a063a5, 0x6fa516fd, 0x07a63a35,
+                0x8fd9098e, 0xe7da2546, 0x5fdf501e, 0x37dc7cd6, 0x2a39cc5f, 0x423ae097, 0xfa3f95cf, 0x923cb907,
+                0xc1f4f4dd, 0xa9f7d815, 0x11f2ad4d, 0x79f18185, 0x6414310c, 0x0c171dc4, 0xb412689c, 0xdc114454,
+                0x1382f328, 0x7b81dfe0, 0xc384aab8, 0xab878670, 0xb66236f9, 0xde611a31, 0x66646f69, 0x0e6743a1,
+                0x5daf0e7b, 0x35ac22b3, 0x8da957eb, 0xe5aa7b23, 0xf84fcbaa, 0x904ce762, 0x2849923a, 0x404abef2,
+                0xb2828a33, 0xda81a6fb, 0x6284d3a3, 0x0a87ff6b, 0x17624fe2, 0x7f61632a, 0xc7641672, 0xaf673aba,
+                0xfcaf7760, 0x94ac5ba8, 0x2ca92ef0, 0x44aa0238, 0x594fb2b1, 0x314c9e79, 0x8949eb21, 0xe14ac7e9,
+                0x2ed97095, 0x46da5c5d, 0xfedf2905, 0x96dc05cd, 0x8b39b544, 0xe33a998c, 0x5b3fecd4, 0x333cc01c,
+                0x60f48dc6, 0x08f7a10e, 0xb0f2d456, 0xd8f1f89e, 0xc5144817, 0xad1764df, 0x15121187, 0x7d113d4f
+            },
+            {
+                0x00000000, 0x493c7d27, 0x9278fa4e, 0xdb448769, 0x211d826d, 0x6821ff4a, 0xb3657823, 0xfa590504,
+                0x423b04da, 0x0b0779fd, 0xd043fe94, 0x997f83b3, 0x632686b7, 0x2a1afb90, 0xf15e7cf9, 0xb86201de,
+                0x847609b4, 0xcd4a7493, 0x160ef3fa, 0x5f328edd, 0xa56b8bd9, 0xec57f6fe, 0x37137197, 0x7e2f0cb0,
+                0xc64d0d6e, 0x8f717049, 0x5435f720, 0x1d098a07, 0xe7508f03, 0xae6cf224, 0x7528754d, 0x3c14086a,
+                0x0d006599, 0x443c18be, 0x9f789fd7, 0xd644e2f0, 0x2c1de7f4, 0x65219ad3, 0xbe651dba, 0xf759609d,
+                0x4f3b6143, 0x06071c64, 0xdd439b0d, 0x947fe62a, 0x6e26e32e, 0x271a9e09, 0xfc5e1960, 0xb5626447,
+                0x89766c2d, 0xc04a110a, 0x1b0e9663, 0x5232eb44, 0xa86bee40, 0xe1579367, 0x3a13140e, 0x732f6929,
+                0xcb4d68f7, 0x827115d0, 0x593592b9, 0x1009ef9e, 0xea50ea9a, 0xa36c97bd, 0x782810d4, 0x31146df3,
+                0x1a00cb32, 0x533cb615, 0x8878317c, 0xc1444c5b, 0x3b1d495f, 0x72213478, 0xa965b311, 0xe059ce36,
+                0x583bcfe8, 0x1107b2cf, 0xca4335a6, 0x837f4881, 0x79264d85, 0x301a30a2, 0xeb5eb7cb, 0xa262caec,
+                0x9e76c286, 0xd74abfa1, 0x0c0e38c8, 0x453245ef, 0xbf6b40eb, 0xf6573dcc, 0x2d13baa5, 0x642fc782,
+                0xdc4dc65c, 0x9571bb7b, 0x4e353c12, 0x07094135, 0xfd504431, 0xb46c3916, 0x6f28be7f, 0x2614c358,
+                0x1700aeab, 0x5e3cd38c, 0x857854e5, 0xcc4429c2, 0x361d2cc6, 0x7f2151e1, 0xa465d688, 0xed59abaf,
+                0x553baa71, 0x1c07d756, 0xc743503f, 0x8e7f2d18, 0x7426281c, 0x3d1a553b, 0xe65ed252, 0xaf62af75,
+                0x9376a71f, 0xda4ada38, 0x010e5d51, 0x48322076, 0xb26b2572, 0xfb575855, 0x2013df3c, 0x692fa21b,
+                0xd14da3c5, 0x9871dee2, 0x4335598b, 0x0a0924ac, 0xf05021a8, 0xb96c5c8f, 0x6228dbe6, 0x2b14a6c1,
+                0x34019664, 0x7d3deb43, 0xa6796c2a, 0xef45110d, 0x151c1409, 0x5c20692e, 0x8764ee47, 0xce589360,
+                0x763a92be, 0x3f06ef99, 0xe44268f0, 0xad7e15d7, 0x572710d3, 0x1e1b6df4, 0xc55fea9d, 0x8c6397ba,
+                0xb0779fd0, 0xf94be2f7, 0x220f659e, 0x6b3318b9, 0x916a1dbd, 0xd856609a, 0x0312e7f3, 0x4a2e9ad4,
+                0xf24c9b0a, 0xbb70e62d, 0x60346144, 0x29081c63, 0xd3511967, 0x9a6d6440, 0x4129e329, 0x08159e0e,
+                0x3901f3fd, 0x703d8eda, 0xab7909b3, 0xe2457494, 0x181c7190, 0x51200cb7, 0x8a648bde, 0xc358f6f9,
+                0x7b3af727, 0x32068a00, 0xe9420d69, 0xa07e704e, 0x5a27754a, 0x131b086d, 0xc85f8f04, 0x8163f223,
+                0xbd77fa49, 0xf44b876e, 0x2f0f0007, 0x66337d20, 0x9c6a7824, 0xd5560503, 0x0e12826a, 0x472eff4d,
+                0xff4cfe93, 0xb67083b4, 0x6d3404dd, 0x240879fa, 0xde517cfe, 0x976d01d9, 0x4c2986b0, 0x0515fb97,
+                0x2e015d56, 0x673d2071, 0xbc79a718, 0xf545da3f, 0x0f1cdf3b, 0x4620a21c, 0x9d642575, 0xd4585852,
+                0x6c3a598c, 0x250624ab, 0xfe42a3c2, 0xb77edee5, 0x4d27dbe1, 0x041ba6c6, 0xdf5f21af, 0x96635c88,
+                0xaa7754e2, 0xe34b29c5, 0x380faeac, 0x7133d38b, 0x8b6ad68f, 0xc256aba8, 0x19122cc1, 0x502e51e6,
+                0xe84c5038, 0xa1702d1f, 0x7a34aa76, 0x3308d751, 0xc951d255, 0x806daf72, 0x5b29281b, 0x1215553c,
+                0x230138cf, 0x6a3d45e8, 0xb179c281, 0xf845bfa6, 0x021cbaa2, 0x4b20c785, 0x906440ec, 0xd9583dcb,
+                0x613a3c15, 0x28064132, 0xf342c65b, 0xba7ebb7c, 0x4027be78, 0x091bc35f, 0xd25f4436, 0x9b633911,
+                0xa777317b, 0xee4b4c5c, 0x350fcb35, 0x7c33b612, 0x866ab316, 0xcf56ce31, 0x14124958, 0x5d2e347f,
+                0xe54c35a1, 0xac704886, 0x7734cfef, 0x3e08b2c8, 0xc451b7cc, 0x8d6dcaeb, 0x56294d82, 0x1f1530a5
+            }
+        };
+
+        uint32_t Crc32c(const void* src, size_t size)
+        {
+            const uint8_t* p8 = (const uint8_t*)src;
+            uint32_t crc = 0xFFFFFFFF;
+
+            for (; ((uintptr_t)p8 & (sizeof(uint32_t) - 1)) != 0 && size > 0; ++p8, --size)
+                crc = Crc32cTable[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8);
+
+            const uint32_t* p32 = (const uint32_t*)p8;
+            for (; size >= 8; size -= 8)
+            {
+#ifdef SIMD_BIG_ENDIAN
+                uint32_t v0 = *p32++ ^ Reorder32(crc);
+                uint32_t v1 = *p32++;
+                crc =
+                    Crc32cTable[0x0][v1 & 0xFF] ^
+                    Crc32cTable[0x1][(v1 >> 8) & 0xFF] ^
+                    Crc32cTable[0x2][(v1 >> 16) & 0xFF] ^
+                    Crc32cTable[0x3][(v1 >> 24) & 0xFF] ^
+                    Crc32cTable[0x4][v0 & 0xFF] ^
+                    Crc32cTable[0x5][(v0 >> 8) & 0xFF] ^
+                    Crc32cTable[0x6][(v0 >> 16) & 0xFF] ^
+                    Crc32cTable[0x7][(v0 >> 24) & 0xFF];
+#else
+                uint32_t v0 = *p32++ ^ crc;
+                uint32_t v1 = *p32++;
+                crc =
+                    Crc32cTable[0x0][(v1 >> 24) & 0xFF] ^
+                    Crc32cTable[0x1][(v1 >> 16) & 0xFF] ^
+                    Crc32cTable[0x2][(v1 >> 8) & 0xFF] ^
+                    Crc32cTable[0x3][v1 & 0xFF] ^
+                    Crc32cTable[0x4][(v0 >> 24) & 0xFF] ^
+                    Crc32cTable[0x5][(v0 >> 16) & 0xFF] ^
+                    Crc32cTable[0x6][(v0 >> 8) & 0xFF] ^
+                    Crc32cTable[0x7][v0 & 0xFF];
+#endif
+            }
+
+            for (p8 = (const uint8_t*)p32; size > 0; ++p8, size--)
+                crc = Crc32cTable[0][(crc ^ *p8) & 0xFF] ^ (crc >> 8);
+
+            return (~crc);
+        }
+    }
+}
diff --git a/3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp b/3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp
new file mode 100644
index 0000000000..b064ca50a2
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseImageLoad.cpp
@@ -0,0 +1,371 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdArray.h"
+#include "Simd/SimdCpu.h"
+#include "Simd/SimdBase.h"
+
+#include <stdio.h>
+
+#if defined(_MSC_VER)
+#pragma warning (push)
+#pragma warning (disable: 4996)
+#endif
+
+namespace Simd
+{
+    uint8_t* ImageLoadFromFile(const ImageLoadFromMemoryPtr loader, const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+    {
+        uint8_t* data = NULL;
+        ::FILE* file = ::fopen(path, "rb");
+        if (file)
+        {
+            ::fseek(file, 0, SEEK_END);
+            Array8u buffer(::ftell(file));
+            ::fseek(file, 0, SEEK_SET);
+            if (::fread(buffer.data, 1, buffer.size, file) == buffer.size)
+                data = loader(buffer.data, buffer.size, stride, width, height, format);
+            ::fclose(file);
+        }
+        return data;
+    }
+
+    //-------------------------------------------------------------------------
+
+    ImageLoaderParam::ImageLoaderParam(const uint8_t* d, size_t s, SimdPixelFormatType f)
+        : data(d)
+        , size(s)
+        , format(f)
+        , file(SimdImageFileUndefined)
+    {
+    }
+
+    bool ImageLoaderParam::Validate()
+    {
+        if (size >= 3)
+        {
+            if (data[0] == 'P' && data[2] == '\n')
+            {
+                if (data[1] == '2')
+                    file = SimdImageFilePgmTxt;
+                if (data[1] == '3')
+                    file = SimdImageFilePpmTxt;
+                if (data[1] == '5')
+                    file = SimdImageFilePgmBin;
+                if (data[1] == '6')
+                    file = SimdImageFilePpmBin;
+            }
+        }
+        if (size >= 8)
+        {
+            const uint8_t SIGNATURE[8] = { 137, 80, 78, 71, 13, 10, 26, 10 };
+            if(memcmp(data, SIGNATURE, 8) == 0)
+                file = SimdImageFilePng;
+        }
+        if (size >= 2)
+        {
+            if (data[0] == 0xFF && data[1] == 0xD8)
+                file = SimdImageFileJpeg;
+        }
+        return
+            file != SimdImageFileUndefined && 
+                (format == SimdPixelFormatNone || format == SimdPixelFormatGray8 || 
+                format == SimdPixelFormatBgr24 || format == SimdPixelFormatBgra32 || 
+                format == SimdPixelFormatRgb24 || format == SimdPixelFormatRgba32);
+    }
+        
+    namespace Base
+    {
+        ImagePxmLoader::ImagePxmLoader(const ImageLoaderParam& param)
+            : ImageLoader(param)
+            , _toAny(NULL)
+            , _toBgra(NULL)
+        {
+        }
+
+        bool ImagePxmLoader::ReadHeader(size_t version)
+        {
+            if (_stream.Size() < 3 ||
+                _stream.Data()[0] != 'P' ||
+                _stream.Data()[1] != '0' + version ||
+                _stream.Data()[2] != '\n')
+                return false;
+            _stream.Seek(3);
+            uint32_t width, height, max;
+            if (!(_stream.ReadUnsigned(width) && _stream.ReadUnsigned(height) && _stream.ReadUnsigned(max)))
+                return false;
+            if (!(width > 0 && height > 0 && max == 255))
+                return false;
+            uint8_t byte;
+            if (!(_stream.Read(byte) && byte == '\n'))
+                return false;
+            _image.Recreate(width, height, (Image::Format)_param.format);
+            _block = height;
+            if (_param.file == SimdImageFilePgmTxt || _param.file == SimdImageFilePgmBin)
+            {
+                _size = width * 1;
+                if (_param.format != SimdPixelFormatGray8)
+                {
+                    _block = Simd::RestrictRange<size_t>(Base::AlgCacheL1() / _size, 1, height);
+                    _buffer.Resize(_block * _size);
+                }
+            }
+            else if (_param.file == SimdImageFilePpmTxt || _param.file == SimdImageFilePpmBin)
+            {
+                _size = width * 3;
+                if (_param.format != SimdPixelFormatRgb24)
+                {
+                    _block = Simd::RestrictRange<size_t>(Base::AlgCacheL1() / _size, 1, height);
+                    _buffer.Resize(_block * _size);
+                }
+            }
+            else
+                return false;
+            SetConverters();
+            return true;
+        }
+
+        //-------------------------------------------------------------------------
+
+        ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param)
+            : ImagePxmLoader(param)
+        {
+            if (_param.format == SimdPixelFormatNone)
+                _param.format = SimdPixelFormatGray8;
+        }
+
+        bool ImagePgmTxtLoader::FromStream()
+        {
+            if (!ReadHeader(2))
+                return false;
+            size_t grayStride = _param.format == SimdPixelFormatGray8 ? _image.stride : _size;
+            for (size_t row = 0; row < _image.height;)
+            {
+                size_t block = Simd::Min(row + _block, _image.height) - row;
+                uint8_t * gray = _param.format == SimdPixelFormatGray8 ? _image.Row<uint8_t>(row) : _buffer.data;
+                for (size_t b = 0; b < block; ++b)
+                {
+                    for (size_t i = 0; i < _size; ++i)
+                    {
+                        if (!_stream.ReadUnsigned(gray[i]))
+                            return false;
+                    }
+                    gray += grayStride;
+                }
+                if(_param.format == SimdPixelFormatBgr24 || _param.format == SimdPixelFormatRgb24)
+                    _toAny(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride);
+                if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32)
+                    _toBgra(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride, 0xFF);
+                row += block;
+            }
+            return true;
+        }
+
+        void ImagePgmTxtLoader::SetConverters()
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatBgr24: _toAny = Base::GrayToBgr; break;
+            case SimdPixelFormatBgra32: _toBgra = Base::GrayToBgra; break;
+            case SimdPixelFormatRgb24: _toAny = Base::GrayToBgr; break;
+            case SimdPixelFormatRgba32: _toBgra = Base::GrayToBgra; break;
+            default: break;
+            }
+        }
+
+        //-------------------------------------------------------------------------
+
+        ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param)
+            : ImagePxmLoader(param)
+        {
+            if (_param.format == SimdPixelFormatNone)
+                _param.format = SimdPixelFormatGray8;
+        }
+
+        bool ImagePgmBinLoader::FromStream()
+        {
+            if (!ReadHeader(5))
+                return false;
+            size_t grayStride = _param.format == SimdPixelFormatGray8 ? _image.stride : _size;
+            for (size_t row = 0; row < _image.height;)
+            {
+                size_t block = Simd::Min(row + _block, _image.height) - row;
+                uint8_t* gray = _param.format == SimdPixelFormatGray8 ? _image.Row<uint8_t>(row) : _buffer.data;
+                for (size_t b = 0; b < block; ++b)
+                {
+                    if (_stream.Read(_size, gray) != _size)
+                        return false;
+                    gray += grayStride;
+                }
+                if (_param.format == SimdPixelFormatBgr24 || _param.format == SimdPixelFormatRgb24)
+                    _toAny(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride);
+                if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32)
+                    _toBgra(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride, 0xFF);
+                row += block;
+            }
+            return true;
+        }
+
+        void ImagePgmBinLoader::SetConverters()
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatBgr24: _toAny = Base::GrayToBgr; break;
+            case SimdPixelFormatBgra32: _toBgra = Base::GrayToBgra; break;
+            case SimdPixelFormatRgb24: _toAny = Base::GrayToBgr; break;
+            case SimdPixelFormatRgba32: _toBgra = Base::GrayToBgra; break;
+            default: break;
+            }
+        }
+
+        //-------------------------------------------------------------------------
+
+        ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param)
+            : ImagePxmLoader(param)
+        {
+            if (_param.format == SimdPixelFormatNone)
+                _param.format = SimdPixelFormatRgb24;
+        }
+
+        bool ImagePpmTxtLoader::FromStream()
+        {
+            if (!ReadHeader(3))
+                return false;
+            size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? _image.stride : _size;
+            for (size_t row = 0; row < _image.height;)
+            {
+                size_t block = Simd::Min(row + _block, _image.height) - row;
+                uint8_t* rgb = _param.format == SimdPixelFormatRgb24 ? _image.Row<uint8_t>(row) : _buffer.data;
+                for (size_t b = 0; b < block; ++b)
+                {
+                    for (size_t i = 0; i < _size; ++i)
+                    {
+                        if (!_stream.ReadUnsigned(rgb[i]))
+                            return false;
+                    }
+                    rgb += rgbStride;
+                }
+                if (_param.format == SimdPixelFormatGray8 || _param.format == SimdPixelFormatBgr24)
+                    _toAny(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride);
+                if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32)
+                    _toBgra(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride, 0xFF);
+                row += block;
+            }
+            return true;
+        }
+
+        void ImagePpmTxtLoader::SetConverters()
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatGray8: _toAny = Base::RgbToGray; break;
+            case SimdPixelFormatBgr24: _toAny = Base::BgrToRgb; break;
+            case SimdPixelFormatBgra32: _toBgra = Base::RgbToBgra; break;
+            case SimdPixelFormatRgba32: _toBgra = Base::BgrToBgra; break;
+            default: break;
+            }
+        }
+
+        //-------------------------------------------------------------------------
+
+        ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param)
+            : ImagePxmLoader(param)
+        {
+            if (_param.format == SimdPixelFormatNone)
+                _param.format = SimdPixelFormatRgb24;
+        }
+
+        bool ImagePpmBinLoader::FromStream()
+        {
+            if (!ReadHeader(6))
+                return false;
+            size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? _image.stride : _size;
+            for (size_t row = 0; row < _image.height;)
+            {
+                size_t block = Simd::Min(row + _block, _image.height) - row;
+                uint8_t* rgb = _param.format == SimdPixelFormatRgb24 ? _image.Row<uint8_t>(row) : _buffer.data;
+                for (size_t b = 0; b < block; ++b)
+                {
+                    if (_stream.Read(_size, rgb) != _size)
+                        return false;
+                    rgb += rgbStride;
+                }
+                if (_param.format == SimdPixelFormatGray8 || _param.format == SimdPixelFormatBgr24)
+                    _toAny(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride);
+                if (_param.format == SimdPixelFormatBgra32 || _param.format == SimdPixelFormatRgba32)
+                    _toBgra(_buffer.data, _image.width, block, _size, _image.Row<uint8_t>(row), _image.stride, 0xFF);
+                row += block;
+            }
+            return true;
+        }
+
+        void ImagePpmBinLoader::SetConverters()
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatGray8: _toAny = Base::RgbToGray; break;
+            case SimdPixelFormatBgr24: _toAny = Base::BgrToRgb; break;
+            case SimdPixelFormatBgra32: _toBgra = Base::RgbToBgra; break;
+            case SimdPixelFormatRgba32: _toBgra = Base::BgrToBgra; break;
+            default: break;
+            }
+        }
+
+        //-------------------------------------------------------------------------
+
+        ImageLoader* CreateImageLoader(const ImageLoaderParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinLoader(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinLoader(param);
+            case SimdImageFilePng: return new ImagePngLoader(param);
+            case SimdImageFileJpeg: return new ImageJpegLoader(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+        {
+            ImageLoaderParam param(data, size, *format);
+            if (param.Validate())
+            {
+                Holder<ImageLoader> loader(CreateImageLoader(param));
+                if (loader)
+                {
+                    if (loader->FromStream())
+                        return loader->Release(stride, width, height, format);
+                }
+            }
+            return NULL;
+        }
+    }
+}
+
+#if defined(_MSC_VER)
+#pragma warning (pop)
+#endif
diff --git a/3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp b/3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp
new file mode 100644
index 0000000000..88c5da73d0
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseImageLoadJpeg.cpp
@@ -0,0 +1,2456 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdArray.h"
+#include "Simd/SimdCpu.h"
+#include "Simd/SimdBase.h"
+
+namespace Simd
+{
+    namespace Base
+    {
+#if defined(SIMD_X64_ENABLE) && !defined(SIMD_SSE2_DISABLE)
+#define JPEG_SSE2
+        static int jpeg__sse2_available(void)
+        {
+            return 1;
+        }
+#endif
+
+#if defined(SIMD_ARM64_ENABLE) && !defined(SIMD_NEON_DISABLE)
+#define JPEG_NEON
+#endif
+
+        typedef unsigned char jpeg_uc;
+        typedef unsigned short jpeg_us;
+        typedef unsigned short jpeg__uint16;
+        typedef   signed short jpeg__int16;
+        typedef unsigned int   jpeg__uint32;
+        typedef   signed int   jpeg__int32;
+
+        typedef struct
+        {
+            int      (*read)  (void* user, char* data, int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+            void     (*skip)  (void* user, int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+            int      (*eof)   (void* user);                       // returns nonzero if we are at end of file/data
+        } jpeg_io_callbacks;
+
+#define jpeg_inline SIMD_INLINE
+#define JPEG_ASSERT assert
+
+#ifdef _MSC_VER
+#define JPEG_NOTUSED(v)  (void)(v)
+#else
+#define JPEG_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+        typedef struct
+        {
+            jpeg__uint32 img_x, img_y;
+            int img_n, img_out_n;
+
+            jpeg_io_callbacks io;
+            void* io_user_data;
+
+            int read_from_callbacks;
+            int buflen;
+            jpeg_uc buffer_start[128];
+            int callback_already_read;
+
+            jpeg_uc* img_buffer, * img_buffer_end;
+            jpeg_uc* img_buffer_original, * img_buffer_original_end;
+        } jpeg__context;
+
+        static int jpeg__err(const char* str)
+        {
+            //jpeg__g_failure_reason = str;
+            return 0;
+        }
+
+        static int jpeg__err(const char* str1, const char* str2)
+        {
+            //jpeg__g_failure_reason = str;
+            return 0;
+        }
+
+#define jpeg__errpuc(x,y)  ((unsigned char *)(size_t) (jpeg__err(x,y)?NULL:NULL))
+
+        static void jpeg__refill_buffer(jpeg__context* s)
+        {
+            int n = (s->io.read)(s->io_user_data, (char*)s->buffer_start, s->buflen);
+            s->callback_already_read += (int)(s->img_buffer - s->img_buffer_original);
+            if (n == 0) {
+                // at end of file, treat same as if from memory, but need to handle case
+                // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+                s->read_from_callbacks = 0;
+                s->img_buffer = s->buffer_start;
+                s->img_buffer_end = s->buffer_start + 1;
+                *s->img_buffer = 0;
+            }
+            else {
+                s->img_buffer = s->buffer_start;
+                s->img_buffer_end = s->buffer_start + n;
+            }
+        }
+
+        jpeg_inline static jpeg_uc jpeg__get8(jpeg__context* s)
+        {
+            if (s->img_buffer < s->img_buffer_end)
+                return *s->img_buffer++;
+            if (s->read_from_callbacks) {
+                jpeg__refill_buffer(s);
+                return *s->img_buffer++;
+            }
+            return 0;
+        }
+
+#define jpeg_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
+
+#define JPEG_SIMD_ALIGN(type, name) SIMD_ALIGNED(16) type name
+
+        static int jpeg__get16be(jpeg__context* s)
+        {
+            int z = jpeg__get8(s);
+            return (z << 8) + jpeg__get8(s);
+        }
+
+        static void jpeg__skip(jpeg__context* s, int n)
+        {
+            if (n == 0) return;  // already there!
+            if (n < 0) {
+                s->img_buffer = s->img_buffer_end;
+                return;
+            }
+            if (s->io.read) {
+                int blen = (int)(s->img_buffer_end - s->img_buffer);
+                if (blen < n) {
+                    s->img_buffer = s->img_buffer_end;
+                    (s->io.skip)(s->io_user_data, n - blen);
+                    return;
+                }
+            }
+            s->img_buffer += n;
+        }
+
+        jpeg_inline static int jpeg__at_eof(jpeg__context* s)
+        {
+            if (s->io.read) {
+                if (!(s->io.eof)(s->io_user_data)) return 0;
+                // if feof() is true, check if buffer = end
+                // special case: we've only got the special 0 character at the end
+                if (s->read_from_callbacks == 0) return 1;
+            }
+
+            return s->img_buffer >= s->img_buffer_end;
+        }
+
+#define JPEG_MALLOC(sz)           malloc(sz)
+#define JPEG_REALLOC(p,newsz)     realloc(p,newsz)
+#define JPEG_FREE(p)              free(p)
+
+#define JPEG_MAX_DIMENSIONS (1 << 24)
+
+        enum
+        {
+            JPEG__SCAN_load = 0,
+            JPEG__SCAN_type,
+            JPEG__SCAN_header
+        };
+
+        static void* jpeg__malloc(size_t size)
+        {
+            return JPEG_MALLOC(size);
+        }
+
+        static int jpeg__addsizes_valid(int a, int b)
+        {
+            if (b < 0) return 0;
+            // now 0 <= b <= INT_MAX, hence also
+            // 0 <= INT_MAX - b <= INTMAX.
+            // And "a + b <= INT_MAX" (which might overflow) is the
+            // same as a <= INT_MAX - b (no overflow)
+            return a <= INT_MAX - b;
+        }
+
+        static int jpeg__mul2sizes_valid(int a, int b)
+        {
+            if (a < 0 || b < 0) return 0;
+            if (b == 0) return 1; // mul-by-0 is always safe
+            // portable way to check for no overflows in a*b
+            return a <= INT_MAX / b;
+        }
+
+        static int jpeg__mad2sizes_valid(int a, int b, int add)
+        {
+            return jpeg__mul2sizes_valid(a, b) && jpeg__addsizes_valid(a * b, add);
+        }
+
+        // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+        static int jpeg__mad3sizes_valid(int a, int b, int c, int add)
+        {
+            return jpeg__mul2sizes_valid(a, b) && jpeg__mul2sizes_valid(a * b, c) &&
+                jpeg__addsizes_valid(a * b * c, add);
+        }
+
+        static int jpeg__mad4sizes_valid(int a, int b, int c, int d, int add)
+        {
+            return jpeg__mul2sizes_valid(a, b) && jpeg__mul2sizes_valid(a * b, c) &&
+                jpeg__mul2sizes_valid(a * b * c, d) && jpeg__addsizes_valid(a * b * c * d, add);
+        }
+
+        static void* jpeg__malloc_mad2(int a, int b, int add)
+        {
+            if (!jpeg__mad2sizes_valid(a, b, add)) return NULL;
+            return jpeg__malloc(a * b + add);
+        }
+
+        static void* jpeg__malloc_mad3(int a, int b, int c, int add)
+        {
+            if (!jpeg__mad3sizes_valid(a, b, c, add)) return NULL;
+            return jpeg__malloc(a * b * c + add);
+        }
+
+        static jpeg_uc jpeg__compute_y(int r, int g, int b)
+        {
+            return (jpeg_uc)(((r * 77) + (g * 150) + (29 * b)) >> 8);
+        }
+
+        typedef struct
+        {
+            int bits_per_channel;
+            int num_channels;
+            int channel_order;
+        } jpeg__result_info;
+
+        static void jpeg__rewind(jpeg__context* s)
+        {
+            // conceptually rewind SHOULD rewind to the beginning of the stream,
+            // but we just rewind to the beginning of the initial buffer, because
+            // we only use it after doing 'test', which only ever looks at at most 92 bytes
+            s->img_buffer = s->img_buffer_original;
+            s->img_buffer_end = s->img_buffer_original_end;
+        }
+
+        //------------------------------------------------------------------------------
+
+        // huffman decoding acceleration
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
+
+        typedef struct
+        {
+            jpeg_uc  fast[1 << FAST_BITS];
+            // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+            jpeg__uint16 code[256];
+            jpeg_uc  values[256];
+            jpeg_uc  size[257];
+            unsigned int maxcode[18];
+            int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+        } jpeg__huffman;
+
+        typedef struct
+        {
+            jpeg__context* s;
+            jpeg__huffman huff_dc[4];
+            jpeg__huffman huff_ac[4];
+            jpeg__uint16 dequant[4][64];
+            jpeg__int16 fast_ac[4][1 << FAST_BITS];
+
+            // sizes for components, interleaved MCUs
+            int img_h_max, img_v_max;
+            int img_mcu_x, img_mcu_y;
+            int img_mcu_w, img_mcu_h;
+
+            // definition of jpeg image component
+            struct
+            {
+                int id;
+                int h, v;
+                int tq;
+                int hd, ha;
+                int dc_pred;
+
+                int x, y, w2, h2;
+                jpeg_uc* data;
+                void* raw_data, * raw_coeff;
+                jpeg_uc* linebuf;
+                short* coeff;   // progressive only
+                int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
+            } img_comp[4];
+
+            jpeg__uint32   code_buffer; // jpeg entropy-coded buffer
+            int            code_bits;   // number of valid bits
+            unsigned char  marker;      // marker seen while filling entropy buffer
+            int            nomore;      // flag if we saw a marker so must stop
+
+            int            progressive;
+            int            spec_start;
+            int            spec_end;
+            int            succ_high;
+            int            succ_low;
+            int            eob_run;
+            int            jfif;
+            int            app14_color_transform; // Adobe APP14 tag
+            int            rgb;
+
+            int scan_n, order[4];
+            int restart_interval, todo;
+
+            // kernels
+            void (*idct_block_kernel)(jpeg_uc* out, int out_stride, short data[64]);
+            void (*YCbCr_to_RGB_kernel)(jpeg_uc* out, const jpeg_uc* y, const jpeg_uc* pcb, const jpeg_uc* pcr, int count, int step);
+            jpeg_uc* (*resample_row_hv_2_kernel)(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs);
+        } jpeg__jpeg;
+
+        static int jpeg__build_huffman(jpeg__huffman* h, int* count)
+        {
+            int i, j, k = 0;
+            unsigned int code;
+            // build size list for each symbol (from JPEG spec)
+            for (i = 0; i < 16; ++i)
+                for (j = 0; j < count[i]; ++j)
+                    h->size[k++] = (jpeg_uc)(i + 1);
+            h->size[k] = 0;
+
+            // compute actual symbols (from jpeg spec)
+            code = 0;
+            k = 0;
+            for (j = 1; j <= 16; ++j) {
+                // compute delta to add to code to compute symbol id
+                h->delta[j] = k - code;
+                if (h->size[k] == j) {
+                    while (h->size[k] == j)
+                        h->code[k++] = (jpeg__uint16)(code++);
+                    if (code - 1 >= (1u << j)) return jpeg__err("bad code lengths", "Corrupt JPEG");
+                }
+                // compute largest code + 1 for this size, preshifted as needed later
+                h->maxcode[j] = code << (16 - j);
+                code <<= 1;
+            }
+            h->maxcode[j] = 0xffffffff;
+
+            // build non-spec acceleration table; 255 is flag for not-accelerated
+            memset(h->fast, 255, 1 << FAST_BITS);
+            for (i = 0; i < k; ++i) {
+                int s = h->size[i];
+                if (s <= FAST_BITS) {
+                    int c = h->code[i] << (FAST_BITS - s);
+                    int m = 1 << (FAST_BITS - s);
+                    for (j = 0; j < m; ++j) {
+                        h->fast[c + j] = (jpeg_uc)i;
+                    }
+                }
+            }
+            return 1;
+        }
+
+        // build a table that decodes both magnitude and value of small ACs in
+        // one go.
+        static void jpeg__build_fast_ac(jpeg__int16* fast_ac, jpeg__huffman* h)
+        {
+            int i;
+            for (i = 0; i < (1 << FAST_BITS); ++i) {
+                jpeg_uc fast = h->fast[i];
+                fast_ac[i] = 0;
+                if (fast < 255) {
+                    int rs = h->values[fast];
+                    int run = (rs >> 4) & 15;
+                    int magbits = rs & 15;
+                    int len = h->size[fast];
+
+                    if (magbits && len + magbits <= FAST_BITS) {
+                        // magnitude code followed by receive_extend code
+                        int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+                        int m = 1 << (magbits - 1);
+                        if (k < m) k += (~0U << magbits) + 1;
+                        // if the result is small enough, we can fit it in fast_ac table
+                        if (k >= -128 && k <= 127)
+                            fast_ac[i] = (jpeg__int16)((k * 256) + (run * 16) + (len + magbits));
+                    }
+                }
+            }
+        }
+
+        static void jpeg__grow_buffer_unsafe(jpeg__jpeg* j)
+        {
+            do {
+                unsigned int b = j->nomore ? 0 : jpeg__get8(j->s);
+                if (b == 0xff) {
+                    int c = jpeg__get8(j->s);
+                    while (c == 0xff) c = jpeg__get8(j->s); // consume fill bytes
+                    if (c != 0) {
+                        j->marker = (unsigned char)c;
+                        j->nomore = 1;
+                        return;
+                    }
+                }
+                j->code_buffer |= b << (24 - j->code_bits);
+                j->code_bits += 8;
+            } while (j->code_bits <= 24);
+        }
+
+        // (1 << n) - 1
+        static const jpeg__uint32 jpeg__bmask[17] = { 0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535 };
+
+        // decode a jpeg huffman value from the bitstream
+        jpeg_inline static int jpeg__jpeg_huff_decode(jpeg__jpeg* j, jpeg__huffman* h)
+        {
+            unsigned int temp;
+            int c, k;
+
+            if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j);
+
+            // look at the top FAST_BITS and determine what symbol ID it is,
+            // if the code is <= FAST_BITS
+            c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
+            k = h->fast[c];
+            if (k < 255) {
+                int s = h->size[k];
+                if (s > j->code_bits)
+                    return -1;
+                j->code_buffer <<= s;
+                j->code_bits -= s;
+                return h->values[k];
+            }
+
+            // naive test is to shift the code_buffer down so k bits are
+            // valid, then test against maxcode. To speed this up, we've
+            // preshifted maxcode left so that it has (16-k) 0s at the
+            // end; in other words, regardless of the number of bits, it
+            // wants to be compared against something shifted to have 16;
+            // that way we don't need to shift inside the loop.
+            temp = j->code_buffer >> 16;
+            for (k = FAST_BITS + 1; ; ++k)
+                if (temp < h->maxcode[k])
+                    break;
+            if (k == 17) {
+                // error! code not found
+                j->code_bits -= 16;
+                return -1;
+            }
+
+            if (k > j->code_bits)
+                return -1;
+
+            // convert the huffman code to the symbol id
+            c = ((j->code_buffer >> (32 - k)) & jpeg__bmask[k]) + h->delta[k];
+            JPEG_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & jpeg__bmask[h->size[c]]) == h->code[c]);
+
+            // convert the id to a symbol
+            j->code_bits -= k;
+            j->code_buffer <<= k;
+            return h->values[c];
+        }
+
+        // bias[n] = (-1<<n) + 1
+        static const int jpeg__jbias[16] = { 0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767 };
+
+        // combined JPEG 'receive' and JPEG 'extend', since baseline
+        // always extends everything it receives.
+        jpeg_inline static int jpeg__extend_receive(jpeg__jpeg* j, int n)
+        {
+            unsigned int k;
+            int sgn;
+            if (j->code_bits < n) jpeg__grow_buffer_unsafe(j);
+
+            sgn = (jpeg__int32)j->code_buffer >> 31; // sign bit is always in MSB
+            k = jpeg_lrot(j->code_buffer, n);
+            if (n < 0 || n >= (int)(sizeof(jpeg__bmask) / sizeof(*jpeg__bmask))) return 0;
+            j->code_buffer = k & ~jpeg__bmask[n];
+            k &= jpeg__bmask[n];
+            j->code_bits -= n;
+            return k + (jpeg__jbias[n] & ~sgn);
+        }
+
+        // get some unsigned bits
+        jpeg_inline static int jpeg__jpeg_get_bits(jpeg__jpeg* j, int n)
+        {
+            unsigned int k;
+            if (j->code_bits < n) jpeg__grow_buffer_unsafe(j);
+            k = jpeg_lrot(j->code_buffer, n);
+            j->code_buffer = k & ~jpeg__bmask[n];
+            k &= jpeg__bmask[n];
+            j->code_bits -= n;
+            return k;
+        }
+
+        jpeg_inline static int jpeg__jpeg_get_bit(jpeg__jpeg* j)
+        {
+            unsigned int k;
+            if (j->code_bits < 1) jpeg__grow_buffer_unsafe(j);
+            k = j->code_buffer;
+            j->code_buffer <<= 1;
+            --j->code_bits;
+            return k & 0x80000000;
+        }
+
+        // given a value that's at position X in the zigzag stream,
+        // where does it appear in the 8x8 matrix coded as row-major?
+        static const jpeg_uc jpeg__jpeg_dezigzag[64 + 15] =
+        {
+            0,  1,  8, 16,  9,  2,  3, 10,
+           17, 24, 32, 25, 18, 11,  4,  5,
+           12, 19, 26, 33, 40, 48, 41, 34,
+           27, 20, 13,  6,  7, 14, 21, 28,
+           35, 42, 49, 56, 57, 50, 43, 36,
+           29, 22, 15, 23, 30, 37, 44, 51,
+           58, 59, 52, 45, 38, 31, 39, 46,
+           53, 60, 61, 54, 47, 55, 62, 63,
+           // let corrupt input sample past end
+           63, 63, 63, 63, 63, 63, 63, 63,
+           63, 63, 63, 63, 63, 63, 63
+        };
+
+        // decode one 64-entry block--
+        static int jpeg__jpeg_decode_block(jpeg__jpeg* j, short data[64], jpeg__huffman* hdc, jpeg__huffman* hac, jpeg__int16* fac, int b, jpeg__uint16* dequant)
+        {
+            int diff, dc, k;
+            int t;
+
+            if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j);
+            t = jpeg__jpeg_huff_decode(j, hdc);
+            if (t < 0) return jpeg__err("bad huffman code", "Corrupt JPEG");
+
+            // 0 all the ac values now so we can do it 32-bits at a time
+            memset(data, 0, 64 * sizeof(data[0]));
+
+            diff = t ? jpeg__extend_receive(j, t) : 0;
+            dc = j->img_comp[b].dc_pred + diff;
+            j->img_comp[b].dc_pred = dc;
+            data[0] = (short)(dc * dequant[0]);
+
+            // decode AC components, see JPEG spec
+            k = 1;
+            do {
+                unsigned int zig;
+                int c, r, s;
+                if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j);
+                c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
+                r = fac[c];
+                if (r) { // fast-AC path
+                    k += (r >> 4) & 15; // run
+                    s = r & 15; // combined length
+                    j->code_buffer <<= s;
+                    j->code_bits -= s;
+                    // decode into unzigzag'd location
+                    zig = jpeg__jpeg_dezigzag[k++];
+                    data[zig] = (short)((r >> 8) * dequant[zig]);
+                }
+                else {
+                    int rs = jpeg__jpeg_huff_decode(j, hac);
+                    if (rs < 0) return jpeg__err("bad huffman code", "Corrupt JPEG");
+                    s = rs & 15;
+                    r = rs >> 4;
+                    if (s == 0) {
+                        if (rs != 0xf0) break; // end block
+                        k += 16;
+                    }
+                    else {
+                        k += r;
+                        // decode into unzigzag'd location
+                        zig = jpeg__jpeg_dezigzag[k++];
+                        data[zig] = (short)(jpeg__extend_receive(j, s) * dequant[zig]);
+                    }
+                }
+            } while (k < 64);
+            return 1;
+        }
+
+        static int jpeg__jpeg_decode_block_prog_dc(jpeg__jpeg* j, short data[64], jpeg__huffman* hdc, int b)
+        {
+            int diff, dc;
+            int t;
+            if (j->spec_end != 0) return jpeg__err("can't merge dc and ac", "Corrupt JPEG");
+
+            if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j);
+
+            if (j->succ_high == 0) {
+                // first scan for DC coefficient, must be first
+                memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now
+                t = jpeg__jpeg_huff_decode(j, hdc);
+                if (t == -1) return jpeg__err("can't merge dc and ac", "Corrupt JPEG");
+                diff = t ? jpeg__extend_receive(j, t) : 0;
+
+                dc = j->img_comp[b].dc_pred + diff;
+                j->img_comp[b].dc_pred = dc;
+                data[0] = (short)(dc << j->succ_low);
+            }
+            else {
+                // refinement scan for DC coefficient
+                if (jpeg__jpeg_get_bit(j))
+                    data[0] += (short)(1 << j->succ_low);
+            }
+            return 1;
+        }
+
+        // @OPTIMIZE: store non-zigzagged during the decode passes,
+        // and only de-zigzag when dequantizing
+        static int jpeg__jpeg_decode_block_prog_ac(jpeg__jpeg* j, short data[64], jpeg__huffman* hac, jpeg__int16* fac)
+        {
+            int k;
+            if (j->spec_start == 0) return jpeg__err("can't merge dc and ac", "Corrupt JPEG");
+
+            if (j->succ_high == 0) {
+                int shift = j->succ_low;
+
+                if (j->eob_run) {
+                    --j->eob_run;
+                    return 1;
+                }
+
+                k = j->spec_start;
+                do {
+                    unsigned int zig;
+                    int c, r, s;
+                    if (j->code_bits < 16) jpeg__grow_buffer_unsafe(j);
+                    c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
+                    r = fac[c];
+                    if (r) { // fast-AC path
+                        k += (r >> 4) & 15; // run
+                        s = r & 15; // combined length
+                        j->code_buffer <<= s;
+                        j->code_bits -= s;
+                        zig = jpeg__jpeg_dezigzag[k++];
+                        data[zig] = (short)((r >> 8) << shift);
+                    }
+                    else {
+                        int rs = jpeg__jpeg_huff_decode(j, hac);
+                        if (rs < 0) return jpeg__err("bad huffman code", "Corrupt JPEG");
+                        s = rs & 15;
+                        r = rs >> 4;
+                        if (s == 0) {
+                            if (r < 15) {
+                                j->eob_run = (1 << r);
+                                if (r)
+                                    j->eob_run += jpeg__jpeg_get_bits(j, r);
+                                --j->eob_run;
+                                break;
+                            }
+                            k += 16;
+                        }
+                        else {
+                            k += r;
+                            zig = jpeg__jpeg_dezigzag[k++];
+                            data[zig] = (short)(jpeg__extend_receive(j, s) << shift);
+                        }
+                    }
+                } while (k <= j->spec_end);
+            }
+            else {
+                // refinement scan for these AC coefficients
+
+                short bit = (short)(1 << j->succ_low);
+
+                if (j->eob_run) {
+                    --j->eob_run;
+                    for (k = j->spec_start; k <= j->spec_end; ++k) {
+                        short* p = &data[jpeg__jpeg_dezigzag[k]];
+                        if (*p != 0)
+                            if (jpeg__jpeg_get_bit(j))
+                                if ((*p & bit) == 0) {
+                                    if (*p > 0)
+                                        *p += bit;
+                                    else
+                                        *p -= bit;
+                                }
+                    }
+                }
+                else {
+                    k = j->spec_start;
+                    do {
+                        int r, s;
+                        int rs = jpeg__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+                        if (rs < 0) return jpeg__err("bad huffman code", "Corrupt JPEG");
+                        s = rs & 15;
+                        r = rs >> 4;
+                        if (s == 0) {
+                            if (r < 15) {
+                                j->eob_run = (1 << r) - 1;
+                                if (r)
+                                    j->eob_run += jpeg__jpeg_get_bits(j, r);
+                                r = 64; // force end of block
+                            }
+                            else {
+                                // r=15 s=0 should write 16 0s, so we just do
+                                // a run of 15 0s and then write s (which is 0),
+                                // so we don't have to do anything special here
+                            }
+                        }
+                        else {
+                            if (s != 1) return jpeg__err("bad huffman code", "Corrupt JPEG");
+                            // sign bit
+                            if (jpeg__jpeg_get_bit(j))
+                                s = bit;
+                            else
+                                s = -bit;
+                        }
+
+                        // advance by r
+                        while (k <= j->spec_end) {
+                            short* p = &data[jpeg__jpeg_dezigzag[k++]];
+                            if (*p != 0) {
+                                if (jpeg__jpeg_get_bit(j))
+                                    if ((*p & bit) == 0) {
+                                        if (*p > 0)
+                                            *p += bit;
+                                        else
+                                            *p -= bit;
+                                    }
+                            }
+                            else {
+                                if (r == 0) {
+                                    *p = (short)s;
+                                    break;
+                                }
+                                --r;
+                            }
+                        }
+                    } while (k <= j->spec_end);
+                }
+            }
+            return 1;
+        }
+
+        // take a -128..127 value and jpeg__clamp it and convert to 0..255
+        jpeg_inline static jpeg_uc jpeg__clamp(int x)
+        {
+            // trick to use a single test to catch both cases
+            if ((unsigned int)x > 255) {
+                if (x < 0) return 0;
+                if (x > 255) return 255;
+            }
+            return (jpeg_uc)x;
+        }
+
+#define jpeg__f2f(x)  ((int) (((x) * 4096 + 0.5)))
+#define jpeg__fsh(x)  ((x) * 4096)
+
+        // derived from jidctint -- DCT_ISLOW
+#define JPEG__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * jpeg__f2f(0.5411961f);       \
+   t2 = p1 + p3*jpeg__f2f(-1.847759065f);      \
+   t3 = p1 + p2*jpeg__f2f( 0.765366865f);      \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = jpeg__fsh(p2+p3);                      \
+   t1 = jpeg__fsh(p2-p3);                      \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*jpeg__f2f( 1.175875602f);      \
+   t0 = t0*jpeg__f2f( 0.298631336f);           \
+   t1 = t1*jpeg__f2f( 2.053119869f);           \
+   t2 = t2*jpeg__f2f( 3.072711026f);           \
+   t3 = t3*jpeg__f2f( 1.501321110f);           \
+   p1 = p5 + p1*jpeg__f2f(-0.899976223f);      \
+   p2 = p5 + p2*jpeg__f2f(-2.562915447f);      \
+   p3 = p3*jpeg__f2f(-1.961570560f);           \
+   p4 = p4*jpeg__f2f(-0.390180644f);           \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
+
+        static void jpeg__idct_block(jpeg_uc* out, int out_stride, short data[64])
+        {
+            int i, val[64], * v = val;
+            jpeg_uc* o;
+            short* d = data;
+
+            // columns
+            for (i = 0; i < 8; ++i, ++d, ++v) {
+                // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+                if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0
+                    && d[40] == 0 && d[48] == 0 && d[56] == 0) {
+                    //    no shortcut                 0     seconds
+                    //    (1|2|3|4|5|6|7)==0          0     seconds
+                    //    all separate               -0.047 seconds
+                    //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+                    int dcterm = d[0] * 4;
+                    v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+                }
+                else {
+                    JPEG__IDCT_1D(d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56])
+                        // constants scaled things up by 1<<12; let's bring them back
+                        // down, but keep 2 extra bits of precision
+                        x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+                    v[0] = (x0 + t3) >> 10;
+                    v[56] = (x0 - t3) >> 10;
+                    v[8] = (x1 + t2) >> 10;
+                    v[48] = (x1 - t2) >> 10;
+                    v[16] = (x2 + t1) >> 10;
+                    v[40] = (x2 - t1) >> 10;
+                    v[24] = (x3 + t0) >> 10;
+                    v[32] = (x3 - t0) >> 10;
+                }
+            }
+
+            for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) {
+                // no fast case since the first 1D IDCT spread components out
+                JPEG__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7])
+                    // constants scaled things up by 1<<12, plus we had 1<<2 from first
+                    // loop, plus horizontal and vertical each scale by sqrt(8) so together
+                    // we've got an extra 1<<3, so 1<<17 total we need to remove.
+                    // so we want to round that, which means adding 0.5 * 1<<17,
+                    // aka 65536. Also, we'll end up with -128 to 127 that we want
+                    // to encode as 0..255 by adding 128, so we'll add that before the shift
+                    x0 += 65536 + (128 << 17);
+                x1 += 65536 + (128 << 17);
+                x2 += 65536 + (128 << 17);
+                x3 += 65536 + (128 << 17);
+                // tried computing the shifts into temps, or'ing the temps to see
+                // if any were out of range, but that was slower
+                o[0] = jpeg__clamp((x0 + t3) >> 17);
+                o[7] = jpeg__clamp((x0 - t3) >> 17);
+                o[1] = jpeg__clamp((x1 + t2) >> 17);
+                o[6] = jpeg__clamp((x1 - t2) >> 17);
+                o[2] = jpeg__clamp((x2 + t1) >> 17);
+                o[5] = jpeg__clamp((x2 - t1) >> 17);
+                o[3] = jpeg__clamp((x3 + t0) >> 17);
+                o[4] = jpeg__clamp((x3 - t0) >> 17);
+            }
+        }
+
+#ifdef JPEG_SSE2
+        // sse2 integer IDCT. not the fastest possible implementation but it
+        // produces bit-identical results to the generic C version so it's
+        // fully "transparent".
+        static void jpeg__idct_simd(jpeg_uc* out, int out_stride, short data[64])
+        {
+            // This is constructed to match our regular (generic) integer IDCT exactly.
+            __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+            __m128i tmp;
+
+            // dot product constant: even elems=x, odd elems=y
+#define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
+
+// out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+// out(1) = c1[even]*x + c1[odd]*y
+#define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+   // out = in << 12  (in 16-bit, out 32-bit)
+#define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+   // wide add
+#define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+   // wide sub
+#define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+   // butterfly a/b, add bias, then shift by "s" and pack
+#define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
+
+   // 8-bit interleave step (for transposes)
+#define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
+
+   // 16-bit interleave step (for transposes)
+#define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
+
+#define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
+
+            __m128i rot0_0 = dct_const(jpeg__f2f(0.5411961f), jpeg__f2f(0.5411961f) + jpeg__f2f(-1.847759065f));
+            __m128i rot0_1 = dct_const(jpeg__f2f(0.5411961f) + jpeg__f2f(0.765366865f), jpeg__f2f(0.5411961f));
+            __m128i rot1_0 = dct_const(jpeg__f2f(1.175875602f) + jpeg__f2f(-0.899976223f), jpeg__f2f(1.175875602f));
+            __m128i rot1_1 = dct_const(jpeg__f2f(1.175875602f), jpeg__f2f(1.175875602f) + jpeg__f2f(-2.562915447f));
+            __m128i rot2_0 = dct_const(jpeg__f2f(-1.961570560f) + jpeg__f2f(0.298631336f), jpeg__f2f(-1.961570560f));
+            __m128i rot2_1 = dct_const(jpeg__f2f(-1.961570560f), jpeg__f2f(-1.961570560f) + jpeg__f2f(3.072711026f));
+            __m128i rot3_0 = dct_const(jpeg__f2f(-0.390180644f) + jpeg__f2f(2.053119869f), jpeg__f2f(-0.390180644f));
+            __m128i rot3_1 = dct_const(jpeg__f2f(-0.390180644f), jpeg__f2f(-0.390180644f) + jpeg__f2f(1.501321110f));
+
+            // rounding biases in column/row passes, see jpeg__idct_block for explanation.
+            __m128i bias_0 = _mm_set1_epi32(512);
+            __m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17));
+
+            // load
+            row0 = _mm_load_si128((const __m128i*) (data + 0 * 8));
+            row1 = _mm_load_si128((const __m128i*) (data + 1 * 8));
+            row2 = _mm_load_si128((const __m128i*) (data + 2 * 8));
+            row3 = _mm_load_si128((const __m128i*) (data + 3 * 8));
+            row4 = _mm_load_si128((const __m128i*) (data + 4 * 8));
+            row5 = _mm_load_si128((const __m128i*) (data + 5 * 8));
+            row6 = _mm_load_si128((const __m128i*) (data + 6 * 8));
+            row7 = _mm_load_si128((const __m128i*) (data + 7 * 8));
+
+            // column pass
+            dct_pass(bias_0, 10);
+
+            {
+                // 16bit 8x8 transpose pass 1
+                dct_interleave16(row0, row4);
+                dct_interleave16(row1, row5);
+                dct_interleave16(row2, row6);
+                dct_interleave16(row3, row7);
+
+                // transpose pass 2
+                dct_interleave16(row0, row2);
+                dct_interleave16(row1, row3);
+                dct_interleave16(row4, row6);
+                dct_interleave16(row5, row7);
+
+                // transpose pass 3
+                dct_interleave16(row0, row1);
+                dct_interleave16(row2, row3);
+                dct_interleave16(row4, row5);
+                dct_interleave16(row6, row7);
+            }
+
+            // row pass
+            dct_pass(bias_1, 17);
+
+            {
+                // pack
+                __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+                __m128i p1 = _mm_packus_epi16(row2, row3);
+                __m128i p2 = _mm_packus_epi16(row4, row5);
+                __m128i p3 = _mm_packus_epi16(row6, row7);
+
+                // 8bit 8x8 transpose pass 1
+                dct_interleave8(p0, p2); // a0e0a1e1...
+                dct_interleave8(p1, p3); // c0g0c1g1...
+
+                // transpose pass 2
+                dct_interleave8(p0, p1); // a0c0e0g0...
+                dct_interleave8(p2, p3); // b0d0f0h0...
+
+                // transpose pass 3
+                dct_interleave8(p0, p2); // a0b0c0d0...
+                dct_interleave8(p1, p3); // a4b4c4d4...
+
+                // store
+                _mm_storel_epi64((__m128i*) out, p0); out += out_stride;
+                _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+                _mm_storel_epi64((__m128i*) out, p2); out += out_stride;
+                _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+                _mm_storel_epi64((__m128i*) out, p1); out += out_stride;
+                _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+                _mm_storel_epi64((__m128i*) out, p3); out += out_stride;
+                _mm_storel_epi64((__m128i*) out, _mm_shuffle_epi32(p3, 0x4e));
+            }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+        }
+
+#endif // JPEG_SSE2
+
+#ifdef JPEG_NEON
+
+        // NEON integer IDCT. should produce bit-identical
+        // results to the generic C version.
+        static void jpeg__idct_simd(jpeg_uc* out, int out_stride, short data[64])
+        {
+            int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+            int16x4_t rot0_0 = vdup_n_s16(jpeg__f2f(0.5411961f));
+            int16x4_t rot0_1 = vdup_n_s16(jpeg__f2f(-1.847759065f));
+            int16x4_t rot0_2 = vdup_n_s16(jpeg__f2f(0.765366865f));
+            int16x4_t rot1_0 = vdup_n_s16(jpeg__f2f(1.175875602f));
+            int16x4_t rot1_1 = vdup_n_s16(jpeg__f2f(-0.899976223f));
+            int16x4_t rot1_2 = vdup_n_s16(jpeg__f2f(-2.562915447f));
+            int16x4_t rot2_0 = vdup_n_s16(jpeg__f2f(-1.961570560f));
+            int16x4_t rot2_1 = vdup_n_s16(jpeg__f2f(-0.390180644f));
+            int16x4_t rot3_0 = vdup_n_s16(jpeg__f2f(0.298631336f));
+            int16x4_t rot3_1 = vdup_n_s16(jpeg__f2f(2.053119869f));
+            int16x4_t rot3_2 = vdup_n_s16(jpeg__f2f(3.072711026f));
+            int16x4_t rot3_3 = vdup_n_s16(jpeg__f2f(1.501321110f));
+
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+            // wide add
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
+
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
+
+   // load
+            row0 = vld1q_s16(data + 0 * 8);
+            row1 = vld1q_s16(data + 1 * 8);
+            row2 = vld1q_s16(data + 2 * 8);
+            row3 = vld1q_s16(data + 3 * 8);
+            row4 = vld1q_s16(data + 4 * 8);
+            row5 = vld1q_s16(data + 5 * 8);
+            row6 = vld1q_s16(data + 6 * 8);
+            row7 = vld1q_s16(data + 7 * 8);
+
+            // add DC bias
+            row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+            // column pass
+            dct_pass(vrshrn_n_s32, 10);
+
+            // 16bit 8x8 transpose
+            {
+                // these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+                // whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
+
+      // pass 1
+                dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+                dct_trn16(row2, row3);
+                dct_trn16(row4, row5);
+                dct_trn16(row6, row7);
+
+                // pass 2
+                dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+                dct_trn32(row1, row3);
+                dct_trn32(row4, row6);
+                dct_trn32(row5, row7);
+
+                // pass 3
+                dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+                dct_trn64(row1, row5);
+                dct_trn64(row2, row6);
+                dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+            }
+
+            // row pass
+            // vrshrn_n_s32 only supports shifts up to 16, we need
+            // 17. so do a non-rounding shift of 16 first then follow
+            // up with a rounding shift by 1.
+            dct_pass(vshrn_n_s32, 16);
+
+            {
+                // pack and round
+                uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+                uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+                uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+                uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+                uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+                uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+                uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+                uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+                // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
+
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
+
+      // 8x8 8-bit transpose pass 1
+                dct_trn8_8(p0, p1);
+                dct_trn8_8(p2, p3);
+                dct_trn8_8(p4, p5);
+                dct_trn8_8(p6, p7);
+
+                // pass 2
+                dct_trn8_16(p0, p2);
+                dct_trn8_16(p1, p3);
+                dct_trn8_16(p4, p6);
+                dct_trn8_16(p5, p7);
+
+                // pass 3
+                dct_trn8_32(p0, p4);
+                dct_trn8_32(p1, p5);
+                dct_trn8_32(p2, p6);
+                dct_trn8_32(p3, p7);
+
+                // store
+                vst1_u8(out, p0); out += out_stride;
+                vst1_u8(out, p1); out += out_stride;
+                vst1_u8(out, p2); out += out_stride;
+                vst1_u8(out, p3); out += out_stride;
+                vst1_u8(out, p4); out += out_stride;
+                vst1_u8(out, p5); out += out_stride;
+                vst1_u8(out, p6); out += out_stride;
+                vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+            }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+        }
+
+#endif // JPEG_NEON
+
+#define JPEG__MARKER_none  0xff
+        // if there's a pending marker from the entropy stream, return that
+        // otherwise, fetch from the stream and get a marker. if there's no
+        // marker, return 0xff, which is never a valid marker value
+        static jpeg_uc jpeg__get_marker(jpeg__jpeg* j)
+        {
+            jpeg_uc x;
+            if (j->marker != JPEG__MARKER_none) { x = j->marker; j->marker = JPEG__MARKER_none; return x; }
+            x = jpeg__get8(j->s);
+            if (x != 0xff) return JPEG__MARKER_none;
+            while (x == 0xff)
+                x = jpeg__get8(j->s); // consume repeated 0xff fill bytes
+            return x;
+        }
+
+        // in each scan, we'll have scan_n components, and the order
+        // of the components is specified by order[]
+#define JPEG__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, jpeg__jpeg_reset the entropy decoder and
+// the dc prediction
+        static void jpeg__jpeg_reset(jpeg__jpeg* j)
+        {
+            j->code_bits = 0;
+            j->code_buffer = 0;
+            j->nomore = 0;
+            j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
+            j->marker = JPEG__MARKER_none;
+            j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+            j->eob_run = 0;
+            // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+            // since we don't even allow 1<<30 pixels
+        }
+
+        static int jpeg__parse_entropy_coded_data(jpeg__jpeg* z)
+        {
+            jpeg__jpeg_reset(z);
+            if (!z->progressive) {
+                if (z->scan_n == 1) {
+                    int i, j;
+                    JPEG_SIMD_ALIGN(short, data[64]);
+                    int n = z->order[0];
+                    // non-interleaved data, we just need to process one block at a time,
+                    // in trivial scanline order
+                    // number of blocks to do just depends on how many actual "pixels" this
+                    // component has, independent of interleaved MCU blocking and such
+                    int w = (z->img_comp[n].x + 7) >> 3;
+                    int h = (z->img_comp[n].y + 7) >> 3;
+                    for (j = 0; j < h; ++j) {
+                        for (i = 0; i < w; ++i) {
+                            int ha = z->img_comp[n].ha;
+                            if (!jpeg__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                            z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data);
+                            // every data block is an MCU, so countdown the restart interval
+                            if (--z->todo <= 0) {
+                                if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z);
+                                // if it's NOT a restart, then just bail, so we get corrupt data
+                                // rather than no data
+                                if (!JPEG__RESTART(z->marker)) return 1;
+                                jpeg__jpeg_reset(z);
+                            }
+                        }
+                    }
+                    return 1;
+                }
+                else { // interleaved
+                    int i, j, k, x, y;
+                    JPEG_SIMD_ALIGN(short, data[64]);
+                    for (j = 0; j < z->img_mcu_y; ++j) {
+                        for (i = 0; i < z->img_mcu_x; ++i) {
+                            // scan an interleaved mcu... process scan_n components in order
+                            for (k = 0; k < z->scan_n; ++k) {
+                                int n = z->order[k];
+                                // scan out an mcu's worth of this component; that's just determined
+                                // by the basic H and V specified for the component
+                                for (y = 0; y < z->img_comp[n].v; ++y) {
+                                    for (x = 0; x < z->img_comp[n].h; ++x) {
+                                        int x2 = (i * z->img_comp[n].h + x) * 8;
+                                        int y2 = (j * z->img_comp[n].v + y) * 8;
+                                        int ha = z->img_comp[n].ha;
+                                        if (!jpeg__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, z->huff_ac + ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                                        z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2, z->img_comp[n].w2, data);
+                                    }
+                                }
+                            }
+                            // after all interleaved components, that's an interleaved MCU,
+                            // so now count down the restart interval
+                            if (--z->todo <= 0) {
+                                if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z);
+                                if (!JPEG__RESTART(z->marker)) return 1;
+                                jpeg__jpeg_reset(z);
+                            }
+                        }
+                    }
+                    return 1;
+                }
+            }
+            else {
+                if (z->scan_n == 1) {
+                    int i, j;
+                    int n = z->order[0];
+                    // non-interleaved data, we just need to process one block at a time,
+                    // in trivial scanline order
+                    // number of blocks to do just depends on how many actual "pixels" this
+                    // component has, independent of interleaved MCU blocking and such
+                    int w = (z->img_comp[n].x + 7) >> 3;
+                    int h = (z->img_comp[n].y + 7) >> 3;
+                    for (j = 0; j < h; ++j) {
+                        for (i = 0; i < w; ++i) {
+                            short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+                            if (z->spec_start == 0) {
+                                if (!jpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                                    return 0;
+                            }
+                            else {
+                                int ha = z->img_comp[n].ha;
+                                if (!jpeg__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                                    return 0;
+                            }
+                            // every data block is an MCU, so countdown the restart interval
+                            if (--z->todo <= 0) {
+                                if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z);
+                                if (!JPEG__RESTART(z->marker)) return 1;
+                                jpeg__jpeg_reset(z);
+                            }
+                        }
+                    }
+                    return 1;
+                }
+                else { // interleaved
+                    int i, j, k, x, y;
+                    for (j = 0; j < z->img_mcu_y; ++j) {
+                        for (i = 0; i < z->img_mcu_x; ++i) {
+                            // scan an interleaved mcu... process scan_n components in order
+                            for (k = 0; k < z->scan_n; ++k) {
+                                int n = z->order[k];
+                                // scan out an mcu's worth of this component; that's just determined
+                                // by the basic H and V specified for the component
+                                for (y = 0; y < z->img_comp[n].v; ++y) {
+                                    for (x = 0; x < z->img_comp[n].h; ++x) {
+                                        int x2 = (i * z->img_comp[n].h + x);
+                                        int y2 = (j * z->img_comp[n].v + y);
+                                        short* data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                                        if (!jpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                                            return 0;
+                                    }
+                                }
+                            }
+                            // after all interleaved components, that's an interleaved MCU,
+                            // so now count down the restart interval
+                            if (--z->todo <= 0) {
+                                if (z->code_bits < 24) jpeg__grow_buffer_unsafe(z);
+                                if (!JPEG__RESTART(z->marker)) return 1;
+                                jpeg__jpeg_reset(z);
+                            }
+                        }
+                    }
+                    return 1;
+                }
+            }
+        }
+
+        static void jpeg__jpeg_dequantize(short* data, jpeg__uint16* dequant)
+        {
+            int i;
+            for (i = 0; i < 64; ++i)
+                data[i] *= dequant[i];
+        }
+
+        static void jpeg__jpeg_finish(jpeg__jpeg* z)
+        {
+            if (z->progressive) {
+                // dequantize and idct the data
+                int i, j, n;
+                for (n = 0; n < z->s->img_n; ++n) {
+                    int w = (z->img_comp[n].x + 7) >> 3;
+                    int h = (z->img_comp[n].y + 7) >> 3;
+                    for (j = 0; j < h; ++j) {
+                        for (i = 0; i < w; ++i) {
+                            short* data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+                            jpeg__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+                            z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8, z->img_comp[n].w2, data);
+                        }
+                    }
+                }
+            }
+        }
+
+        static int jpeg__process_marker(jpeg__jpeg* z, int m)
+        {
+            int L;
+            switch (m) {
+            case JPEG__MARKER_none: // no marker found
+                return jpeg__err("expected marker", "Corrupt JPEG");
+
+            case 0xDD: // DRI - specify restart interval
+                if (jpeg__get16be(z->s) != 4) return jpeg__err("bad DRI len", "Corrupt JPEG");
+                z->restart_interval = jpeg__get16be(z->s);
+                return 1;
+
+            case 0xDB: // DQT - define quantization table
+                L = jpeg__get16be(z->s) - 2;
+                while (L > 0) {
+                    int q = jpeg__get8(z->s);
+                    int p = q >> 4, sixteen = (p != 0);
+                    int t = q & 15, i;
+                    if (p != 0 && p != 1) return jpeg__err("bad DQT type", "Corrupt JPEG");
+                    if (t > 3) return jpeg__err("bad DQT table", "Corrupt JPEG");
+
+                    for (i = 0; i < 64; ++i)
+                        z->dequant[t][jpeg__jpeg_dezigzag[i]] = (jpeg__uint16)(sixteen ? jpeg__get16be(z->s) : jpeg__get8(z->s));
+                    L -= (sixteen ? 129 : 65);
+                }
+                return L == 0;
+
+            case 0xC4: // DHT - define huffman table
+                L = jpeg__get16be(z->s) - 2;
+                while (L > 0) {
+                    jpeg_uc* v;
+                    int sizes[16], i, n = 0;
+                    int q = jpeg__get8(z->s);
+                    int tc = q >> 4;
+                    int th = q & 15;
+                    if (tc > 1 || th > 3) return jpeg__err("bad DHT header", "Corrupt JPEG");
+                    for (i = 0; i < 16; ++i) {
+                        sizes[i] = jpeg__get8(z->s);
+                        n += sizes[i];
+                    }
+                    L -= 17;
+                    if (tc == 0) {
+                        if (!jpeg__build_huffman(z->huff_dc + th, sizes)) return 0;
+                        v = z->huff_dc[th].values;
+                    }
+                    else {
+                        if (!jpeg__build_huffman(z->huff_ac + th, sizes)) return 0;
+                        v = z->huff_ac[th].values;
+                    }
+                    for (i = 0; i < n; ++i)
+                        v[i] = jpeg__get8(z->s);
+                    if (tc != 0)
+                        jpeg__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+                    L -= n;
+                }
+                return L == 0;
+            }
+
+            // check for comment block or APP blocks
+            if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+                L = jpeg__get16be(z->s);
+                if (L < 2) {
+                    if (m == 0xFE)
+                        return jpeg__err("bad COM len", "Corrupt JPEG");
+                    else
+                        return jpeg__err("bad APP len", "Corrupt JPEG");
+                }
+                L -= 2;
+
+                if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+                    static const unsigned char tag[5] = { 'J','F','I','F','\0' };
+                    int ok = 1;
+                    int i;
+                    for (i = 0; i < 5; ++i)
+                        if (jpeg__get8(z->s) != tag[i])
+                            ok = 0;
+                    L -= 5;
+                    if (ok)
+                        z->jfif = 1;
+                }
+                else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+                    static const unsigned char tag[6] = { 'A','d','o','b','e','\0' };
+                    int ok = 1;
+                    int i;
+                    for (i = 0; i < 6; ++i)
+                        if (jpeg__get8(z->s) != tag[i])
+                            ok = 0;
+                    L -= 6;
+                    if (ok) {
+                        jpeg__get8(z->s); // version
+                        jpeg__get16be(z->s); // flags0
+                        jpeg__get16be(z->s); // flags1
+                        z->app14_color_transform = jpeg__get8(z->s); // color transform
+                        L -= 6;
+                    }
+                }
+
+                jpeg__skip(z->s, L);
+                return 1;
+            }
+
+            return jpeg__err("unknown marker", "Corrupt JPEG");
+        }
+
+        // after we see SOS
+        static int jpeg__process_scan_header(jpeg__jpeg* z)
+        {
+            int i;
+            int Ls = jpeg__get16be(z->s);
+            z->scan_n = jpeg__get8(z->s);
+            if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int)z->s->img_n) return jpeg__err("bad SOS component count", "Corrupt JPEG");
+            if (Ls != 6 + 2 * z->scan_n) return jpeg__err("bad SOS len", "Corrupt JPEG");
+            for (i = 0; i < z->scan_n; ++i) {
+                int id = jpeg__get8(z->s), which;
+                int q = jpeg__get8(z->s);
+                for (which = 0; which < z->s->img_n; ++which)
+                    if (z->img_comp[which].id == id)
+                        break;
+                if (which == z->s->img_n) return 0; // no match
+                z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return jpeg__err("bad DC huff", "Corrupt JPEG");
+                z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return jpeg__err("bad AC huff", "Corrupt JPEG");
+                z->order[i] = which;
+            }
+
+            {
+                int aa;
+                z->spec_start = jpeg__get8(z->s);
+                z->spec_end = jpeg__get8(z->s); // should be 63, but might be 0
+                aa = jpeg__get8(z->s);
+                z->succ_high = (aa >> 4);
+                z->succ_low = (aa & 15);
+                if (z->progressive) {
+                    if (z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+                        return jpeg__err("bad SOS", "Corrupt JPEG");
+                }
+                else {
+                    if (z->spec_start != 0) return jpeg__err("bad SOS", "Corrupt JPEG");
+                    if (z->succ_high != 0 || z->succ_low != 0) return jpeg__err("bad SOS", "Corrupt JPEG");
+                    z->spec_end = 63;
+                }
+            }
+
+            return 1;
+        }
+
+        static int jpeg__free_jpeg_components(jpeg__jpeg* z, int ncomp, int why)
+        {
+            int i;
+            for (i = 0; i < ncomp; ++i) {
+                if (z->img_comp[i].raw_data) {
+                    JPEG_FREE(z->img_comp[i].raw_data);
+                    z->img_comp[i].raw_data = NULL;
+                    z->img_comp[i].data = NULL;
+                }
+                if (z->img_comp[i].raw_coeff) {
+                    JPEG_FREE(z->img_comp[i].raw_coeff);
+                    z->img_comp[i].raw_coeff = 0;
+                    z->img_comp[i].coeff = 0;
+                }
+                if (z->img_comp[i].linebuf) {
+                    JPEG_FREE(z->img_comp[i].linebuf);
+                    z->img_comp[i].linebuf = NULL;
+                }
+            }
+            return why;
+        }
+
+        static int jpeg__process_frame_header(jpeg__jpeg* z, int scan)
+        {
+            jpeg__context* s = z->s;
+            int Lf, p, i, q, h_max = 1, v_max = 1, c;
+            Lf = jpeg__get16be(s);         if (Lf < 11) return jpeg__err("bad SOF len", "Corrupt JPEG"); // JPEG
+            p = jpeg__get8(s);            if (p != 8) return jpeg__err("only 8-bit", "JPEG format not supported: 8-bit only"); // JPEG baseline
+            s->img_y = jpeg__get16be(s);   if (s->img_y == 0) return jpeg__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+            s->img_x = jpeg__get16be(s);   if (s->img_x == 0) return jpeg__err("0 width", "Corrupt JPEG"); // JPEG requires
+            if (s->img_y > JPEG_MAX_DIMENSIONS) return jpeg__err("too large", "Very large image (corrupt?)");
+            if (s->img_x > JPEG_MAX_DIMENSIONS) return jpeg__err("too large", "Very large image (corrupt?)");
+            c = jpeg__get8(s);
+            if (c != 3 && c != 1 && c != 4) return jpeg__err("bad component count", "Corrupt JPEG");
+            s->img_n = c;
+            for (i = 0; i < c; ++i) {
+                z->img_comp[i].data = NULL;
+                z->img_comp[i].linebuf = NULL;
+            }
+
+            if (Lf != 8 + 3 * s->img_n) return jpeg__err("bad SOF len", "Corrupt JPEG");
+
+            z->rgb = 0;
+            for (i = 0; i < s->img_n; ++i) {
+                static const unsigned char rgb[3] = { 'R', 'G', 'B' };
+                z->img_comp[i].id = jpeg__get8(s);
+                if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+                    ++z->rgb;
+                q = jpeg__get8(s);
+                z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return jpeg__err("bad H", "Corrupt JPEG");
+                z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return jpeg__err("bad V", "Corrupt JPEG");
+                z->img_comp[i].tq = jpeg__get8(s);  if (z->img_comp[i].tq > 3) return jpeg__err("bad TQ", "Corrupt JPEG");
+            }
+
+            if (scan != JPEG__SCAN_load) return 1;
+
+            if (!jpeg__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return jpeg__err("too large", "Image too large to decode");
+
+            for (i = 0; i < s->img_n; ++i) {
+                if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+                if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+            }
+
+            // compute interleaved mcu info
+            z->img_h_max = h_max;
+            z->img_v_max = v_max;
+            z->img_mcu_w = h_max * 8;
+            z->img_mcu_h = v_max * 8;
+            // these sizes can't be more than 17 bits
+            z->img_mcu_x = (s->img_x + z->img_mcu_w - 1) / z->img_mcu_w;
+            z->img_mcu_y = (s->img_y + z->img_mcu_h - 1) / z->img_mcu_h;
+
+            for (i = 0; i < s->img_n; ++i) {
+                // number of effective pixels (e.g. for non-interleaved MCU)
+                z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max - 1) / h_max;
+                z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max - 1) / v_max;
+                // to simplify generation, we'll allocate enough memory to decode
+                // the bogus oversized data from using interleaved MCUs and their
+                // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+                // discard the extra data until colorspace conversion
+                //
+                // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+                // so these muls can't overflow with 32-bit ints (which we require)
+                z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+                z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+                z->img_comp[i].coeff = 0;
+                z->img_comp[i].raw_coeff = 0;
+                z->img_comp[i].linebuf = NULL;
+                z->img_comp[i].raw_data = jpeg__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+                if (z->img_comp[i].raw_data == NULL)
+                    return jpeg__free_jpeg_components(z, i + 1, jpeg__err("outofmem", "Out of memory"));
+                // align blocks for idct using mmx/sse
+                z->img_comp[i].data = (jpeg_uc*)(((size_t)z->img_comp[i].raw_data + 15) & ~15);
+                if (z->progressive) {
+                    // w2, h2 are multiples of 8 (see above)
+                    z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+                    z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+                    z->img_comp[i].raw_coeff = jpeg__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+                    if (z->img_comp[i].raw_coeff == NULL)
+                        return jpeg__free_jpeg_components(z, i + 1, jpeg__err("outofmem", "Out of memory"));
+                    z->img_comp[i].coeff = (short*)(((size_t)z->img_comp[i].raw_coeff + 15) & ~15);
+                }
+            }
+
+            return 1;
+        }
+
+        // use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define jpeg__DNL(x)         ((x) == 0xdc)
+#define jpeg__SOI(x)         ((x) == 0xd8)
+#define jpeg__EOI(x)         ((x) == 0xd9)
+#define jpeg__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define jpeg__SOS(x)         ((x) == 0xda)
+
+#define jpeg__SOF_progressive(x)   ((x) == 0xc2)
+
+        static int jpeg__decode_jpeg_header(jpeg__jpeg* z, int scan)
+        {
+            int m;
+            z->jfif = 0;
+            z->app14_color_transform = -1; // valid values are 0,1,2
+            z->marker = JPEG__MARKER_none; // initialize cached marker to empty
+            m = jpeg__get_marker(z);
+            if (!jpeg__SOI(m)) return jpeg__err("no SOI", "Corrupt JPEG");
+            if (scan == JPEG__SCAN_type) return 1;
+            m = jpeg__get_marker(z);
+            while (!jpeg__SOF(m)) {
+                if (!jpeg__process_marker(z, m)) return 0;
+                m = jpeg__get_marker(z);
+                while (m == JPEG__MARKER_none) {
+                    // some files have extra padding after their blocks, so ok, we'll scan
+                    if (jpeg__at_eof(z->s)) return jpeg__err("no SOF", "Corrupt JPEG");
+                    m = jpeg__get_marker(z);
+                }
+            }
+            z->progressive = jpeg__SOF_progressive(m);
+            if (!jpeg__process_frame_header(z, scan)) return 0;
+            return 1;
+        }
+
+        // decode image to YCbCr format
+        static int jpeg__decode_jpeg_image(jpeg__jpeg* j)
+        {
+            int m;
+            for (m = 0; m < 4; m++) {
+                j->img_comp[m].raw_data = NULL;
+                j->img_comp[m].raw_coeff = NULL;
+            }
+            j->restart_interval = 0;
+            if (!jpeg__decode_jpeg_header(j, JPEG__SCAN_load)) return 0;
+            m = jpeg__get_marker(j);
+            while (!jpeg__EOI(m)) {
+                if (jpeg__SOS(m)) {
+                    if (!jpeg__process_scan_header(j)) return 0;
+                    if (!jpeg__parse_entropy_coded_data(j)) return 0;
+                    if (j->marker == JPEG__MARKER_none) {
+                        // handle 0s at the end of image data from IP Kamera 9060
+                        while (!jpeg__at_eof(j->s)) {
+                            int x = jpeg__get8(j->s);
+                            if (x == 255) {
+                                j->marker = jpeg__get8(j->s);
+                                break;
+                            }
+                        }
+                        // if we reach eof without hitting a marker, jpeg__get_marker() below will fail and we'll eventually return 0
+                    }
+                }
+                else if (jpeg__DNL(m)) {
+                    int Ld = jpeg__get16be(j->s);
+                    jpeg__uint32 NL = jpeg__get16be(j->s);
+                    if (Ld != 4) return jpeg__err("bad DNL len", "Corrupt JPEG");
+                    if (NL != j->s->img_y) return jpeg__err("bad DNL height", "Corrupt JPEG");
+                }
+                else {
+                    if (!jpeg__process_marker(j, m)) return 0;
+                }
+                m = jpeg__get_marker(j);
+            }
+            if (j->progressive)
+                jpeg__jpeg_finish(j);
+            return 1;
+        }
+
+        // static jfif-centered resampling (across block boundaries)
+
+        typedef jpeg_uc* (*resample_row_func)(jpeg_uc* out, jpeg_uc* in0, jpeg_uc* in1,
+            int w, int hs);
+
+#define jpeg__div4(x) ((jpeg_uc) ((x) >> 2))
+
+        static jpeg_uc* resample_row_1(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs)
+        {
+            JPEG_NOTUSED(out);
+            JPEG_NOTUSED(in_far);
+            JPEG_NOTUSED(w);
+            JPEG_NOTUSED(hs);
+            return in_near;
+        }
+
+        static jpeg_uc* jpeg__resample_row_v_2(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs)
+        {
+            // need to generate two samples vertically for every one in input
+            int i;
+            JPEG_NOTUSED(hs);
+            for (i = 0; i < w; ++i)
+                out[i] = jpeg__div4(3 * in_near[i] + in_far[i] + 2);
+            return out;
+        }
+
+        static jpeg_uc* jpeg__resample_row_h_2(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs)
+        {
+            // need to generate two samples horizontally for every one in input
+            int i;
+            jpeg_uc* input = in_near;
+
+            if (w == 1) {
+                // if only one sample, can't do any interpolation
+                out[0] = out[1] = input[0];
+                return out;
+            }
+
+            out[0] = input[0];
+            out[1] = jpeg__div4(input[0] * 3 + input[1] + 2);
+            for (i = 1; i < w - 1; ++i) {
+                int n = 3 * input[i] + 2;
+                out[i * 2 + 0] = jpeg__div4(n + input[i - 1]);
+                out[i * 2 + 1] = jpeg__div4(n + input[i + 1]);
+            }
+            out[i * 2 + 0] = jpeg__div4(input[w - 2] * 3 + input[w - 1] + 2);
+            out[i * 2 + 1] = input[w - 1];
+
+            JPEG_NOTUSED(in_far);
+            JPEG_NOTUSED(hs);
+
+            return out;
+        }
+
+#define jpeg__div16(x) ((jpeg_uc) ((x) >> 4))
+
+        static jpeg_uc* jpeg__resample_row_hv_2(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs)
+        {
+            // need to generate 2x2 samples for every one in input
+            int i, t0, t1;
+            if (w == 1) {
+                out[0] = out[1] = jpeg__div4(3 * in_near[0] + in_far[0] + 2);
+                return out;
+            }
+
+            t1 = 3 * in_near[0] + in_far[0];
+            out[0] = jpeg__div4(t1 + 2);
+            for (i = 1; i < w; ++i) {
+                t0 = t1;
+                t1 = 3 * in_near[i] + in_far[i];
+                out[i * 2 - 1] = jpeg__div16(3 * t0 + t1 + 8);
+                out[i * 2] = jpeg__div16(3 * t1 + t0 + 8);
+            }
+            out[w * 2 - 1] = jpeg__div4(t1 + 2);
+
+            JPEG_NOTUSED(hs);
+
+            return out;
+        }
+
+#if defined(JPEG_SSE2) || defined(JPEG_NEON)
+        static jpeg_uc* jpeg__resample_row_hv_2_simd(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs)
+        {
+            // need to generate 2x2 samples for every one in input
+            int i = 0, t0, t1;
+
+            if (w == 1) {
+                out[0] = out[1] = jpeg__div4(3 * in_near[0] + in_far[0] + 2);
+                return out;
+            }
+
+            t1 = 3 * in_near[0] + in_far[0];
+            // process groups of 8 pixels for as long as we can.
+            // note we can't handle the last pixel in a row in this loop
+            // because we need to handle the filter boundary conditions.
+            for (; i < ((w - 1) & ~7); i += 8) {
+#if defined(JPEG_SSE2)
+                // load and perform the vertical filtering pass
+                // this uses 3*x + y = 4*x + (y - x)
+                __m128i zero = _mm_setzero_si128();
+                __m128i farb = _mm_loadl_epi64((__m128i*) (in_far + i));
+                __m128i nearb = _mm_loadl_epi64((__m128i*) (in_near + i));
+                __m128i farw = _mm_unpacklo_epi8(farb, zero);
+                __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+                __m128i diff = _mm_sub_epi16(farw, nearw);
+                __m128i nears = _mm_slli_epi16(nearw, 2);
+                __m128i curr = _mm_add_epi16(nears, diff); // current row
+
+                // horizontal filter works the same based on shifted vers of current
+                // row. "prev" is current row shifted right by 1 pixel; we need to
+                // insert the previous pixel value (from t1).
+                // "next" is current row shifted left by 1 pixel, with first pixel
+                // of next block of 8 pixels added in.
+                __m128i prv0 = _mm_slli_si128(curr, 2);
+                __m128i nxt0 = _mm_srli_si128(curr, 2);
+                __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+                __m128i next = _mm_insert_epi16(nxt0, 3 * in_near[i + 8] + in_far[i + 8], 7);
+
+                // horizontal filter, polyphase implementation since it's convenient:
+                // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+                // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+                // note the shared term.
+                __m128i bias = _mm_set1_epi16(8);
+                __m128i curs = _mm_slli_epi16(curr, 2);
+                __m128i prvd = _mm_sub_epi16(prev, curr);
+                __m128i nxtd = _mm_sub_epi16(next, curr);
+                __m128i curb = _mm_add_epi16(curs, bias);
+                __m128i even = _mm_add_epi16(prvd, curb);
+                __m128i odd = _mm_add_epi16(nxtd, curb);
+
+                // interleave even and odd pixels, then undo scaling.
+                __m128i int0 = _mm_unpacklo_epi16(even, odd);
+                __m128i int1 = _mm_unpackhi_epi16(even, odd);
+                __m128i de0 = _mm_srli_epi16(int0, 4);
+                __m128i de1 = _mm_srli_epi16(int1, 4);
+
+                // pack and write output
+                __m128i outv = _mm_packus_epi16(de0, de1);
+                _mm_storeu_si128((__m128i*) (out + i * 2), outv);
+#elif defined(JPEG_NEON)
+                // load and perform the vertical filtering pass
+                // this uses 3*x + y = 4*x + (y - x)
+                uint8x8_t farb = vld1_u8(in_far + i);
+                uint8x8_t nearb = vld1_u8(in_near + i);
+                int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+                int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+                int16x8_t curr = vaddq_s16(nears, diff); // current row
+
+                // horizontal filter works the same based on shifted vers of current
+                // row. "prev" is current row shifted right by 1 pixel; we need to
+                // insert the previous pixel value (from t1).
+                // "next" is current row shifted left by 1 pixel, with first pixel
+                // of next block of 8 pixels added in.
+                int16x8_t prv0 = vextq_s16(curr, curr, 7);
+                int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+                int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+                int16x8_t next = vsetq_lane_s16(3 * in_near[i + 8] + in_far[i + 8], nxt0, 7);
+
+                // horizontal filter, polyphase implementation since it's convenient:
+                // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+                // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+                // note the shared term.
+                int16x8_t curs = vshlq_n_s16(curr, 2);
+                int16x8_t prvd = vsubq_s16(prev, curr);
+                int16x8_t nxtd = vsubq_s16(next, curr);
+                int16x8_t even = vaddq_s16(curs, prvd);
+                int16x8_t odd = vaddq_s16(curs, nxtd);
+
+                // undo scaling and round, then store with even/odd phases interleaved
+                uint8x8x2_t o;
+                o.val[0] = vqrshrun_n_s16(even, 4);
+                o.val[1] = vqrshrun_n_s16(odd, 4);
+                vst2_u8(out + i * 2, o);
+#endif
+
+                // "previous" value for next iter
+                t1 = 3 * in_near[i + 7] + in_far[i + 7];
+            }
+
+            t0 = t1;
+            t1 = 3 * in_near[i] + in_far[i];
+            out[i * 2] = jpeg__div16(3 * t1 + t0 + 8);
+
+            for (++i; i < w; ++i) {
+                t0 = t1;
+                t1 = 3 * in_near[i] + in_far[i];
+                out[i * 2 - 1] = jpeg__div16(3 * t0 + t1 + 8);
+                out[i * 2] = jpeg__div16(3 * t1 + t0 + 8);
+            }
+            out[w * 2 - 1] = jpeg__div4(t1 + 2);
+
+            JPEG_NOTUSED(hs);
+
+            return out;
+        }
+#endif
+
+        static jpeg_uc* jpeg__resample_row_generic(jpeg_uc* out, jpeg_uc* in_near, jpeg_uc* in_far, int w, int hs)
+        {
+            // resample with nearest-neighbor
+            int i, j;
+            JPEG_NOTUSED(in_far);
+            for (i = 0; i < w; ++i)
+                for (j = 0; j < hs; ++j)
+                    out[i * hs + j] = in_near[i];
+            return out;
+        }
+
+        // this is a reduced-precision calculation of YCbCr-to-RGB introduced
+        // to make sure the code produces the same results in both SIMD and scalar
+#define jpeg__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+        static void jpeg__YCbCr_to_RGB_row(jpeg_uc* out, const jpeg_uc* y, const jpeg_uc* pcb, const jpeg_uc* pcr, int count, int step)
+        {
+            int i;
+            for (i = 0; i < count; ++i) {
+                int y_fixed = (y[i] << 20) + (1 << 19); // rounding
+                int r, g, b;
+                int cr = pcr[i] - 128;
+                int cb = pcb[i] - 128;
+                r = y_fixed + cr * jpeg__float2fixed(1.40200f);
+                g = y_fixed + (cr * -jpeg__float2fixed(0.71414f)) + ((cb * -jpeg__float2fixed(0.34414f)) & 0xffff0000);
+                b = y_fixed + cb * jpeg__float2fixed(1.77200f);
+                r >>= 20;
+                g >>= 20;
+                b >>= 20;
+                if ((unsigned)r > 255) { if (r < 0) r = 0; else r = 255; }
+                if ((unsigned)g > 255) { if (g < 0) g = 0; else g = 255; }
+                if ((unsigned)b > 255) { if (b < 0) b = 0; else b = 255; }
+                out[0] = (jpeg_uc)r;
+                out[1] = (jpeg_uc)g;
+                out[2] = (jpeg_uc)b;
+                out[3] = 255;
+                out += step;
+            }
+        }
+
+#if defined(JPEG_SSE2) || defined(JPEG_NEON)
+        static void jpeg__YCbCr_to_RGB_simd(jpeg_uc* out, jpeg_uc const* y, jpeg_uc const* pcb, jpeg_uc const* pcr, int count, int step)
+        {
+            int i = 0;
+
+#ifdef JPEG_SSE2
+            // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+            // it's useful in practice (you wouldn't use it for textures, for example).
+            // so just accelerate step == 4 case.
+            if (step == 4) {
+                // this is a fairly straightforward implementation and not super-optimized.
+                __m128i signflip = _mm_set1_epi8(-0x80);
+                __m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f));
+                __m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f));
+                __m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f));
+                __m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f));
+                __m128i y_bias = _mm_set1_epi8((char)(unsigned char)128);
+                __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+                for (; i + 7 < count; i += 8) {
+                    // load
+                    __m128i y_bytes = _mm_loadl_epi64((__m128i*) (y + i));
+                    __m128i cr_bytes = _mm_loadl_epi64((__m128i*) (pcr + i));
+                    __m128i cb_bytes = _mm_loadl_epi64((__m128i*) (pcb + i));
+                    __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+                    __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+                    // unpack to short (and left-shift cr, cb by 8)
+                    __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
+                    __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+                    __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+                    // color transform
+                    __m128i yws = _mm_srli_epi16(yw, 4);
+                    __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+                    __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+                    __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+                    __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+                    __m128i rws = _mm_add_epi16(cr0, yws);
+                    __m128i gwt = _mm_add_epi16(cb0, yws);
+                    __m128i bws = _mm_add_epi16(yws, cb1);
+                    __m128i gws = _mm_add_epi16(gwt, cr1);
+
+                    // descale
+                    __m128i rw = _mm_srai_epi16(rws, 4);
+                    __m128i bw = _mm_srai_epi16(bws, 4);
+                    __m128i gw = _mm_srai_epi16(gws, 4);
+
+                    // back to byte, set up for transpose
+                    __m128i brb = _mm_packus_epi16(rw, bw);
+                    __m128i gxb = _mm_packus_epi16(gw, xw);
+
+                    // transpose to interleave channels
+                    __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+                    __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+                    __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+                    __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+                    // store
+                    _mm_storeu_si128((__m128i*) (out + 0), o0);
+                    _mm_storeu_si128((__m128i*) (out + 16), o1);
+                    out += 32;
+                }
+            }
+#endif
+
+#ifdef JPEG_NEON
+            // in this version, step=3 support would be easy to add. but is there demand?
+            if (step == 4) {
+                // this is a fairly straightforward implementation and not super-optimized.
+                uint8x8_t signflip = vdup_n_u8(0x80);
+                int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f));
+                int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f));
+                int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f));
+                int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f));
+
+                for (; i + 7 < count; i += 8) {
+                    // load
+                    uint8x8_t y_bytes = vld1_u8(y + i);
+                    uint8x8_t cr_bytes = vld1_u8(pcr + i);
+                    uint8x8_t cb_bytes = vld1_u8(pcb + i);
+                    int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+                    int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+                    // expand to s16
+                    int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+                    int16x8_t crw = vshll_n_s8(cr_biased, 7);
+                    int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+                    // color transform
+                    int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+                    int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+                    int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+                    int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+                    int16x8_t rws = vaddq_s16(yws, cr0);
+                    int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+                    int16x8_t bws = vaddq_s16(yws, cb1);
+
+                    // undo scaling, round, convert to byte
+                    uint8x8x4_t o;
+                    o.val[0] = vqrshrun_n_s16(rws, 4);
+                    o.val[1] = vqrshrun_n_s16(gws, 4);
+                    o.val[2] = vqrshrun_n_s16(bws, 4);
+                    o.val[3] = vdup_n_u8(255);
+
+                    // store, interleaving r/g/b/a
+                    vst4_u8(out, o);
+                    out += 8 * 4;
+                }
+            }
+#endif
+
+            for (; i < count; ++i) {
+                int y_fixed = (y[i] << 20) + (1 << 19); // rounding
+                int r, g, b;
+                int cr = pcr[i] - 128;
+                int cb = pcb[i] - 128;
+                r = y_fixed + cr * jpeg__float2fixed(1.40200f);
+                g = y_fixed + cr * -jpeg__float2fixed(0.71414f) + ((cb * -jpeg__float2fixed(0.34414f)) & 0xffff0000);
+                b = y_fixed + cb * jpeg__float2fixed(1.77200f);
+                r >>= 20;
+                g >>= 20;
+                b >>= 20;
+                if ((unsigned)r > 255) { if (r < 0) r = 0; else r = 255; }
+                if ((unsigned)g > 255) { if (g < 0) g = 0; else g = 255; }
+                if ((unsigned)b > 255) { if (b < 0) b = 0; else b = 255; }
+                out[0] = (jpeg_uc)r;
+                out[1] = (jpeg_uc)g;
+                out[2] = (jpeg_uc)b;
+                out[3] = 255;
+                out += step;
+            }
+        }
+#endif
+
+        // set up the kernels
+        static void jpeg__setup_jpeg(jpeg__jpeg* j)
+        {
+            j->idct_block_kernel = jpeg__idct_block;
+            j->YCbCr_to_RGB_kernel = jpeg__YCbCr_to_RGB_row;
+            j->resample_row_hv_2_kernel = jpeg__resample_row_hv_2;
+
+#ifdef JPEG_SSE2
+            if (jpeg__sse2_available()) {
+                j->idct_block_kernel = jpeg__idct_simd;
+                j->YCbCr_to_RGB_kernel = jpeg__YCbCr_to_RGB_simd;
+                j->resample_row_hv_2_kernel = jpeg__resample_row_hv_2_simd;
+            }
+#endif
+
+#ifdef JPEG_NEON
+            j->idct_block_kernel = jpeg__idct_simd;
+            j->YCbCr_to_RGB_kernel = jpeg__YCbCr_to_RGB_simd;
+            j->resample_row_hv_2_kernel = jpeg__resample_row_hv_2_simd;
+#endif
+        }
+
+        // clean up the temporary component buffers
+        static void jpeg__cleanup_jpeg(jpeg__jpeg* j)
+        {
+            jpeg__free_jpeg_components(j, j->s->img_n, 0);
+        }
+
+        typedef struct
+        {
+            resample_row_func resample;
+            jpeg_uc* line0, * line1;
+            int hs, vs;   // expansion factor in each axis
+            int w_lores; // horizontal pixels pre-expansion
+            int ystep;   // how far through vertical expansion we are
+            int ypos;    // which pre-expansion row we're on
+        } jpeg__resample;
+
+        // fast 0..255 * 0..255 => 0..255 rounded multiplication
+        static jpeg_uc jpeg__blinn_8x8(jpeg_uc x, jpeg_uc y)
+        {
+            unsigned int t = x * y + 128;
+            return (jpeg_uc)((t + (t >> 8)) >> 8);
+        }
+
+        static jpeg_uc* load_jpeg_image(jpeg__jpeg* z, int* out_x, int* out_y, int* comp, int req_comp)
+        {
+            int n, decode_n, is_rgb;
+            z->s->img_n = 0; // make jpeg__cleanup_jpeg safe
+
+            // validate req_comp
+            if (req_comp < 0 || req_comp > 4) return jpeg__errpuc("bad req_comp", "Internal error");
+
+            // load a jpeg image from whichever source, but leave in YCbCr format
+            if (!jpeg__decode_jpeg_image(z)) { jpeg__cleanup_jpeg(z); return NULL; }
+
+            // determine actual number of components to generate
+            n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+            is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+
+            if (z->s->img_n == 3 && n < 3 && !is_rgb)
+                decode_n = 1;
+            else
+                decode_n = z->s->img_n;
+
+            // resample and color-convert
+            {
+                int k;
+                unsigned int i, j;
+                jpeg_uc* output;
+                jpeg_uc* coutput[4] = { NULL, NULL, NULL, NULL };
+
+                jpeg__resample res_comp[4];
+
+                for (k = 0; k < decode_n; ++k) {
+                    jpeg__resample* r = &res_comp[k];
+
+                    // allocate line buffer big enough for upsampling off the edges
+                    // with upsample factor of 4
+                    z->img_comp[k].linebuf = (jpeg_uc*)jpeg__malloc(z->s->img_x + 3);
+                    if (!z->img_comp[k].linebuf) { jpeg__cleanup_jpeg(z); return jpeg__errpuc("outofmem", "Out of memory"); }
+
+                    r->hs = z->img_h_max / z->img_comp[k].h;
+                    r->vs = z->img_v_max / z->img_comp[k].v;
+                    r->ystep = r->vs >> 1;
+                    r->w_lores = (z->s->img_x + r->hs - 1) / r->hs;
+                    r->ypos = 0;
+                    r->line0 = r->line1 = z->img_comp[k].data;
+
+                    if (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+                    else if (r->hs == 1 && r->vs == 2) r->resample = jpeg__resample_row_v_2;
+                    else if (r->hs == 2 && r->vs == 1) r->resample = jpeg__resample_row_h_2;
+                    else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
+                    else                               r->resample = jpeg__resample_row_generic;
+                }
+
+                // can't error after this so, this is safe
+                output = (jpeg_uc*)jpeg__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+                if (!output) { jpeg__cleanup_jpeg(z); return jpeg__errpuc("outofmem", "Out of memory"); }
+
+                // now go ahead and resample
+                for (j = 0; j < z->s->img_y; ++j) {
+                    jpeg_uc* out = output + n * z->s->img_x * j;
+                    for (k = 0; k < decode_n; ++k) {
+                        jpeg__resample* r = &res_comp[k];
+                        int y_bot = r->ystep >= (r->vs >> 1);
+                        coutput[k] = r->resample(z->img_comp[k].linebuf,
+                            y_bot ? r->line1 : r->line0,
+                            y_bot ? r->line0 : r->line1,
+                            r->w_lores, r->hs);
+                        if (++r->ystep >= r->vs) {
+                            r->ystep = 0;
+                            r->line0 = r->line1;
+                            if (++r->ypos < z->img_comp[k].y)
+                                r->line1 += z->img_comp[k].w2;
+                        }
+                    }
+                    if (n >= 3) {
+                        jpeg_uc* y = coutput[0];
+                        if (z->s->img_n == 3) {
+                            if (is_rgb) {
+                                for (i = 0; i < z->s->img_x; ++i) {
+                                    out[0] = y[i];
+                                    out[1] = coutput[1][i];
+                                    out[2] = coutput[2][i];
+                                    out[3] = 255;
+                                    out += n;
+                                }
+                            }
+                            else {
+                                z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                            }
+                        }
+                        else if (z->s->img_n == 4) {
+                            if (z->app14_color_transform == 0) { // CMYK
+                                for (i = 0; i < z->s->img_x; ++i) {
+                                    jpeg_uc m = coutput[3][i];
+                                    out[0] = jpeg__blinn_8x8(coutput[0][i], m);
+                                    out[1] = jpeg__blinn_8x8(coutput[1][i], m);
+                                    out[2] = jpeg__blinn_8x8(coutput[2][i], m);
+                                    out[3] = 255;
+                                    out += n;
+                                }
+                            }
+                            else if (z->app14_color_transform == 2) { // YCCK
+                                z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                                for (i = 0; i < z->s->img_x; ++i) {
+                                    jpeg_uc m = coutput[3][i];
+                                    out[0] = jpeg__blinn_8x8(255 - out[0], m);
+                                    out[1] = jpeg__blinn_8x8(255 - out[1], m);
+                                    out[2] = jpeg__blinn_8x8(255 - out[2], m);
+                                    out += n;
+                                }
+                            }
+                            else { // YCbCr + alpha?  Ignore the fourth channel for now
+                                z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                            }
+                        }
+                        else
+                            for (i = 0; i < z->s->img_x; ++i) {
+                                out[0] = out[1] = out[2] = y[i];
+                                out[3] = 255; // not used if n==3
+                                out += n;
+                            }
+                    }
+                    else {
+                        if (is_rgb) {
+                            if (n == 1)
+                                for (i = 0; i < z->s->img_x; ++i)
+                                    *out++ = jpeg__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                            else {
+                                for (i = 0; i < z->s->img_x; ++i, out += 2) {
+                                    out[0] = jpeg__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                                    out[1] = 255;
+                                }
+                            }
+                        }
+                        else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+                            for (i = 0; i < z->s->img_x; ++i) {
+                                jpeg_uc m = coutput[3][i];
+                                jpeg_uc r = jpeg__blinn_8x8(coutput[0][i], m);
+                                jpeg_uc g = jpeg__blinn_8x8(coutput[1][i], m);
+                                jpeg_uc b = jpeg__blinn_8x8(coutput[2][i], m);
+                                out[0] = jpeg__compute_y(r, g, b);
+                                out[1] = 255;
+                                out += n;
+                            }
+                        }
+                        else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+                            for (i = 0; i < z->s->img_x; ++i) {
+                                out[0] = jpeg__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                                out[1] = 255;
+                                out += n;
+                            }
+                        }
+                        else {
+                            jpeg_uc* y = coutput[0];
+                            if (n == 1)
+                                for (i = 0; i < z->s->img_x; ++i) out[i] = y[i];
+                            else
+                                for (i = 0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
+                        }
+                    }
+                }
+                jpeg__cleanup_jpeg(z);
+                *out_x = z->s->img_x;
+                *out_y = z->s->img_y;
+                if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
+                return output;
+            }
+        }
+
+        static void* jpeg__jpeg_load(jpeg__context* s, int* x, int* y, int* comp, int req_comp, jpeg__result_info* ri)
+        {
+            unsigned char* result;
+            jpeg__jpeg* j = (jpeg__jpeg*)jpeg__malloc(sizeof(jpeg__jpeg));
+            JPEG_NOTUSED(ri);
+            j->s = s;
+            jpeg__setup_jpeg(j);
+            result = load_jpeg_image(j, x, y, comp, req_comp);
+            JPEG_FREE(j);
+            return result;
+        }
+
+        static int jpeg__jpeg_test(jpeg__context* s)
+        {
+            int r;
+            jpeg__jpeg* j = (jpeg__jpeg*)jpeg__malloc(sizeof(jpeg__jpeg));
+            j->s = s;
+            jpeg__setup_jpeg(j);
+            r = jpeg__decode_jpeg_header(j, JPEG__SCAN_type);
+            jpeg__rewind(s);
+            JPEG_FREE(j);
+            return r;
+        }
+
+        static int jpeg__jpeg_info_raw(jpeg__jpeg* j, int* x, int* y, int* comp)
+        {
+            if (!jpeg__decode_jpeg_header(j, JPEG__SCAN_header)) {
+                jpeg__rewind(j->s);
+                return 0;
+            }
+            if (x) *x = j->s->img_x;
+            if (y) *y = j->s->img_y;
+            if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
+            return 1;
+        }
+
+        static int jpeg__jpeg_info(jpeg__context* s, int* x, int* y, int* comp)
+        {
+            int result;
+            jpeg__jpeg* j = (jpeg__jpeg*)(jpeg__malloc(sizeof(jpeg__jpeg)));
+            j->s = s;
+            result = jpeg__jpeg_info_raw(j, x, y, comp);
+            JPEG_FREE(j);
+            return result;
+        }
+
+        //------------------------------------------------------------------------
+
+        static int jpeg__stdio_read(void* user, char* data, int size)
+        {
+            InputMemoryStream* stream = (InputMemoryStream*)user;
+            return (int)stream->Read(size, data);
+        }
+
+        static void jpeg__stdio_skip(void* user, int n)
+        {
+            InputMemoryStream* stream = (InputMemoryStream*)user;
+            stream->Skip(n);
+        }
+
+        static int jpeg__stdio_eof(void* user)
+        {
+            InputMemoryStream* stream = (InputMemoryStream*)user;
+            return stream->Pos() == stream->Size() ? 1 : 0;
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageJpegLoader::ImageJpegLoader(const ImageLoaderParam& param)
+            : ImageLoader(param)
+        {
+            if (_param.format == SimdPixelFormatNone)
+                _param.format = SimdPixelFormatRgb24;
+        }
+
+        bool ImageJpegLoader::FromStream()
+        {
+            int x, y, comp;
+            jpeg__context s;
+            s.io.eof = jpeg__stdio_eof;
+            s.io.read = jpeg__stdio_read;
+            s.io.skip = jpeg__stdio_skip;
+            s.io_user_data = &_stream;
+            s.buflen = sizeof(s.buffer_start);
+            s.read_from_callbacks = 1;
+            s.callback_already_read = 0;
+            s.img_buffer = s.img_buffer_original = s.buffer_start;
+            jpeg__refill_buffer(&s);
+            s.img_buffer_original_end = s.img_buffer_end;
+            jpeg__result_info ri;
+            uint8_t * data = (uint8_t*)jpeg__jpeg_load(&s, &x, &y, &comp, 3, &ri);
+            if (data)
+            {
+                size_t stride = 3 * x;
+                _image.Recreate(x, y, (Image::Format)_param.format);
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8:
+                    Base::RgbToGray(data, x, y, stride, _image.data, _image.stride);
+                    break;
+                case SimdPixelFormatBgr24:
+                    Base::BgrToRgb(data, x, y, stride, _image.data, _image.stride);
+                    break;
+                case SimdPixelFormatBgra32:
+                    Base::RgbToBgra(data, x, y, stride, _image.data, _image.stride, 0xFF);
+                    break;
+                case SimdPixelFormatRgb24:
+                    Base::Copy(data, stride, x, y, 3, _image.data, _image.stride);
+                    break;
+                case SimdPixelFormatRgba32:
+                    Base::BgrToBgra(data, x, y, stride, _image.data, _image.stride, 0xFF);
+                    break;
+                default: 
+                    break;
+                }
+                JPEG_FREE(data);
+                return true;
+            }
+            return false;
+        }
+    }
+}
diff --git a/3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp b/3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp
new file mode 100644
index 0000000000..03ae0fab6f
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseImageLoadPng.cpp
@@ -0,0 +1,1317 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdImageSavePng.h"
+#include "Simd/SimdArray.h"
+#include "Simd/SimdCpu.h"
+#include "Simd/SimdBase.h"
+
+namespace Simd
+{
+    namespace Base
+    {
+#define PNG_MALLOC(sz)           malloc(sz)
+#define PNG_REALLOC(p,newsz)     realloc(p,newsz)
+#define PNG_FREE(p)              free(p)
+
+#define PNG__BYTECAST(x)  ((uint8_t) ((x) & 255))  // truncate int to byte without warnings
+
+        SIMD_INLINE int PngError(const char* str, const char* stub)
+        {
+            std::cout << "PNG load error: " << str << ", " << stub << "!" << std::endl;
+            return 0;
+        }
+
+        SIMD_INLINE uint8_t * PngErrorPtr(const char* str, const char* stub)
+        {
+            return (uint8_t*)(size_t)(PngError(str, stub) ? NULL : NULL);
+        }
+
+        static void* png__malloc(size_t size)
+        {
+            return PNG_MALLOC(size);
+        }
+
+        struct PngContext
+        {
+            uint32_t img_x, img_y;
+            int img_n, img_out_n;
+        };
+
+        static int png__addsizes_valid(int a, int b)
+        {
+            if (b < 0) return 0;
+            // now 0 <= b <= INT_MAX, hence also
+            // 0 <= INT_MAX - b <= INTMAX.
+            // And "a + b <= INT_MAX" (which might overflow) is the
+            // same as a <= INT_MAX - b (no overflow)
+            return a <= INT_MAX - b;
+        }
+
+        // returns 1 if the product is valid, 0 on overflow.
+        // negative factors are considered invalid.
+        static int png__mul2sizes_valid(int a, int b)
+        {
+            if (a < 0 || b < 0) return 0;
+            if (b == 0) return 1; // mul-by-0 is always safe
+            // portable way to check for no overflows in a*b
+            return a <= INT_MAX / b;
+        }
+
+        // returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+        static int png__mad2sizes_valid(int a, int b, int add)
+        {
+            return png__mul2sizes_valid(a, b) && png__addsizes_valid(a * b, add);
+        }
+
+        // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+        static int png__mad3sizes_valid(int a, int b, int c, int add)
+        {
+            return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) &&
+                png__addsizes_valid(a * b * c, add);
+        }
+
+        // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+        static int png__mad4sizes_valid(int a, int b, int c, int d, int add)
+        {
+            return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) &&
+                png__mul2sizes_valid(a * b * c, d) && png__addsizes_valid(a * b * c * d, add);
+        }
+
+        // mallocs with size overflow checking
+        static void* png__malloc_mad2(int a, int b, int add)
+        {
+            if (!png__mad2sizes_valid(a, b, add)) return NULL;
+            return png__malloc(a * b + add);
+        }
+
+        static void* png__malloc_mad3(int a, int b, int c, int add)
+        {
+            if (!png__mad3sizes_valid(a, b, c, add)) return NULL;
+            return png__malloc(a * b * c + add);
+        }
+
+        static void* png__malloc_mad4(int a, int b, int c, int d, int add)
+        {
+            if (!png__mad4sizes_valid(a, b, c, d, add)) return NULL;
+            return png__malloc(a * b * c * d + add);
+        }
+
+        static uint8_t png__compute_y(int r, int g, int b)
+        {
+            return (uint8_t)(((r * 77) + (g * 150) + (29 * b)) >> 8);
+        }
+
+        static uint8_t* png__convert_format(uint8_t* data, int img_n, int req_comp, unsigned int x, unsigned int y)
+        {
+            int i, j;
+            uint8_t* good;
+
+            if (req_comp == img_n) 
+                return data;
+            assert(req_comp >= 1 && req_comp <= 4);
+
+            good = (uint8_t*)png__malloc_mad3(req_comp, x, y, 0);
+            if (good == NULL) 
+            {
+                PNG_FREE(data);
+                return PngErrorPtr("outofmem", "Out of memory");
+            }
+
+            for (j = 0; j < (int)y; ++j) 
+            {
+                uint8_t* src = data + j * x * img_n;
+                uint8_t* dest = good + j * x * req_comp;
+
+#define PNG__COMBO(a,b)  ((a)*8+(b))
+#define PNG__CASE(a,b)   case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+                // convert source image with img_n components to one with req_comp components;
+                // avoid switch per pixel, so use switch per scanline and massive macros
+                switch (PNG__COMBO(img_n, req_comp)) 
+                {
+                    PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 255; } break;
+                    PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 255; } break;
+                    PNG__CASE(2, 1) { dest[0] = src[0]; } break;
+                    PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break;
+                    PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 255; } break;
+                    PNG__CASE(3, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break;
+                    PNG__CASE(3, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = 255; } break;
+                    PNG__CASE(4, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break;
+                    PNG__CASE(4, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = src[3]; } break;
+                    PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break;
+                default: assert(0); PNG_FREE(data); PNG_FREE(good); return PngErrorPtr("unsupported", "Unsupported format conversion");
+                }
+#undef PNG__CASE
+            }
+
+            PNG_FREE(data);
+            return good;
+        }
+
+        static uint16_t png__compute_y_16(int r, int g, int b)
+        {
+            return (uint16_t)(((r * 77) + (g * 150) + (29 * b)) >> 8);
+        }
+
+        static uint16_t* png__convert_format16(uint16_t* data, int img_n, int req_comp, unsigned int x, unsigned int y)
+        {
+            int i, j;
+            uint16_t* good;
+
+            if (req_comp == img_n) 
+                return data;
+            assert(req_comp >= 1 && req_comp <= 4);
+
+            good = (uint16_t*)png__malloc(req_comp * x * y * 2);
+            if (good == NULL) 
+            {
+                PNG_FREE(data);
+                return (uint16_t*)PngErrorPtr("outofmem", "Out of memory");
+            }
+
+            for (j = 0; j < (int)y; ++j) 
+            {
+                uint16_t* src = data + j * x * img_n;
+                uint16_t* dest = good + j * x * req_comp;
+
+#define PNG__COMBO(a,b)  ((a)*8+(b))
+#define PNG__CASE(a,b)   case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+                // convert source image with img_n components to one with req_comp components;
+                // avoid switch per pixel, so use switch per scanline and massive macros
+                switch (PNG__COMBO(img_n, req_comp)) {
+                    PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 0xffff; } break;
+                    PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 0xffff; } break;
+                    PNG__CASE(2, 1) { dest[0] = src[0]; } break;
+                    PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break;
+                    PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 0xffff; } break;
+                    PNG__CASE(3, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break;
+                    PNG__CASE(3, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = 0xffff; } break;
+                    PNG__CASE(4, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break;
+                    PNG__CASE(4, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = src[3]; } break;
+                    PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break;
+                default: assert(0); PNG_FREE(data); PNG_FREE(good); return (uint16_t*)PngErrorPtr("unsupported", "Unsupported format conversion");
+                }
+#undef PNG__CASE
+            }
+
+            PNG_FREE(data);
+            return good;
+        }
+
+        namespace Zlib
+        {
+            const size_t ZFAST_BITS = 9;
+            const size_t ZFAST_SIZE = 1 << ZFAST_BITS;
+            const size_t ZFAST_MASK = ZFAST_SIZE - 1;
+
+            struct Zhuffman
+            {
+                uint16_t fast[ZFAST_SIZE];
+                uint16_t firstCode[16];
+                int maxCode[17];
+                uint16_t firstSymbol[16];
+                uint8_t  size[288];
+                uint16_t value[288];
+
+                bool Build(const uint8_t* sizelist, int num)
+                {
+                    int i, k = 0;
+                    int code, nextCode[16], sizes[17];
+
+                    memset(sizes, 0, sizeof(sizes));
+                    memset(fast, 0, sizeof(fast));
+                    for (i = 0; i < num; ++i)
+                        ++sizes[sizelist[i]];
+                    sizes[0] = 0;
+                    for (i = 1; i < 16; ++i)
+                        if (sizes[i] > (1 << i))
+                            return PngError("bad sizes", "Corrupt PNG");
+                    code = 0;
+                    for (i = 1; i < 16; ++i)
+                    {
+                        nextCode[i] = code;
+                        firstCode[i] = (uint16_t)code;
+                        firstSymbol[i] = (uint16_t)k;
+                        code = (code + sizes[i]);
+                        if (sizes[i] && code - 1 >= (1 << i))
+                            return PngError("bad codelengths", "Corrupt PNG");
+                        maxCode[i] = code << (16 - i); // preshift for inner loop
+                        code <<= 1;
+                        k += sizes[i];
+                    }
+                    maxCode[16] = 0x10000; // sentinel
+                    for (i = 0; i < num; ++i)
+                    {
+                        int s = sizelist[i];
+                        if (s)
+                        {
+                            int c = nextCode[s] - firstCode[s] + firstSymbol[s];
+                            uint16_t fastv = (uint16_t)((s << 9) | i);
+                            size[c] = (uint8_t)s;
+                            value[c] = (uint16_t)i;
+                            if (s <= (int)ZFAST_BITS)
+                            {
+                                int j = ZlibBitRev(nextCode[s], s);
+                                while (j < (1 << ZFAST_BITS))
+                                {
+                                    fast[j] = fastv;
+                                    j += (1 << s);
+                                }
+                            }
+                            ++nextCode[s];
+                        }
+                    }
+                    return 1;
+                }
+            };
+
+            SIMD_INLINE static int BitRev16(int n)
+            {
+                n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1);
+                n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2);
+                n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4);
+                n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8);
+                return n;
+            }
+
+            static int ZhuffmanDecode(InputMemoryStream& is, const Zhuffman& z)
+            {
+                int b, s;
+                if (is.BitCount() < 16)
+                {
+                    if (is.Eof())
+                        return -1;
+                    is.FillBits();
+                }
+                b = z.fast[is.BitBuffer() & ZFAST_MASK];
+                if (b)
+                {
+                    s = b >> 9;
+                    is.BitBuffer() >>= s;
+                    is.BitCount() -= s;
+                    return b & 511;
+                }
+                else
+                {
+                    int k;
+                    k = BitRev16(is.BitBuffer());
+                    for (s = ZFAST_BITS + 1; k >= z.maxCode[s]; ++s);
+                    if (s >= 16)
+                        return -1;
+                    b = (k >> (16 - s)) - z.firstCode[s] + z.firstSymbol[s];
+                    if (b >= sizeof(z.size) || z.size[b] != s)
+                        return -1;
+                    is.BitBuffer() >>= s;
+                    is.BitCount() -= s;
+                    return z.value[b];
+                }
+            }
+
+            static int ParseHuffmanBlock(InputMemoryStream& is, const Zhuffman& zLength, const Zhuffman& zDistance, OutputMemoryStream& os)
+            {
+                static const int zlengthBase[31] = { 3,4,5,6,7,8,9,10,11,13, 15,17,19,23,27,31,35,43,51,59, 67,83,99,115,131,163,195,227,258,0,0 };
+                static const int zlengthExtra[31] = { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+                static const int zdistBase[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193, 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0 };
+                static const int zdistExtra[32] = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
+
+                uint8_t* beg = os.Data(), * dst = os.Current(), * end = beg + os.Capacity();
+                for (;;)
+                {
+                    ptrdiff_t z = ZhuffmanDecode(is, zLength);
+                    if (z < 256)
+                    {
+                        if (z < 0)
+                            return PngError("bad huffman code", "Corrupt PNG");
+                        if (dst >= end)
+                        {
+                            os.Reserve(end - beg + 1);
+                            beg = os.Data();
+                            dst = os.Current();
+                            end = beg + os.Capacity();
+                        }
+                        *dst++ = (uint8_t)z;
+                    }
+                    else
+                    {
+                        uint8_t* p;
+                        ptrdiff_t len, dist;
+                        if (z == 256)
+                        {
+                            os.Seek(dst - beg);
+                            return 1;
+                        }
+                        z -= 257;
+                        len = zlengthBase[z];
+                        if (zlengthExtra[z])
+                            len += is.ReadBits(zlengthExtra[z]);
+                        z = ZhuffmanDecode(is, zDistance);
+                        if (z < 0)
+                            return PngError("bad huffman code", "Corrupt PNG");
+                        dist = zdistBase[z];
+                        if (zdistExtra[z])
+                            dist += is.ReadBits(zdistExtra[z]);
+                        if (dst - beg < dist)
+                            return PngError("bad dist", "Corrupt PNG");
+                        if (dst + len > end)
+                        {
+                            os.Reserve(end - beg + 1);
+                            beg = os.Data();
+                            dst = os.Current();
+                            end = beg + os.Capacity();
+                        }
+                        uint8_t* src = dst - dist;
+                        if (dist == 1)
+                        {
+                            memset(dst, *src, len);
+                            dst += len;
+                        }
+                        else if (dist < len || len < 16)
+                        {
+                            for (; len; len--)
+                                *dst++ = *src++;
+                        }
+                        else
+                        {
+                            memcpy(dst, src, len);
+                            dst += len;
+                        }
+                    }
+                }
+            }
+
+            static int ComputeHuffmanCodes(InputMemoryStream& is, Zhuffman& zLength, Zhuffman& zDistance)
+            {
+                static const uint8_t length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+                Zhuffman z_codelength;
+                uint8_t lencodes[286 + 32 + 137];
+                uint8_t codelength_sizes[19];
+                int i, n;
+
+                int hlit = is.ReadBits(5) + 257;
+                int hdist = is.ReadBits(5) + 1;
+                int hclen = is.ReadBits(4) + 4;
+                int ntot = hlit + hdist;
+
+                memset(codelength_sizes, 0, sizeof(codelength_sizes));
+                for (i = 0; i < hclen; ++i)
+                {
+                    int s = is.ReadBits(3);
+                    codelength_sizes[length_dezigzag[i]] = (uint8_t)s;
+                }
+                if (!z_codelength.Build(codelength_sizes, 19))
+                    return 0;
+                n = 0;
+                while (n < ntot)
+                {
+                    int c = ZhuffmanDecode(is, z_codelength);
+                    if (c < 0 || c >= 19)
+                        return PngError("bad codelengths", "Corrupt PNG");
+                    if (c < 16)
+                        lencodes[n++] = (uint8_t)c;
+                    else
+                    {
+                        uint8_t fill = 0;
+                        if (c == 16)
+                        {
+                            c = is.ReadBits(2) + 3;
+                            if (n == 0) return PngError("bad codelengths", "Corrupt PNG");
+                            fill = lencodes[n - 1];
+                        }
+                        else if (c == 17)
+                            c = is.ReadBits(3) + 3;
+                        else if (c == 18)
+                            c = is.ReadBits(7) + 11;
+                        else
+                            return PngError("bad codelengths", "Corrupt PNG");
+                        if (ntot - n < c)
+                            return PngError("bad codelengths", "Corrupt PNG");
+                        memset(lencodes + n, fill, c);
+                        n += c;
+                    }
+                }
+                if (n != ntot)
+                    return PngError("bad codelengths", "Corrupt PNG");
+                if (!zLength.Build(lencodes, hlit))
+                    return 0;
+                if (!zDistance.Build(lencodes + hlit, hdist))
+                    return 0;
+                return 1;
+            }
+
+            static int ParseUncompressedBlock(InputMemoryStream& is, OutputMemoryStream& os)
+            {
+                is.ClearBits();
+                uint16_t len, nlen;
+                if (!is.Read16u(len) || !is.Read16u(nlen) || nlen != (len ^ 0xffff))
+                    return PngError("zlib corrupt", "Corrupt PNG");
+                if (!os.Write(is, len))
+                    return PngError("read past buffer", "Corrupt PNG");
+                return 1;
+            }
+
+            static int ParseHeader(InputMemoryStream& is)
+            {
+                uint8_t cmf, flg;
+                if (!(is.Read8u(cmf) && is.Read8u(flg)))
+                    return PngError("bad zlib header", "Corrupt PNG");
+                if ((int(cmf) * 256 + flg) % 31 != 0)
+                    return PngError("bad zlib header", "Corrupt PNG");
+                if (flg & 32)
+                    return PngError("no preset dict", "Corrupt PNG");
+                if ((cmf & 15) != 8)
+                    return PngError("bad compression", "Corrupt PNG");
+                return 1;
+            }
+
+            bool Decode(InputMemoryStream& is, OutputMemoryStream& os, bool parseHeader)
+            {
+                static const uint8_t ZdefaultLength[288] = {
+                   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+                   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+                   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+                   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+                   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+                   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+                   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+                   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+                   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+                };
+                static const uint8_t ZdefaultDistance[32] = {
+                   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+                };
+
+                Zhuffman zLength, zDistance;
+                int final, type;
+                if (parseHeader)
+                {
+                    if (!ParseHeader(is))
+                        return false;
+                }
+                do
+                {
+                    final = is.ReadBits(1);
+                    type = is.ReadBits(2);
+                    if (type == 0)
+                    {
+                        if (!ParseUncompressedBlock(is, os))
+                            return false;
+                    }
+                    else if (type == 3)
+                        return false;
+                    else
+                    {
+                        if (type == 1)
+                        {
+                            if (!zLength.Build(ZdefaultLength, 288))
+                                return false;
+                            if (!zDistance.Build(ZdefaultDistance, 32))
+                                return false;
+                        }
+                        else
+                        {
+                            if (!ComputeHuffmanCodes(is, zLength, zDistance))
+                                return false;
+                        }
+                        if (!ParseHuffmanBlock(is, zLength, zDistance, os))
+                            return false;
+                    }
+                } while (!final);
+                return true;
+            }
+        }
+
+        typedef struct
+        {
+            PngContext* s;
+            uint8_t * out;
+            uint8_t depth;
+        } png__png;
+
+        enum 
+        {
+            PNG__F_none = 0,
+            PNG__F_sub = 1,
+            PNG__F_up = 2,
+            PNG__F_avg = 3,
+            PNG__F_paeth = 4,
+            // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+            PNG__F_avg_first,
+            PNG__F_paeth_first
+        };
+
+        static uint8_t first_row_filter[5] =
+        {
+           PNG__F_none,
+           PNG__F_sub,
+           PNG__F_none,
+           PNG__F_avg_first,
+           PNG__F_paeth_first
+        };
+
+        static int png__paeth(int a, int b, int c)
+        {
+            int p = a + b - c;
+            int pa = abs(p - a);
+            int pb = abs(p - b);
+            int pc = abs(p - c);
+            if (pa <= pb && pa <= pc) return a;
+            if (pb <= pc) return b;
+            return c;
+        }
+
+        static const uint8_t png__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+        // create the png data from post-deflated data
+        static int png__create_png_image_raw(png__png* a, uint8_t* raw, uint32_t raw_len, int out_n, uint32_t x, uint32_t y, int depth, int color)
+        {
+            int bytes = (depth == 16 ? 2 : 1);
+            PngContext* s = a->s;
+            uint32_t i, j, stride = x * out_n * bytes;
+            uint32_t img_len, img_width_bytes;
+            int k;
+            int img_n = s->img_n; // copy it into a local for later
+
+            int output_bytes = out_n * bytes;
+            int filter_bytes = img_n * bytes;
+            int width = x;
+
+            assert(out_n == s->img_n || out_n == s->img_n + 1);
+            a->out = (uint8_t*)png__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+            if (!a->out) return PngError("outofmem", "Out of memory");
+
+            if (!png__mad3sizes_valid(img_n, x, depth, 7)) return PngError("too large", "Corrupt PNG");
+            img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+            img_len = (img_width_bytes + 1) * y;
+
+            // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+            // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+            // so just check for raw_len < img_len always.
+            if (raw_len < img_len) 
+                return PngError("not enough pixels", "Corrupt PNG");
+
+            for (j = 0; j < y; ++j) 
+            {
+                uint8_t* cur = a->out + stride * j;
+                uint8_t* prior;
+                int filter = *raw++;
+
+                if (filter > 4)
+                    return PngError("invalid filter", "Corrupt PNG");
+
+                if (depth < 8) 
+                {
+                    if (img_width_bytes > x) 
+                        return PngError("invalid width", "Corrupt PNG");
+                    cur += x * out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+                    filter_bytes = 1;
+                    width = img_width_bytes;
+                }
+                prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
+
+                // if first row, use special filter that doesn't sample previous row
+                if (j == 0) filter = first_row_filter[filter];
+
+                // handle first byte explicitly
+                for (k = 0; k < filter_bytes; ++k) 
+                {
+                    switch (filter) {
+                    case PNG__F_none: cur[k] = raw[k]; break;
+                    case PNG__F_sub: cur[k] = raw[k]; break;
+                    case PNG__F_up: cur[k] = PNG__BYTECAST(raw[k] + prior[k]); break;
+                    case PNG__F_avg: cur[k] = PNG__BYTECAST(raw[k] + (prior[k] >> 1)); break;
+                    case PNG__F_paeth: cur[k] = PNG__BYTECAST(raw[k] + png__paeth(0, prior[k], 0)); break;
+                    case PNG__F_avg_first: cur[k] = raw[k]; break;
+                    case PNG__F_paeth_first: cur[k] = raw[k]; break;
+                    }
+                }
+
+                if (depth == 8) 
+                {
+                    if (img_n != out_n)
+                        cur[img_n] = 255; // first pixel
+                    raw += img_n;
+                    cur += out_n;
+                    prior += out_n;
+                }
+                else if (depth == 16) 
+                {
+                    if (img_n != out_n) 
+                    {
+                        cur[filter_bytes] = 255; // first pixel top byte
+                        cur[filter_bytes + 1] = 255; // first pixel bottom byte
+                    }
+                    raw += filter_bytes;
+                    cur += output_bytes;
+                    prior += output_bytes;
+                }
+                else 
+                {
+                    raw += 1;
+                    cur += 1;
+                    prior += 1;
+                }
+
+                // this is a little gross, so that we don't switch per-pixel or per-component
+                if (depth < 8 || img_n == out_n) 
+                {
+                    int nk = (width - 1) * filter_bytes;
+#define PNG__CASE(f) \
+             case f:     \
+                for (k=0; k < nk; ++k)
+                    switch (filter) {
+                        // "none" filter turns into a memcpy here; make that explicit.
+                    case PNG__F_none:         memcpy(cur, raw, nk); break;
+                        PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - filter_bytes]); } break;
+                        PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break;
+                        PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes])); } break;
+                        PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], 0, 0)); } break;
+                    }
+#undef PNG__CASE
+                    raw += nk;
+                }
+                else 
+                {
+                    assert(img_n + 1 == out_n);
+#define PNG__CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
+                    switch (filter) {
+                        PNG__CASE(PNG__F_none) { cur[k] = raw[k]; } break;
+                        PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - output_bytes]); } break;
+                        PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break;
+                        PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - output_bytes]) >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes])); } break;
+                        PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - output_bytes] >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], 0, 0)); } break;
+                    }
+#undef PNG__CASE
+
+                    // the loop above sets the high byte of the pixels' alpha, but for
+                    // 16 bit png files we also need the low byte set. we'll do that here.
+                    if (depth == 16) 
+                    {
+                        cur = a->out + stride * j; // start at the beginning of the row again
+                        for (i = 0; i < x; ++i, cur += output_bytes) 
+                            cur[filter_bytes + 1] = 255;
+                    }
+                }
+            }
+
+            // we make a separate pass to expand bits to pixels; for performance,
+            // this could run two scanlines behind the above code, so it won't
+            // intefere with filtering but will still be in the cache.
+            if (depth < 8)
+            {
+                for (j = 0; j < y; ++j)
+                {
+                    uint8_t* cur = a->out + stride * j;
+                    uint8_t* in = a->out + stride * j + x * out_n - img_width_bytes;
+                    // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
+                    // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+                    uint8_t scale = (color == 0) ? png__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+                    // note that the final byte might overshoot and write more data than desired.
+                    // we can allocate enough data that this never writes out of memory, but it
+                    // could also overwrite the next scanline. can it overwrite non-empty data
+                    // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+                    // so we need to explicitly clamp the final ones
+
+                    if (depth == 4) 
+                    {
+                        for (k = x * img_n; k >= 2; k -= 2, ++in) 
+                        {
+                            *cur++ = scale * ((*in >> 4));
+                            *cur++ = scale * ((*in) & 0x0f);
+                        }
+                        if (k > 0) 
+                            *cur++ = scale * ((*in >> 4));
+                    }
+                    else if (depth == 2) 
+                    {
+                        for (k = x * img_n; k >= 4; k -= 4, ++in) 
+                        {
+                            *cur++ = scale * ((*in >> 6));
+                            *cur++ = scale * ((*in >> 4) & 0x03);
+                            *cur++ = scale * ((*in >> 2) & 0x03);
+                            *cur++ = scale * ((*in) & 0x03);
+                        }
+                        if (k > 0) 
+                            *cur++ = scale * ((*in >> 6));
+                        if (k > 1) 
+                            *cur++ = scale * ((*in >> 4) & 0x03);
+                        if (k > 2) 
+                            *cur++ = scale * ((*in >> 2) & 0x03);
+                    }
+                    else if (depth == 1)
+                    {
+                        for (k = x * img_n; k >= 8; k -= 8, ++in) 
+                        {
+                            *cur++ = scale * ((*in >> 7));
+                            *cur++ = scale * ((*in >> 6) & 0x01);
+                            *cur++ = scale * ((*in >> 5) & 0x01);
+                            *cur++ = scale * ((*in >> 4) & 0x01);
+                            *cur++ = scale * ((*in >> 3) & 0x01);
+                            *cur++ = scale * ((*in >> 2) & 0x01);
+                            *cur++ = scale * ((*in >> 1) & 0x01);
+                            *cur++ = scale * ((*in) & 0x01);
+                        }
+                        if (k > 0) *cur++ = scale * ((*in >> 7));
+                        if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+                        if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+                        if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+                        if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+                        if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+                        if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+                    }
+                    if (img_n != out_n) 
+                    {
+                        int q;
+                        // insert alpha = 255
+                        cur = a->out + stride * j;
+                        if (img_n == 1) 
+                        {
+                            for (q = x - 1; q >= 0; --q)
+                            {
+                                cur[q * 2 + 1] = 255;
+                                cur[q * 2 + 0] = cur[q];
+                            }
+                        }
+                        else
+                        {
+                            assert(img_n == 3);
+                            for (q = x - 1; q >= 0; --q) 
+                            {
+                                cur[q * 4 + 3] = 255;
+                                cur[q * 4 + 2] = cur[q * 3 + 2];
+                                cur[q * 4 + 1] = cur[q * 3 + 1];
+                                cur[q * 4 + 0] = cur[q * 3 + 0];
+                            }
+                        }
+                    }
+                }
+            }
+            else if (depth == 16) 
+            {
+                // force the image data from big-endian to platform-native.
+                // this is done in a separate pass due to the decoding relying
+                // on the data being untouched, but could probably be done
+                // per-line during decode if care is taken.
+                uint8_t* cur = a->out;
+                uint16_t* cur16 = (uint16_t*)cur;
+
+                for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2)
+                    *cur16 = (cur[0] << 8) | cur[1];
+            }
+
+            return 1;
+        }
+
+        static int png__create_png_image(png__png* a, uint8_t* image_data, uint32_t image_data_len, int out_n, int depth, int color, int interlaced)
+        {
+            int bytes = (depth == 16 ? 2 : 1);
+            int out_bytes = out_n * bytes;
+            uint8_t* final;
+            int p;
+            if (!interlaced)
+                return png__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+            // de-interlacing
+            final = (uint8_t*)png__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+            for (p = 0; p < 7; ++p) 
+            {
+                int xorig[] = { 0,4,0,2,0,1,0 };
+                int yorig[] = { 0,0,4,0,2,0,1 };
+                int xspc[] = { 8,8,4,4,2,2,1 };
+                int yspc[] = { 8,8,8,4,4,2,2 };
+                int i, j, x, y;
+                // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+                x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p];
+                y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p];
+                if (x && y) 
+                {
+                    uint32_t img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+                    if (!png__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color))
+                    {
+                        PNG_FREE(final);
+                        return 0;
+                    }
+                    for (j = 0; j < y; ++j) 
+                    {
+                        for (i = 0; i < x; ++i) 
+                        {
+                            int out_y = j * yspc[p] + yorig[p];
+                            int out_x = i * xspc[p] + xorig[p];
+                            memcpy(final + out_y * a->s->img_x * out_bytes + out_x * out_bytes,
+                                a->out + (j * x + i) * out_bytes, out_bytes);
+                        }
+                    }
+                    PNG_FREE(a->out);
+                    image_data += img_len;
+                    image_data_len -= img_len;
+                }
+            }
+            a->out = final;
+
+            return 1;
+        }
+
+        static int png__compute_transparency(png__png* z, uint8_t tc[3], int out_n)
+        {
+            PngContext* s = z->s;
+            uint32_t i, pixel_count = s->img_x * s->img_y;
+            uint8_t* p = z->out;
+
+            // compute color-based transparency, assuming we've
+            // already got 255 as the alpha value in the output
+            assert(out_n == 2 || out_n == 4);
+
+            if (out_n == 2) 
+            {
+                for (i = 0; i < pixel_count; ++i) 
+                {
+                    p[1] = (p[0] == tc[0] ? 0 : 255);
+                    p += 2;
+                }
+            }
+            else 
+            {
+                for (i = 0; i < pixel_count; ++i) 
+                {
+                    if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+                        p[3] = 0;
+                    p += 4;
+                }
+            }
+            return 1;
+        }
+
+        static int png__compute_transparency16(png__png* z, uint16_t tc[3], int out_n)
+        {
+            PngContext* s = z->s;
+            uint32_t i, pixel_count = s->img_x * s->img_y;
+            uint16_t* p = (uint16_t*)z->out;
+
+            // compute color-based transparency, assuming we've
+            // already got 65535 as the alpha value in the output
+            assert(out_n == 2 || out_n == 4);
+
+            if (out_n == 2) 
+            {
+                for (i = 0; i < pixel_count; ++i)
+                {
+                    p[1] = (p[0] == tc[0] ? 0 : 65535);
+                    p += 2;
+                }
+            }
+            else 
+            {
+                for (i = 0; i < pixel_count; ++i)
+                {
+                    if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+                        p[3] = 0;
+                    p += 4;
+                }
+            }
+            return 1;
+        }
+
+        static int png__expand_png_palette(png__png* a, uint8_t* palette, int len, int pal_img_n)
+        {
+            uint32_t i, pixel_count = a->s->img_x * a->s->img_y;
+            uint8_t* p, * temp_out, * orig = a->out;
+
+            p = (uint8_t*)png__malloc_mad2(pixel_count, pal_img_n, 0);
+            if (p == NULL) 
+                return PngError("outofmem", "Out of memory");
+
+            // between here and free(out) below, exitting would leak
+            temp_out = p;
+
+            if (pal_img_n == 3) 
+            {
+                for (i = 0; i < pixel_count; ++i) 
+                {
+                    int n = orig[i] * 4;
+                    p[0] = palette[n];
+                    p[1] = palette[n + 1];
+                    p[2] = palette[n + 2];
+                    p += 3;
+                }
+            }
+            else 
+            {
+                for (i = 0; i < pixel_count; ++i) 
+                {
+                    int n = orig[i] * 4;
+                    p[0] = palette[n];
+                    p[1] = palette[n + 1];
+                    p[2] = palette[n + 2];
+                    p[3] = palette[n + 3];
+                    p += 4;
+                }
+            }
+            PNG_FREE(a->out);
+            a->out = temp_out;
+
+            return 1;
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePngLoader::ImagePngLoader(const ImageLoaderParam& param)
+            : ImageLoader(param)
+            , _toAny8(NULL)
+            , _toBgra8(NULL)
+            , _toAny16(NULL)
+            , _toBgra16(NULL)
+        {
+            if (_param.format == SimdPixelFormatNone)
+                _param.format = SimdPixelFormatRgba32;
+        }
+
+        void ImagePngLoader::SetConverters()
+        {
+            _bgrToBgra = Base::BgrToBgra;
+        }
+
+        SIMD_INLINE constexpr uint32_t ChunkType(char a, char b, char c, char d)
+        {
+            return ((uint32_t(a) << 24) + (uint32_t(b) << 16) + (uint32_t(c) << 8) + uint32_t(d));
+        }
+
+        bool ImagePngLoader::FromStream()
+        {
+            const int req_comp = 4;
+            PngContext context;
+            png__png p;
+            p.s = &context;
+            png__png* z = &p;
+
+            PngContext* s = z->s;
+
+            z->out = NULL;
+
+            if (!ParseFile())
+                return false;
+
+            s->img_x = _width;
+            s->img_y = _height;
+            z->depth = _depth;
+            s->img_n = _channels;
+
+            InputMemoryStream zSrc = MergedDataStream();
+            OutputMemoryStream zDst(AlignHi(size_t(_width) * _depth, 8) * _height * _channels + _height);
+            if(!Zlib::Decode(zSrc, zDst, !_iPhone))
+                return false;
+
+            if ((req_comp == s->img_n + 1 && req_comp != 3 && !_paletteChannels) || _hasTrans)
+                s->img_out_n = s->img_n + 1;
+            else
+                s->img_out_n = s->img_n;
+            if (!png__create_png_image(z, zDst.Data(), zDst.Size(), s->img_out_n, z->depth, _color, _interlace))
+                return 0;
+            if (_hasTrans) 
+            {
+                if (z->depth == 16)
+                {
+                    if (!png__compute_transparency16(z, _tc16, s->img_out_n))
+                        return false;
+                }
+                else
+                {
+                    if (!png__compute_transparency(z, _tc, s->img_out_n))
+                        return false;
+                }
+            }
+            if (_paletteChannels)
+            {
+                s->img_n = _paletteChannels; // record the actual colors we had
+                s->img_out_n = _paletteChannels;
+                if (req_comp >= 3) 
+                    s->img_out_n = req_comp;
+                if (!png__expand_png_palette(z, _palette.data, (int)_palette.size, s->img_out_n))
+                    return false;
+            }
+            else if (_hasTrans)
+                ++s->img_n;
+
+            if (!(p.depth <= 8 || p.depth == 16))
+                return false;
+            uint8_t* data = p.out;
+            p.out = NULL;
+            if (req_comp && req_comp != p.s->img_out_n)
+            {
+                if (p.depth <= 8)
+                    data = png__convert_format((uint8_t*)data, p.s->img_out_n, req_comp, _width, _height);
+                else
+                    data = (uint8_t*)png__convert_format16((uint16_t*)data, p.s->img_out_n, req_comp, _width, _height);
+                p.s->img_out_n = req_comp;
+                if (data == NULL)
+                    return false;
+            }
+            if (p.depth == 16)
+            {
+                size_t size = context.img_x * context.img_y * req_comp;
+                const uint16_t* src = (uint16_t*)data;
+                uint8_t* dst = (uint8_t*)PNG_MALLOC(size);
+                for (size_t i = 0; i < size; ++i)
+                    dst[i] = uint8_t(src[i] >> 8);
+                PNG_FREE(data);
+                data = dst;
+            }
+            PNG_FREE(p.out);
+            if (data)
+            {
+                size_t stride = 4 * context.img_x;
+                _image.Recreate(context.img_x, context.img_y, (Image::Format)_param.format);
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8:
+                    Base::RgbaToGray(data, context.img_x, context.img_y, stride, _image.data, _image.stride);
+                    break;
+                case SimdPixelFormatBgr24:
+                    Base::BgraToRgb(data, context.img_x, context.img_y, stride, _image.data, _image.stride);
+                    break;
+                case SimdPixelFormatBgra32:
+                    Base::BgraToRgba(data, context.img_x, context.img_y, stride, _image.data, _image.stride);
+                    break;
+                case SimdPixelFormatRgb24:
+                    Base::BgraToBgr(data, context.img_x, context.img_y, stride, _image.data, _image.stride);
+                    break;
+                case SimdPixelFormatRgba32:
+                    Base::Copy(data, stride, context.img_x, context.img_y, 4, _image.data, _image.stride);
+                    break;
+                default: 
+                    break;
+                }
+                PNG_FREE(data);
+                return true;
+            }
+            return false;
+        }
+
+        bool ImagePngLoader::ParseFile()
+        {
+            _first = true, _iPhone = false, _hasTrans = false;
+            if (!CheckHeader())
+                return false;
+            for (bool run = true; run;)
+            {
+                Chunk chunk;
+                if (!ReadChunk(chunk))
+                    return 0;
+                if (chunk.type == ChunkType('C', 'g', 'B', 'I'))
+                {
+                    _iPhone = true;
+                    _stream.Skip(chunk.size);
+                }
+                else if (chunk.type == ChunkType('I', 'H', 'D', 'R'))
+                {
+                    if (!ReadHeader(chunk))
+                        return false;
+                    SetConverters();
+                }
+                else if (chunk.type == ChunkType('P', 'L', 'T', 'E'))
+                {
+                    if (!ReadPalette(chunk))
+                        return false;
+                }
+                else if (chunk.type == ChunkType('t', 'R', 'N', 'S'))
+                {
+                    if (!ReadTransparency(chunk))
+                        return false;
+                }
+                else if (chunk.type == ChunkType('I', 'D', 'A', 'T'))
+                {
+                    if (!ReadData(chunk))
+                        return false;
+                }
+                else if (chunk.type == ChunkType('I', 'E', 'N', 'D'))
+                {
+                    if (_first)
+                        return false;
+                    run = false;
+                }
+                else
+                {
+                    if (_first || (chunk.type & (1 << 29)) == 0)
+                        return false;
+                    _stream.Skip(chunk.size);
+                }
+                uint32_t crc32;
+                if (!_stream.ReadBe32u(crc32))
+                    return false;
+            }
+            return _idats.size() != 0;
+        }
+
+        bool ImagePngLoader::CheckHeader()
+        {
+            const size_t size = 8;
+            const uint8_t control[size] = { 137, 80, 78, 71, 13, 10, 26, 10 };
+            uint8_t buffer[size];
+            return _stream.Read(size, buffer) == size && memcmp(buffer, control, size) == 0;
+        }
+
+        SIMD_INLINE bool ImagePngLoader::ReadChunk(Chunk& chunk)
+        {
+            if (_stream.ReadBe32u(chunk.size) && _stream.ReadBe32u(chunk.type))
+            {
+                chunk.offs = (uint32_t)_stream.Pos();
+                return true;
+            }
+            return false;
+        }
+
+        bool ImagePngLoader::ReadHeader(const Chunk& chunk)
+        {
+            const int MAX_SIZE = 1 << 24;
+            if (!_first)
+                return false;
+            _first = false;
+            if (!(chunk.size == 13 && _stream.CanRead(13)))
+                return false;
+            uint8_t comp, filter;
+            if (!(_stream.ReadBe32u(_width) && _stream.ReadBe32u(_height) &&
+                _stream.Read8u(_depth) && _stream.Read8u(_color) && _stream.Read8u(comp) &&
+                _stream.Read8u(filter) && _stream.Read8u(_interlace)))
+                return false;
+            if (_width == 0 || _width > MAX_SIZE || _height == 0 || _height > MAX_SIZE)
+                return false;
+            if (_depth != 1 && _depth != 2 && _depth != 4 && _depth != 8 && _depth != 16)
+                return false;
+            if (_color > 6 || (_color == 3 && _depth == 16))
+                return false;
+            _paletteChannels = 0;
+            if (_color == 3)
+                _paletteChannels = 3;
+            else if (_color & 1)
+                return false;
+            if (comp != 0 || filter != 0 || _interlace > 1)
+                return false;
+            if (!_paletteChannels)
+            {
+                _channels = (_color & 2 ? 3 : 1) + (_color & 4 ? 1 : 0);
+                if ((1 << 30) / _width / _channels < _height)
+                    return false;
+            }
+            else
+            {
+                _channels = 1;
+                if ((1 << 30) / _width / 4 < _height)
+                    return false;
+            }
+            return true;
+        }
+
+        bool ImagePngLoader::ReadPalette(const Chunk& chunk)
+        {
+            if (_first || chunk.size > 256 * 3)
+                return false;
+            size_t length = chunk.size / 3;
+            if (length * 3 != chunk.size)
+                return false;
+            if (_stream.CanRead(chunk.size))
+            {
+                _palette.Resize(length * 4);
+                _bgrToBgra(_stream.Current(), length, 1, length, _palette.data, _palette.size, 0xFF);
+                _stream.Skip(chunk.size);
+                return true;
+            }
+            else
+                return false;
+        }
+
+        bool ImagePngLoader::ReadTransparency(const Chunk& chunk)
+        {
+            if (_first)
+                return false;
+            if (_idats.size())
+                return false;
+            if (_paletteChannels)
+            {
+                if (_palette.size == 0 || chunk.size > _palette.size || !_stream.CanRead(chunk.size))
+                    return false;
+                _paletteChannels = 4;
+                for (size_t i = 0; i < chunk.size; ++i)
+                    _palette.data[i * 4 + 3] = _stream.Current()[i];
+                _stream.Skip(chunk.size);
+            }
+            else
+            {
+                if (!(_channels & 1) || chunk.size != _channels * 2)
+                    return false;
+                _hasTrans = true;
+                for (size_t k = 0; k < _channels; ++k)
+                    if (!_stream.ReadBe16u(_tc16[k]))
+                        return false;
+                if (_depth != 16)
+                {
+                    for (size_t k = 0; k < _channels; ++k)
+                        _tc[k] = uint8_t(_tc16[k]) * png__depth_scale_table[_depth];
+                }
+            }
+            return true;
+        }
+
+        bool ImagePngLoader::ReadData(const Chunk& chunk)
+        {
+            if (_first)
+                return false;
+            if (_paletteChannels && !_palette.size)
+                return false;
+            if (!_stream.CanRead(chunk.size))
+                return false;
+            _idats.push_back(chunk);
+            _stream.Skip(chunk.size);
+            return true;
+        }
+
+        InputMemoryStream ImagePngLoader::MergedDataStream()
+        {
+            if (_idats.size() == 1)
+                return InputMemoryStream((uint8_t*)_stream.Data() + _idats[0].offs, _idats[0].size);
+            else
+            {
+                size_t size = 0;
+                for (size_t i = 0; i < _idats.size(); ++i)
+                    size += _idats[i].size;
+                _idat.Resize(size);
+                for (size_t i = 0, offset = 0; i < _idats.size(); ++i)
+                {
+                    memcpy(_idat.data + offset, _stream.Data() + _idats[i].offs, _idats[i].size);
+                    offset += _idats[i].size;
+                }
+                return InputMemoryStream(_idat.data, _idat.size);
+            }
+        }
+    }
+}
diff --git a/3rdparty/simdlib/Simd/SimdBaseImageSave.cpp b/3rdparty/simdlib/Simd/SimdBaseImageSave.cpp
new file mode 100644
index 0000000000..fb5a8eacef
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseImageSave.cpp
@@ -0,0 +1,340 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdCpu.h"
+#include "Simd/SimdBase.h"
+
+#include <stdio.h>
+
+#include <memory>
+#include <sstream>
+
+#if defined(_MSC_VER)
+#pragma warning (push)
+#pragma warning (disable: 4996)
+#endif
+
+namespace Simd
+{        
+    SimdBool ImageSaveToFile(const ImageSaveToMemoryPtr saver, const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char* path)
+    {
+        SimdBool result = SimdFalse;
+        size_t size;
+        uint8_t * data = saver(src, stride, width, height, format, file, quality, &size);
+        if (data)
+        {
+            ::FILE* file = ::fopen(path, "wb");
+            if (file)
+            {
+                if (::fwrite(data, 1, size, file) == size)
+                    result = SimdTrue;
+                ::fclose(file);
+            }
+            Simd::Free(data);
+        }
+        return result;
+    }
+
+    //-------------------------------------------------------------------------
+
+    namespace Base
+    {
+        ImagePxmSaver::ImagePxmSaver(const ImageSaverParam& param)
+            : ImageSaver(param)
+            , _convert(NULL)
+        {
+            _block = _param.height;
+            if (_param.file == SimdImageFilePgmTxt || _param.file == SimdImageFilePgmBin)
+            {
+                _size = _param.width * 1;
+                if (_param.format != SimdPixelFormatGray8)
+                {
+                    _block = Simd::RestrictRange<size_t>(Base::AlgCacheL1() / _size, 1, _param.height);
+                    _buffer.Resize(_block * _size);
+                }
+            }
+            else if (_param.file == SimdImageFilePpmTxt || _param.file == SimdImageFilePpmBin)
+            {
+                _size = _param.width * 3;
+                if (_param.format != SimdPixelFormatRgb24)
+                {
+                    _block = Simd::RestrictRange<size_t>(Base::AlgCacheL1() / _size, 1, _param.height);
+                    _buffer.Resize(_block * _size);
+                }
+            }
+            else
+                assert(0);
+        }
+
+        void ImagePxmSaver::WriteHeader(size_t version)
+        {
+            std::stringstream header;
+            header << "P" << version << "\n" << _param.width << " " << _param.height << "\n255\n";
+            _stream.Write(header.str().c_str(), header.str().size());
+        }
+
+        uint8_t g_pxmPrint[256][4];
+        bool PxmPrintInit()
+        {
+            for (int i = 0; i < 256; ++i)
+            {
+                int d0 = i / 100;
+                int d1 = (i / 10) % 10;
+                int d2 = i % 10;
+                g_pxmPrint[i][0] = d0 ? '0' + d0 : ' ';
+                g_pxmPrint[i][1] = (d1 || d0) ? '0' + d1 : ' ';
+                g_pxmPrint[i][2] = '0' + d2;
+                g_pxmPrint[i][3] = ' ';
+            }
+            return true;
+        }
+        bool g_pxmPrintInited = PxmPrintInit();
+
+        //---------------------------------------------------------------------
+
+        ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param)
+            : ImagePxmSaver(param)
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatBgr24: _convert = Base::BgrToGray; break;
+            case SimdPixelFormatBgra32: _convert = Base::BgraToGray; break;
+            case SimdPixelFormatRgb24: _convert = Base::RgbToGray; break;
+            case SimdPixelFormatRgba32: _convert = Base::RgbaToGray; break;
+            default: break;
+            }
+        }
+
+        bool ImagePgmTxtSaver::ToStream(const uint8_t* src, size_t stride)
+        {
+            size_t grayStride = _param.format == SimdPixelFormatGray8 ? stride : _size;
+            _stream.Reserve(32 + _param.height * (_param.width * 4 + DivHi(_param.width, 17)));
+            WriteHeader(2);
+            for (size_t row = 0; row < _param.height;)
+            {
+                size_t block = Simd::Min(row + _block, _param.height) - row;
+                const uint8_t* gray = src;
+                if (_param.format != SimdPixelFormatGray8)
+                {
+                    _convert(src, _param.width, block, stride, _buffer.data, grayStride);
+                    gray = _buffer.data;
+                }                
+                for (size_t b = 0; b < block; ++b)
+                {
+                    uint8_t string[70];
+                    for (size_t col = 0, offset = 0; col < _param.width; ++col)
+                    {
+                        *(uint32_t*)(string + offset) = *(uint32_t*)g_pxmPrint[gray[col]];
+                        offset += 4;
+                        if (offset >= 68 || col == _param.width - 1)
+                        {
+                            string[offset++] = '\n';
+                            _stream.Write(string, offset);
+                            offset = 0;
+                        }
+                    }
+                    gray += grayStride;
+                }
+                src += stride * block;
+                row += block;
+            }
+            return true;
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param)
+            : ImagePxmSaver(param)
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatBgr24: _convert = Base::BgrToGray; break;
+            case SimdPixelFormatBgra32: _convert = Base::BgraToGray; break;
+            case SimdPixelFormatRgb24: _convert = Base::RgbToGray; break;
+            case SimdPixelFormatRgba32: _convert = Base::RgbaToGray; break;
+            default: break;
+            }
+        }
+
+        bool ImagePgmBinSaver::ToStream(const uint8_t* src, size_t stride)
+        {
+            size_t grayStride = _param.format == SimdPixelFormatGray8 ? stride : _size;
+            _stream.Reserve(32 + _param.height * _size);
+            WriteHeader(5);
+            for (size_t row = 0; row < _param.height;)
+            {
+                size_t block = Simd::Min(row + _block, _param.height) - row;
+                const uint8_t* gray = src;
+                if (_param.format != SimdPixelFormatGray8)
+                {
+                    _convert(src, _param.width, block, stride, _buffer.data, grayStride);
+                    gray = _buffer.data;
+                }
+                for (size_t b = 0; b < block; ++b)
+                {
+                    _stream.Write(gray, _size);
+                    gray += grayStride;
+                }
+                src += stride * block;
+                row += block;
+            }
+            return true;
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param)
+            : ImagePxmSaver(param)
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatGray8: _convert = Base::GrayToBgr; break;
+            case SimdPixelFormatBgr24: _convert = Base::BgrToRgb; break;
+            case SimdPixelFormatBgra32: _convert = Base::BgraToRgb; break;
+            case SimdPixelFormatRgba32: _convert = Base::BgraToBgr; break;
+            default: break;
+            }
+        }
+
+        bool ImagePpmTxtSaver::ToStream(const uint8_t* src, size_t stride)
+        {
+            size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? stride : _size;
+            _stream.Reserve(32 + _param.height * (_param.width * 13 + DivHi(_param.width, 5)));
+            WriteHeader(3);
+            for (size_t row = 0; row < _param.height;)
+            {
+                size_t block = Simd::Min(row + _block, _param.height) - row;
+                const uint8_t* rgb = src;
+                if (_param.format != SimdPixelFormatRgb24)
+                {
+                    _convert(src, _param.width, block, stride, _buffer.data, rgbStride);
+                    rgb = _buffer.data;
+                }
+                for (size_t b = 0; b < block; ++b)
+                {
+                    uint8_t string[70];
+                    for (size_t col = 0, offset = 0; col < _size; col += 3)
+                    {
+                        ((uint32_t*)(string + offset))[0] = *(uint32_t*)g_pxmPrint[rgb[col + 0]];
+                        ((uint32_t*)(string + offset))[1] = *(uint32_t*)g_pxmPrint[rgb[col + 1]];
+                        ((uint32_t*)(string + offset))[2] = *(uint32_t*)g_pxmPrint[rgb[col + 2]];
+                        offset += 12;
+                        if (offset >= 68 || col == _size - 3)
+                        {
+                            string[offset++] = '\n';
+                            _stream.Write(string, offset);
+                            offset = 0;
+                        }
+                        else
+                        {
+                            string[offset++] = ' ';
+                            string[offset++] = ' ';
+                        }
+                    }
+                    rgb += rgbStride;
+                }
+                src += stride * block;
+                row += block;
+            }
+            return true;
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param)
+            : ImagePxmSaver(param)
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatGray8: _convert = Base::GrayToBgr; break;
+            case SimdPixelFormatBgr24: _convert = Base::BgrToRgb; break;
+            case SimdPixelFormatBgra32: _convert = Base::BgraToRgb; break;
+            case SimdPixelFormatRgba32: _convert = Base::BgraToBgr; break;
+            default: break;
+            }
+        }
+
+        bool ImagePpmBinSaver::ToStream(const uint8_t* src, size_t stride)
+        {
+            size_t rgbStride = _param.format == SimdPixelFormatRgb24 ? stride : _size;
+            _stream.Reserve(32 + _param.height * _size);
+            WriteHeader(6);
+            for (size_t row = 0; row < _param.height;)
+            {
+                size_t block = Simd::Min(row + _block, _param.height) - row;
+                const uint8_t* rgb = src;
+                if (_param.format != SimdPixelFormatRgb24)
+                {
+                    _convert(src, _param.width, block, stride, _buffer.data, rgbStride);
+                    rgb = _buffer.data;
+                }
+                for (size_t b = 0; b < block; ++b)
+                {
+                    _stream.Write(rgb, _size);
+                    rgb += rgbStride;
+                }
+                src += stride * block;
+                row += block;
+            }
+            return true;
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageSaver* CreateImageSaver(const ImageSaverParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinSaver(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinSaver(param);
+            case SimdImageFilePng:    return new ImagePngSaver(param);
+            case SimdImageFileJpeg:   return new ImageJpegSaver(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size)
+        {
+            ImageSaverParam param(width, height, format, file, quality);
+            if (param.Validate())
+            {
+                Holder<ImageSaver> saver(CreateImageSaver(param));
+                if (saver)
+                {
+                    if (saver->ToStream(src, stride))
+                        return saver->Release(size);
+                }
+            }
+            return NULL;
+        }
+    }
+}
+
+#if defined(_MSC_VER)
+#pragma warning (pop)
+#endif
diff --git a/3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp b/3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp
new file mode 100644
index 0000000000..f7ba583247
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseImageSaveJpeg.cpp
@@ -0,0 +1,451 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdImageSaveJpeg.h"
+#include "Simd/SimdBase.h"
+
+namespace Simd
+{
+    namespace Base
+    {
+        const uint8_t JpegZigZagD[64] = { 
+            0, 1, 5, 6, 14, 15, 27, 28, 
+            2, 4, 7, 13, 16, 26, 29, 42, 
+            3, 8, 12, 17, 25, 30, 41, 43, 
+            9, 11, 18, 24, 31, 40, 44, 53, 
+            10, 19, 23, 32, 39, 45, 52, 54, 
+            20, 22, 33, 38, 46, 51, 55, 60, 
+            21, 34, 37, 47, 50, 56, 59, 61, 
+            35, 36, 48, 49, 57, 58, 62, 63 };
+
+        const uint8_t JpegZigZagT[64] = { 
+            0, 2, 3, 9, 10, 20, 21, 35,
+            1, 4, 8, 11, 19, 22, 34, 36,
+            5, 7, 12, 18, 23, 33, 37, 48,
+            6, 13, 17, 24, 32, 38, 47, 49,
+            14, 16, 25, 31, 39, 46, 50, 57,
+            15, 26, 30, 40, 45, 51, 56, 58,
+            27, 29, 41, 44, 52, 55, 59, 62,
+            28, 42, 43, 53, 54, 60, 61, 63 };        
+
+        const uint16_t HuffmanYdc[256][2] = { {0, 2}, {2, 3}, {3, 3}, {4, 3}, {5, 3}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9} };
+        const uint16_t HuffmanUVdc[256][2] = { {0, 2}, {1, 2}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}, {1022, 10}, {2046, 11} };
+        const uint16_t HuffmanYac[256][2] = {
+           {10, 4}, {0, 2}, {1, 2}, {4, 3}, {11, 4}, {26, 5}, {120, 7}, {248, 8}, {1014, 10}, {65410, 16}, {65411, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {12, 4}, {27, 5}, {121, 7}, {502, 9}, {2038, 11}, {65412, 16}, {65413, 16}, {65414, 16}, {65415, 16}, {65416, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {28, 5}, {249, 8}, {1015, 10}, {4084, 12}, {65417, 16}, {65418, 16}, {65419, 16}, {65420, 16}, {65421, 16}, {65422, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {58, 6}, {503, 9}, {4085, 12}, {65423, 16}, {65424, 16}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {59, 6}, {1016, 10}, {65430, 16}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {122, 7}, {2039, 11}, {65438, 16}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {123, 7}, {4086, 12}, {65446, 16}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {250, 8}, {4087, 12}, {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {504, 9}, {32704, 15}, {65462, 16}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {505, 9}, {65470, 16}, {65471, 16}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {506, 9}, {65479, 16}, {65480, 16}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {1017, 10}, {65488, 16}, {65489, 16}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {1018, 10}, {65497, 16}, {65498, 16}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {2040, 11}, {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {2041, 11}, {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+        };
+        const uint16_t HuffmanUVac[256][2] = {
+           {0, 2}, {1, 2}, {4, 3}, {10, 4}, {24, 5}, {25, 5}, {56, 6}, {120, 7}, {500, 9}, {1014, 10}, {4084, 12}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {11, 4}, {57, 6}, {246, 8}, {501, 9}, {2038, 11}, {4085, 12}, {65416, 16}, {65417, 16}, {65418, 16}, {65419, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {26, 5}, {247, 8}, {1015, 10}, {4086, 12}, {32706, 15}, {65420, 16}, {65421, 16}, {65422, 16}, {65423, 16}, {65424, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {27, 5}, {248, 8}, {1016, 10}, {4087, 12}, {65425, 16}, {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {65430, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {58, 6}, {502, 9}, {65431, 16}, {65432, 16}, {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {65438, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {59, 6}, {1017, 10}, {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {65446, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {121, 7}, {2039, 11}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, {65453, 16}, {65454, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {122, 7}, {2040, 11}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {65462, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {249, 8}, {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {65470, 16}, {65471, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {503, 9}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {65479, 16}, {65480, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {504, 9}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {65488, 16}, {65489, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {505, 9}, {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {65497, 16}, {65498, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {506, 9}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {65506, 16}, {65507, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {2041, 11}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {65515, 16}, {65516, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {65525, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, 
+           {1018, 10}, {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16}, {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+        };
+
+#if defined(SIMD_JPEG_CALC_BITS_TABLE)
+        uint16_t JpegCalcBitsTable[JpegCalcBitsRange * 2][2];
+        bool JpegCalcBitsTableInit()
+        {
+            for (int i = 0, n = JpegCalcBitsRange * 2; i < n; ++i)
+            {
+                int val = i - JpegCalcBitsRange;
+                int tmp = val < 0 ? -val : val;
+                val = val < 0 ? val - 1 : val;
+                int cnt = 1;
+                while (tmp >>= 1)
+                    ++cnt;
+                JpegCalcBitsTable[i][0] = val & ((1 << cnt) - 1);
+                JpegCalcBitsTable[i][1] = cnt;
+            }
+            return true;
+        }
+        bool JpegCalcBitsTableInited = JpegCalcBitsTableInit();
+#endif
+
+        SIMD_INLINE void JpegDct(float* d0p, float* d1p, float* d2p, float* d3p, float* d4p, float* d5p, float* d6p, float* d7p)
+        {
+            float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
+            float z1, z2, z3, z4, z5, z11, z13;
+            float tmp0 = d0 + d7;
+            float tmp7 = d0 - d7;
+            float tmp1 = d1 + d6;
+            float tmp6 = d1 - d6;
+            float tmp2 = d2 + d5;
+            float tmp5 = d2 - d5;
+            float tmp3 = d3 + d4;
+            float tmp4 = d3 - d4;
+
+            float tmp10 = tmp0 + tmp3;
+            float tmp13 = tmp0 - tmp3;
+            float tmp11 = tmp1 + tmp2;
+            float tmp12 = tmp1 - tmp2;
+
+            d0 = tmp10 + tmp11;
+            d4 = tmp10 - tmp11;
+
+            z1 = (tmp12 + tmp13) * 0.707106781f;
+            d2 = tmp13 + z1;
+            d6 = tmp13 - z1;
+
+            tmp10 = tmp4 + tmp5;
+            tmp11 = tmp5 + tmp6;
+            tmp12 = tmp6 + tmp7;
+
+            z5 = (tmp10 - tmp12) * 0.382683433f;
+            z2 = tmp10 * 0.541196100f + z5;
+            z4 = tmp12 * 1.306562965f + z5;
+            z3 = tmp11 * 0.707106781f;
+
+            z11 = tmp7 + z3;
+            z13 = tmp7 - z3;
+
+            *d5p = z13 + z2;
+            *d3p = z13 - z2;
+            *d1p = z11 + z4;
+            *d7p = z11 - z4;
+
+            *d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
+        }
+
+        static int JpegProcessDu(Base::BitBuf& bitBuf, float* CDU, int stride, const float* fdtbl, int DC, const uint16_t HTDC[256][2], const uint16_t HTAC[256][2])
+        {
+            int offs, i, j, n, diff, end0pos, x, y;
+            for (offs = 0; offs < 8; ++offs) 
+                JpegDct(&CDU[offs], &CDU[offs + stride], &CDU[offs + stride * 2], &CDU[offs + stride * 3], &CDU[offs + stride * 4],
+                    &CDU[offs + stride * 5], &CDU[offs + stride * 6], &CDU[offs + stride * 7]);
+            for (offs = 0, n = stride * 8; offs < n; offs += stride)
+                JpegDct(&CDU[offs], &CDU[offs + 1], &CDU[offs + 2], &CDU[offs + 3], &CDU[offs + 4], &CDU[offs + 5], &CDU[offs + 6], &CDU[offs + 7]);
+            int DU[64];
+            for (y = 0, j = 0; y < 8; ++y) 
+            {
+                for (x = 0; x < 8; ++x, ++j) 
+                {
+                    i = y * stride + x;
+                    float v = CDU[i] * fdtbl[j];
+                    DU[JpegZigZagD[j]] = Round(v);
+                }
+            }
+            diff = DU[0] - DC;
+            if (diff == 0) 
+                bitBuf.Push(HTDC[0]);
+            else 
+            {
+                uint16_t bits[2];
+                JpegCalcBits(diff, bits);
+                bitBuf.Push(HTDC[bits[1]]);
+                bitBuf.Push(bits);
+            }
+            end0pos = 63;
+            for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos);
+            if (end0pos == 0) 
+            {
+                bitBuf.Push(HTAC[0x00]);
+                return DU[0];
+            }
+            for (i = 1; i <= end0pos; ++i)
+            {
+                int startpos = i;
+                int nrzeroes;
+                uint16_t bits[2];
+                for (; DU[i] == 0 && i <= end0pos; ++i);
+                nrzeroes = i - startpos;
+                if (nrzeroes >= 16) 
+                {
+                    int lng = nrzeroes >> 4;
+                    int nrmarker;
+                    for (nrmarker = 1; nrmarker <= lng; ++nrmarker)
+                        bitBuf.Push(HTAC[0xF0]);
+                    nrzeroes &= 15;
+                }
+                JpegCalcBits(DU[i], bits);
+                bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]);
+                bitBuf.Push(bits);
+            }
+            if (end0pos != 63) 
+                bitBuf.Push(HTAC[0x00]);
+            return DU[0];
+        }
+
+        void JpegWriteBlockSubs(OutputMemoryStream & stream, int width, int height, const uint8_t * red,
+            const uint8_t* green, const uint8_t* blue, int stride, const float * fY, const float* fUv, int dc[3])
+        {
+            int & DCY = dc[0], & DCU = dc[1], & DCV = dc[2];
+            float Y[256], U[256], V[256];
+            float subU[64], subV[64];
+            bool gray = red == green && red == blue;
+            Base::BitBuf bitBuf;
+            for (int y = 0; y < height; y += 16)
+            {
+                for (int x = 0; x < width; x += 16)
+                {
+                    if (gray)
+                        Base::GrayToY(red + x, stride, height - y, width - x, Y, 16);
+                    else
+                        Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 16);
+                    DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        for (int yy = 0, pos = 0; yy < 8; ++yy)
+                        {
+                            for (int xx = 0; xx < 8; ++xx, ++pos)
+                            {
+                                int j = yy * 32 + xx * 2;
+                                subU[pos] = (U[j + 0] + U[j + 1] + U[j + 16] + U[j + 17]) * 0.25f;
+                                subV[pos] = (V[j + 0] + V[j + 1] + V[j + 16] + V[j + 17]) * 0.25f;
+                            }
+                        }
+                        DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                    if (bitBuf.Full())
+                    {
+                        Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+                        bitBuf.Clear();
+                    }
+                }
+            }
+            Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+            bitBuf.Clear();
+        }
+
+        void JpegWriteBlockFull(OutputMemoryStream& stream, int width, int height, const uint8_t* red,
+            const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3])
+        {
+            int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2];
+            float Y[64], U[64], V[64];
+            bool gray = red == green && red == blue;
+            Base::BitBuf bitBuf;
+            for (int y = 0; y < height; y += 8)
+            {
+                for (int x = 0; x < width; x += 8)
+                {
+                    if (gray)
+                        Base::GrayToY(red + x, stride, height - y, width - x, Y, 8);
+                    else
+                        Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 8);
+                    DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                    if (bitBuf.Full())
+                    {
+                        Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+                        bitBuf.Clear();
+                    }
+                }
+            }
+            Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+            bitBuf.Clear();
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageJpegSaver::ImageJpegSaver(const ImageSaverParam& param)
+            : ImageSaver(param)
+            , _deintBgra(NULL)
+            , _deintBgr(NULL)
+        {
+        }
+
+        void ImageJpegSaver::Init()
+        {
+            InitParams(false);
+            switch (_param.format)
+            {
+            case SimdPixelFormatBgr24:
+            case SimdPixelFormatRgb24:
+                _deintBgr = Base::DeinterleaveBgr;
+                break;
+            case SimdPixelFormatBgra32:
+            case SimdPixelFormatRgba32:
+                _deintBgra = Base::DeinterleaveBgra;
+                break;
+            default: 
+                break;
+            }
+            _writeBlock = _subSample ? JpegWriteBlockSubs : JpegWriteBlockFull;
+        }
+
+        void ImageJpegSaver::InitParams(bool trans)
+        {
+            static const int YQT[] = { 16, 11, 10, 16, 24, 40, 51, 61, 12, 12, 14, 19, 26, 58, 60, 55, 14, 13, 
+                16, 24, 40, 57, 69, 56, 14, 17, 22, 29, 51, 87, 80, 62, 18, 22, 37, 56, 68, 109, 103, 77, 24, 
+                35, 55, 64, 81, 104, 113, 92, 49, 64, 78, 87, 103, 121, 120, 101, 72, 92, 95, 98, 112, 100, 103, 99 };
+            static const int UVQT[] = { 17, 18, 24, 47, 99, 99, 99, 99, 18, 21, 26, 66, 99, 99, 99, 99, 24, 
+                26, 56, 99, 99, 99, 99, 99, 47, 66, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 
+                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99 };
+            static const float AASF[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 
+                1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f, 1.0f * 2.828427125f, 
+                0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
+            _quality = _param.quality;
+            _quality = _quality ? _quality : 90;
+            _subSample = _quality <= 90 ? 1 : 0;
+            _quality = _quality < 1 ? 1 : _quality > 100 ? 100 : _quality;
+            _quality = _quality < 50 ? 5000 / _quality : 200 - _quality * 2;
+            for (size_t i = 0; i < 64; ++i)
+            {
+                int uvti, yti = (YQT[i] * _quality + 50) / 100;
+                _uY[Base::JpegZigZagD[i]] = uint8_t(yti < 1 ? 1 : yti > 255 ? 255 : yti);
+                uvti = (UVQT[i] * _quality + 50) / 100;
+                _uUv[Base::JpegZigZagD[i]] = uint8_t(uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
+            }
+            const uint8_t *ZigZag = trans ? Base::JpegZigZagT : Base::JpegZigZagD;
+            for (size_t y = 0, i = 0; y < 8; ++y)
+            {
+                for (size_t x = 0; x < 8; ++x, ++i)
+                {
+                    _fY[i] = 1.0f / (_uY[ZigZag[i]] * AASF[y] * AASF[x]);
+                    _fUv[i] = 1.0f / (_uUv[ZigZag[i]] * AASF[y] * AASF[x]);
+                }
+            }
+            _block = _subSample ? 16 : 8;
+            _width = (int)AlignHi(_param.width, _block);
+            if (_param.format != SimdPixelFormatGray8)
+                _buffer.Resize(_width * _block * 3);
+        }
+
+        void ImageJpegSaver::WriteHeader()
+        {
+            static const uint8_t DC_LUM_COD[] = { 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
+            static const uint8_t DC_LUM_VAL[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+            static const uint8_t AC_LUM_COD[] = { 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d };
+            static const uint8_t AC_LUM_VAL[] = {
+               0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08, 
+               0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0, 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28, 
+               0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 
+               0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 
+               0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 
+               0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2, 
+               0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa
+            };
+            static const uint8_t DC_CHR_COD[] = { 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
+            static const uint8_t DC_CHR_VAL[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+            static const uint8_t AC_CHR_COD[] = { 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 };
+            static const uint8_t AC_CHR_VAL[] = {
+               0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71, 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, 
+               0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0, 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34, 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26, 
+               0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 
+               0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 
+               0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 
+               0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 
+               0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa
+            };
+            static const uint8_t head0[] = { 0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F', 'I', 'F', 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0xFF, 0xDB, 0, 0x84, 0 };
+            static const uint8_t head2[] = { 0xFF, 0xDA, 0, 0xC, 3, 1, 0, 2, 0x11, 3, 0x11, 0, 0x3F, 0 };
+            const uint8_t head1[] = { 0xFF, 0xC0, 0, 0x11, 8,  uint8_t(_param.height >> 8),  uint8_t(_param.height),  uint8_t(_param.width >> 8),  
+                uint8_t(_param.width), 3, 1, uint8_t(_subSample ? 0x22 : 0x11), 0, 2, 0x11, 1, 3, 0x11, 1, 0xFF, 0xC4, 0x01, 0xA2, 0 };
+            _stream.Write(head0, sizeof(head0));
+            _stream.Write(_uY, 64);
+            _stream.Write8u(1);
+            _stream.Write(_uUv, 64);
+            _stream.Write(head1, sizeof(head1));
+            _stream.Write(DC_LUM_COD + 1, sizeof(DC_LUM_COD) - 1);
+            _stream.Write(DC_LUM_VAL, sizeof(DC_LUM_VAL));
+            _stream.Write8u(0x10); // HTYACinfo
+            _stream.Write(AC_LUM_COD + 1, sizeof(AC_LUM_COD) - 1);
+            _stream.Write(AC_LUM_VAL, sizeof(AC_LUM_VAL));
+            _stream.Write8u(1); // HTUDCinfo
+            _stream.Write(DC_CHR_COD + 1, sizeof(DC_CHR_COD) - 1);
+            _stream.Write(DC_CHR_VAL, sizeof(DC_CHR_VAL));
+            _stream.Write8u(0x11); // HTUACinfo
+            _stream.Write(AC_CHR_COD + 1, sizeof(AC_CHR_COD) - 1);
+            _stream.Write(AC_CHR_VAL, sizeof(AC_CHR_VAL));
+            _stream.Write(head2, sizeof(head2));
+        }
+
+        bool ImageJpegSaver::ToStream(const uint8_t* src, size_t stride)
+        {
+            Init();
+            WriteHeader();
+            uint8_t* r = _buffer.data, * g = r + _width * _block,* b = g + _width * _block;
+            int dc[3] = { 0, 0, 0 };
+            for (int row = 0; row < (int)_param.height; row += _block)
+            {
+                int block = Simd::Min(row + _block, (int)_param.height) - row;
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24:
+                    _deintBgr(src, stride, _param.width, block, b, _width, g, _width, r, _width);
+                    break;
+                case SimdPixelFormatBgra32:
+                    _deintBgra(src, stride, _param.width, block, b, _width, g, _width, r, _width, NULL, 0);
+                    break;
+                case SimdPixelFormatRgb24:
+                    _deintBgr(src, stride, _param.width, block, r, _width, g, _width, b, _width);
+                    break;
+                case SimdPixelFormatRgba32:
+                    _deintBgra(src, stride, _param.width, block, r, _width, g, _width, b, _width, NULL, 0);
+                    break;
+                default: 
+                    break;
+                }
+                if(_param.format == SimdPixelFormatGray8)
+                    _writeBlock(_stream, (int)_param.width, block, src, src, src, (int)stride, _fY, _fUv, dc);
+                else
+                    _writeBlock(_stream, (int)_param.width, block, r, g, b, _width, _fY, _fUv, dc);
+                src += block * stride;
+            }
+            static const uint16_t FILL_BITS[] = { 0x7F, 7 };
+            Base::WriteBits(_stream, FILL_BITS);
+            _stream.Write8u(0xFF);
+            _stream.Write8u(0xD9);
+            return true;
+        }
+    }
+}
diff --git a/3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp b/3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp
new file mode 100644
index 0000000000..dcb8f2efbb
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdBaseImageSavePng.cpp
@@ -0,0 +1,379 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdImageSavePng.h"
+#include "Simd/SimdBase.h"
+#include "Simd/SimdCpu.h"
+
+namespace Simd
+{
+    namespace Base
+    {
+        const uint16_t ZlibLenC[30] = { 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 259 };
+        const uint8_t  ZlibLenEb[29] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
+        const uint16_t ZlibDistC[31] = { 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 32768 };
+        const uint8_t  ZlibDistEb[30] = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13 };
+
+#if defined(SIMD_PNG_ZLIB_BIT_REV_TABLE)
+        int ZlibBitRevTable[512];
+        static bool ZlibBitRevTableInit()
+        {
+            for (int i = 0; i < 512; i++)
+            {
+                int rev = 0, val = i;
+                for (size_t b = 0; b < 9; b++)
+                {
+                    rev = (rev << 1) | (val & 1);
+                    val >>= 1;
+                }
+                ZlibBitRevTable[i] = rev;
+            }
+            return true;
+        }
+        bool ZlibBitRevTableInited = ZlibBitRevTableInit();
+
+#endif
+
+        uint32_t ZlibAdler32(uint8_t* data, int size)
+        {
+            uint32_t lo = 1, hi = 0;
+            for (int b = 0, n = (int)(size % 5552); b < size;)
+            {
+                for (int i = 0; i < n; ++i)
+                {
+                    lo += data[b + i];
+                    hi += lo;
+                }
+                lo %= 65521;
+                hi %= 65521;
+                b += n;
+                n = 5552;
+            }
+            return (hi << 16) | lo;
+        }
+
+        void ZlibCompress(uint8_t* data, int size, int quality, OutputMemoryStream& stream)
+        {
+            const int ZHASH = 16384;
+            if (quality < 5)
+                quality = 5;
+            const int basket = quality * 2;
+            Array32i hashTable(ZHASH * basket);
+            memset(hashTable.data, -1, hashTable.RawSize());
+
+            stream.Write(uint8_t(0x78));
+            stream.Write(uint8_t(0x5e));
+            stream.WriteBits(1, 1);
+            stream.WriteBits(1, 2);
+
+            int i = 0, j;
+            while (i < size - 3)
+            {
+                int h = ZlibHash(data + i) & (ZHASH - 1), best = 3;
+                uint8_t* bestLoc = 0;
+                int* hList = hashTable.data + h * basket;
+                for (j = 0; hList[j] != -1 && j < basket; ++j)
+                {
+                    if (hList[j] > i - 32768)
+                    {
+                        int d = ZlibCount(data + hList[j], data + i, size - i);
+                        if (d >= best)
+                        {
+                            best = d;
+                            bestLoc = data + hList[j];
+                        }
+                    }
+                }
+                if (j == basket)
+                {
+                    memcpy(hList, hList + quality, quality * sizeof(int));
+                    memset(hList + quality, -1, quality * sizeof(int));
+                    j = quality;
+                }
+                hList[j] = i;
+
+                if (bestLoc)
+                {
+                    h = ZlibHash(data + i + 1) & (ZHASH - 1);
+                    int* hList = hashTable.data + h * basket;
+                    for (j = 0; hList[j] != -1 && j < basket; ++j)
+                    {
+                        if (hList[j] > i - 32767)
+                        {
+                            int e = ZlibCount(data + hList[j], data + i + 1, size - i - 1);
+                            if (e > best)
+                            {
+                                bestLoc = NULL;
+                                break;
+                            }
+                        }
+                    }
+                }
+
+                if (bestLoc)
+                {
+                    int d = (int)(data + i - bestLoc);
+                    assert(d <= 32767 && best <= 258);
+                    for (j = 0; best > Base::ZlibLenC[j + 1] - 1; ++j);
+                    Base::ZlibHuff(j + 257, stream);
+                    if (Base::ZlibLenEb[j])
+                        stream.WriteBits(best - Base::ZlibLenC[j], Base::ZlibLenEb[j]);
+                    for (j = 0; d > Base::ZlibDistC[j + 1] - 1; ++j);
+                    stream.WriteBits(Base::ZlibBitRev(j, 5), 5);
+                    if (Base::ZlibDistEb[j])
+                        stream.WriteBits(d - Base::ZlibDistC[j], Base::ZlibDistEb[j]);
+                    i += best;
+                }
+                else
+                {
+                    ZlibHuffB(data[i], stream);
+                    ++i;
+                }
+            }
+            for (; i < size; ++i)
+                ZlibHuffB(data[i], stream);
+            ZlibHuff(256, stream);
+            stream.FlushBits();
+            stream.WriteBe32u(ZlibAdler32(data, size));
+        }
+
+        uint32_t EncodeLine0(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            uint32_t sum = 0;
+            for (size_t i = 0; i < size; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine1(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            uint32_t sum = 0;
+            for (size_t i = 0; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            for (size_t i = n; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - n];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine2(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            uint32_t sum = 0;
+            for (size_t i = 0; i < n; ++i)
+            {
+                dst[i] = src[i] - src[i - stride];
+                sum += ::abs(dst[i]);
+            }
+            for (size_t i = n; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - stride];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine3(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            uint32_t sum = 0;
+            for (size_t i = 0; i < n; ++i)
+            {
+                dst[i] = src[i] - (src[i - stride] >> 1);
+                sum += ::abs(dst[i]);
+            }
+            for (size_t i = n; i < size; ++i)
+            {
+                dst[i] = src[i] - ((src[i - n] + src[i - stride]) >> 1);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine4(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            uint32_t sum = 0;
+            for (size_t i = 0; i < n; ++i)
+            {
+                dst[i] = (int8_t)(src[i] - src[i - stride]);
+                sum += ::abs(dst[i]);
+            }
+            for (size_t i = n; i < size; ++i)
+            {
+                dst[i] = src[i] - Paeth(src[i - n], src[i - stride], src[i - stride - n]);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine5(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            uint32_t sum = 0;
+            for (size_t i = 0; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            for (size_t i = n; i < size; ++i)
+            {
+                dst[i] = src[i] - (src[i - n] >> 1);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine6(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            uint32_t sum = 0;
+            for (size_t i = 0; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            for (size_t i = n; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - n];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        ImagePngSaver::ImagePngSaver(const ImageSaverParam& param)
+            : ImageSaver(param)
+            , _channels(0)
+            , _size(0)
+            , _convert(NULL)
+        {
+            switch (_param.format)
+            {
+            case SimdPixelFormatGray8:
+                _channels = 1;
+                break;
+            case SimdPixelFormatBgr24:
+                _channels = 3;
+                break;
+            case SimdPixelFormatBgra32:
+                _channels = 4;
+                break;
+            case SimdPixelFormatRgb24:
+                _channels = 3;
+                break;
+            case SimdPixelFormatRgba32:
+                _channels = 4;
+                break;
+            default: 
+                break;
+            }
+            _size = _param.width * _channels;
+            if (_param.format == SimdPixelFormatBgr24)
+            {
+                _convert = Base::BgrToRgb;
+                _buff.Resize(_param.height * _size);
+            }
+            else if (_param.format == SimdPixelFormatBgra32)
+            {
+                _convert = Base::BgraToRgba;
+                _buff.Resize(_param.height * _size);
+            }
+            _filt.Resize((_size + 1) * _param.height);
+            _line.Resize(_size * FILTERS);
+            _encode[0] = Base::EncodeLine0;
+            _encode[1] = Base::EncodeLine1;
+            _encode[2] = Base::EncodeLine2;
+            _encode[3] = Base::EncodeLine3;
+            _encode[4] = Base::EncodeLine4;
+            _encode[5] = Base::EncodeLine5;
+            _encode[6] = Base::EncodeLine6;
+            _compress = Base::ZlibCompress;
+        }
+
+        bool ImagePngSaver::ToStream(const uint8_t* src, size_t stride)
+        {
+            if (_convert)
+            {
+                _convert(src, _param.width, _param.height, stride, _buff.data, _size);
+                src = _buff.data;
+                stride = _size;
+            }
+            for (size_t row = 0; row < _param.height; ++row)
+            {
+                int bestFilter = 0, bestSum = INT_MAX;
+                for (int filter = 0; filter < FILTERS; filter++)
+                {
+                    static const int TYPES[] = { 0, 1, 0, 5, 6, 0, 1, 2, 3, 4 };
+                    int type = TYPES[filter + (row ? 1 : 0) * FILTERS];
+                    int sum = _encode[type](src + stride * row, stride, _channels, _size, _line.data + _size * filter);
+                    if (sum < bestSum)
+                    {
+                        bestSum = sum;
+                        bestFilter = filter;
+                    }
+                }
+                _filt[row * (_size + 1)] = (uint8_t)bestFilter;
+                memcpy(_filt.data + row * (_size + 1) + 1, _line.data + _size * bestFilter, _size);
+            }
+            OutputMemoryStream zlib(Min(_param.width * _param.height, Base::AlgCacheL1()));
+            _compress(_filt.data, (int)_filt.size, COMPRESSION, zlib);
+            WriteToStream(zlib.Data(), zlib.Size());
+            return true;
+        }
+
+        SIMD_INLINE void WriteCrc32(OutputMemoryStream& stream, size_t size)
+        {
+            stream.WriteBe32u(Base::Crc32(stream.Current() - size - 4, size + 4));
+        }
+
+        void ImagePngSaver::WriteToStream(const uint8_t* zlib, size_t zlen)
+        {
+            const uint8_t SIGNATURE[8] = { 137, 80, 78, 71, 13, 10, 26, 10 };
+            const int8_t CTYPE[5] = { -1, 0, 4, 2, 6 };
+            _stream.Reserve(8 + 12 + 13 + 12 + zlen + 12);
+            _stream.Write(SIGNATURE, 8);
+            _stream.WriteBe32u(13);
+            _stream.Write("IHDR", 4);
+            _stream.WriteBe32u((uint32_t)_param.width);
+            _stream.WriteBe32u((uint32_t)_param.height);
+            _stream.Write8u(8);
+            _stream.Write8u(CTYPE[_channels]);
+            _stream.Write8u(0);
+            _stream.Write8u(0);
+            _stream.Write8u(0);
+            WriteCrc32(_stream, 13);
+            _stream.WriteBe32u((uint32_t)zlen);
+            _stream.Write("IDAT", 4);
+            _stream.Write(zlib, zlen);
+            WriteCrc32(_stream, zlen);
+            _stream.WriteBe32u(0);
+            _stream.Write("IEND", 4);
+            WriteCrc32(_stream, 0);
+        }
+    }
+}
diff --git a/3rdparty/simdlib/Simd/SimdImageLoad.h b/3rdparty/simdlib/Simd/SimdImageLoad.h
new file mode 100644
index 0000000000..43e44961e6
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdImageLoad.h
@@ -0,0 +1,396 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdImageLoad_h__
+#define __SimdImageLoad_h__
+
+#include "Simd/SimdMemoryStream.h"
+#include "Simd/SimdArray.h"
+#include "Simd/SimdAlignment.h"
+
+#include "Simd/SimdView.hpp"
+
+#include <vector>
+
+namespace Simd
+{
+    typedef uint8_t* (*ImageLoadFromMemoryPtr)(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format);
+
+    uint8_t* ImageLoadFromFile(const ImageLoadFromMemoryPtr loader, const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format);
+
+    //-------------------------------------------------------------------------
+
+    struct ImageLoaderParam
+    {
+        const uint8_t* data;
+        size_t size;
+        SimdImageFileType file;
+        SimdPixelFormatType format;
+
+        ImageLoaderParam(const uint8_t* d, size_t s, SimdPixelFormatType f);
+
+        bool Validate();
+    };
+
+    class ImageLoader
+    {
+    protected:
+        typedef Simd::View<Simd::Allocator> Image;
+
+        ImageLoaderParam _param;
+        InputMemoryStream _stream;
+        Image _image;
+        
+    public:
+        ImageLoader(const ImageLoaderParam& param)
+            : _param(param)
+            , _stream(_param.data, _param.size)
+        {
+        }
+
+        virtual ~ImageLoader()
+        {
+        }
+
+        virtual bool FromStream() = 0;
+
+        SIMD_INLINE uint8_t* Release(size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+        {
+            *stride = _image.stride;
+            *width = _image.width;
+            *height = _image.height;
+            *format = (SimdPixelFormatType)_image.format;
+            return _image.Release();
+        }
+    };
+
+    namespace Base
+    {
+        class ImagePxmLoader : public ImageLoader
+        {
+        public:
+            ImagePxmLoader(const ImageLoaderParam& param);
+
+        protected:
+            typedef void (*ToAnyPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride);
+            typedef void (*ToBgraPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+            ToAnyPtr _toAny;
+            ToBgraPtr _toBgra;
+            Array8u _buffer;
+            size_t _block, _size;
+
+            bool ReadHeader(size_t version);
+            virtual void SetConverters() = 0;
+        };
+
+        class ImagePgmTxtLoader : public ImagePxmLoader
+        {
+        public:
+            ImagePgmTxtLoader(const ImageLoaderParam& param);
+
+            virtual bool FromStream();
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePgmBinLoader : public ImagePxmLoader
+        {
+        public:
+            ImagePgmBinLoader(const ImageLoaderParam& param);
+
+            virtual bool FromStream();
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmTxtLoader : public ImagePxmLoader
+        {
+        public:
+            ImagePpmTxtLoader(const ImageLoaderParam& param);
+
+            virtual bool FromStream();
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmBinLoader : public ImagePxmLoader
+        {
+        public:
+            ImagePpmBinLoader(const ImageLoaderParam& param);
+
+            virtual bool FromStream();
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePngLoader : public ImageLoader
+        {
+        public:
+            ImagePngLoader(const ImageLoaderParam& param);
+
+            virtual bool FromStream();
+
+        protected:
+            typedef void (*ToAny8Ptr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride);
+            typedef void (*ToBgra8Ptr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+            typedef void (*ToAny16Ptr)(const uint16_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride);
+            typedef void (*ToBgra16Ptr)(const uint16_t* src, size_t width, size_t height, size_t srcStride, uint8_t* bgra, size_t bgraStride, uint8_t alpha);
+            ToAny8Ptr _toAny8;
+            ToBgra8Ptr _toBgra8, _bgrToBgra;
+            ToAny16Ptr _toAny16;
+            ToBgra16Ptr _toBgra16;
+
+            virtual void SetConverters();
+        private:
+            bool _first, _hasTrans, _iPhone;
+            uint32_t _width, _height, _channels;
+            uint16_t _tc16[3];
+            uint8_t _depth, _color, _interlace, _paletteChannels, _tc[3];
+            Array8u _palette, _idat;
+
+            struct Chunk
+            {
+                uint32_t size;
+                uint32_t type;
+                uint32_t offs;
+            };
+            typedef std::vector<Chunk> Chunks;
+            Chunks _idats;
+
+            bool ParseFile();
+            bool CheckHeader();
+            bool ReadChunk(Chunk& chunk);
+            bool ReadHeader(const Chunk & chunk);
+            bool ReadPalette(const Chunk& chunk);
+            bool ReadTransparency(const Chunk& chunk);
+            bool ReadData(const Chunk& chunk);
+            InputMemoryStream MergedDataStream();
+        };
+
+        class ImageJpegLoader : public ImageLoader
+        {
+        public:
+            ImageJpegLoader(const ImageLoaderParam& param);
+
+            virtual bool FromStream();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format);
+    }
+
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        class ImagePgmTxtLoader : public Base::ImagePgmTxtLoader
+        {
+        public:
+            ImagePgmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePgmBinLoader : public Base::ImagePgmBinLoader
+        {
+        public:
+            ImagePgmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmTxtLoader : public Base::ImagePpmTxtLoader
+        {
+        public:
+            ImagePpmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmBinLoader : public Base::ImagePpmBinLoader
+        {
+        public:
+            ImagePpmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePngLoader : public Base::ImagePngLoader
+        {
+        public:
+            ImagePngLoader(const ImageLoaderParam& param);
+
+            virtual bool FromStream();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format);
+    }
+#endif// SIMD_SSE41_ENABLE
+
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        class ImagePgmTxtLoader : public Sse41::ImagePgmTxtLoader
+        {
+        public:
+            ImagePgmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePgmBinLoader : public Sse41::ImagePgmBinLoader
+        {
+        public:
+            ImagePgmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmTxtLoader : public Sse41::ImagePpmTxtLoader
+        {
+        public:
+            ImagePpmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmBinLoader : public Sse41::ImagePpmBinLoader
+        {
+        public:
+            ImagePpmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format);
+    }
+#endif// SIMD_AVX2_ENABLE
+
+#ifdef SIMD_AVX512BW_ENABLE    
+    namespace Avx512bw
+    {
+        class ImagePgmTxtLoader : public Avx2::ImagePgmTxtLoader
+        {
+        public:
+            ImagePgmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePgmBinLoader : public Avx2::ImagePgmBinLoader
+        {
+        public:
+            ImagePgmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmTxtLoader : public Avx2::ImagePpmTxtLoader
+        {
+        public:
+            ImagePpmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmBinLoader : public Avx2::ImagePpmBinLoader
+        {
+        public:
+            ImagePpmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format);
+    }
+#endif// SIMD_AVX512BW_ENABLE
+
+#ifdef SIMD_NEON_ENABLE    
+    namespace Neon
+    {
+        class ImagePgmTxtLoader : public Base::ImagePgmTxtLoader
+        {
+        public:
+            ImagePgmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePgmBinLoader : public Base::ImagePgmBinLoader
+        {
+        public:
+            ImagePgmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmTxtLoader : public Base::ImagePpmTxtLoader
+        {
+        public:
+            ImagePpmTxtLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        class ImagePpmBinLoader : public Base::ImagePpmBinLoader
+        {
+        public:
+            ImagePpmBinLoader(const ImageLoaderParam& param);
+
+        protected:
+            virtual void SetConverters();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format);
+    }
+#endif// SIMD_NEON_ENABLE
+}
+
+#endif//__SimdImageLoad_h__
diff --git a/3rdparty/simdlib/Simd/SimdImageSave.h b/3rdparty/simdlib/Simd/SimdImageSave.h
new file mode 100644
index 0000000000..4e1945c077
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdImageSave.h
@@ -0,0 +1,386 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdImageSave_h__
+#define __SimdImageSave_h__
+
+#include "Simd/SimdMemoryStream.h"
+#include "Simd/SimdArray.h"
+#include "Simd/SimdPerformance.h"
+
+namespace Simd
+{
+    typedef uint8_t* (*ImageSaveToMemoryPtr)(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size);
+
+    SimdBool ImageSaveToFile(const ImageSaveToMemoryPtr saver, const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char* path);
+
+    //---------------------------------------------------------------------
+
+    struct ImageSaverParam
+    {
+        size_t width, height;
+        SimdPixelFormatType format;
+        SimdImageFileType file;
+        int quality;
+
+        SIMD_INLINE ImageSaverParam(size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality)
+        {
+            this->width = width;
+            this->height = height;
+            this->format = format;
+            this->file = file;
+            this->quality = quality;
+        }
+
+        bool Validate()
+        {
+            if (file == SimdImageFileUndefined)
+            {
+                if (format == SimdPixelFormatGray8)
+                    file = SimdImageFilePgmBin;
+                else
+                    file = SimdImageFilePpmBin;
+            }            
+            if (format < SimdPixelFormatGray8 || format > SimdPixelFormatRgba32)
+                return false;
+            if (width == 0 || height == 0)
+                return false;
+            if (file <= SimdImageFileUndefined || file > SimdImageFileJpeg)
+                return false;
+            return true;
+        }
+    };
+
+    class ImageSaver
+    {
+    protected:
+        ImageSaverParam _param;
+        OutputMemoryStream _stream;
+    public:
+        ImageSaver(const ImageSaverParam& param)
+            : _param(param)
+        {
+        }
+
+        virtual ~ImageSaver()
+        {
+        }
+
+        virtual bool ToStream(const uint8_t* src, size_t stride) = 0;
+
+        SIMD_INLINE uint8_t* Release(size_t* size)
+        {
+            return _stream.Release(size);
+        }
+    };
+       
+    namespace Base
+    {
+        class ImagePxmSaver : public ImageSaver
+        {
+        public:
+            ImagePxmSaver(const ImageSaverParam& param);
+
+        protected:
+            typedef void (*ConvertPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride);
+            ConvertPtr _convert;
+            Array8u _buffer;
+            size_t _block, _size;
+
+            void WriteHeader(size_t version);
+        };
+
+        class ImagePgmTxtSaver : public ImagePxmSaver
+        {
+        public:
+            ImagePgmTxtSaver(const ImageSaverParam& param);
+
+            virtual bool ToStream(const uint8_t* src, size_t stride);
+        };
+
+        class ImagePgmBinSaver : public ImagePxmSaver
+        {
+        public:
+            ImagePgmBinSaver(const ImageSaverParam& param);
+
+            virtual bool ToStream(const uint8_t* src, size_t stride);
+        };
+
+        class ImagePpmTxtSaver : public ImagePxmSaver
+        {
+        public:
+            ImagePpmTxtSaver(const ImageSaverParam& param);
+
+            virtual bool ToStream(const uint8_t* src, size_t stride);
+        };
+
+        class ImagePpmBinSaver : public ImagePxmSaver
+        {
+        public:
+            ImagePpmBinSaver(const ImageSaverParam& param);
+
+            virtual bool ToStream(const uint8_t* src, size_t stride);
+        };
+
+        class ImagePngSaver : public ImageSaver
+        {
+        public:
+            ImagePngSaver(const ImageSaverParam& param);
+
+            virtual bool ToStream(const uint8_t* src, size_t stride);
+        protected:
+            static const int COMPRESSION = 8;
+            static const int FILTERS = 5;
+            static const int TYPES = 7;
+            typedef void (*ConvertPtr)(const uint8_t* src, size_t width, size_t height, size_t srcStride, uint8_t* dst, size_t dstStride);
+            typedef uint32_t (*EncodePtr)(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst);
+            typedef void (*CompressPtr)(uint8_t* data, int size, int quality, OutputMemoryStream& stream);
+            ConvertPtr _convert;
+            EncodePtr _encode[TYPES];
+            CompressPtr _compress;
+            size_t _channels, _size;
+            Array8u _filt, _buff;
+            Array8i _line;
+
+            void WriteToStream(const uint8_t* zlib, size_t zlen);
+        };
+
+        class ImageJpegSaver : public ImageSaver
+        {
+        public:
+            ImageJpegSaver(const ImageSaverParam& param);
+
+            virtual bool ToStream(const uint8_t* src, size_t stride);
+        protected:
+            typedef void (*DeintBgrPtr)(const uint8_t* bgr, size_t bgrStride, size_t width, size_t height,
+                uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride);
+            typedef void (*DeintBgraPtr)(const uint8_t* bgra, size_t bgraStride, size_t width, size_t height,
+                uint8_t* b, size_t bStride, uint8_t* g, size_t gStride, uint8_t* r, size_t rStride, uint8_t* a, size_t aStride);
+            typedef void (*WriteBlockPtr)(OutputMemoryStream& stream, int width, int height, const uint8_t* red,
+                const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3]);
+
+            Array8u _buffer;
+            DeintBgrPtr _deintBgr;
+            DeintBgraPtr _deintBgra;
+            WriteBlockPtr _writeBlock;
+            bool _subSample;
+            int _quality, _block, _width;
+            float _fY[64], _fUv[64];
+            uint8_t _uY[64], _uUv[64];
+
+            virtual void Init();
+
+            void InitParams(bool trans);
+            void WriteHeader();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size);
+    }
+
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        class ImagePgmTxtSaver : public Base::ImagePgmTxtSaver
+        {
+        public:
+            ImagePgmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePgmBinSaver : public Base::ImagePgmBinSaver
+        {
+        public:
+            ImagePgmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmTxtSaver : public Base::ImagePpmTxtSaver
+        {
+        public:
+            ImagePpmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmBinSaver : public Base::ImagePpmBinSaver
+        {
+        public:
+            ImagePpmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePngSaver : public Base::ImagePngSaver
+        {
+        public:
+            ImagePngSaver(const ImageSaverParam& param);
+        };
+
+        class ImageJpegSaver : public Base::ImageJpegSaver
+        {
+        public:
+            ImageJpegSaver(const ImageSaverParam& param);
+
+        protected:
+            virtual void Init();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size);
+    }
+#endif// SIMD_SSE41_ENABLE
+
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        class ImagePgmTxtSaver : public Sse41::ImagePgmTxtSaver
+        {
+        public:
+            ImagePgmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePgmBinSaver : public Sse41::ImagePgmBinSaver
+        {
+        public:
+            ImagePgmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmTxtSaver : public Sse41::ImagePpmTxtSaver
+        {
+        public:
+            ImagePpmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmBinSaver : public Sse41::ImagePpmBinSaver
+        {
+        public:
+            ImagePpmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePngSaver : public Sse41::ImagePngSaver
+        {
+        public:
+            ImagePngSaver(const ImageSaverParam& param);
+        };
+
+        class ImageJpegSaver : public Sse41::ImageJpegSaver
+        {
+        public:
+            ImageJpegSaver(const ImageSaverParam& param);
+
+        protected:
+            virtual void Init();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size);
+    }
+#endif// SIMD_AVX2_ENABLE
+
+#ifdef SIMD_AVX512BW_ENABLE    
+    namespace Avx512bw
+    {
+        class ImagePgmTxtSaver : public Avx2::ImagePgmTxtSaver
+        {
+        public:
+            ImagePgmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePgmBinSaver : public Avx2::ImagePgmBinSaver
+        {
+        public:
+            ImagePgmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmTxtSaver : public Avx2::ImagePpmTxtSaver
+        {
+        public:
+            ImagePpmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmBinSaver : public Avx2::ImagePpmBinSaver
+        {
+        public:
+            ImagePpmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePngSaver : public Avx2::ImagePngSaver
+        {
+        public:
+            ImagePngSaver(const ImageSaverParam& param);
+        };
+
+        class ImageJpegSaver : public Avx2::ImageJpegSaver
+        {
+        public:
+            ImageJpegSaver(const ImageSaverParam& param);
+
+        protected:
+            virtual void Init();
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size);
+    }
+#endif// SIMD_AVX512BW_ENABLE
+
+#ifdef SIMD_NEON_ENABLE    
+    namespace Neon
+    {
+        class ImagePgmTxtSaver : public Base::ImagePgmTxtSaver
+        {
+        public:
+            ImagePgmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePgmBinSaver : public Base::ImagePgmBinSaver
+        {
+        public:
+            ImagePgmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmTxtSaver : public Base::ImagePpmTxtSaver
+        {
+        public:
+            ImagePpmTxtSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePpmBinSaver : public Base::ImagePpmBinSaver
+        {
+        public:
+            ImagePpmBinSaver(const ImageSaverParam& param);
+        };
+
+        class ImagePngSaver : public Base::ImagePngSaver
+        {
+        public:
+            ImagePngSaver(const ImageSaverParam& param);
+        };
+
+        //---------------------------------------------------------------------
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size);
+    }
+#endif// SIMD_NEON_ENABLE
+}
+
+#endif//__SimdImageSave_h__
diff --git a/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h
new file mode 100644
index 0000000000..d54164f7d4
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h
@@ -0,0 +1,649 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdImageSaveJpeg_h__
+#define __SimdImageSaveJpeg_h__
+
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdMath.h"
+
+#define SIMD_JPEG_CALC_BITS_TABLE
+
+namespace Simd
+{
+    namespace Base
+    {
+        struct BitBuf
+        {
+            static const uint32_t capacity = 1024;
+            uint32_t size;
+            uint16_t data[1024][2];
+
+            SIMD_INLINE BitBuf()
+                : size(0) 
+            {
+            }
+
+            SIMD_INLINE void Push(const uint16_t* bits)
+            {
+                ((uint32_t*)data)[size++] = ((uint32_t*)bits)[0];
+            }
+
+            SIMD_INLINE bool Full(uint32_t tail = capacity / 2) const
+            {
+                return size + tail >= capacity;
+            }
+
+            SIMD_INLINE uint32_t Capacity() const 
+            {
+                return capacity;
+            }
+
+            SIMD_INLINE void Clear()
+            {
+                size = 0;
+            }
+        }; 
+
+        extern const uint8_t JpegZigZagD[64];
+        extern const uint8_t JpegZigZagT[64];
+
+        extern const uint16_t HuffmanYdc[256][2];
+        extern const uint16_t HuffmanUVdc[256][2];
+        extern const uint16_t HuffmanYac[256][2];
+        extern const uint16_t HuffmanUVac[256][2];
+
+#if defined(SIMD_JPEG_CALC_BITS_TABLE)
+        const int JpegCalcBitsRange = 2048;
+        extern uint16_t JpegCalcBitsTable[JpegCalcBitsRange * 2][2];
+        SIMD_INLINE void JpegCalcBits(int val, uint16_t bits[2])
+        {
+            assert(val >= -JpegCalcBitsRange && val < JpegCalcBitsRange);
+            ((uint32_t*)bits)[0] = ((uint32_t*)JpegCalcBitsTable)[val + JpegCalcBitsRange];
+        }
+#else
+        SIMD_INLINE void JpegCalcBits(int val, uint16_t bits[2])
+        {
+            int tmp = val < 0 ? -val : val;
+            val = val < 0 ? val - 1 : val;
+            bits[1] = 1;
+            while (tmp >>= 1)
+                ++bits[1];
+            bits[0] = val & ((1 << bits[1]) - 1);
+        }
+#endif
+
+        SIMD_INLINE void RgbToYuv(const uint8_t* r, const uint8_t* g, const uint8_t* b, int stride, int height, int width, float* y, float* u, float* v, int size)
+        {
+            for (int row = 0; row < size;)
+            {
+                for (int col = 0; col < size; col += 1)
+                {
+                    int offs = (col < width ? col : width - 1);
+                    float _r = r[offs], _g = g[offs], _b = b[offs];
+                    y[col] = +0.29900f * _r + 0.58700f * _g + 0.11400f * _b - 128.000f;
+                    u[col] = -0.16874f * _r - 0.33126f * _g + 0.50000f * _b;
+                    v[col] = +0.50000f * _r - 0.41869f * _g - 0.08131f * _b;
+                }
+                if (++row < height)
+                    r += stride, g += stride, b += stride;
+                y += size, u += size, v += size;
+            }
+        }
+
+        SIMD_INLINE void GrayToY(const uint8_t* g, int stride, int height, int width, float* y, int size)
+        {
+            for (int row = 0; row < size;)
+            {
+                for (int col = 0; col < size; col += 1)
+                {
+                    int offs = (col < width ? col : width - 1);
+                    y[col] = g[offs] - 128.000f;
+                }
+                if (++row < height)
+                    g += stride;
+                y += size;
+            }
+        }
+
+        SIMD_INLINE void JpegProcessDuGrayUv(BitBuf & bitBuf)
+        {
+            bitBuf.Push(Base::HuffmanUVdc[0]);
+            bitBuf.Push(Base::HuffmanUVac[0]);
+            bitBuf.Push(Base::HuffmanUVdc[0]);
+            bitBuf.Push(Base::HuffmanUVac[0]);
+        }
+
+        SIMD_INLINE void WriteBits(OutputMemoryStream & stream, const uint16_t bits[2])
+        {
+            stream.BitCount() += bits[1];
+#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+            stream.BitBuffer() |= uint64_t(bits[0]) << (64 - stream.BitCount());
+            while (stream.BitCount() >= 8)
+            {
+                uint8_t byte = stream.BitBuffer() >> 56;
+                stream.Write8u(byte);
+                if (byte == 255)
+                    stream.Write8u(0);
+                stream.BitBuffer() <<= 8;
+                stream.BitCount() -= 8;
+            }
+#else
+            stream.BitBuffer() |= uint32_t(bits[0]) << (32 - stream.BitCount());
+            while (stream.BitCount() >= 8)
+            {
+                uint8_t byte = stream.BitBuffer() >> 24;
+                stream.Write8u(byte);
+                if (byte == 255)
+                    stream.Write8u(0);
+                stream.BitBuffer() <<= 8;
+                stream.BitCount() -= 8;
+            }
+#endif
+        }
+
+        SIMD_INLINE void WriteBits(OutputMemoryStream& stream, const uint16_t bits[][2], size_t size)
+        {
+            size_t pos = stream.Pos();
+            stream.Reserve(pos + size * 2);
+            uint8_t* data = stream.Data();
+            size_t & bitCount = stream.BitCount();
+            size_t i = 0;
+#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+            uint64_t &bitBuffer = stream.BitBuffer();
+            for (size_t size3 = AlignLoAny(size, 3); i < size3; i += 3, bits += 3)
+            {
+                bitCount += bits[0][1];
+                bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount);
+                bitCount += bits[1][1];
+                bitBuffer |= uint64_t(bits[1][0]) << (64 - bitCount);
+                bitCount += bits[2][1];
+                bitBuffer |= uint64_t(bits[2][0]) << (64 - bitCount);
+                assert(bitCount <= 64);
+                while (bitCount >= 16)
+                {
+                    uint8_t byte = uint8_t(bitBuffer >> 56);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    byte = uint8_t(bitBuffer >> 48);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    bitBuffer <<= 16;
+                    bitCount -= 16;
+                }
+            }
+            if(bitCount >= 8)
+            {
+                assert(bitCount < 16);
+                uint8_t byte = uint8_t(bitBuffer >> 56);
+                data[pos++] = byte;
+                if (byte == 255)
+                    data[pos++] = 0;
+                bitBuffer <<= 8;
+                bitCount -= 8;
+            }
+            for (; i < size; ++i, ++bits)
+            {
+                bitCount += bits[0][1];
+                bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount);
+                while (bitCount >= 8)
+                {
+                    uint8_t byte = uint8_t(bitBuffer >> 56);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    bitBuffer <<= 8;
+                    bitCount -= 8;
+                }
+            }
+#else
+            uint32_t &bitBuffer = stream.BitBuffer();
+            for (; i < size; ++i, ++bits)
+            {
+                bitCount += bits[0][1];
+                bitBuffer |= uint32_t(bits[0][0]) << (32 - bitCount);
+                while (bitCount >= 8)
+                {
+                    uint8_t byte = uint8_t(bitBuffer >> 24);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    bitBuffer <<= 8;
+                    bitCount -= 8;
+                }
+            }
+#endif
+            stream.Seek(pos);
+        }
+    }
+
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+    }
+#endif// SIMD_SSE41_ENABLE
+
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        extern const uint32_t JpegZigZagTi32[64];
+
+        SIMD_INLINE void JpegDctV(const float* src, size_t srcStride, float* dst, size_t dstStride)
+        {
+            static const __m256 _0_707106781 = _mm256_set1_ps(0.707106781f);
+            static const __m256 _0_382683433 = _mm256_set1_ps(0.382683433f);
+            static const __m256 _0_541196100 = _mm256_set1_ps(0.541196100f);
+            static const __m256 _1_306562965 = _mm256_set1_ps(1.306562965f);
+
+            __m256 d0 = _mm256_loadu_ps(src + 0 * srcStride);
+            __m256 d1 = _mm256_loadu_ps(src + 1 * srcStride);
+            __m256 d2 = _mm256_loadu_ps(src + 2 * srcStride);
+            __m256 d3 = _mm256_loadu_ps(src + 3 * srcStride);
+            __m256 d4 = _mm256_loadu_ps(src + 4 * srcStride);
+            __m256 d5 = _mm256_loadu_ps(src + 5 * srcStride);
+            __m256 d6 = _mm256_loadu_ps(src + 6 * srcStride);
+            __m256 d7 = _mm256_loadu_ps(src + 7 * srcStride);
+
+            __m256 tmp0 = _mm256_add_ps(d0, d7);
+            __m256 tmp7 = _mm256_sub_ps(d0, d7);
+            __m256 tmp1 = _mm256_add_ps(d1, d6);
+            __m256 tmp6 = _mm256_sub_ps(d1, d6);
+            __m256 tmp2 = _mm256_add_ps(d2, d5);
+            __m256 tmp5 = _mm256_sub_ps(d2, d5);
+            __m256 tmp3 = _mm256_add_ps(d3, d4);
+            __m256 tmp4 = _mm256_sub_ps(d3, d4);
+
+            __m256 tmp10 = _mm256_add_ps(tmp0, tmp3);
+            __m256 tmp13 = _mm256_sub_ps(tmp0, tmp3);
+            __m256 tmp11 = _mm256_add_ps(tmp1, tmp2);
+            __m256 tmp12 = _mm256_sub_ps(tmp1, tmp2);
+
+            d0 = _mm256_add_ps(tmp10, tmp11);
+            d4 = _mm256_sub_ps(tmp10, tmp11);
+
+            __m256 z1 = _mm256_mul_ps(_mm256_add_ps(tmp12, tmp13), _0_707106781);
+            d2 = _mm256_add_ps(tmp13, z1);
+            d6 = _mm256_sub_ps(tmp13, z1);
+
+            tmp10 = _mm256_add_ps(tmp4, tmp5);
+            tmp11 = _mm256_add_ps(tmp5, tmp6);
+            tmp12 = _mm256_add_ps(tmp6, tmp7);
+
+            __m256 z5 = _mm256_mul_ps(_mm256_sub_ps(tmp10, tmp12), _0_382683433);
+            __m256 z2 = _mm256_add_ps(_mm256_mul_ps(tmp10, _0_541196100), z5);
+            __m256 z4 = _mm256_add_ps(_mm256_mul_ps(tmp12, _1_306562965), z5);
+            __m256 z3 = _mm256_mul_ps(tmp11, _0_707106781);
+
+            __m256 z11 = _mm256_add_ps(tmp7, z3);
+            __m256 z13 = _mm256_sub_ps(tmp7, z3);
+
+            _mm256_storeu_ps(dst + 0 * dstStride, d0);
+            _mm256_storeu_ps(dst + 1 * dstStride, _mm256_add_ps(z11, z4));
+            _mm256_storeu_ps(dst + 2 * dstStride, d2);
+            _mm256_storeu_ps(dst + 3 * dstStride, _mm256_sub_ps(z13, z2));
+            _mm256_storeu_ps(dst + 4 * dstStride, d4);
+            _mm256_storeu_ps(dst + 5 * dstStride, _mm256_add_ps(z13, z2));
+            _mm256_storeu_ps(dst + 6 * dstStride, d6);
+            _mm256_storeu_ps(dst + 7 * dstStride, _mm256_sub_ps(z11, z4));
+        }
+
+        SIMD_INLINE void JpegDct(const float* src, size_t stride, const float* fdt, int* dst)
+        {
+            static const __m256 _0_707106781 = _mm256_set1_ps(0.707106781f);
+            static const __m256 _0_382683433 = _mm256_set1_ps(0.382683433f);
+            static const __m256 _0_541196100 = _mm256_set1_ps(0.541196100f);
+            static const __m256 _1_306562965 = _mm256_set1_ps(1.306562965f);
+
+            __m256 d0 = _mm256_loadu_ps(src + 0 * stride);
+            __m256 d1 = _mm256_loadu_ps(src + 1 * stride);
+            __m256 d2 = _mm256_loadu_ps(src + 2 * stride);
+            __m256 d3 = _mm256_loadu_ps(src + 3 * stride);
+            __m256 d4 = _mm256_loadu_ps(src + 4 * stride);
+            __m256 d5 = _mm256_loadu_ps(src + 5 * stride);
+            __m256 d6 = _mm256_loadu_ps(src + 6 * stride);
+            __m256 d7 = _mm256_loadu_ps(src + 7 * stride);
+
+            __m256 tmp0 = _mm256_add_ps(d0, d7);
+            __m256 tmp7 = _mm256_sub_ps(d0, d7);
+            __m256 tmp1 = _mm256_add_ps(d1, d6);
+            __m256 tmp6 = _mm256_sub_ps(d1, d6);
+            __m256 tmp2 = _mm256_add_ps(d2, d5);
+            __m256 tmp5 = _mm256_sub_ps(d2, d5);
+            __m256 tmp3 = _mm256_add_ps(d3, d4);
+            __m256 tmp4 = _mm256_sub_ps(d3, d4);
+
+            __m256 tmp10 = _mm256_add_ps(tmp0, tmp3);
+            __m256 tmp13 = _mm256_sub_ps(tmp0, tmp3);
+            __m256 tmp11 = _mm256_add_ps(tmp1, tmp2);
+            __m256 tmp12 = _mm256_sub_ps(tmp1, tmp2);
+
+            d0 = _mm256_add_ps(tmp10, tmp11);
+            d4 = _mm256_sub_ps(tmp10, tmp11);
+
+            __m256 z1 = _mm256_mul_ps(_mm256_add_ps(tmp12, tmp13), _0_707106781);
+            d2 = _mm256_add_ps(tmp13, z1);
+            d6 = _mm256_sub_ps(tmp13, z1);
+
+            tmp10 = _mm256_add_ps(tmp4, tmp5);
+            tmp11 = _mm256_add_ps(tmp5, tmp6);
+            tmp12 = _mm256_add_ps(tmp6, tmp7);
+
+            __m256 z5 = _mm256_mul_ps(_mm256_sub_ps(tmp10, tmp12), _0_382683433);
+            __m256 z2 = _mm256_add_ps(_mm256_mul_ps(tmp10, _0_541196100), z5);
+            __m256 z4 = _mm256_add_ps(_mm256_mul_ps(tmp12, _1_306562965), z5);
+            __m256 z3 = _mm256_mul_ps(tmp11, _0_707106781);
+
+            __m256 z11 = _mm256_add_ps(tmp7, z3);
+            __m256 z13 = _mm256_sub_ps(tmp7, z3);
+
+            d1 = _mm256_add_ps(z11, z4);
+            d3 = _mm256_sub_ps(z13, z2);
+            d5 = _mm256_add_ps(z13, z2);
+            d7 = _mm256_sub_ps(z11, z4);
+
+            tmp10 = _mm256_permute2f128_ps(d0, d4, 0x20);
+            tmp11 = _mm256_permute2f128_ps(d1, d5, 0x20);
+            tmp12 = _mm256_permute2f128_ps(d2, d6, 0x20);
+            tmp13 = _mm256_permute2f128_ps(d3, d7, 0x20);
+            d4 = _mm256_permute2f128_ps(d0, d4, 0x31);
+            d5 = _mm256_permute2f128_ps(d1, d5, 0x31);
+            d6 = _mm256_permute2f128_ps(d2, d6, 0x31);
+            d7 = _mm256_permute2f128_ps(d3, d7, 0x31);
+
+            tmp0 = _mm256_unpacklo_ps(tmp10, tmp12);
+            tmp1 = _mm256_unpackhi_ps(tmp10, tmp12);
+            tmp2 = _mm256_unpacklo_ps(tmp11, tmp13);
+            tmp3 = _mm256_unpackhi_ps(tmp11, tmp13);
+            d0 = _mm256_unpacklo_ps(tmp0, tmp2);
+            d1 = _mm256_unpackhi_ps(tmp0, tmp2);
+            d2 = _mm256_unpacklo_ps(tmp1, tmp3);
+            d3 = _mm256_unpackhi_ps(tmp1, tmp3);
+
+            tmp0 = _mm256_unpacklo_ps(d4, d6);
+            tmp1 = _mm256_unpackhi_ps(d4, d6);
+            tmp2 = _mm256_unpacklo_ps(d5, d7);
+            tmp3 = _mm256_unpackhi_ps(d5, d7);
+            d4 = _mm256_unpacklo_ps(tmp0, tmp2);
+            d5 = _mm256_unpackhi_ps(tmp0, tmp2);
+            d6 = _mm256_unpacklo_ps(tmp1, tmp3);
+            d7 = _mm256_unpackhi_ps(tmp1, tmp3);
+
+            tmp0 = _mm256_add_ps(d0, d7);
+            tmp1 = _mm256_add_ps(d1, d6);
+            tmp2 = _mm256_add_ps(d2, d5);
+            tmp3 = _mm256_add_ps(d3, d4);
+            tmp7 = _mm256_sub_ps(d0, d7);
+            tmp6 = _mm256_sub_ps(d1, d6);
+            tmp5 = _mm256_sub_ps(d2, d5);
+            tmp4 = _mm256_sub_ps(d3, d4);
+
+            tmp10 = _mm256_add_ps(tmp0, tmp3);
+            tmp13 = _mm256_sub_ps(tmp0, tmp3);
+            tmp11 = _mm256_add_ps(tmp1, tmp2);
+            tmp12 = _mm256_sub_ps(tmp1, tmp2);
+
+            d0 = _mm256_add_ps(tmp10, tmp11);
+            d4 = _mm256_sub_ps(tmp10, tmp11);
+
+            z1 = _mm256_mul_ps(_mm256_add_ps(tmp12, tmp13), _0_707106781);
+            d2 = _mm256_add_ps(tmp13, z1);
+            d6 = _mm256_sub_ps(tmp13, z1);
+
+            tmp10 = _mm256_add_ps(tmp4, tmp5);
+            tmp11 = _mm256_add_ps(tmp5, tmp6);
+            tmp12 = _mm256_add_ps(tmp6, tmp7);
+
+            z5 = _mm256_mul_ps(_mm256_sub_ps(tmp10, tmp12), _0_382683433);
+            z2 = _mm256_add_ps(_mm256_mul_ps(tmp10, _0_541196100), z5);
+            z4 = _mm256_add_ps(_mm256_mul_ps(tmp12, _1_306562965), z5);
+            z3 = _mm256_mul_ps(tmp11, _0_707106781);
+
+            z11 = _mm256_add_ps(tmp7, z3);
+            z13 = _mm256_sub_ps(tmp7, z3);
+
+            d1 = _mm256_add_ps(z11, z4);
+            d3 = _mm256_sub_ps(z13, z2);
+            d5 = _mm256_add_ps(z13, z2);
+            d7 = _mm256_sub_ps(z11, z4);
+
+            _mm256_storeu_si256((__m256i*)dst + 0, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 0), d0)));
+            _mm256_storeu_si256((__m256i*)dst + 1, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 1), d1)));
+            _mm256_storeu_si256((__m256i*)dst + 2, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 2), d2)));
+            _mm256_storeu_si256((__m256i*)dst + 3, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 3), d3)));
+            _mm256_storeu_si256((__m256i*)dst + 4, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 4), d4)));
+            _mm256_storeu_si256((__m256i*)dst + 5, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 5), d5)));
+            _mm256_storeu_si256((__m256i*)dst + 6, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 6), d6)));
+            _mm256_storeu_si256((__m256i*)dst + 7, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_loadu_ps(fdt + F * 7), d7)));
+        }
+
+        const __m256i K32_PERM_LD = SIMD_MM256_SETR_EPI32(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1);
+
+        const __m256i K8_SHFL_VS = SIMD_MM256_SETR_EPI8(
+            0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1,
+            0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1);
+
+        const __m256i K8_SHFL_SH = SIMD_MM256_SETR_EPI8(
+            0x2, 0x3, -1, -1, 0x6, 0x7, -1, -1, 0xA, 0xB, -1, -1, -1, -1, -1, -1,
+            0x2, 0x3, -1, -1, 0x6, 0x7, -1, -1, 0xA, 0xB, -1, -1, -1, -1, -1, -1);
+
+        const __m256i K32_32 = SIMD_MM256_SET1_EPI32(32);
+
+#if defined(SIMD_X64_ENABLE)
+        SIMD_INLINE void WriteBits(uint8_t* data, size_t & pos, uint64_t & bitBuffer, size_t &bitCount, uint64_t shift, uint64_t value, uint64_t mask)
+        {
+            bitCount += shift;
+            assert(bitCount <= 64);
+            bitBuffer |= _pext_u64(value, mask) << (64 - bitCount);
+            while (bitCount >= 16)
+            {
+                uint8_t byte = uint8_t(bitBuffer >> 56);
+                data[pos++] = byte;
+                if (byte == 255)
+                    data[pos++] = 0;
+                byte = uint8_t(bitBuffer >> 48);
+                data[pos++] = byte;
+                if (byte == 255)
+                    data[pos++] = 0;
+                bitBuffer <<= 16;
+                bitCount -= 16;
+            }
+        }
+#endif
+
+        SIMD_INLINE void WriteBits(OutputMemoryStream& stream, const uint16_t bits[][2], size_t size)
+        {
+            size_t pos = stream.Pos();
+            stream.Reserve(pos + size * 2);
+            uint8_t* data = stream.Data();
+            size_t& bitCount = stream.BitCount();
+            size_t i = 0;
+#if defined(SIMD_X64_ENABLE)
+            uint64_t &bitBuffer = stream.BitBuffer();
+            size_t size12 = AlignLoAny(size, 12);
+            for (; i < size12; i += 12, bits += 12)
+            {
+                __m256i b0 = _mm256_permutevar8x32_epi32(_mm256_loadu_si256((__m256i*)(bits + 0)), K32_PERM_LD);
+                __m256i b1 = _mm256_permutevar8x32_epi32(_mm256_loadu_si256((__m256i*)(bits + 6)), K32_PERM_LD);
+                __m256i vs0 = _mm256_shuffle_epi8(b0, K8_SHFL_VS);
+                __m256i vs1 = _mm256_shuffle_epi8(b1, K8_SHFL_VS);
+                __m256i vv = Shuffle64i<0x0>(vs0, vs1);
+                __m256i ss = Shuffle64i<0xF>(vs0, vs1);
+                SIMD_ALIGNED(32) uint64_t value[4], mask[4], shift[4];
+                _mm256_storeu_si256((__m256i*)value, vv);
+                _mm256_storeu_si256((__m256i*)shift, _mm256_sad_epu8(ss, K_ZERO));
+                __m256i s0 = _mm256_sub_epi32(K32_32, _mm256_shuffle_epi8(b0, K8_SHFL_SH));
+                __m256i m0 = _mm256_srlv_epi32(K_INV_ZERO, s0);
+                __m256i s1 = _mm256_sub_epi32(K32_32, _mm256_shuffle_epi8(b1, K8_SHFL_SH));
+                __m256i m1 = _mm256_srlv_epi32(K_INV_ZERO, s1);
+                __m256i ms0 = _mm256_shuffle_epi8(m0, K8_SHFL_VS);
+                __m256i ms1 = _mm256_shuffle_epi8(m1, K8_SHFL_VS);
+                _mm256_storeu_si256((__m256i*)mask, Shuffle64i<0x0>(ms0, ms1));
+                WriteBits(data, pos, bitBuffer, bitCount, shift[0], value[0], mask[0]);
+                WriteBits(data, pos, bitBuffer, bitCount, shift[2], value[2], mask[2]);
+                WriteBits(data, pos, bitBuffer, bitCount, shift[1], value[1], mask[1]);
+                WriteBits(data, pos, bitBuffer, bitCount, shift[3], value[3], mask[3]);
+            }
+            if (bitCount >= 8)
+            {
+                assert(bitCount < 16);
+                uint8_t byte = uint8_t(bitBuffer >> 56);
+                data[pos++] = byte;
+                if (byte == 255)
+                    data[pos++] = 0;
+                bitBuffer <<= 8;
+                bitCount -= 8;
+            }
+            for (; i < size; ++i, ++bits)
+            {
+                bitCount += bits[0][1];
+                bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount);
+                while (bitCount >= 8)
+                {
+                    uint8_t byte = uint8_t(bitBuffer >> 56);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    bitBuffer <<= 8;
+                    bitCount -= 8;
+                }
+            }
+#else
+            uint32_t& bitBuffer = stream.BitBuffer();
+            for (; i < size; ++i, ++bits)
+            {
+                bitCount += bits[0][1];
+                bitBuffer |= uint32_t(bits[0][0]) << (32 - bitCount);
+                while (bitCount >= 8)
+                {
+                    uint8_t byte = uint8_t(bitBuffer >> 24);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    bitBuffer <<= 8;
+                    bitCount -= 8;
+                }
+            }
+#endif
+            stream.Seek(pos);
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+
+#ifdef SIMD_AVX512BW_ENABLE    
+    namespace Avx512bw
+    {
+        const __m512i K32_PERM_LD = SIMD_MM512_SETR_EPI32(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);
+
+        const __m512i K8_SHFL_VS = SIMD_MM512_SETR_EPI8(
+            0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1,
+            0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1,
+            0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1,
+            0x8, 0x9, 0x4, 0x5, 0x0, 0x1, -1, -1, 0xA, 0xB, 0x6, 0x7, 0x2, 0x3, -1, -1);
+
+        SIMD_INLINE void WriteBits(OutputMemoryStream& stream, const uint16_t bits[][2], size_t size)
+        {
+            size_t pos = stream.Pos();
+            stream.Reserve(pos + size * 2);
+            uint8_t* data = stream.Data();
+            size_t& bitCount = stream.BitCount();
+            size_t i = 0;
+#if defined(SIMD_X64_ENABLE)
+            uint64_t &bitBuffer = stream.BitBuffer();
+            size_t size24 = AlignLoAny(size, 24);
+            for (; i < size24; i += 24, bits += 24)
+            {
+                __m512i b0 = _mm512_permutexvar_epi32(K32_PERM_LD, _mm512_loadu_si512((__m512i*)(bits + 00)));
+                __m512i b1 = _mm512_permutexvar_epi32(K32_PERM_LD, _mm512_loadu_si512((__m512i*)(bits + 12)));
+                __m512i vs0 = _mm512_shuffle_epi8(b0, K8_SHFL_VS);
+                __m512i vs1 = _mm512_shuffle_epi8(b1, K8_SHFL_VS);
+                __m512i vv = Shuffle64i<0x00>(vs0, vs1);
+                __m512i ss = Shuffle64i<0xFF>(vs0, vs1);
+                SIMD_ALIGNED(64) uint64_t value[8], mask[8], shift[8];
+                _mm512_storeu_si512((__m512i*)value, vv);
+                _mm512_storeu_si512((__m512i*)shift, _mm512_sad_epu8(ss, K_ZERO));
+                _mm512_storeu_si512((__m512i*)mask, _mm512_srlv_epi16(K_INV_ZERO, _mm512_sub_epi16(K16_0010, ss)));
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[0], value[0], mask[0]);
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[2], value[2], mask[2]);
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[4], value[4], mask[4]);
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[6], value[6], mask[6]);
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[1], value[1], mask[1]);
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[3], value[3], mask[3]);
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[5], value[5], mask[5]);
+                Avx2::WriteBits(data, pos, bitBuffer, bitCount, shift[7], value[7], mask[7]);
+            }
+            if (bitCount >= 8)
+            {
+                assert(bitCount < 16);
+                uint8_t byte = uint8_t(bitBuffer >> 56);
+                data[pos++] = byte;
+                if (byte == 255)
+                    data[pos++] = 0;
+                bitBuffer <<= 8;
+                bitCount -= 8;
+            }
+            for (; i < size; ++i, ++bits)
+            {
+                bitCount += bits[0][1];
+                bitBuffer |= uint64_t(bits[0][0]) << (64 - bitCount);
+                while (bitCount >= 8)
+                {
+                    uint8_t byte = uint8_t(bitBuffer >> 56);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    bitBuffer <<= 8;
+                    bitCount -= 8;
+                }
+            }
+#else
+            uint32_t& bitBuffer = stream.BitBuffer();
+            for (; i < size; ++i, ++bits)
+            {
+                bitCount += bits[0][1];
+                bitBuffer |= uint32_t(bits[0][0]) << (32 - bitCount);
+                while (bitCount >= 8)
+                {
+                    uint8_t byte = uint8_t(bitBuffer >> 24);
+                    data[pos++] = byte;
+                    if (byte == 255)
+                        data[pos++] = 0;
+                    bitBuffer <<= 8;
+                    bitCount -= 8;
+                }
+            }
+#endif
+            stream.Seek(pos);
+        }
+    }
+#endif// SIMD_AVX512BW_ENABLE
+
+#ifdef SIMD_NEON_ENABLE    
+    namespace Neon
+    {
+    }
+#endif// SIMD_NEON_ENABLE
+}
+
+#endif//__SimdImageSaveJpeg_h__
diff --git a/3rdparty/simdlib/Simd/SimdImageSavePng.h b/3rdparty/simdlib/Simd/SimdImageSavePng.h
new file mode 100644
index 0000000000..71efd1ca60
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdImageSavePng.h
@@ -0,0 +1,235 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdImageSavePng_h__
+#define __SimdImageSavePng_h__
+
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdLoad.h"
+
+#define SIMD_PNG_ZLIB_BIT_REV_TABLE
+
+namespace Simd
+{
+    namespace Base
+    {
+        extern const uint16_t ZlibLenC[30];
+        extern const uint8_t  ZlibLenEb[29];
+        extern const uint16_t ZlibDistC[31];
+        extern const uint8_t  ZlibDistEb[30];
+
+#if defined(SIMD_PNG_ZLIB_BIT_REV_TABLE)
+        const int ZlibBitRevShift = 9;
+        const int ZlibBitRevSize = 1 << ZlibBitRevShift;
+        extern int ZlibBitRevTable[ZlibBitRevSize];
+        SIMD_INLINE int ZlibBitRev(int bits, int count)
+        {
+            assert(bits < ZlibBitRevSize&& count <= ZlibBitRevShift);
+            return ZlibBitRevTable[bits] >> (ZlibBitRevShift - count);
+        }
+#else
+        SIMD_INLINE int ZlibBitRev(int bits, int count)
+        {
+            int rev = 0;
+            for (size_t b = 0; b < count; b++)
+            {
+                rev = (rev << 1) | (bits & 1);
+                bits >>= 1;
+            }
+            return rev;
+        }
+#endif
+
+        SIMD_INLINE uint32_t ZlibHash(const uint8_t* data)
+        {
+            uint32_t hash = data[0] + (data[1] << 8) + (data[2] << 16);
+            hash ^= hash << 3;
+            hash += hash >> 5;
+            hash ^= hash << 4;
+            hash += hash >> 17;
+            hash ^= hash << 25;
+            hash += hash >> 6;
+            return hash;
+        }
+
+        SIMD_INLINE void ZlibHuffA(int bits, int count, OutputMemoryStream& stream)
+        {
+            stream.WriteBits(ZlibBitRev(bits, count), count);
+        }
+
+        SIMD_INLINE void ZlibHuff1(int bits, OutputMemoryStream& stream)
+        {
+            ZlibHuffA(0x30 + bits, 8, stream);
+        }
+
+        SIMD_INLINE void ZlibHuff2(int bits, OutputMemoryStream& stream)
+        {
+            ZlibHuffA(0x190 + bits - 144, 9, stream);
+        }
+
+        SIMD_INLINE void ZlibHuff3(int bits, OutputMemoryStream& stream)
+        {
+            ZlibHuffA(0 + bits - 256, 7, stream);
+        }
+
+        SIMD_INLINE void ZlibHuff4(int bits, OutputMemoryStream& stream)
+        {
+            ZlibHuffA(0xc0 + bits - 280, 8, stream);
+        }
+
+        SIMD_INLINE void ZlibHuff(int bits, OutputMemoryStream& stream)
+        {
+            if (bits <= 143)
+                ZlibHuff1(bits, stream);
+            else if (bits <= 255)
+                ZlibHuff2(bits, stream);
+            else if (bits <= 279)
+                ZlibHuff3(bits, stream);
+            else
+                ZlibHuff4(bits, stream);
+        }
+
+        SIMD_INLINE void ZlibHuffB(int bits, OutputMemoryStream& stream)
+        {
+            if (bits <= 143)
+                ZlibHuff1(bits, stream);
+            else
+                ZlibHuff2(bits, stream);
+        }
+
+        SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit)
+        {
+            limit = Min(limit, 258);
+            int i = 0;
+#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+            int limit8 = limit & (~7);
+            for (; i < limit8; i += 8)
+                if (*(uint64_t*)(a + i) != *(uint64_t*)(b + i))
+                    break;
+#else
+            int limit4 = limit & (~3);
+            for (; i < limit4; i += 4)
+                if (*(uint32_t*)(a + i) != *(uint32_t*)(b + i))
+                    break;
+#endif
+            for (; i < limit; i += 1)
+                if (a[i] != b[i])
+                    break;
+            return i;
+        }
+
+        SIMD_INLINE uint8_t Paeth(int a, int b, int c)
+        {
+            int p = a + b - c, pa = abs(p - a), pb = abs(p - b), pc = abs(p - c);
+            if (pa <= pb && pa <= pc)
+                return uint8_t(a);
+            if (pb <= pc)
+                return uint8_t(b);
+            return uint8_t(c);
+        }
+    }
+
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit)
+        {
+            limit = Min(limit, 258);
+            int i = 0;
+            int limit16 = limit & (~15);
+            for (; i < limit16; i += 16)
+                if (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)(a + i)), _mm_loadu_si128((__m128i*)(b + i)))) != 0xFFFF)
+                    break;
+#if defined(SIMD_X64_ENABLE)
+            int limit8 = limit & (~7);
+            for (; i < limit8; i += 8)
+                if (*(uint64_t*)(a + i) != *(uint64_t*)(b + i))
+                    break;
+#else
+            int limit4 = limit & (~3);
+            for (; i < limit4; i += 4)
+                if (*(uint32_t*)(a + i) != *(uint32_t*)(b + i))
+                    break;
+#endif
+            for (; i < limit; i += 1)
+                if (a[i] != b[i])
+                    break;
+            return i;
+        }
+    }
+#endif// SIMD_SSE41_ENABLE
+
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit)
+        {
+            limit = Min(limit, 258);
+            int i = 0;
+            for (; i < limit; i += 32)
+            {
+                __m256i _a = _mm256_loadu_si256((__m256i*)(a + i));
+                __m256i _b = _mm256_loadu_si256((__m256i*)(b + i));
+                uint32_t mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(_a, _b));
+                if (mask != 0xFFFFFFFF)
+                {
+                    i += _tzcnt_u32(~mask);
+                    break;
+                }
+            }
+            return Min(i, limit);
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+
+#ifdef SIMD_AVX512BW_ENABLE    
+    namespace Avx512bw
+    {
+        SIMD_INLINE int ZlibCount(const uint8_t* a, const uint8_t* b, int limit)
+        {
+            limit = Min(limit, 258);
+            int i = 0;
+            for (; i < limit; i += 64)
+            {
+                __m512i _a = _mm512_loadu_si512(a + i);
+                __m512i _b = _mm512_loadu_si512(b + i);
+                uint64_t mask = _mm512_cmp_epi8_mask(_a, _b, _MM_CMPINT_NE);
+                if (mask != 0)
+                {
+                    i += (int)FirstNotZero64(mask);
+                    break;
+                }
+            }
+            return Min(i, limit);
+        }
+    }
+#endif// SIMD_AVX512BW_ENABLE
+
+#ifdef SIMD_NEON_ENABLE    
+    namespace Neon
+    {
+    }
+#endif// SIMD_NEON_ENABLE
+}
+
+#endif//__SimdImageSavePng_h__
diff --git a/3rdparty/simdlib/Simd/SimdLib.cpp b/3rdparty/simdlib/Simd/SimdLib.cpp
index 89718bb80e..c168701413 100755
--- a/3rdparty/simdlib/Simd/SimdLib.cpp
+++ b/3rdparty/simdlib/Simd/SimdLib.cpp
@@ -61,8 +61,10 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved)
 #include "Simd/SimdConst.h"
 #include "Simd/SimdLog.h"
 
-#include "Simd/SimdResizer.h"
 #include "Simd/SimdGaussianBlur.h"
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdResizer.h"
 
 #include "Simd/SimdBase.h"
 #include "Simd/SimdSse2.h"
@@ -451,6 +453,34 @@ SIMD_API void SimdGrayToBgra(const uint8_t * gray, size_t width, size_t height,
         Base::GrayToBgra(gray, width, height, grayStride, bgra, bgraStride, alpha);
 }
 
+SIMD_API uint8_t* SimdImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size)
+{
+    const static Simd::ImageSaveToMemoryPtr imageSaveToMemory = SIMD_FUNC3(ImageSaveToMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC);
+
+    return imageSaveToMemory(src, stride, width, height, format, file, quality, size);
+}
+
+SIMD_API SimdBool SimdImageSaveToFile(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char* path)
+{
+    const static Simd::ImageSaveToMemoryPtr imageSaveToMemory = SIMD_FUNC3(ImageSaveToMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC);
+
+    return ImageSaveToFile(imageSaveToMemory, src, stride, width, height, format, file, quality, path);
+}
+
+SIMD_API uint8_t* SimdImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+{
+    const static Simd::ImageLoadFromMemoryPtr imageLoadFromMemory = SIMD_FUNC3(ImageLoadFromMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC);
+
+    return imageLoadFromMemory(data, size, stride, width, height, format);
+}
+
+SIMD_API uint8_t* SimdImageLoadFromFile(const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+{
+    const static Simd::ImageLoadFromMemoryPtr imageLoadFromMemory = SIMD_FUNC3(ImageLoadFromMemory, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC, SIMD_NEON_FUNC);
+
+    return ImageLoadFromFile(imageLoadFromMemory, path, stride, width, height, format);
+}
+
 SIMD_API void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride,
     size_t width, size_t height, uint8_t * bgr, size_t bgrStride)
 {
diff --git a/3rdparty/simdlib/Simd/SimdLib.h b/3rdparty/simdlib/Simd/SimdLib.h
index 4838b82261..5441805969 100755
--- a/3rdparty/simdlib/Simd/SimdLib.h
+++ b/3rdparty/simdlib/Simd/SimdLib.h
@@ -116,6 +116,27 @@ typedef enum
     SimdCpuInfoNeon, /*!< Availability of NEON (ARM). */
 } SimdCpuInfoType;
 
+/*! @ingroup c_types
+    Describes formats of image file. It is used in functions ::SimdImageSaveToMemory and ::SimdImageSaveToFile.
+*/
+typedef enum
+{
+    /*! An undefined image file format (format auto choice). */
+    SimdImageFileUndefined = 0,
+    /*! A PGM (Portable Gray Map) text (P2) image file format. */
+    SimdImageFilePgmTxt,
+    /*! A PGM (Portable Gray Map) binary (P5) image file format. */
+    SimdImageFilePgmBin,
+    /*! A PGM (Portable Pixel Map) text (P3) image file format. */
+    SimdImageFilePpmTxt,
+    /*! A PGM (Portable Pixel Map) binary (P6) image file format. */
+    SimdImageFilePpmBin,
+    /*! A PNG (Portable Network Graphics) image file format. */
+    SimdImageFilePng,
+    /*! A JPEG (Joint Photographic Experts Group) image file format. */
+    SimdImageFileJpeg,
+} SimdImageFileType;
+
 /*! @ingroup c_types
     Describes types of binary operation between two images performed by function ::SimdOperationBinary8u.
     Images must have the same format (unsigned 8-bit integer for every channel).
@@ -167,18 +188,6 @@ typedef enum
     SimdPixelFormatFloat,
     /*! A single channel 64-bit float point pixel format. */
     SimdPixelFormatDouble,
-    /*! A 8-bit Bayer pixel format (GRBG). */
-    SimdPixelFormatBayerGrbg,
-    /*! A 8-bit Bayer pixel format (GBRG). */
-    SimdPixelFormatBayerGbrg,
-    /*! A 8-bit Bayer pixel format (RGGB). */
-    SimdPixelFormatBayerRggb,
-    /*! A 8-bit Bayer pixel format (BGGR). */
-    SimdPixelFormatBayerBggr,
-    /*! A 24-bit (3 8-bit channels) HSV (Hue, Saturation, Value) pixel format. */
-    SimdPixelFormatHsv24,
-    /*! A 24-bit (3 8-bit channels) HSL (Hue, Saturation, Lightness) pixel format. */
-    SimdPixelFormatHsl24,
     /*! A 24-bit (3 8-bit channels) RGB (Red, Green, Blue) pixel format. */
     SimdPixelFormatRgb24,
     /*! A 32-bit (4 8-bit channels) RGBA (Red, Green, Blue, Alpha) pixel format. */
@@ -753,6 +762,82 @@ extern "C"
     SIMD_API void SimdGrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride,
         uint8_t *bgra, size_t bgraStride, uint8_t alpha);
 
+    /*! @ingroup image_io
+
+        \fn uint8_t* SimdImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t * size);
+
+        \short Saves an image to memory in given image file format.
+
+        \param [in] src - a pointer to pixels data of input image. 
+        \param [in] stride - a row size of input image in bytes.
+        \param [in] width - a width of input image.
+        \param [in] height - a height of input image.
+        \param [in] format - a pixel format of input image. 
+            Supported pixel formats: ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32.
+        \param [in] file - a format of output image file. To auto choise format of output file set this parameter to ::SimdImageFileUndefined.
+        \param [in] quality - a parameter of compression quality (if file format supports it).
+        \param [out] size - a pointer to the size of output image file in bytes.
+        \return a pointer to memory buffer with output image file. 
+            It has to be deleted after use by function ::SimdFree. On error it returns NULL.
+    */
+    SIMD_API uint8_t* SimdImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t * size);
+
+    /*! @ingroup image_io
+
+        \fn SimdBool SimdImageSaveToFile(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char * path);
+
+        \short Saves an image to memory in given image file format.
+
+        \param [in] src - a pointer to pixels data of input image.
+        \param [in] stride - a row size of input image in bytes.
+        \param [in] width - a width of input image.
+        \param [in] height - a height of input image.
+        \param [in] format - a pixel format of input image. 
+            Supported pixel formats: ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32.
+        \param [in] file - a format of output image file. To auto choise format of output file set this parameter to ::SimdImageFileUndefined.
+        \param [in] quality - a parameter of compression quality (if file format supports it).
+        \param [in] path - a path to output image file.
+        \return result of the operation.
+    */
+    SIMD_API SimdBool SimdImageSaveToFile(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, const char * path);
+
+    /*! @ingroup image_io
+
+        \fn uint8_t* SimdImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format);
+
+        \short Loads an image from memory buffer.
+
+        \param [in] data - a pointer to memory buffer with input image file.
+        \param [in] size - a size of input image file in bytes.
+        \param [out] stride - a pointer to row size of output image in bytes.
+        \param [out] width - a pointer to width of output image.
+        \param [out] height - a pointer to height of output image.
+        \param [in, out] format - a pointer to pixel format of output image. 
+            Here you can set desired pixel format (it can be ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32).
+            Or set ::SimdPixelFormatNone and use pixel format of input image file.
+        \return a pointer to pixels data of output image. 
+            It has to be deleted after use by function ::SimdFree. On error it returns NULL.
+    */
+    SIMD_API uint8_t* SimdImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format);
+
+    /*! @ingroup image_io
+
+        \fn uint8_t* SimdImageLoadFromFile(const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format);
+
+        \short Loads an image from file.
+
+        \param [in] path - a path to input image file.
+        \param [out] stride - a pointer to row size of output image in bytes.
+        \param [out] width - a pointer to width of output image.
+        \param [out] height - a pointer to height of output image.
+        \param [in, out] format - a pointer to pixel format of output image.
+            Here you can set desired pixel format (it can be ::SimdPixelFormatGray8, ::SimdPixelFormatBgr24, ::SimdPixelFormatBgra32, ::SimdPixelFormatRgb24, ::SimdPixelFormatRgba32).
+            Or set ::SimdPixelFormatNone and use pixel format of input image file.
+        \return a pointer to pixels data of output image.
+            It has to be deleted after use by function ::SimdFree. On error it returns NULL.
+    */
+    SIMD_API uint8_t* SimdImageLoadFromFile(const char* path, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType * format);
+
     /*! @ingroup other_conversion
 
         \fn void SimdInterleaveBgr(const uint8_t * b, size_t bStride, const uint8_t * g, size_t gStride, const uint8_t * r, size_t rStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride);
diff --git a/3rdparty/simdlib/Simd/SimdMath.h b/3rdparty/simdlib/Simd/SimdMath.h
index 0f7425f76e..f8c192a189 100755
--- a/3rdparty/simdlib/Simd/SimdMath.h
+++ b/3rdparty/simdlib/Simd/SimdMath.h
@@ -750,6 +750,11 @@ namespace Simd
             return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(lo), _mm256_castsi256_ps(hi), imm));
         }
 
+        template<int imm> SIMD_INLINE __m256i Shuffle64i(__m256i lo, __m256i hi)
+        {
+            return _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(lo), _mm256_castsi256_pd(hi), imm));
+        }
+
         template<int imm> SIMD_INLINE __m256 Permute4x64(__m256 a)
         {
             return _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(a), imm));
diff --git a/3rdparty/simdlib/Simd/SimdMemory.h b/3rdparty/simdlib/Simd/SimdMemory.h
index d7772ffa3c..f0fca8840a 100755
--- a/3rdparty/simdlib/Simd/SimdMemory.h
+++ b/3rdparty/simdlib/Simd/SimdMemory.h
@@ -35,6 +35,18 @@
 
 namespace Simd
 {
+    SIMD_INLINE size_t DivHi(size_t value, size_t divider)
+    {
+        return (value + divider - 1) / divider;
+    }
+
+    SIMD_INLINE size_t Pow2Hi(size_t value)
+    {
+        size_t pow2 = 1;
+        for (; pow2 < value; pow2 *= 2);
+        return pow2;
+    }
+
     SIMD_INLINE size_t AlignHiAny(size_t size, size_t align)
     {
         return (size + align - 1) / align * align;
@@ -108,6 +120,13 @@ namespace Simd
         return ptr;
     }
 
+    template<class T> T* Allocate(uint8_t*& buffer, size_t size, size_t align = SIMD_ALIGN)
+    {
+        T* ptr = (T*)buffer;
+        buffer = buffer + AlignHi(size * sizeof(T), align);
+        return ptr;
+    }
+
     SIMD_INLINE void Free(void * ptr)
     {
 #ifdef SIMD_NO_MANS_LAND
diff --git a/3rdparty/simdlib/Simd/SimdMemoryStream.h b/3rdparty/simdlib/Simd/SimdMemoryStream.h
new file mode 100644
index 0000000000..9665f33d63
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdMemoryStream.h
@@ -0,0 +1,510 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdMemoryStream_h__
+#define __SimdMemoryStream_h__
+
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdPerformance.h"
+
+namespace Simd
+{
+    class InputMemoryStream
+    {
+        const uint8_t* _data;
+        size_t _pos, _size, _bitCount;
+#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+        uint64_t _bitBuffer;
+#else
+        uint32_t _bitBuffer;
+#endif
+
+    public:
+        SIMD_INLINE InputMemoryStream(const uint8_t* data = NULL, size_t size = 0)
+        {
+            Init(data, size);
+        }
+
+        SIMD_INLINE void Init(const uint8_t* data, size_t size)
+        {
+            _pos = 0;
+            _data = data;
+            _size = size;
+            _bitBuffer = 0;
+            _bitCount = 0;
+        }
+
+        SIMD_INLINE bool Seek(size_t pos)
+        {
+            if (pos <= _size)
+            {
+                _pos = pos;
+                return true;
+            }
+            return false;
+        }
+
+        SIMD_INLINE size_t Size() const
+        {
+            return _size;
+        }
+
+        SIMD_INLINE const uint8_t* Data() const
+        {
+            return _data;
+        }
+
+        SIMD_INLINE size_t Pos() const
+        {
+            return _pos;
+        }
+
+        SIMD_INLINE const uint8_t* Current() const
+        {
+            return _data + _pos;
+        }
+
+        SIMD_INLINE bool Eof() const
+        {
+            return _pos >= _size;
+        }
+
+        SIMD_INLINE bool CanRead(size_t size) const
+        {
+            return _pos + size <= _size;
+        }
+        
+        SIMD_INLINE size_t Read(size_t size, void* data)
+        {
+            size = Min(_size - _pos, size);
+            memcpy(data, _data + _pos, size);
+            _pos += size;
+            return size;
+        }
+
+        template <class Value> SIMD_INLINE bool Read(Value & value)
+        {
+            return Read(sizeof(Value), &value) == sizeof(Value);
+        }
+
+        SIMD_INLINE bool Read8u(uint8_t & value)
+        {
+            if (_pos < _size)
+            {
+                value = _data[_pos++];
+                return true;
+            }
+            else
+                return false;
+        }
+
+        SIMD_INLINE bool Read16u(uint16_t& value)
+        {
+            if (_pos + 2 <= _size)
+            {
+                value = *(uint16_t*)(_data + _pos);
+                _pos += 2;
+                return true;
+            }
+            else
+                return false;
+        }
+
+        SIMD_INLINE bool Read32u(uint32_t& value)
+        {
+            if (_pos + 4 <= _size)
+            {
+                value = *(uint32_t*)(_data + _pos);
+                _pos += 4;
+                return true;
+            }
+            else
+                return false;
+        }
+
+        SIMD_INLINE bool ReadBe16u(uint16_t& value)
+        {
+            if (Read16u(value))
+            {
+#if !defined(SIMD_BIG_ENDIAN)
+                value =
+                    (value & 0x00FF) << 8 |
+                    (value & 0xFF00) >> 8;
+#endif
+                return true;
+            }
+            else
+                return false;
+        }
+
+        SIMD_INLINE bool ReadBe32u(uint32_t& value)
+        {
+            if (Read32u(value))
+            {
+#if !defined(SIMD_BIG_ENDIAN)
+                value =
+                    (value & 0x000000FF) << 24 |
+                    (value & 0x0000FF00) << 8 |
+                    (value & 0x00FF0000) >> 8 |
+                    (value & 0xFF000000) >> 24;
+#endif
+                return true;
+            }
+            else
+                return false;
+        }
+
+        template<class Unsigned> SIMD_INLINE bool ReadUnsigned(Unsigned& value)
+        {
+            if (!SkipGap())
+                return false;
+            value = 0;
+            while (!IsGap(_data[_pos]) && _pos < _size)
+            {
+                if (_data[_pos] >= '0' && _data[_pos] <= '9')
+                    value = value * 10 + Unsigned(_data[_pos] - '0');
+                else
+                    return false;
+                _pos++;
+            }
+            return true;
+        }
+
+        SIMD_INLINE bool Skip(size_t size)
+        {
+            if (_pos + size < _size)
+            {
+                _pos += size;
+                return true;
+            }
+            return false;
+        }
+
+        SIMD_INLINE bool SkipValue(uint8_t value)
+        {
+            while (_data[_pos] == value && _pos < _size)
+                _pos++;
+            return _pos < _size;
+        }
+
+        SIMD_INLINE bool SkipNotGap()
+        {
+            while (!IsGap(_data[_pos]) && _pos < _size)
+                _pos++;
+            return _pos < _size;
+        }        
+        
+        SIMD_INLINE bool SkipGap()
+        {
+            while (IsGap(_data[_pos]) && _pos < _size)
+                _pos++;
+            return _pos < _size;
+        }
+
+        static SIMD_INLINE bool IsGap(uint8_t value)
+        {
+            return value == ' ' || value == '\t' || value == '\n' || value == '\r';
+        }
+
+#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+        SIMD_INLINE uint64_t& BitBuffer()
+        {
+            return _bitBuffer;
+        }
+#else
+        SIMD_INLINE uint32_t& BitBuffer()
+        {
+            return _bitBuffer;
+        }
+#endif
+
+        SIMD_INLINE size_t& BitCount()
+        {
+            return _bitCount;
+        }
+
+        SIMD_INLINE void FillBits()
+        {
+            static const size_t canReadByte = (sizeof(_bitBuffer) - 1) * 8;
+            while (_bitCount <= canReadByte && _pos < _size)
+            {
+                _bitBuffer |= (size_t)_data[_pos++] << _bitCount;
+                _bitCount += 8;
+            }
+        }
+
+        SIMD_INLINE void ClearBits()
+        {
+            _pos -= _bitCount / 8;
+            _bitBuffer = 0;
+            _bitCount = 0;
+        }
+
+        SIMD_INLINE bool ReadBits(size_t & bits, size_t count)
+        {
+            if (_bitCount < count)
+                FillBits();
+            if (_bitCount < count)
+                return false;
+            bits = _bitBuffer & ((size_t(1) << count) - 1);
+            _bitBuffer >>= count;
+            _bitCount -= count;
+            return true;
+        }
+
+        SIMD_INLINE size_t ReadBits(size_t count)
+        {
+            if (_bitCount < count)
+                FillBits();
+            size_t bits = _bitBuffer & ((size_t(1) << count) - 1);
+            _bitBuffer >>= count;
+            _bitCount -= count;
+            return bits;
+        }
+    };
+
+    //-------------------------------------------------------------------------
+
+    class OutputMemoryStream
+    {
+        const size_t CAPACITY_MIN = 64;
+
+        uint8_t * _data;
+        size_t _pos, _size, _capacity, _bitCount;
+#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+        uint64_t _bitBuffer;
+#else
+        uint32_t _bitBuffer;
+#endif
+
+        SIMD_INLINE void Reset(bool owner)
+        {
+            if (_data && owner)
+                Free(_data);
+            _data = NULL;
+            _pos = 0;
+            _size = 0;
+            _capacity = 0;
+            _bitBuffer = 0;
+            _bitCount = 0;
+        }
+
+    public:
+        SIMD_INLINE OutputMemoryStream(size_t capacity = 0)
+        {
+            Reset(false);
+            if (capacity)
+                Reserve(capacity);
+        }
+
+        SIMD_INLINE ~OutputMemoryStream()
+        {
+            Reset(true);
+        }
+
+        SIMD_INLINE void Seek(size_t pos)
+        {
+            _pos = pos;
+            _size = Max(_size, _pos);
+            Reserve(_pos);
+        }
+
+        SIMD_INLINE size_t Pos() const
+        {
+            return _pos;
+        }
+
+        SIMD_INLINE size_t Size() const
+        {
+            return _size;
+        }
+
+        SIMD_INLINE size_t Capacity() const
+        {
+            return _capacity;
+        }
+
+        SIMD_INLINE uint8_t* Data()
+        {
+            return _data;
+        }
+
+        SIMD_INLINE const uint8_t * Data() const
+        {
+            return _data;
+        }
+
+        SIMD_INLINE uint8_t* Current()
+        {
+            return _data + _pos;
+        }
+
+        SIMD_INLINE const uint8_t* Current() const
+        {
+            return _data + _pos;
+        }
+
+        SIMD_INLINE void Write(const void * data, size_t size)
+        {
+            Reserve(_pos + size);
+            memcpy(_data + _pos, data, size);
+            _pos += size;
+            _size = Max(_size, _pos);
+        }
+
+        SIMD_INLINE bool Write(InputMemoryStream & input, size_t size)
+        {
+            if (input.CanRead(size))
+            {
+                Write(input.Current(), size);
+                input.Skip(size);
+                return true;
+            }
+            return false;
+        }
+
+        SIMD_INLINE bool WriteSelf(ptrdiff_t offset, size_t size)
+        {
+            if (offset < 0)
+                return false;
+            Reserve(_pos + size);
+            if (offset + size > _pos)
+            {
+                for (size_t i = 0; i < size; ++i)
+                    _data[_pos++] = _data[offset++];
+            }
+            else
+            {
+                memcpy(_data + _pos, _data + offset, size);
+                _pos += size;
+            }
+            _size = Max(_size, _pos);
+            return true;
+        }
+
+        template <class Value> SIMD_INLINE void Write(const Value& value)
+        {
+            Write(&value, sizeof(Value));
+        }
+
+        SIMD_INLINE void Write8u(uint8_t value)
+        {
+            Reserve(_pos + 1);
+            _data[_pos++] = value;
+            _size = Max(_size, _pos);
+        }
+
+        SIMD_INLINE void Write8u(uint8_t value, size_t count)
+        {
+            Reserve(_pos + count);
+            memset(_data + _pos, value, count);
+            _pos += count;
+            _size = Max(_size, _pos);
+        }
+
+        SIMD_INLINE void WriteBe32u(const uint32_t & value)
+        {
+#if defined(SIMD_BIG_ENDIAN)
+            Write<uint32_t>(value);
+#else
+            Write<uint32_t>(
+                (value & 0x000000FF) << 24 | 
+                (value & 0x0000FF00) << 8 |
+                (value & 0x00FF0000) >> 8 | 
+                (value & 0xFF000000) >> 24);
+#endif
+        }
+
+        SIMD_INLINE uint8_t* Release(size_t* size = NULL)
+        {
+            uint8_t* data = _data;
+            if(size)
+                *size = _size;
+            Reset(false);
+            return data;
+        }
+
+        SIMD_INLINE void Reserve(size_t size)
+        {
+            if (size > _capacity)
+            {
+                size_t capacity = Max(CAPACITY_MIN, Max(_capacity * 2, size));
+                uint8_t* data = (uint8_t*)Allocate(capacity, SIMD_ALIGN);
+                if (_data)
+                {
+                    memcpy(data, _data, _size);
+                    Free(_data);
+                }
+                _data = data;
+                _capacity = capacity;
+            }
+        }
+
+        SIMD_INLINE void WriteBits(const size_t bits, size_t count)
+        {
+            _bitBuffer |= (bits) << _bitCount;
+            _bitCount += count;
+            while (_bitCount >= 8)
+            {
+                Write8u((uint8_t)_bitBuffer);
+                _bitBuffer >>= 8;
+                _bitCount -= 8;
+            }
+        }
+
+        SIMD_INLINE void FlushBits()
+        {
+            while (_bitCount >= 8)
+            {
+                Write8u((uint8_t)_bitBuffer);
+                _bitBuffer >>= 8;
+                _bitCount -= 8;
+            }
+            if (_bitCount)
+            {
+                Write8u((uint8_t)_bitBuffer);
+                _bitBuffer = 0;
+                _bitCount = 0;
+            }
+        }
+
+#if defined(SIMD_X64_ENABLE) || defined(SIMD_ARM64_ENABLE)
+        SIMD_INLINE uint64_t & BitBuffer()
+        {
+            return _bitBuffer;
+        }
+#else
+        SIMD_INLINE uint32_t& BitBuffer()
+        {
+            return _bitBuffer;
+        }
+#endif
+
+        SIMD_INLINE size_t& BitCount()
+        {
+            return _bitCount;
+        }
+    };
+}
+
+#endif//__SimdMemoryStream_h__
diff --git a/3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp b/3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp
new file mode 100644
index 0000000000..61c5d90359
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdNeonImageLoad.cpp
@@ -0,0 +1,154 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdNeon.h"
+
+#include <memory>
+
+namespace Simd
+{
+#ifdef SIMD_NEON_ENABLE    
+    namespace Neon
+    {
+        ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param)
+            : Base::ImagePgmTxtLoader(param)
+        {
+        }
+
+        void ImagePgmTxtLoader::SetConverters()
+        {
+            Base::ImagePgmTxtLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _toAny = Neon::GrayToBgr; break;
+                case SimdPixelFormatBgra32: _toBgra = Neon::GrayToBgra; break;
+                case SimdPixelFormatRgb24: _toAny = Neon::GrayToBgr; break;
+                case SimdPixelFormatRgba32: _toBgra = Neon::GrayToBgra; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param)
+            : Base::ImagePgmBinLoader(param)
+        {
+        }
+
+        void ImagePgmBinLoader::SetConverters()
+        {
+            Base::ImagePgmBinLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _toAny = Neon::GrayToBgr; break;
+                case SimdPixelFormatBgra32: _toBgra = Neon::GrayToBgra; break;
+                case SimdPixelFormatRgb24: _toAny = Neon::GrayToBgr; break;
+                case SimdPixelFormatRgba32: _toBgra = Neon::GrayToBgra; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param)
+            : Base::ImagePpmTxtLoader(param)
+        {
+        }
+
+        void ImagePpmTxtLoader::SetConverters()
+        {
+            Base::ImagePpmTxtLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _toAny = Neon::RgbToGray; break;
+                case SimdPixelFormatBgr24: _toAny = Neon::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _toBgra = Neon::RgbToBgra; break;
+                case SimdPixelFormatRgba32: _toBgra = Neon::BgrToBgra; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param)
+            : Base::ImagePpmBinLoader(param)
+        {
+        }
+
+        void ImagePpmBinLoader::SetConverters()
+        {
+            Base::ImagePpmBinLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _toAny = Neon::RgbToGray; break;
+                case SimdPixelFormatBgr24: _toAny = Neon::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _toBgra = Neon::RgbToBgra; break;
+                case SimdPixelFormatRgba32: _toBgra = Neon::BgrToBgra; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageLoader* CreateImageLoader(const ImageLoaderParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinLoader(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinLoader(param);
+            case SimdImageFilePng: return new Base::ImagePngLoader(param);
+            case SimdImageFileJpeg: return new Base::ImageJpegLoader(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+        {
+            ImageLoaderParam param(data, size, *format);
+            if (param.Validate())
+            {
+                Holder<ImageLoader> loader(CreateImageLoader(param));
+                if (loader)
+                {
+                    if (loader->FromStream())
+                        return loader->Release(stride, width, height, format);
+                }
+            }
+            return NULL;
+        }
+    }
+#endif// SIMD_NEON_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdNeonImageSave.cpp b/3rdparty/simdlib/Simd/SimdNeonImageSave.cpp
new file mode 100644
index 0000000000..a0fbbd071a
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdNeonImageSave.cpp
@@ -0,0 +1,134 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdNeon.h"
+
+#include <memory>
+
+namespace Simd
+{
+#ifdef SIMD_NEON_ENABLE    
+    namespace Neon
+    {
+        ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param)
+            : Base::ImagePgmTxtSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _convert = Neon::BgrToGray; break;
+                case SimdPixelFormatBgra32: _convert = Neon::BgraToGray; break;
+                case SimdPixelFormatRgb24: _convert = Neon::RgbToGray; break;
+                case SimdPixelFormatRgba32: _convert = Neon::RgbaToGray; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param)
+            : Base::ImagePgmBinSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _convert = Neon::BgrToGray; break;
+                case SimdPixelFormatBgra32: _convert = Neon::BgraToGray; break;
+                case SimdPixelFormatRgb24: _convert = Neon::RgbToGray; break;
+                case SimdPixelFormatRgba32: _convert = Neon::RgbaToGray; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param)
+            : Base::ImagePpmTxtSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _convert = Neon::GrayToBgr; break;
+                case SimdPixelFormatBgr24: _convert = Neon::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _convert = Neon::BgraToRgb; break;
+                case SimdPixelFormatRgba32: _convert = Neon::BgraToBgr; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param)
+            : Base::ImagePpmBinSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _convert = Neon::GrayToBgr; break;
+                case SimdPixelFormatBgr24: _convert = Neon::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _convert = Neon::BgraToRgb; break;
+                case SimdPixelFormatRgba32: _convert = Neon::BgraToBgr; break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageSaver* CreateImageSaver(const ImageSaverParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinSaver(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinSaver(param);
+            case SimdImageFilePng: return new ImagePngSaver(param);
+            case SimdImageFileJpeg: return new Base::ImageJpegSaver(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size)
+        {
+            ImageSaverParam param(width, height, format, file, quality);
+            if (param.Validate())
+            {
+                Holder<ImageSaver> saver(CreateImageSaver(param));
+                if (saver)
+                {
+                    if (saver->ToStream(src, stride))
+                        return saver->Release(size);
+                }
+            }
+            return NULL;
+        }
+    }
+#endif// SIMD_NEON_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdPerformance.h b/3rdparty/simdlib/Simd/SimdPerformance.h
new file mode 100644
index 0000000000..e695326a69
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdPerformance.h
@@ -0,0 +1,197 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdPerformance_h__
+#define __SimdPerformance_h__
+
+#include "Simd/SimdDefs.h"
+
+#include <string>
+#include <sstream>
+
+namespace Simd
+{
+    typedef std::string String;
+
+    template <class T> SIMD_INLINE String ToStr(const T & value)
+    {
+        std::stringstream ss;
+        ss << value;
+        return ss.str();
+    }
+}
+
+#if defined(SIMD_PERFORMANCE_STATISTIC) && (defined(NDEBUG) || defined(SIMD_PERF_STAT_IN_DEBUG))
+
+#include "Simd/SimdTime.h"
+
+#include <limits>
+#include <iostream>
+#include <iomanip>
+#include <memory>
+#include <map>
+#include <thread>
+#include <mutex>
+#include <algorithm>
+
+namespace Simd
+{
+    namespace Base
+    {
+        class PerformanceMeasurer
+        {
+            String	_name;
+            int64_t _start, _current, _total, _min, _max;
+            int64_t _count, _flop;
+            bool _entered, _paused;
+
+        public:
+            PerformanceMeasurer(const String& name = "Unknown", int64_t flop = 0);
+
+            PerformanceMeasurer(const PerformanceMeasurer& pm);
+
+            void Enter();
+
+            void Leave(bool pause = false);
+
+            String Statistic() const;
+
+            void Combine(const PerformanceMeasurer& other);
+
+        private:
+            double Average() const;
+            double GFlops() const;
+        };
+
+        class PerformanceMeasurerHolder
+        {
+            PerformanceMeasurer * _pm;
+
+        public:
+            SIMD_INLINE PerformanceMeasurerHolder(PerformanceMeasurer * pm, bool enter = true)
+                : _pm(pm)
+            {
+                if (_pm && enter)
+                    _pm->Enter();
+            }
+
+            SIMD_INLINE void Enter()
+            {
+                if (_pm)
+                    _pm->Enter();
+            }
+
+            SIMD_INLINE void Leave(bool pause)
+            {
+                if (_pm)
+                    _pm->Leave(pause);
+            }
+
+            SIMD_INLINE ~PerformanceMeasurerHolder()
+            {
+                if (_pm)
+                    _pm->Leave();
+            }
+        };
+
+        class PerformanceMeasurerStorage
+        {
+            typedef PerformanceMeasurer Pm;
+            typedef std::shared_ptr<Pm> PmPtr;
+            typedef std::map<String, PmPtr> FunctionMap;
+            typedef std::map<std::thread::id, FunctionMap> ThreadMap;
+
+            ThreadMap _map;
+            mutable std::mutex _mutex;
+            String _report;
+
+            SIMD_INLINE FunctionMap & ThisThread()
+            {
+                static thread_local FunctionMap * thread = NULL;
+                if (thread == NULL)
+                {
+                    std::lock_guard<std::mutex> lock(_mutex);
+                    thread = &_map[std::this_thread::get_id()];
+                }
+                return *thread;
+            }
+
+        public:
+            static PerformanceMeasurerStorage s_storage;
+
+            PerformanceMeasurerStorage()
+            {
+            }
+
+            SIMD_INLINE PerformanceMeasurer * Get(const String & name, int64_t flop = 0)
+            {
+                FunctionMap & thread = ThisThread();
+                PerformanceMeasurer * pm = NULL;
+                FunctionMap::iterator it = thread.find(name);
+                if (it == thread.end())
+                {
+                    pm = new PerformanceMeasurer(name, flop);
+                    thread[name].reset(pm);
+                }
+                else
+                    pm = it->second.get();
+                return pm;
+            }
+
+            SIMD_INLINE PerformanceMeasurer * Get(const String func, const String & desc, int64_t flop = 0)
+            {
+                return Get(func + "{ " + desc + " }", flop);
+            }
+
+            const char* PerformanceStatistic();
+        };
+    }
+}
+#define SIMD_PERF_FUNCF(flop) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)(Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, (int64_t)(flop)))
+#define SIMD_PERF_FUNC() SIMD_PERF_FUNCF(0)
+#define SIMD_PERF_BEGF(desc, flop) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)(Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc, (int64_t)(flop)))
+#define SIMD_PERF_BEG(desc) SIMD_PERF_BEGF(desc, 0)
+#define SIMD_PERF_IFF(cond, desc, flop) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)((cond) ? Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc, (int64_t)(flop)) : NULL)
+#define SIMD_PERF_IF(cond, desc) SIMD_PERF_IFF(cond, desc, 0)
+#define SIMD_PERF_END(desc) Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc)->Leave();
+#define SIMD_PERF_INITF(name, desc, flop) Simd::Base::PerformanceMeasurerHolder name(Simd::Base::PerformanceMeasurerStorage::s_storage.Get(SIMD_FUNCTION, desc, (int64_t)(flop)), false);
+#define SIMD_PERF_INIT(name, desc)  SIMD_PERF_INITF(name, desc, 0);
+#define SIMD_PERF_START(name) name.Enter(); 
+#define SIMD_PERF_PAUSE(name) name.Leave(true);
+#define SIMD_PERF_EXT(ext) Simd::Base::PerformanceMeasurerHolder SIMD_CAT(__pmh, __LINE__)((ext)->Perf(SIMD_FUNCTION)) 
+#else//SIMD_PERFORMANCE_STATISTIC
+#define SIMD_PERF_FUNCF(flop)
+#define SIMD_PERF_FUNC()
+#define SIMD_PERF_BEGF(desc, flop)
+#define SIMD_PERF_BEG(desc)
+#define SIMD_PERF_IFF(cond, desc, flop)
+#define SIMD_PERF_IF(cond, desc)
+#define SIMD_PERF_END(desc)
+#define SIMD_PERF_INITF(name, desc, flop)
+#define SIMD_PERF_INIT(name, desc)
+#define SIMD_PERF_START(name)
+#define SIMD_PERF_PAUSE(name)
+#define SIMD_PERF_EXT(ext)
+#endif//SIMD_PERFORMANCE_STATISTIC 
+
+#endif//__SimdPerformance_h__
diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp
new file mode 100644
index 0000000000..eca83c63ed
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdSse41ImageLoad.cpp
@@ -0,0 +1,159 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdSse2.h"
+#include "Simd/SimdSse41.h"
+
+#include <memory>
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        ImagePgmTxtLoader::ImagePgmTxtLoader(const ImageLoaderParam& param)
+            : Base::ImagePgmTxtLoader(param)
+        {
+        }
+
+        void ImagePgmTxtLoader::SetConverters()
+        {
+            Base::ImagePgmTxtLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _toAny = Sse41::GrayToBgr; break;
+                case SimdPixelFormatBgra32: _toBgra = Sse2::GrayToBgra; break;
+                case SimdPixelFormatRgb24: _toAny = Sse41::GrayToBgr; break;
+                case SimdPixelFormatRgba32: _toBgra = Sse41::GrayToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePgmBinLoader::ImagePgmBinLoader(const ImageLoaderParam& param)
+            : Base::ImagePgmBinLoader(param)
+        {
+        }
+
+        void ImagePgmBinLoader::SetConverters()
+        {
+            Base::ImagePgmBinLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _toAny = Sse41::GrayToBgr; break;
+                case SimdPixelFormatBgra32: _toBgra = Sse2::GrayToBgra; break;
+                case SimdPixelFormatRgb24: _toAny = Sse41::GrayToBgr; break;
+                case SimdPixelFormatRgba32: _toBgra = Sse41::GrayToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmTxtLoader::ImagePpmTxtLoader(const ImageLoaderParam& param)
+            : Base::ImagePpmTxtLoader(param)
+        {
+        }
+
+        void ImagePpmTxtLoader::SetConverters()
+        {
+            Base::ImagePpmTxtLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _toAny = Sse41::RgbToGray; break;
+                case SimdPixelFormatBgr24: _toAny = Sse41::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _toBgra = Sse41::RgbToBgra; break;
+                case SimdPixelFormatRgba32: _toBgra = Sse41::BgrToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmBinLoader::ImagePpmBinLoader(const ImageLoaderParam& param)
+            : Base::ImagePpmBinLoader(param)
+        {
+        }
+
+        void ImagePpmBinLoader::SetConverters()
+        {
+            Base::ImagePpmBinLoader::SetConverters();
+            if (_image.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _toAny = Sse41::RgbToGray; break;
+                case SimdPixelFormatBgr24: _toAny = Sse41::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _toBgra = Sse41::RgbToBgra; break;
+                case SimdPixelFormatRgba32: _toBgra = Sse41::BgrToBgra; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageLoader* CreateImageLoader(const ImageLoaderParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtLoader(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinLoader(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtLoader(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinLoader(param);
+            case SimdImageFilePng: return new ImagePngLoader(param);
+            case SimdImageFileJpeg: return new Base::ImageJpegLoader(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageLoadFromMemory(const uint8_t* data, size_t size, size_t* stride, size_t* width, size_t* height, SimdPixelFormatType* format)
+        {
+            ImageLoaderParam param(data, size, *format);
+            if (param.Validate())
+            {
+                std::unique_ptr<ImageLoader> loader(CreateImageLoader(param));
+                if (loader)
+                {
+                    if (loader->FromStream())
+                        return loader->Release(stride, width, height, format);
+                }
+            }
+            return NULL;
+        }
+    }
+#endif// SIMD_SSE41_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp
new file mode 100644
index 0000000000..1ec6ca0118
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdSse41ImageLoadPng.cpp
@@ -0,0 +1,1805 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdImageLoad.h"
+#include "Simd/SimdArray.h"
+#include "Simd/SimdCpu.h"
+#include "Simd/SimdBase.h"
+#include "Simd/SimdSse2.h"
+#include "Simd/SimdSse41.h"
+
+namespace Simd
+{
+#if defined(SIMD_SSE41_ENABLE) 
+    namespace Sse41
+    {
+        typedef unsigned char png_uc;
+        typedef unsigned short png_us;
+
+        typedef uint16_t png__uint16;
+        typedef uint32_t png__uint32;
+
+#define png_inline SIMD_INLINE
+#define PNG_ASSERT assert
+#define PNG_MALLOC(sz)           malloc(sz)
+#define PNG_REALLOC(p,newsz)     realloc(p,newsz)
+#define PNG_FREE(p)              free(p)
+#define PNG_REALLOC_SIZED(p,oldsz,newsz) PNG_REALLOC(p,newsz)
+#define STBIDEF static
+
+#ifdef _MSC_VER
+#define PNG_NOTUSED(v)  (void)(v)
+#else
+#define PNG_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#define PNG__BYTECAST(x)  ((png_uc) ((x) & 255))  // truncate int to byte without warnings
+#define PNG_MAX_DIMENSIONS (1 << 24)
+
+        static int png__err(const char* str, const char* stub)
+        {
+            return 0;
+        }
+
+#define png__errpuc(x,y)  ((unsigned char *)(size_t) (png__err(x,y)?NULL:NULL))
+
+        static void* png__malloc(size_t size)
+        {
+            return PNG_MALLOC(size);
+        }
+
+        typedef struct
+        {
+            int      (*read)  (void* user, char* data, int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+            void     (*skip)  (void* user, int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+            int      (*eof)   (void* user);                       // returns nonzero if we are at end of file/data
+        } png_io_callbacks;
+
+        typedef struct
+        {
+            png__uint32 img_x, img_y;
+            int img_n, img_out_n;
+
+            png_io_callbacks io;
+            void* io_user_data;
+
+            int read_from_callbacks;
+            int buflen;
+            png_uc buffer_start[128];
+            int callback_already_read;
+
+            png_uc* img_buffer, * img_buffer_end;
+            png_uc* img_buffer_original, * img_buffer_original_end;
+        } png__context;
+
+        typedef struct
+        {
+            int bits_per_channel;
+            int num_channels;
+            int channel_order;
+        } png__result_info;
+
+        enum
+        {
+            PNG__SCAN_load = 0,
+            PNG__SCAN_type,
+            PNG__SCAN_header
+        };
+
+        enum
+        {
+            PNG_ORDER_RGB,
+            PNG_ORDER_BGR
+        };
+
+        static void png__rewind(png__context* s)
+        {
+            // conceptually rewind SHOULD rewind to the beginning of the stream,
+            // but we just rewind to the beginning of the initial buffer, because
+            // we only use it after doing 'test', which only ever looks at at most 92 bytes
+            s->img_buffer = s->img_buffer_original;
+            s->img_buffer_end = s->img_buffer_original_end;
+        }
+
+        static void png__refill_buffer(png__context* s)
+        {
+            int n = (s->io.read)(s->io_user_data, (char*)s->buffer_start, s->buflen);
+            s->callback_already_read += (int)(s->img_buffer - s->img_buffer_original);
+            if (n == 0) {
+                // at end of file, treat same as if from memory, but need to handle case
+                // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+                s->read_from_callbacks = 0;
+                s->img_buffer = s->buffer_start;
+                s->img_buffer_end = s->buffer_start + 1;
+                *s->img_buffer = 0;
+            }
+            else {
+                s->img_buffer = s->buffer_start;
+                s->img_buffer_end = s->buffer_start + n;
+            }
+        }
+
+        png_inline static png_uc png__get8(png__context* s)
+        {
+            if (s->img_buffer < s->img_buffer_end)
+                return *s->img_buffer++;
+            if (s->read_from_callbacks) {
+                png__refill_buffer(s);
+                return *s->img_buffer++;
+            }
+            return 0;
+        }
+
+        static int png__get16be(png__context* s)
+        {
+            int z = png__get8(s);
+            return (z << 8) + png__get8(s);
+        }
+
+        static png__uint32 png__get32be(png__context* s)
+        {
+            png__uint32 z = png__get16be(s);
+            return (z << 16) + png__get16be(s);
+        }
+
+        png_inline static int png__at_eof(png__context* s)
+        {
+            if (s->io.read) {
+                if (!(s->io.eof)(s->io_user_data)) return 0;
+                // if feof() is true, check if buffer = end
+                // special case: we've only got the special 0 character at the end
+                if (s->read_from_callbacks == 0) return 1;
+            }
+
+            return s->img_buffer >= s->img_buffer_end;
+        }
+
+        static void png__skip(png__context* s, int n)
+        {
+            if (n == 0) return;  // already there!
+            if (n < 0) {
+                s->img_buffer = s->img_buffer_end;
+                return;
+            }
+            if (s->io.read) {
+                int blen = (int)(s->img_buffer_end - s->img_buffer);
+                if (blen < n) {
+                    s->img_buffer = s->img_buffer_end;
+                    (s->io.skip)(s->io_user_data, n - blen);
+                    return;
+                }
+            }
+            s->img_buffer += n;
+        }
+
+        static int png__getn(png__context* s, png_uc* buffer, int n)
+        {
+            if (s->io.read) {
+                int blen = (int)(s->img_buffer_end - s->img_buffer);
+                if (blen < n) {
+                    int res, count;
+
+                    memcpy(buffer, s->img_buffer, blen);
+
+                    count = (s->io.read)(s->io_user_data, (char*)buffer + blen, n - blen);
+                    res = (count == (n - blen));
+                    s->img_buffer = s->img_buffer_end;
+                    return res;
+                }
+            }
+
+            if (s->img_buffer + n <= s->img_buffer_end) {
+                memcpy(buffer, s->img_buffer, n);
+                s->img_buffer += n;
+                return 1;
+            }
+            else
+                return 0;
+        }
+
+        static int png__addsizes_valid(int a, int b)
+        {
+            if (b < 0) return 0;
+            // now 0 <= b <= INT_MAX, hence also
+            // 0 <= INT_MAX - b <= INTMAX.
+            // And "a + b <= INT_MAX" (which might overflow) is the
+            // same as a <= INT_MAX - b (no overflow)
+            return a <= INT_MAX - b;
+        }
+
+        // returns 1 if the product is valid, 0 on overflow.
+        // negative factors are considered invalid.
+        static int png__mul2sizes_valid(int a, int b)
+        {
+            if (a < 0 || b < 0) return 0;
+            if (b == 0) return 1; // mul-by-0 is always safe
+            // portable way to check for no overflows in a*b
+            return a <= INT_MAX / b;
+        }
+
+        // returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+        static int png__mad2sizes_valid(int a, int b, int add)
+        {
+            return png__mul2sizes_valid(a, b) && png__addsizes_valid(a * b, add);
+        }
+
+        // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+        static int png__mad3sizes_valid(int a, int b, int c, int add)
+        {
+            return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) &&
+                png__addsizes_valid(a * b * c, add);
+        }
+
+        // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+        static int png__mad4sizes_valid(int a, int b, int c, int d, int add)
+        {
+            return png__mul2sizes_valid(a, b) && png__mul2sizes_valid(a * b, c) &&
+                png__mul2sizes_valid(a * b * c, d) && png__addsizes_valid(a * b * c * d, add);
+        }
+
+        // mallocs with size overflow checking
+        static void* png__malloc_mad2(int a, int b, int add)
+        {
+            if (!png__mad2sizes_valid(a, b, add)) return NULL;
+            return png__malloc(a * b + add);
+        }
+
+        static void* png__malloc_mad3(int a, int b, int c, int add)
+        {
+            if (!png__mad3sizes_valid(a, b, c, add)) return NULL;
+            return png__malloc(a * b * c + add);
+        }
+
+        static void* png__malloc_mad4(int a, int b, int c, int d, int add)
+        {
+            if (!png__mad4sizes_valid(a, b, c, d, add)) return NULL;
+            return png__malloc(a * b * c * d + add);
+        }
+
+        static png_uc png__compute_y(int r, int g, int b)
+        {
+            return (png_uc)(((r * 77) + (g * 150) + (29 * b)) >> 8);
+        }
+
+        static unsigned char* png__convert_format(unsigned char* data, int img_n, int req_comp, unsigned int x, unsigned int y)
+        {
+            int i, j;
+            unsigned char* good;
+
+            if (req_comp == img_n) return data;
+            PNG_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+            good = (unsigned char*)png__malloc_mad3(req_comp, x, y, 0);
+            if (good == NULL) {
+                PNG_FREE(data);
+                return png__errpuc("outofmem", "Out of memory");
+            }
+
+            for (j = 0; j < (int)y; ++j) {
+                unsigned char* src = data + j * x * img_n;
+                unsigned char* dest = good + j * x * req_comp;
+
+#define PNG__COMBO(a,b)  ((a)*8+(b))
+#define PNG__CASE(a,b)   case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+                // convert source image with img_n components to one with req_comp components;
+                // avoid switch per pixel, so use switch per scanline and massive macros
+                switch (PNG__COMBO(img_n, req_comp)) {
+                    PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 255; } break;
+                    PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 255; } break;
+                    PNG__CASE(2, 1) { dest[0] = src[0]; } break;
+                    PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break;
+                    PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 255; } break;
+                    PNG__CASE(3, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break;
+                    PNG__CASE(3, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = 255; } break;
+                    PNG__CASE(4, 1) { dest[0] = png__compute_y(src[0], src[1], src[2]); } break;
+                    PNG__CASE(4, 2) { dest[0] = png__compute_y(src[0], src[1], src[2]); dest[1] = src[3]; } break;
+                    PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break;
+                default: PNG_ASSERT(0); PNG_FREE(data); PNG_FREE(good); return png__errpuc("unsupported", "Unsupported format conversion");
+                }
+#undef PNG__CASE
+            }
+
+            PNG_FREE(data);
+            return good;
+        }
+
+        static png__uint16 png__compute_y_16(int r, int g, int b)
+        {
+            return (png__uint16)(((r * 77) + (g * 150) + (29 * b)) >> 8);
+        }
+
+        static png__uint16* png__convert_format16(png__uint16* data, int img_n, int req_comp, unsigned int x, unsigned int y)
+        {
+            int i, j;
+            png__uint16* good;
+
+            if (req_comp == img_n) return data;
+            PNG_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+            good = (png__uint16*)png__malloc(req_comp * x * y * 2);
+            if (good == NULL) {
+                PNG_FREE(data);
+                return (png__uint16*)png__errpuc("outofmem", "Out of memory");
+            }
+
+            for (j = 0; j < (int)y; ++j) {
+                png__uint16* src = data + j * x * img_n;
+                png__uint16* dest = good + j * x * req_comp;
+
+#define PNG__COMBO(a,b)  ((a)*8+(b))
+#define PNG__CASE(a,b)   case PNG__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+                // convert source image with img_n components to one with req_comp components;
+                // avoid switch per pixel, so use switch per scanline and massive macros
+                switch (PNG__COMBO(img_n, req_comp)) {
+                    PNG__CASE(1, 2) { dest[0] = src[0]; dest[1] = 0xffff; } break;
+                    PNG__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(1, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = 0xffff; } break;
+                    PNG__CASE(2, 1) { dest[0] = src[0]; } break;
+                    PNG__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } break;
+                    PNG__CASE(2, 4) { dest[0] = dest[1] = dest[2] = src[0]; dest[3] = src[1]; } break;
+                    PNG__CASE(3, 4) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; dest[3] = 0xffff; } break;
+                    PNG__CASE(3, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break;
+                    PNG__CASE(3, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = 0xffff; } break;
+                    PNG__CASE(4, 1) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); } break;
+                    PNG__CASE(4, 2) { dest[0] = png__compute_y_16(src[0], src[1], src[2]); dest[1] = src[3]; } break;
+                    PNG__CASE(4, 3) { dest[0] = src[0]; dest[1] = src[1]; dest[2] = src[2]; } break;
+                default: PNG_ASSERT(0); PNG_FREE(data); PNG_FREE(good); return (png__uint16*)png__errpuc("unsupported", "Unsupported format conversion");
+                }
+#undef PNG__CASE
+            }
+
+            PNG_FREE(data);
+            return good;
+        }
+
+        // fast-way is faster to check than jpeg huffman, but slow way is slower
+#define PNG__ZFAST_BITS  9 // accelerate all cases in default tables
+#define PNG__ZFAST_MASK  ((1 << PNG__ZFAST_BITS) - 1)
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+        typedef struct
+        {
+            png__uint16 fast[1 << PNG__ZFAST_BITS];
+            png__uint16 firstcode[16];
+            int maxcode[17];
+            png__uint16 firstsymbol[16];
+            png_uc  size[288];
+            png__uint16 value[288];
+        } png__zhuffman;
+
+        png_inline static int png__bitreverse16(int n)
+        {
+            n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1);
+            n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2);
+            n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4);
+            n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8);
+            return n;
+        }
+
+        png_inline static int png__bit_reverse(int v, int bits)
+        {
+            PNG_ASSERT(bits <= 16);
+            // to bit reverse n bits, reverse 16 and shift
+            // e.g. 11 bits, bit reverse and shift away 5
+            return png__bitreverse16(v) >> (16 - bits);
+        }
+
+        static int png__zbuild_huffman(png__zhuffman* z, const png_uc* sizelist, int num)
+        {
+            int i, k = 0;
+            int code, next_code[16], sizes[17];
+
+            // DEFLATE spec for generating codes
+            memset(sizes, 0, sizeof(sizes));
+            memset(z->fast, 0, sizeof(z->fast));
+            for (i = 0; i < num; ++i)
+                ++sizes[sizelist[i]];
+            sizes[0] = 0;
+            for (i = 1; i < 16; ++i)
+                if (sizes[i] > (1 << i))
+                    return png__err("bad sizes", "Corrupt PNG");
+            code = 0;
+            for (i = 1; i < 16; ++i) {
+                next_code[i] = code;
+                z->firstcode[i] = (png__uint16)code;
+                z->firstsymbol[i] = (png__uint16)k;
+                code = (code + sizes[i]);
+                if (sizes[i])
+                    if (code - 1 >= (1 << i)) return png__err("bad codelengths", "Corrupt PNG");
+                z->maxcode[i] = code << (16 - i); // preshift for inner loop
+                code <<= 1;
+                k += sizes[i];
+            }
+            z->maxcode[16] = 0x10000; // sentinel
+            for (i = 0; i < num; ++i) {
+                int s = sizelist[i];
+                if (s) {
+                    int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+                    png__uint16 fastv = (png__uint16)((s << 9) | i);
+                    z->size[c] = (png_uc)s;
+                    z->value[c] = (png__uint16)i;
+                    if (s <= PNG__ZFAST_BITS) {
+                        int j = png__bit_reverse(next_code[s], s);
+                        while (j < (1 << PNG__ZFAST_BITS)) {
+                            z->fast[j] = fastv;
+                            j += (1 << s);
+                        }
+                    }
+                    ++next_code[s];
+                }
+            }
+            return 1;
+        }
+
+        // zlib-from-memory implementation for PNG reading
+        //    because PNG allows splitting the zlib stream arbitrarily,
+        //    and it's annoying structurally to have PNG call ZLIB call PNG,
+        //    we require PNG read all the IDATs and combine them into a single
+        //    memory buffer
+
+        typedef struct
+        {
+            png_uc* zbuffer, * zbuffer_end;
+            int num_bits;
+            png__uint32 code_buffer;
+
+            char* zout;
+            char* zout_start;
+            char* zout_end;
+            int   z_expandable;
+
+            png__zhuffman z_length, z_distance;
+        } png__zbuf;
+
+        png_inline static int png__zeof(png__zbuf* z)
+        {
+            return (z->zbuffer >= z->zbuffer_end);
+        }
+
+        png_inline static png_uc png__zget8(png__zbuf* z)
+        {
+            return png__zeof(z) ? 0 : *z->zbuffer++;
+        }
+
+        static void png__fill_bits(png__zbuf* z)
+        {
+            do {
+                if (z->code_buffer >= (1U << z->num_bits)) {
+                    z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
+                    return;
+                }
+                z->code_buffer |= (unsigned int)png__zget8(z) << z->num_bits;
+                z->num_bits += 8;
+            } while (z->num_bits <= 24);
+        }
+
+        png_inline static unsigned int png__zreceive(png__zbuf* z, int n)
+        {
+            unsigned int k;
+            if (z->num_bits < n) png__fill_bits(z);
+            k = z->code_buffer & ((1 << n) - 1);
+            z->code_buffer >>= n;
+            z->num_bits -= n;
+            return k;
+        }
+
+        static int png__zhuffman_decode_slowpath(png__zbuf* a, png__zhuffman* z)
+        {
+            int b, s, k;
+            // not resolved by fast table, so compute it the slow way
+            // use jpeg approach, which requires MSbits at top
+            k = png__bit_reverse(a->code_buffer, 16);
+            for (s = PNG__ZFAST_BITS + 1; ; ++s)
+                if (k < z->maxcode[s])
+                    break;
+            if (s >= 16) return -1; // invalid code!
+            // code size is s, so:
+            b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s];
+            if (b >= sizeof(z->size)) return -1; // some data was corrupt somewhere!
+            if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
+            a->code_buffer >>= s;
+            a->num_bits -= s;
+            return z->value[b];
+        }
+
+        png_inline static int png__zhuffman_decode(png__zbuf* a, png__zhuffman* z)
+        {
+            int b, s;
+            if (a->num_bits < 16) {
+                if (png__zeof(a)) {
+                    return -1;   /* report error for unexpected end of data. */
+                }
+                png__fill_bits(a);
+            }
+            b = z->fast[a->code_buffer & PNG__ZFAST_MASK];
+            if (b) {
+                s = b >> 9;
+                a->code_buffer >>= s;
+                a->num_bits -= s;
+                return b & 511;
+            }
+            return png__zhuffman_decode_slowpath(a, z);
+        }
+
+        static int png__zexpand(png__zbuf* z, char* zout, int n)  // need to make room for n bytes
+        {
+            char* q;
+            unsigned int cur, limit, old_limit;
+            z->zout = zout;
+            if (!z->z_expandable) return png__err("output buffer limit", "Corrupt PNG");
+            cur = (unsigned int)(z->zout - z->zout_start);
+            limit = old_limit = (unsigned)(z->zout_end - z->zout_start);
+            if (UINT_MAX - cur < (unsigned)n) return png__err("outofmem", "Out of memory");
+            while (cur + n > limit) {
+                if (limit > UINT_MAX / 2) return png__err("outofmem", "Out of memory");
+                limit *= 2;
+            }
+            q = (char*)PNG_REALLOC_SIZED(z->zout_start, old_limit, limit);
+            PNG_NOTUSED(old_limit);
+            if (q == NULL) return png__err("outofmem", "Out of memory");
+            z->zout_start = q;
+            z->zout = q + cur;
+            z->zout_end = q + limit;
+            return 1;
+        }
+
+        static const int png__zlength_base[31] = {
+           3,4,5,6,7,8,9,10,11,13,
+           15,17,19,23,27,31,35,43,51,59,
+           67,83,99,115,131,163,195,227,258,0,0 };
+
+        static const int png__zlength_extra[31] =
+        { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+        static const int png__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+        257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0 };
+
+        static const int png__zdist_extra[32] =
+        { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
+
+        static int png__parse_huffman_block(png__zbuf* a)
+        {
+            char* zout = a->zout;
+            for (;;) {
+                int z = png__zhuffman_decode(a, &a->z_length);
+                if (z < 256) {
+                    if (z < 0) return png__err("bad huffman code", "Corrupt PNG"); // error in huffman codes
+                    if (zout >= a->zout_end) {
+                        if (!png__zexpand(a, zout, 1)) return 0;
+                        zout = a->zout;
+                    }
+                    *zout++ = (char)z;
+                }
+                else {
+                    png_uc* p;
+                    int len, dist;
+                    if (z == 256) {
+                        a->zout = zout;
+                        return 1;
+                    }
+                    z -= 257;
+                    len = png__zlength_base[z];
+                    if (png__zlength_extra[z]) len += png__zreceive(a, png__zlength_extra[z]);
+                    z = png__zhuffman_decode(a, &a->z_distance);
+                    if (z < 0) return png__err("bad huffman code", "Corrupt PNG");
+                    dist = png__zdist_base[z];
+                    if (png__zdist_extra[z]) dist += png__zreceive(a, png__zdist_extra[z]);
+                    if (zout - a->zout_start < dist) return png__err("bad dist", "Corrupt PNG");
+                    if (zout + len > a->zout_end) {
+                        if (!png__zexpand(a, zout, len)) return 0;
+                        zout = a->zout;
+                    }
+                    p = (png_uc*)(zout - dist);
+                    if (dist == 1) { // run of one byte; common in images.
+                        png_uc v = *p;
+                        if (len) { do *zout++ = v; while (--len); }
+                    }
+                    else {
+                        if (len) { do *zout++ = *p++; while (--len); }
+                    }
+                }
+            }
+        }
+
+        static int png__compute_huffman_codes(png__zbuf* a)
+        {
+            static const png_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+            png__zhuffman z_codelength;
+            png_uc lencodes[286 + 32 + 137];//padding for maximum single op
+            png_uc codelength_sizes[19];
+            int i, n;
+
+            int hlit = png__zreceive(a, 5) + 257;
+            int hdist = png__zreceive(a, 5) + 1;
+            int hclen = png__zreceive(a, 4) + 4;
+            int ntot = hlit + hdist;
+
+            memset(codelength_sizes, 0, sizeof(codelength_sizes));
+            for (i = 0; i < hclen; ++i) {
+                int s = png__zreceive(a, 3);
+                codelength_sizes[length_dezigzag[i]] = (png_uc)s;
+            }
+            if (!png__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+            n = 0;
+            while (n < ntot) {
+                int c = png__zhuffman_decode(a, &z_codelength);
+                if (c < 0 || c >= 19) return png__err("bad codelengths", "Corrupt PNG");
+                if (c < 16)
+                    lencodes[n++] = (png_uc)c;
+                else {
+                    png_uc fill = 0;
+                    if (c == 16) {
+                        c = png__zreceive(a, 2) + 3;
+                        if (n == 0) return png__err("bad codelengths", "Corrupt PNG");
+                        fill = lencodes[n - 1];
+                    }
+                    else if (c == 17) {
+                        c = png__zreceive(a, 3) + 3;
+                    }
+                    else if (c == 18) {
+                        c = png__zreceive(a, 7) + 11;
+                    }
+                    else {
+                        return png__err("bad codelengths", "Corrupt PNG");
+                    }
+                    if (ntot - n < c) return png__err("bad codelengths", "Corrupt PNG");
+                    memset(lencodes + n, fill, c);
+                    n += c;
+                }
+            }
+            if (n != ntot) return png__err("bad codelengths", "Corrupt PNG");
+            if (!png__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+            if (!png__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist)) return 0;
+            return 1;
+        }
+
+        static int png__parse_uncompressed_block(png__zbuf* a)
+        {
+            png_uc header[4];
+            int len, nlen, k;
+            if (a->num_bits & 7)
+                png__zreceive(a, a->num_bits & 7); // discard
+             // drain the bit-packed data into header
+            k = 0;
+            while (a->num_bits > 0) {
+                header[k++] = (png_uc)(a->code_buffer & 255); // suppress MSVC run-time check
+                a->code_buffer >>= 8;
+                a->num_bits -= 8;
+            }
+            if (a->num_bits < 0) return png__err("zlib corrupt", "Corrupt PNG");
+            // now fill header the normal way
+            while (k < 4)
+                header[k++] = png__zget8(a);
+            len = header[1] * 256 + header[0];
+            nlen = header[3] * 256 + header[2];
+            if (nlen != (len ^ 0xffff)) return png__err("zlib corrupt", "Corrupt PNG");
+            if (a->zbuffer + len > a->zbuffer_end) return png__err("read past buffer", "Corrupt PNG");
+            if (a->zout + len > a->zout_end)
+                if (!png__zexpand(a, a->zout, len)) return 0;
+            memcpy(a->zout, a->zbuffer, len);
+            a->zbuffer += len;
+            a->zout += len;
+            return 1;
+        }
+
+        static int png__parse_zlib_header(png__zbuf* a)
+        {
+            int cmf = png__zget8(a);
+            int cm = cmf & 15;
+            /* int cinfo = cmf >> 4; */
+            int flg = png__zget8(a);
+            if (png__zeof(a)) return png__err("bad zlib header", "Corrupt PNG"); // zlib spec
+            if ((cmf * 256 + flg) % 31 != 0) return png__err("bad zlib header", "Corrupt PNG"); // zlib spec
+            if (flg & 32) return png__err("no preset dict", "Corrupt PNG"); // preset dictionary not allowed in png
+            if (cm != 8) return png__err("bad compression", "Corrupt PNG"); // DEFLATE required for png
+            // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+            return 1;
+        }
+
+        static const png_uc png__zdefault_length[288] =
+        {
+           8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+           8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+           8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+           8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+           8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+           9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+           9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+           9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+           7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+        };
+        static const png_uc png__zdefault_distance[32] =
+        {
+           5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+        };
+        /*
+        Init algorithm:
+        {
+           int i;   // use <= to match clearly with spec
+           for (i=0; i <= 143; ++i)     png__zdefault_length[i]   = 8;
+           for (   ; i <= 255; ++i)     png__zdefault_length[i]   = 9;
+           for (   ; i <= 279; ++i)     png__zdefault_length[i]   = 7;
+           for (   ; i <= 287; ++i)     png__zdefault_length[i]   = 8;
+
+           for (i=0; i <=  31; ++i)     png__zdefault_distance[i] = 5;
+        }
+        */
+
+        static int png__parse_zlib(png__zbuf* a, int parse_header)
+        {
+            int final, type;
+            if (parse_header)
+                if (!png__parse_zlib_header(a)) return 0;
+            a->num_bits = 0;
+            a->code_buffer = 0;
+            do {
+                final = png__zreceive(a, 1);
+                type = png__zreceive(a, 2);
+                if (type == 0) {
+                    if (!png__parse_uncompressed_block(a)) return 0;
+                }
+                else if (type == 3) {
+                    return 0;
+                }
+                else {
+                    if (type == 1) {
+                        // use fixed code lengths
+                        if (!png__zbuild_huffman(&a->z_length, png__zdefault_length, 288)) return 0;
+                        if (!png__zbuild_huffman(&a->z_distance, png__zdefault_distance, 32)) return 0;
+                    }
+                    else {
+                        if (!png__compute_huffman_codes(a)) return 0;
+                    }
+                    if (!png__parse_huffman_block(a)) return 0;
+                }
+            } while (!final);
+            return 1;
+        }
+
+        static int png__do_zlib(png__zbuf* a, char* obuf, int olen, int exp, int parse_header)
+        {
+            a->zout_start = obuf;
+            a->zout = obuf;
+            a->zout_end = obuf + olen;
+            a->z_expandable = exp;
+
+            return png__parse_zlib(a, parse_header);
+        }
+
+        STBIDEF char* png_zlib_decode_malloc_guesssize(const char* buffer, int len, int initial_size, int* outlen)
+        {
+            png__zbuf a;
+            char* p = (char*)png__malloc(initial_size);
+            if (p == NULL) return NULL;
+            a.zbuffer = (png_uc*)buffer;
+            a.zbuffer_end = (png_uc*)buffer + len;
+            if (png__do_zlib(&a, p, initial_size, 1, 1)) {
+                if (outlen) *outlen = (int)(a.zout - a.zout_start);
+                return a.zout_start;
+            }
+            else {
+                PNG_FREE(a.zout_start);
+                return NULL;
+            }
+        }
+
+        STBIDEF char* png_zlib_decode_malloc(char const* buffer, int len, int* outlen)
+        {
+            return png_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+        }
+
+        STBIDEF char* png_zlib_decode_malloc_guesssize_headerflag(const char* buffer, int len, int initial_size, int* outlen, int parse_header)
+        {
+            png__zbuf a;
+            char* p = (char*)png__malloc(initial_size);
+            if (p == NULL) return NULL;
+            a.zbuffer = (png_uc*)buffer;
+            a.zbuffer_end = (png_uc*)buffer + len;
+            if (png__do_zlib(&a, p, initial_size, 1, parse_header)) {
+                if (outlen) *outlen = (int)(a.zout - a.zout_start);
+                return a.zout_start;
+            }
+            else {
+                PNG_FREE(a.zout_start);
+                return NULL;
+            }
+        }
+
+        STBIDEF int png_zlib_decode_buffer(char* obuffer, int olen, char const* ibuffer, int ilen)
+        {
+            png__zbuf a;
+            a.zbuffer = (png_uc*)ibuffer;
+            a.zbuffer_end = (png_uc*)ibuffer + ilen;
+            if (png__do_zlib(&a, obuffer, olen, 0, 1))
+                return (int)(a.zout - a.zout_start);
+            else
+                return -1;
+        }
+
+        STBIDEF char* png_zlib_decode_noheader_malloc(char const* buffer, int len, int* outlen)
+        {
+            png__zbuf a;
+            char* p = (char*)png__malloc(16384);
+            if (p == NULL) return NULL;
+            a.zbuffer = (png_uc*)buffer;
+            a.zbuffer_end = (png_uc*)buffer + len;
+            if (png__do_zlib(&a, p, 16384, 1, 0)) {
+                if (outlen) *outlen = (int)(a.zout - a.zout_start);
+                return a.zout_start;
+            }
+            else {
+                PNG_FREE(a.zout_start);
+                return NULL;
+            }
+        }
+
+        STBIDEF int png_zlib_decode_noheader_buffer(char* obuffer, int olen, const char* ibuffer, int ilen)
+        {
+            png__zbuf a;
+            a.zbuffer = (png_uc*)ibuffer;
+            a.zbuffer_end = (png_uc*)ibuffer + ilen;
+            if (png__do_zlib(&a, obuffer, olen, 0, 0))
+                return (int)(a.zout - a.zout_start);
+            else
+                return -1;
+        }
+
+
+        // public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+        //    simple implementation
+        //      - only 8-bit samples
+        //      - no CRC checking
+        //      - allocates lots of intermediate memory
+        //        - avoids problem of streaming data between subsystems
+        //        - avoids explicit window management
+        //    performance
+        //      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+        typedef struct
+        {
+            png__uint32 length;
+            png__uint32 type;
+        } png__pngchunk;
+
+        static png__pngchunk png__get_chunk_header(png__context* s)
+        {
+            png__pngchunk c;
+            c.length = png__get32be(s);
+            c.type = png__get32be(s);
+            return c;
+        }
+
+        static int png__check_png_header(png__context* s)
+        {
+            static const png_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+            int i;
+            for (i = 0; i < 8; ++i)
+                if (png__get8(s) != png_sig[i]) return png__err("bad png sig", "Not a PNG");
+            return 1;
+        }
+
+        typedef struct
+        {
+            png__context* s;
+            png_uc* idata, * expanded, * out;
+            int depth;
+        } png__png;
+
+
+        enum {
+            PNG__F_none = 0,
+            PNG__F_sub = 1,
+            PNG__F_up = 2,
+            PNG__F_avg = 3,
+            PNG__F_paeth = 4,
+            // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+            PNG__F_avg_first,
+            PNG__F_paeth_first
+        };
+
+        static png_uc first_row_filter[5] =
+        {
+           PNG__F_none,
+           PNG__F_sub,
+           PNG__F_none,
+           PNG__F_avg_first,
+           PNG__F_paeth_first
+        };
+
+        static int png__paeth(int a, int b, int c)
+        {
+            int p = a + b - c;
+            int pa = abs(p - a);
+            int pb = abs(p - b);
+            int pc = abs(p - c);
+            if (pa <= pb && pa <= pc) return a;
+            if (pb <= pc) return b;
+            return c;
+        }
+
+        static const png_uc png__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+        // create the png data from post-deflated data
+        static int png__create_png_image_raw(png__png* a, png_uc* raw, png__uint32 raw_len, int out_n, png__uint32 x, png__uint32 y, int depth, int color)
+        {
+            int bytes = (depth == 16 ? 2 : 1);
+            png__context* s = a->s;
+            png__uint32 i, j, stride = x * out_n * bytes;
+            png__uint32 img_len, img_width_bytes;
+            int k;
+            int img_n = s->img_n; // copy it into a local for later
+
+            int output_bytes = out_n * bytes;
+            int filter_bytes = img_n * bytes;
+            int width = x;
+
+            PNG_ASSERT(out_n == s->img_n || out_n == s->img_n + 1);
+            a->out = (png_uc*)png__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+            if (!a->out) return png__err("outofmem", "Out of memory");
+
+            if (!png__mad3sizes_valid(img_n, x, depth, 7)) return png__err("too large", "Corrupt PNG");
+            img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+            img_len = (img_width_bytes + 1) * y;
+
+            // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+            // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+            // so just check for raw_len < img_len always.
+            if (raw_len < img_len) return png__err("not enough pixels", "Corrupt PNG");
+
+            for (j = 0; j < y; ++j) {
+                png_uc* cur = a->out + stride * j;
+                png_uc* prior;
+                int filter = *raw++;
+
+                if (filter > 4)
+                    return png__err("invalid filter", "Corrupt PNG");
+
+                if (depth < 8) {
+                    if (img_width_bytes > x) return png__err("invalid width", "Corrupt PNG");
+                    cur += x * out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+                    filter_bytes = 1;
+                    width = img_width_bytes;
+                }
+                prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
+
+                // if first row, use special filter that doesn't sample previous row
+                if (j == 0) filter = first_row_filter[filter];
+
+                // handle first byte explicitly
+                for (k = 0; k < filter_bytes; ++k) {
+                    switch (filter) {
+                    case PNG__F_none: cur[k] = raw[k]; break;
+                    case PNG__F_sub: cur[k] = raw[k]; break;
+                    case PNG__F_up: cur[k] = PNG__BYTECAST(raw[k] + prior[k]); break;
+                    case PNG__F_avg: cur[k] = PNG__BYTECAST(raw[k] + (prior[k] >> 1)); break;
+                    case PNG__F_paeth: cur[k] = PNG__BYTECAST(raw[k] + png__paeth(0, prior[k], 0)); break;
+                    case PNG__F_avg_first: cur[k] = raw[k]; break;
+                    case PNG__F_paeth_first: cur[k] = raw[k]; break;
+                    }
+                }
+
+                if (depth == 8) {
+                    if (img_n != out_n)
+                        cur[img_n] = 255; // first pixel
+                    raw += img_n;
+                    cur += out_n;
+                    prior += out_n;
+                }
+                else if (depth == 16) {
+                    if (img_n != out_n) {
+                        cur[filter_bytes] = 255; // first pixel top byte
+                        cur[filter_bytes + 1] = 255; // first pixel bottom byte
+                    }
+                    raw += filter_bytes;
+                    cur += output_bytes;
+                    prior += output_bytes;
+                }
+                else {
+                    raw += 1;
+                    cur += 1;
+                    prior += 1;
+                }
+
+                // this is a little gross, so that we don't switch per-pixel or per-component
+                if (depth < 8 || img_n == out_n) {
+                    int nk = (width - 1) * filter_bytes;
+#define PNG__CASE(f) \
+             case f:     \
+                for (k=0; k < nk; ++k)
+                    switch (filter) {
+                        // "none" filter turns into a memcpy here; make that explicit.
+                    case PNG__F_none:         memcpy(cur, raw, nk); break;
+                        PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - filter_bytes]); } break;
+                        PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break;
+                        PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], prior[k], prior[k - filter_bytes])); } break;
+                        PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - filter_bytes], 0, 0)); } break;
+                    }
+#undef PNG__CASE
+                    raw += nk;
+                }
+                else {
+                    PNG_ASSERT(img_n + 1 == out_n);
+#define PNG__CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
+                    switch (filter) {
+                        PNG__CASE(PNG__F_none) { cur[k] = raw[k]; } break;
+                        PNG__CASE(PNG__F_sub) { cur[k] = PNG__BYTECAST(raw[k] + cur[k - output_bytes]); } break;
+                        PNG__CASE(PNG__F_up) { cur[k] = PNG__BYTECAST(raw[k] + prior[k]); } break;
+                        PNG__CASE(PNG__F_avg) { cur[k] = PNG__BYTECAST(raw[k] + ((prior[k] + cur[k - output_bytes]) >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], prior[k], prior[k - output_bytes])); } break;
+                        PNG__CASE(PNG__F_avg_first) { cur[k] = PNG__BYTECAST(raw[k] + (cur[k - output_bytes] >> 1)); } break;
+                        PNG__CASE(PNG__F_paeth_first) { cur[k] = PNG__BYTECAST(raw[k] + png__paeth(cur[k - output_bytes], 0, 0)); } break;
+                    }
+#undef PNG__CASE
+
+                    // the loop above sets the high byte of the pixels' alpha, but for
+                    // 16 bit png files we also need the low byte set. we'll do that here.
+                    if (depth == 16) {
+                        cur = a->out + stride * j; // start at the beginning of the row again
+                        for (i = 0; i < x; ++i, cur += output_bytes) {
+                            cur[filter_bytes + 1] = 255;
+                        }
+                    }
+                }
+            }
+
+            // we make a separate pass to expand bits to pixels; for performance,
+            // this could run two scanlines behind the above code, so it won't
+            // intefere with filtering but will still be in the cache.
+            if (depth < 8) {
+                for (j = 0; j < y; ++j) {
+                    png_uc* cur = a->out + stride * j;
+                    png_uc* in = a->out + stride * j + x * out_n - img_width_bytes;
+                    // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
+                    // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+                    png_uc scale = (color == 0) ? png__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+                    // note that the final byte might overshoot and write more data than desired.
+                    // we can allocate enough data that this never writes out of memory, but it
+                    // could also overwrite the next scanline. can it overwrite non-empty data
+                    // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+                    // so we need to explicitly clamp the final ones
+
+                    if (depth == 4) {
+                        for (k = x * img_n; k >= 2; k -= 2, ++in) {
+                            *cur++ = scale * ((*in >> 4));
+                            *cur++ = scale * ((*in) & 0x0f);
+                        }
+                        if (k > 0) *cur++ = scale * ((*in >> 4));
+                    }
+                    else if (depth == 2) {
+                        for (k = x * img_n; k >= 4; k -= 4, ++in) {
+                            *cur++ = scale * ((*in >> 6));
+                            *cur++ = scale * ((*in >> 4) & 0x03);
+                            *cur++ = scale * ((*in >> 2) & 0x03);
+                            *cur++ = scale * ((*in) & 0x03);
+                        }
+                        if (k > 0) *cur++ = scale * ((*in >> 6));
+                        if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
+                        if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
+                    }
+                    else if (depth == 1) {
+                        for (k = x * img_n; k >= 8; k -= 8, ++in) {
+                            *cur++ = scale * ((*in >> 7));
+                            *cur++ = scale * ((*in >> 6) & 0x01);
+                            *cur++ = scale * ((*in >> 5) & 0x01);
+                            *cur++ = scale * ((*in >> 4) & 0x01);
+                            *cur++ = scale * ((*in >> 3) & 0x01);
+                            *cur++ = scale * ((*in >> 2) & 0x01);
+                            *cur++ = scale * ((*in >> 1) & 0x01);
+                            *cur++ = scale * ((*in) & 0x01);
+                        }
+                        if (k > 0) *cur++ = scale * ((*in >> 7));
+                        if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+                        if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+                        if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+                        if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+                        if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+                        if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+                    }
+                    if (img_n != out_n) {
+                        int q;
+                        // insert alpha = 255
+                        cur = a->out + stride * j;
+                        if (img_n == 1) {
+                            for (q = x - 1; q >= 0; --q) {
+                                cur[q * 2 + 1] = 255;
+                                cur[q * 2 + 0] = cur[q];
+                            }
+                        }
+                        else {
+                            PNG_ASSERT(img_n == 3);
+                            for (q = x - 1; q >= 0; --q) {
+                                cur[q * 4 + 3] = 255;
+                                cur[q * 4 + 2] = cur[q * 3 + 2];
+                                cur[q * 4 + 1] = cur[q * 3 + 1];
+                                cur[q * 4 + 0] = cur[q * 3 + 0];
+                            }
+                        }
+                    }
+                }
+            }
+            else if (depth == 16) {
+                // force the image data from big-endian to platform-native.
+                // this is done in a separate pass due to the decoding relying
+                // on the data being untouched, but could probably be done
+                // per-line during decode if care is taken.
+                png_uc* cur = a->out;
+                png__uint16* cur16 = (png__uint16*)cur;
+
+                for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) {
+                    *cur16 = (cur[0] << 8) | cur[1];
+                }
+            }
+
+            return 1;
+        }
+
+        static int png__create_png_image(png__png* a, png_uc* image_data, png__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+        {
+            int bytes = (depth == 16 ? 2 : 1);
+            int out_bytes = out_n * bytes;
+            png_uc* final;
+            int p;
+            if (!interlaced)
+                return png__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+            // de-interlacing
+            final = (png_uc*)png__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+            for (p = 0; p < 7; ++p) {
+                int xorig[] = { 0,4,0,2,0,1,0 };
+                int yorig[] = { 0,0,4,0,2,0,1 };
+                int xspc[] = { 8,8,4,4,2,2,1 };
+                int yspc[] = { 8,8,8,4,4,2,2 };
+                int i, j, x, y;
+                // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+                x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p];
+                y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p];
+                if (x && y) {
+                    png__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+                    if (!png__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+                        PNG_FREE(final);
+                        return 0;
+                    }
+                    for (j = 0; j < y; ++j) {
+                        for (i = 0; i < x; ++i) {
+                            int out_y = j * yspc[p] + yorig[p];
+                            int out_x = i * xspc[p] + xorig[p];
+                            memcpy(final + out_y * a->s->img_x * out_bytes + out_x * out_bytes,
+                                a->out + (j * x + i) * out_bytes, out_bytes);
+                        }
+                    }
+                    PNG_FREE(a->out);
+                    image_data += img_len;
+                    image_data_len -= img_len;
+                }
+            }
+            a->out = final;
+
+            return 1;
+        }
+
+        static int png__compute_transparency(png__png* z, png_uc tc[3], int out_n)
+        {
+            png__context* s = z->s;
+            png__uint32 i, pixel_count = s->img_x * s->img_y;
+            png_uc* p = z->out;
+
+            // compute color-based transparency, assuming we've
+            // already got 255 as the alpha value in the output
+            PNG_ASSERT(out_n == 2 || out_n == 4);
+
+            if (out_n == 2) {
+                for (i = 0; i < pixel_count; ++i) {
+                    p[1] = (p[0] == tc[0] ? 0 : 255);
+                    p += 2;
+                }
+            }
+            else {
+                for (i = 0; i < pixel_count; ++i) {
+                    if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+                        p[3] = 0;
+                    p += 4;
+                }
+            }
+            return 1;
+        }
+
+        static int png__compute_transparency16(png__png* z, png__uint16 tc[3], int out_n)
+        {
+            png__context* s = z->s;
+            png__uint32 i, pixel_count = s->img_x * s->img_y;
+            png__uint16* p = (png__uint16*)z->out;
+
+            // compute color-based transparency, assuming we've
+            // already got 65535 as the alpha value in the output
+            PNG_ASSERT(out_n == 2 || out_n == 4);
+
+            if (out_n == 2) {
+                for (i = 0; i < pixel_count; ++i) {
+                    p[1] = (p[0] == tc[0] ? 0 : 65535);
+                    p += 2;
+                }
+            }
+            else {
+                for (i = 0; i < pixel_count; ++i) {
+                    if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+                        p[3] = 0;
+                    p += 4;
+                }
+            }
+            return 1;
+        }
+
+        static int png__expand_png_palette(png__png* a, png_uc* palette, int len, int pal_img_n)
+        {
+            png__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+            png_uc* p, * temp_out, * orig = a->out;
+
+            p = (png_uc*)png__malloc_mad2(pixel_count, pal_img_n, 0);
+            if (p == NULL) return png__err("outofmem", "Out of memory");
+
+            // between here and free(out) below, exitting would leak
+            temp_out = p;
+
+            if (pal_img_n == 3) {
+                for (i = 0; i < pixel_count; ++i) {
+                    int n = orig[i] * 4;
+                    p[0] = palette[n];
+                    p[1] = palette[n + 1];
+                    p[2] = palette[n + 2];
+                    p += 3;
+                }
+            }
+            else {
+                for (i = 0; i < pixel_count; ++i) {
+                    int n = orig[i] * 4;
+                    p[0] = palette[n];
+                    p[1] = palette[n + 1];
+                    p[2] = palette[n + 2];
+                    p[3] = palette[n + 3];
+                    p += 4;
+                }
+            }
+            PNG_FREE(a->out);
+            a->out = temp_out;
+
+            PNG_NOTUSED(len);
+
+            return 1;
+        }
+
+        static int png__unpremultiply_on_load = 0;
+        static int png__de_iphone_flag = 0;
+
+        STBIDEF void png_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+        {
+            png__unpremultiply_on_load = flag_true_if_should_unpremultiply;
+        }
+
+        STBIDEF void png_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+        {
+            png__de_iphone_flag = flag_true_if_should_convert;
+        }
+
+        static void png__de_iphone(png__png* z)
+        {
+            png__context* s = z->s;
+            png__uint32 i, pixel_count = s->img_x * s->img_y;
+            png_uc* p = z->out;
+
+            if (s->img_out_n == 3) {  // convert bgr to rgb
+                for (i = 0; i < pixel_count; ++i) {
+                    png_uc t = p[0];
+                    p[0] = p[2];
+                    p[2] = t;
+                    p += 3;
+                }
+            }
+            else {
+                PNG_ASSERT(s->img_out_n == 4);
+                if (png__unpremultiply_on_load) {
+                    // convert bgr to rgb and unpremultiply
+                    for (i = 0; i < pixel_count; ++i) {
+                        png_uc a = p[3];
+                        png_uc t = p[0];
+                        if (a) {
+                            png_uc half = a / 2;
+                            p[0] = (p[2] * 255 + half) / a;
+                            p[1] = (p[1] * 255 + half) / a;
+                            p[2] = (t * 255 + half) / a;
+                        }
+                        else {
+                            p[0] = p[2];
+                            p[2] = t;
+                        }
+                        p += 4;
+                    }
+                }
+                else {
+                    // convert bgr to rgb
+                    for (i = 0; i < pixel_count; ++i) {
+                        png_uc t = p[0];
+                        p[0] = p[2];
+                        p[2] = t;
+                        p += 4;
+                    }
+                }
+            }
+        }
+
+#define PNG__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
+
+        static int png__parse_png_file(png__png* z, int scan, int req_comp)
+        {
+            png_uc palette[1024], pal_img_n = 0;
+            png_uc has_trans = 0, tc[3] = { 0 };
+            png__uint16 tc16[3];
+            png__uint32 ioff = 0, idata_limit = 0, i, pal_len = 0;
+            int first = 1, k, interlace = 0, color = 0, is_iphone = 0;
+            png__context* s = z->s;
+
+            z->expanded = NULL;
+            z->idata = NULL;
+            z->out = NULL;
+
+            if (!png__check_png_header(s)) return 0;
+
+            if (scan == PNG__SCAN_type) return 1;
+
+            for (;;) {
+                png__pngchunk c = png__get_chunk_header(s);
+                switch (c.type) {
+                case PNG__PNG_TYPE('C', 'g', 'B', 'I'):
+                    is_iphone = 1;
+                    png__skip(s, c.length);
+                    break;
+                case PNG__PNG_TYPE('I', 'H', 'D', 'R'): {
+                    int comp, filter;
+                    if (!first) return png__err("multiple IHDR", "Corrupt PNG");
+                    first = 0;
+                    if (c.length != 13) return png__err("bad IHDR len", "Corrupt PNG");
+                    s->img_x = png__get32be(s);
+                    s->img_y = png__get32be(s);
+                    if (s->img_y > PNG_MAX_DIMENSIONS) return png__err("too large", "Very large image (corrupt?)");
+                    if (s->img_x > PNG_MAX_DIMENSIONS) return png__err("too large", "Very large image (corrupt?)");
+                    z->depth = png__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return png__err("1/2/4/8/16-bit only", "PNG not supported: 1/2/4/8/16-bit only");
+                    color = png__get8(s);  if (color > 6)         return png__err("bad ctype", "Corrupt PNG");
+                    if (color == 3 && z->depth == 16)                  return png__err("bad ctype", "Corrupt PNG");
+                    if (color == 3) pal_img_n = 3; else if (color & 1) return png__err("bad ctype", "Corrupt PNG");
+                    comp = png__get8(s);  if (comp) return png__err("bad comp method", "Corrupt PNG");
+                    filter = png__get8(s);  if (filter) return png__err("bad filter method", "Corrupt PNG");
+                    interlace = png__get8(s); if (interlace > 1) return png__err("bad interlace method", "Corrupt PNG");
+                    if (!s->img_x || !s->img_y) return png__err("0-pixel image", "Corrupt PNG");
+                    if (!pal_img_n) {
+                        s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+                        if ((1 << 30) / s->img_x / s->img_n < s->img_y) return png__err("too large", "Image too large to decode");
+                        if (scan == PNG__SCAN_header) return 1;
+                    }
+                    else {
+                        // if paletted, then pal_n is our final components, and
+                        // img_n is # components to decompress/filter.
+                        s->img_n = 1;
+                        if ((1 << 30) / s->img_x / 4 < s->img_y) return png__err("too large", "Corrupt PNG");
+                        // if SCAN_header, have to scan to see if we have a tRNS
+                    }
+                    break;
+                }
+
+                case PNG__PNG_TYPE('P', 'L', 'T', 'E'): {
+                    if (first) return png__err("first not IHDR", "Corrupt PNG");
+                    if (c.length > 256 * 3) return png__err("invalid PLTE", "Corrupt PNG");
+                    pal_len = c.length / 3;
+                    if (pal_len * 3 != c.length) return png__err("invalid PLTE", "Corrupt PNG");
+                    for (i = 0; i < pal_len; ++i) {
+                        palette[i * 4 + 0] = png__get8(s);
+                        palette[i * 4 + 1] = png__get8(s);
+                        palette[i * 4 + 2] = png__get8(s);
+                        palette[i * 4 + 3] = 255;
+                    }
+                    break;
+                }
+
+                case PNG__PNG_TYPE('t', 'R', 'N', 'S'): {
+                    if (first) return png__err("first not IHDR", "Corrupt PNG");
+                    if (z->idata) return png__err("tRNS after IDAT", "Corrupt PNG");
+                    if (pal_img_n) {
+                        if (scan == PNG__SCAN_header) { s->img_n = 4; return 1; }
+                        if (pal_len == 0) return png__err("tRNS before PLTE", "Corrupt PNG");
+                        if (c.length > pal_len) return png__err("bad tRNS len", "Corrupt PNG");
+                        pal_img_n = 4;
+                        for (i = 0; i < c.length; ++i)
+                            palette[i * 4 + 3] = png__get8(s);
+                    }
+                    else {
+                        if (!(s->img_n & 1)) return png__err("tRNS with alpha", "Corrupt PNG");
+                        if (c.length != (png__uint32)s->img_n * 2) return png__err("bad tRNS len", "Corrupt PNG");
+                        has_trans = 1;
+                        if (z->depth == 16) {
+                            for (k = 0; k < s->img_n; ++k) tc16[k] = (png__uint16)png__get16be(s); // copy the values as-is
+                        }
+                        else {
+                            for (k = 0; k < s->img_n; ++k) tc[k] = (png_uc)(png__get16be(s) & 255) * png__depth_scale_table[z->depth]; // non 8-bit images will be larger
+                        }
+                    }
+                    break;
+                }
+
+                case PNG__PNG_TYPE('I', 'D', 'A', 'T'): {
+                    if (first) return png__err("first not IHDR", "Corrupt PNG");
+                    if (pal_img_n && !pal_len) return png__err("no PLTE", "Corrupt PNG");
+                    if (scan == PNG__SCAN_header) { s->img_n = pal_img_n; return 1; }
+                    if ((int)(ioff + c.length) < (int)ioff) return 0;
+                    if (ioff + c.length > idata_limit) {
+                        png__uint32 idata_limit_old = idata_limit;
+                        png_uc* p;
+                        if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+                        while (ioff + c.length > idata_limit)
+                            idata_limit *= 2;
+                        PNG_NOTUSED(idata_limit_old);
+                        p = (png_uc*)PNG_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return png__err("outofmem", "Out of memory");
+                        z->idata = p;
+                    }
+                    if (!png__getn(s, z->idata + ioff, c.length)) return png__err("outofdata", "Corrupt PNG");
+                    ioff += c.length;
+                    break;
+                }
+
+                case PNG__PNG_TYPE('I', 'E', 'N', 'D'): {
+                    png__uint32 raw_len, bpl;
+                    if (first) return png__err("first not IHDR", "Corrupt PNG");
+                    if (scan != PNG__SCAN_load) return 1;
+                    if (z->idata == NULL) return png__err("no IDAT", "Corrupt PNG");
+                    // initial guess for decoded data size to avoid unnecessary reallocs
+                    bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+                    raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+                    z->expanded = (png_uc*)png_zlib_decode_malloc_guesssize_headerflag((char*)z->idata, ioff, raw_len, (int*)&raw_len, !is_iphone);
+                    if (z->expanded == NULL) return 0; // zlib should set error
+                    PNG_FREE(z->idata); z->idata = NULL;
+                    if ((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || has_trans)
+                        s->img_out_n = s->img_n + 1;
+                    else
+                        s->img_out_n = s->img_n;
+                    if (!png__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+                    if (has_trans) {
+                        if (z->depth == 16) {
+                            if (!png__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+                        }
+                        else {
+                            if (!png__compute_transparency(z, tc, s->img_out_n)) return 0;
+                        }
+                    }
+                    if (is_iphone && png__de_iphone_flag && s->img_out_n > 2)
+                        png__de_iphone(z);
+                    if (pal_img_n) {
+                        // pal_img_n == 3 or 4
+                        s->img_n = pal_img_n; // record the actual colors we had
+                        s->img_out_n = pal_img_n;
+                        if (req_comp >= 3) s->img_out_n = req_comp;
+                        if (!png__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                            return 0;
+                    }
+                    else if (has_trans) {
+                        // non-paletted image with tRNS -> source image has (constant) alpha
+                        ++s->img_n;
+                    }
+                    PNG_FREE(z->expanded); z->expanded = NULL;
+                    // end of PNG chunk, read and skip CRC
+                    png__get32be(s);
+                    return 1;
+                }
+
+                default:
+                    // if critical, fail
+                    if (first) return png__err("first not IHDR", "Corrupt PNG");
+                    if ((c.type & (1 << 29)) == 0) {
+#ifndef PNG_NO_FAILURE_STRINGS
+                        // not threadsafe
+                        static char invalid_chunk[] = "XXXX PNG chunk not known";
+                        invalid_chunk[0] = PNG__BYTECAST(c.type >> 24);
+                        invalid_chunk[1] = PNG__BYTECAST(c.type >> 16);
+                        invalid_chunk[2] = PNG__BYTECAST(c.type >> 8);
+                        invalid_chunk[3] = PNG__BYTECAST(c.type >> 0);
+#endif
+                        return png__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+                    }
+                    png__skip(s, c.length);
+                    break;
+                }
+                // end of PNG chunk, read and skip CRC
+                png__get32be(s);
+            }
+        }
+
+        static void* png__do_png(png__png* p, int* x, int* y, int* n, int req_comp, png__result_info* ri)
+        {
+            void* result = NULL;
+            if (req_comp < 0 || req_comp > 4) return png__errpuc("bad req_comp", "Internal error");
+            if (png__parse_png_file(p, PNG__SCAN_load, req_comp)) {
+                if (p->depth <= 8)
+                    ri->bits_per_channel = 8;
+                else if (p->depth == 16)
+                    ri->bits_per_channel = 16;
+                else
+                    return png__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
+                result = p->out;
+                p->out = NULL;
+                if (req_comp && req_comp != p->s->img_out_n) {
+                    if (ri->bits_per_channel == 8)
+                        result = png__convert_format((unsigned char*)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+                    else
+                        result = png__convert_format16((png__uint16*)result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+                    p->s->img_out_n = req_comp;
+                    if (result == NULL) return result;
+                }
+                *x = p->s->img_x;
+                *y = p->s->img_y;
+                if (n) *n = p->s->img_n;
+            }
+            PNG_FREE(p->out);      p->out = NULL;
+            PNG_FREE(p->expanded); p->expanded = NULL;
+            PNG_FREE(p->idata);    p->idata = NULL;
+
+            return result;
+        }
+
+        static void* png__png_load(png__context* s, int* x, int* y, int* comp, int req_comp, png__result_info* ri)
+        {
+            png__png p;
+            p.s = s;
+            return png__do_png(&p, x, y, comp, req_comp, ri);
+        }
+
+        static int png__png_test(png__context* s)
+        {
+            int r;
+            r = png__check_png_header(s);
+            png__rewind(s);
+            return r;
+        }
+
+        static int png__png_info_raw(png__png* p, int* x, int* y, int* comp)
+        {
+            if (!png__parse_png_file(p, PNG__SCAN_header, 0)) {
+                png__rewind(p->s);
+                return 0;
+            }
+            if (x) *x = p->s->img_x;
+            if (y) *y = p->s->img_y;
+            if (comp) *comp = p->s->img_n;
+            return 1;
+        }
+
+        static int png__png_info(png__context* s, int* x, int* y, int* comp)
+        {
+            png__png p;
+            p.s = s;
+            return png__png_info_raw(&p, x, y, comp);
+        }
+
+        static int png__png_is16(png__context* s)
+        {
+            png__png p;
+            p.s = s;
+            if (!png__png_info_raw(&p, NULL, NULL, NULL))
+                return 0;
+            if (p.depth != 16) {
+                png__rewind(p.s);
+                return 0;
+            }
+            return 1;
+        }
+
+        static void* png__load_main(png__context* s, int* x, int* y, int* comp, int req_comp, png__result_info* ri, int bpc)
+        {
+            memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+            ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+            ri->channel_order = PNG_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+            ri->num_channels = 0;
+
+            if (png__png_test(s))  return png__png_load(s, x, y, comp, req_comp, ri);
+
+            return png__errpuc("unknown image type", "Image not of any known type, or corrupt");
+        }
+
+        static png_uc* png__convert_16_to_8(png__uint16* orig, int w, int h, int channels)
+        {
+            int i;
+            int img_len = w * h * channels;
+            png_uc* reduced;
+
+            reduced = (png_uc*)png__malloc(img_len);
+            if (reduced == NULL) return png__errpuc("outofmem", "Out of memory");
+
+            for (i = 0; i < img_len; ++i)
+                reduced[i] = (png_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+            PNG_FREE(orig);
+            return reduced;
+        }
+
+        static unsigned char* png__load_and_postprocess_8bit(png__context* s, int* x, int* y, int* comp, int req_comp)
+        {
+            png__result_info ri;
+            void* result = png__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+            if (result == NULL)
+                return NULL;
+
+            // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+            PNG_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+            if (ri.bits_per_channel != 8) {
+                result = png__convert_16_to_8((png__uint16*)result, *x, *y, req_comp == 0 ? *comp : req_comp);
+                ri.bits_per_channel = 8;
+            }
+
+            // @TODO: move png__convert_format to here
+
+            //if (png__vertically_flip_on_load) {
+            //    int channels = req_comp ? req_comp : *comp;
+            //    png__vertical_flip(result, *x, *y, channels * sizeof(png_uc));
+            //}
+
+            return (unsigned char*)result;
+        }
+
+        static void png__start_mem(png__context* s, png_uc const* buffer, int len)
+        {
+            s->io.read = NULL;
+            s->read_from_callbacks = 0;
+            s->callback_already_read = 0;
+            s->img_buffer = s->img_buffer_original = (png_uc*)buffer;
+            s->img_buffer_end = s->img_buffer_original_end = (png_uc*)buffer + len;
+        }
+
+        STBIDEF png_uc* png_load_from_memory(png_uc const* buffer, int len, int* x, int* y, int* comp, int req_comp)
+        {
+            png__context s;
+            png__start_mem(&s, buffer, len);
+            return png__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
+        }
+
+        //------------------------------------------------------------------------
+
+        static int png__stdio_read(void* user, char* data, int size)
+        {
+            InputMemoryStream* stream = (InputMemoryStream*)user;
+            return (int)stream->Read(size, data);
+        }
+
+        static void png__stdio_skip(void* user, int n)
+        {
+            InputMemoryStream* stream = (InputMemoryStream*)user;
+            stream->Skip(n);
+        }
+
+        static int png__stdio_eof(void* user)
+        {
+            InputMemoryStream* stream = (InputMemoryStream*)user;
+            return stream->Pos() == stream->Size() ? 1 : 0;
+        }
+
+
+        //---------------------------------------------------------------------
+
+        ImagePngLoader::ImagePngLoader(const ImageLoaderParam& param)
+            : Base::ImagePngLoader(param)
+        {
+            if (_param.format == SimdPixelFormatNone)
+                _param.format = SimdPixelFormatRgb24;
+        }
+
+        bool ImagePngLoader::FromStream()
+        {
+            const int req_comp = 4;
+            int x, y, comp;
+            png__context s;
+            s.io.eof = png__stdio_eof;
+            s.io.read = png__stdio_read;
+            s.io.skip = png__stdio_skip;
+            s.io_user_data = &_stream;
+            s.buflen = sizeof(s.buffer_start);
+            s.read_from_callbacks = 1;
+            s.callback_already_read = 0;
+            s.img_buffer = s.img_buffer_original = s.buffer_start;
+            png__refill_buffer(&s);
+            s.img_buffer_original_end = s.img_buffer_end;
+            png__result_info ri;
+            uint8_t* data = (uint8_t*)png__png_load(&s, &x, &y, &comp, req_comp, &ri);
+            if (data)
+            {
+                if (ri.bits_per_channel == 16)
+                {
+                    const uint16_t* src = (uint16_t*)data;
+                    size_t size = x * y * req_comp;
+                    uint8_t* dst = (uint8_t*)PNG_MALLOC(size);
+                    for (size_t i = 0; i < size; ++i)
+                        dst[i] = uint8_t(src[i] >> 8);
+                    PNG_FREE(data);
+                    data = dst;
+                }
+                size_t stride = 4 * x;
+                _image.Recreate(x, y, (Image::Format)_param.format);
+                if (x < A)
+                {
+                    switch (_param.format)
+                    {
+                    case SimdPixelFormatGray8:
+                        Base::RgbaToGray(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatBgr24:
+                        Base::BgraToRgb(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatBgra32:
+                        Base::BgraToRgba(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatRgb24:
+                        Base::BgraToBgr(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatRgba32:
+                        Base::Copy(data, stride, x, y, 4, _image.data, _image.stride);
+                        break;
+                    default:
+                        break;
+                    }
+                }
+                else
+                {
+                    switch (_param.format)
+                    {
+                    case SimdPixelFormatGray8:
+                        Sse2::RgbaToGray(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatBgr24:
+                        Sse41::BgraToRgb(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatBgra32:
+                        Sse41::BgraToRgba(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatRgb24:
+                        Sse41::BgraToBgr(data, x, y, stride, _image.data, _image.stride);
+                        break;
+                    case SimdPixelFormatRgba32:
+                        Base::Copy(data, stride, x, y, 4, _image.data, _image.stride);
+                        break;
+                    default:
+                        break;
+                    }
+                }
+                PNG_FREE(data);
+                return true;
+            }
+            return false;
+        }
+    }
+#endif
+}
diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp
new file mode 100644
index 0000000000..da20b395c0
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdSse41ImageSave.cpp
@@ -0,0 +1,139 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdSse2.h"
+#include "Simd/SimdSse41.h"
+
+#include <memory>
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        ImagePgmTxtSaver::ImagePgmTxtSaver(const ImageSaverParam& param)
+            : Base::ImagePgmTxtSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _convert = Sse41::BgrToGray; break;
+                case SimdPixelFormatBgra32: _convert = Sse2::BgraToGray; break;
+                case SimdPixelFormatRgb24: _convert = Sse41::RgbToGray; break;
+                case SimdPixelFormatRgba32: _convert = Sse41::RgbaToGray; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePgmBinSaver::ImagePgmBinSaver(const ImageSaverParam& param)
+            : Base::ImagePgmBinSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatBgr24: _convert = Sse41::BgrToGray; break;
+                case SimdPixelFormatBgra32: _convert = Sse2::BgraToGray; break;
+                case SimdPixelFormatRgb24: _convert = Sse41::RgbToGray; break;
+                case SimdPixelFormatRgba32: _convert = Sse41::RgbaToGray; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmTxtSaver::ImagePpmTxtSaver(const ImageSaverParam& param)
+            : Base::ImagePpmTxtSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _convert = Sse41::GrayToBgr; break;
+                case SimdPixelFormatBgr24: _convert = Sse41::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _convert = Sse41::BgraToRgb; break;
+                case SimdPixelFormatRgba32: _convert = Sse41::BgraToBgr; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImagePpmBinSaver::ImagePpmBinSaver(const ImageSaverParam& param)
+            : Base::ImagePpmBinSaver(param)
+        {
+            if (_param.width >= A)
+            {
+                switch (_param.format)
+                {
+                case SimdPixelFormatGray8: _convert = Sse41::GrayToBgr; break;
+                case SimdPixelFormatBgr24: _convert = Sse41::BgrToRgb; break;
+                case SimdPixelFormatBgra32: _convert = Sse41::BgraToRgb; break;
+                case SimdPixelFormatRgba32: _convert = Sse41::BgraToBgr; break;
+                default: break;
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageSaver* CreateImageSaver(const ImageSaverParam& param)
+        {
+            switch (param.file)
+            {
+            case SimdImageFilePgmTxt: return new ImagePgmTxtSaver(param);
+            case SimdImageFilePgmBin: return new ImagePgmBinSaver(param);
+            case SimdImageFilePpmTxt: return new ImagePpmTxtSaver(param);
+            case SimdImageFilePpmBin: return new ImagePpmBinSaver(param);
+            case SimdImageFilePng: return new ImagePngSaver(param);
+            case SimdImageFileJpeg: return new ImageJpegSaver(param);
+            default:
+                return NULL;
+            }
+        }
+
+        uint8_t* ImageSaveToMemory(const uint8_t* src, size_t stride, size_t width, size_t height, SimdPixelFormatType format, SimdImageFileType file, int quality, size_t* size)
+        {
+            ImageSaverParam param(width, height, format, file, quality);
+            if (param.Validate())
+            {
+                std::unique_ptr<ImageSaver> saver(CreateImageSaver(param));
+                if (saver)
+                {
+                    if (saver->ToStream(src, stride))
+                        return saver->Release(size);
+                }
+            }
+            return NULL;
+        }
+    }
+#endif// SIMD_SSE41_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp
new file mode 100644
index 0000000000..3a0a2079c1
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdSse41ImageSaveJpeg.cpp
@@ -0,0 +1,431 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdImageSaveJpeg.h"
+#include "Simd/SimdSse41.h"
+#include "Simd/SimdBase.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        SIMD_INLINE void JpegDctV(const float* src, size_t srcStride, float *dst, size_t dstStride)
+        {
+            for (int i = 0; i < 2; i++, src += 4, dst += 4)
+            {
+                __m128 d0 = _mm_loadu_ps(src + 0 * srcStride);
+                __m128 d1 = _mm_loadu_ps(src + 1 * srcStride);
+                __m128 d2 = _mm_loadu_ps(src + 2 * srcStride);
+                __m128 d3 = _mm_loadu_ps(src + 3 * srcStride);
+                __m128 d4 = _mm_loadu_ps(src + 4 * srcStride);
+                __m128 d5 = _mm_loadu_ps(src + 5 * srcStride);
+                __m128 d6 = _mm_loadu_ps(src + 6 * srcStride);
+                __m128 d7 = _mm_loadu_ps(src + 7 * srcStride);
+
+                __m128 tmp0 = _mm_add_ps(d0, d7);
+                __m128 tmp7 = _mm_sub_ps(d0, d7);
+                __m128 tmp1 = _mm_add_ps(d1, d6);
+                __m128 tmp6 = _mm_sub_ps(d1, d6);
+                __m128 tmp2 = _mm_add_ps(d2, d5);
+                __m128 tmp5 = _mm_sub_ps(d2, d5);
+                __m128 tmp3 = _mm_add_ps(d3, d4);
+                __m128 tmp4 = _mm_sub_ps(d3, d4);
+
+                __m128 tmp10 = _mm_add_ps(tmp0, tmp3);
+                __m128 tmp13 = _mm_sub_ps(tmp0, tmp3);
+                __m128 tmp11 = _mm_add_ps(tmp1, tmp2);
+                __m128 tmp12 = _mm_sub_ps(tmp1, tmp2);
+
+                d0 = _mm_add_ps(tmp10, tmp11);
+                d4 = _mm_sub_ps(tmp10, tmp11);
+
+                __m128 z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), _mm_set1_ps(0.707106781f));
+                d2 = _mm_add_ps(tmp13, z1);
+                d6 = _mm_sub_ps(tmp13, z1);
+
+                tmp10 = _mm_add_ps(tmp4, tmp5);
+                tmp11 = _mm_add_ps(tmp5, tmp6);
+                tmp12 = _mm_add_ps(tmp6, tmp7);
+
+                __m128 z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12),  _mm_set1_ps(0.382683433f));
+                __m128 z2 = _mm_add_ps(_mm_mul_ps(tmp10, _mm_set1_ps(0.541196100f)), z5);
+                __m128 z4 = _mm_add_ps(_mm_mul_ps(tmp12, _mm_set1_ps(1.306562965f)), z5);
+                __m128 z3 = _mm_mul_ps(tmp11, _mm_set1_ps(0.707106781f));
+
+                __m128 z11 = _mm_add_ps(tmp7, z3);
+                __m128 z13 = _mm_sub_ps(tmp7, z3);
+
+                _mm_storeu_ps(dst + 0 * dstStride, d0);
+                _mm_storeu_ps(dst + 1 * dstStride, _mm_add_ps(z11, z4));
+                _mm_storeu_ps(dst + 2 * dstStride, d2);
+                _mm_storeu_ps(dst + 3 * dstStride, _mm_sub_ps(z13, z2));
+                _mm_storeu_ps(dst + 4 * dstStride, d4);
+                _mm_storeu_ps(dst + 5 * dstStride, _mm_add_ps(z13, z2));
+                _mm_storeu_ps(dst + 6 * dstStride, d6);
+                _mm_storeu_ps(dst + 7 * dstStride, _mm_sub_ps(z11, z4));
+            }
+        }
+
+        SIMD_INLINE void JpegDctH(const float* src, size_t srcStride, const float * fdt, int* dst)
+        {
+            for (int i = 0; i < 2; i++, src += 4 * srcStride, fdt += 4, dst += 4)
+            {
+                __m128 tmp0, tmp1, tmp2, tmp3;
+                __m128 d0 = _mm_loadu_ps(src + 0 * srcStride);
+                __m128 d1 = _mm_loadu_ps(src + 1 * srcStride);
+                __m128 d2 = _mm_loadu_ps(src + 2 * srcStride);
+                __m128 d3 = _mm_loadu_ps(src + 3 * srcStride);
+                tmp0 = _mm_unpacklo_ps(d0, d2);
+                tmp1 = _mm_unpackhi_ps(d0, d2);
+                tmp2 = _mm_unpacklo_ps(d1, d3);
+                tmp3 = _mm_unpackhi_ps(d1, d3);
+                d0 = _mm_unpacklo_ps(tmp0, tmp2);
+                d1 = _mm_unpackhi_ps(tmp0, tmp2);
+                d2 = _mm_unpacklo_ps(tmp1, tmp3);
+                d3 = _mm_unpackhi_ps(tmp1, tmp3);
+
+                __m128 d4 = _mm_loadu_ps(src + 0 * srcStride + 4);
+                __m128 d5 = _mm_loadu_ps(src + 1 * srcStride + 4);
+                __m128 d6 = _mm_loadu_ps(src + 2 * srcStride + 4);
+                __m128 d7 = _mm_loadu_ps(src + 3 * srcStride + 4);
+                tmp0 = _mm_unpacklo_ps(d4, d6);
+                tmp1 = _mm_unpackhi_ps(d4, d6);
+                tmp2 = _mm_unpacklo_ps(d5, d7);
+                tmp3 = _mm_unpackhi_ps(d5, d7);
+                d4 = _mm_unpacklo_ps(tmp0, tmp2);
+                d5 = _mm_unpackhi_ps(tmp0, tmp2);
+                d6 = _mm_unpacklo_ps(tmp1, tmp3);
+                d7 = _mm_unpackhi_ps(tmp1, tmp3);
+
+                tmp0 = _mm_add_ps(d0, d7);
+                tmp1 = _mm_add_ps(d1, d6);
+                tmp2 = _mm_add_ps(d2, d5);
+                tmp3 = _mm_add_ps(d3, d4);
+                __m128 tmp7 = _mm_sub_ps(d0, d7);
+                __m128 tmp6 = _mm_sub_ps(d1, d6);
+                __m128 tmp5 = _mm_sub_ps(d2, d5);
+                __m128 tmp4 = _mm_sub_ps(d3, d4);
+
+                __m128 tmp10 = _mm_add_ps(tmp0, tmp3);
+                __m128 tmp13 = _mm_sub_ps(tmp0, tmp3);
+                __m128 tmp11 = _mm_add_ps(tmp1, tmp2);
+                __m128 tmp12 = _mm_sub_ps(tmp1, tmp2);
+
+                d0 = _mm_add_ps(tmp10, tmp11);
+                d4 = _mm_sub_ps(tmp10, tmp11);
+
+                __m128 z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), _mm_set1_ps(0.707106781f));
+                d2 = _mm_add_ps(tmp13, z1);
+                d6 = _mm_sub_ps(tmp13, z1);
+
+                tmp10 = _mm_add_ps(tmp4, tmp5);
+                tmp11 = _mm_add_ps(tmp5, tmp6);
+                tmp12 = _mm_add_ps(tmp6, tmp7);
+
+                __m128 z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12), _mm_set1_ps(0.382683433f));
+                __m128 z2 = _mm_add_ps(_mm_mul_ps(tmp10, _mm_set1_ps(0.541196100f)), z5);
+                __m128 z4 = _mm_add_ps(_mm_mul_ps(tmp12, _mm_set1_ps(1.306562965f)), z5);
+                __m128 z3 = _mm_mul_ps(tmp11, _mm_set1_ps(0.707106781f));
+
+                __m128 z11 = _mm_add_ps(tmp7, z3);
+                __m128 z13 = _mm_sub_ps(tmp7, z3);
+
+                d1 = _mm_add_ps(z11, z4);
+                d3 = _mm_sub_ps(z13, z2);
+                d5 = _mm_add_ps(z13, z2);
+                d7 = _mm_sub_ps(z11, z4);
+
+                _mm_storeu_si128((__m128i*)dst + 0x0, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 0), d0)));
+                _mm_storeu_si128((__m128i*)dst + 0x2, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 1), d1)));
+                _mm_storeu_si128((__m128i*)dst + 0x4, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 2), d2)));
+                _mm_storeu_si128((__m128i*)dst + 0x6, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 3), d3)));
+                _mm_storeu_si128((__m128i*)dst + 0x8, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 4), d4)));
+                _mm_storeu_si128((__m128i*)dst + 0xA, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 5), d5)));
+                _mm_storeu_si128((__m128i*)dst + 0xC, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 6), d6)));
+                _mm_storeu_si128((__m128i*)dst + 0xE, _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(fdt + DF * 7), d7)));
+            }
+        }
+
+        static int JpegProcessDu(Base::BitBuf& bitBuf, float* CDU, int stride, const float* fdtbl, int DC, const uint16_t HTDC[256][2], const uint16_t HTAC[256][2])
+        {
+            JpegDctV(CDU, stride, CDU, stride);
+            SIMD_ALIGNED(16) int DUO[64], DU[64];
+            JpegDctH(CDU, stride, fdtbl, DUO);
+            for (int i = 0; i < 64; ++i)
+                DU[Base::JpegZigZagT[i]] = DUO[i];
+            int diff = DU[0] - DC;
+            if (diff == 0)
+                bitBuf.Push(HTDC[0]);
+            else
+            {
+                uint16_t bits[2];
+                Base::JpegCalcBits(diff, bits);
+                bitBuf.Push(HTDC[bits[1]]);
+                bitBuf.Push(bits);
+            }
+            int end0pos4 = 60;
+            for (; end0pos4 > 0 && _mm_testz_si128(_mm_loadu_si128((__m128i*)(DU + end0pos4)), Sse2::K_INV_ZERO); end0pos4 -= 4);
+            int end0pos = end0pos4 + 3;
+            for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos);
+            if (end0pos == 0)
+            {
+                bitBuf.Push(HTAC[0x00]);
+                return DU[0];
+            }
+            for (int i = 1; i <= end0pos; ++i)
+            {
+                int startpos = i;
+                for (; DU[i] == 0 && i <= end0pos; ++i);
+                int nrzeroes = i - startpos;
+                if (nrzeroes >= 16)
+                {
+                    int lng = nrzeroes >> 4;
+                    int nrmarker;
+                    for (nrmarker = 1; nrmarker <= lng; ++nrmarker)
+                        bitBuf.Push(HTAC[0xF0]);
+                    nrzeroes &= 15;
+                }
+                uint16_t bits[2];
+                Base::JpegCalcBits(DU[i], bits);
+                bitBuf.Push(HTAC[(nrzeroes << 4) + bits[1]]);
+                bitBuf.Push(bits);
+            }
+            if (end0pos != 63)
+                bitBuf.Push(HTAC[0x00]);
+            return DU[0];
+        }
+
+        SIMD_INLINE void RgbToYuvInit(__m128 k[10])
+        {
+            k[0] = _mm_set1_ps(+0.29900f);
+            k[1] = _mm_set1_ps(+0.58700f);
+            k[2] = _mm_set1_ps(+0.11400f);
+            k[3] = _mm_set1_ps(-128.000f);
+            k[4] = _mm_set1_ps(-0.16874f);
+            k[5] = _mm_set1_ps(-0.33126f);
+            k[6] = _mm_set1_ps(+0.50000f);
+            k[7] = _mm_set1_ps(+0.50000f);
+            k[8] = _mm_set1_ps(-0.41869f);
+            k[9] = _mm_set1_ps(-0.08131f);
+        }
+
+        SIMD_INLINE void RgbToYuv(const uint8_t* r, const uint8_t* g, const uint8_t* b, int stride, int height, 
+            const __m128 k[10], float* y, float* u, float* v, int size)
+        {
+            for (int row = 0; row < size;)
+            {
+                for (int col = 0; col < size; col += 4)
+                {
+                    __m128 _r = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(r + col))));
+                    __m128 _g = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(g + col))));
+                    __m128 _b = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(b + col))));
+                    _mm_storeu_ps(y + col, _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, k[0]), _mm_mul_ps(_g, k[1])), _mm_mul_ps(_b, k[2])), k[3]));
+                    //_mm_storeu_ps(y + col, _mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, _yr), _mm_mul_ps(_g, _yg)), _mm_add_ps(_mm_mul_ps(_b, _yb), _yt)));
+                    _mm_storeu_ps(u + col, _mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, k[4]), _mm_mul_ps(_g, k[5])), _mm_mul_ps(_b, k[6])));
+                    _mm_storeu_ps(v + col, _mm_add_ps(_mm_add_ps(_mm_mul_ps(_r, k[7]), _mm_mul_ps(_g, k[8])), _mm_mul_ps(_b, k[9])));
+                }
+                if(++row < height)
+                    r += stride, g += stride, b += stride;
+                y += size, u += size, v += size;
+            }
+        }
+
+        SIMD_INLINE void GrayToY(const uint8_t* g, int stride, int height, const __m128 k[10], float* y, int size)
+        {
+            for (int row = 0; row < size;)
+            {
+                for (int col = 0; col < size; col += 4)
+                {
+                    __m128 _g = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(g + col))));
+                    _mm_storeu_ps(y + col, _mm_add_ps(_g, k[3]));
+                }
+                if (++row < height)
+                    g += stride;
+                y += size;
+            }
+        }
+
+        SIMD_INLINE void SubUv(const float * src, float * dst)
+        {
+            __m128 _0_25 = _mm_set1_ps(0.25f), s0, s1;
+            for (int yy = 0; yy < 8; yy += 1)
+            {
+                s0 = _mm_add_ps(_mm_loadu_ps(src + 0), _mm_loadu_ps(src + 16));
+                s1 = _mm_add_ps(_mm_loadu_ps(src + 4), _mm_loadu_ps(src + 20));
+                _mm_storeu_ps(dst + 0, _mm_mul_ps(_mm_hadd_ps(s0, s1), _0_25));
+                s0 = _mm_add_ps(_mm_loadu_ps(src + 8), _mm_loadu_ps(src + 24));
+                s1 = _mm_add_ps(_mm_loadu_ps(src + 12), _mm_loadu_ps(src + 28));
+                _mm_storeu_ps(dst + 4, _mm_mul_ps(_mm_hadd_ps(s0, s1), _0_25));
+                src += 32;
+                dst += 8;
+            }
+        }
+
+        void JpegWriteBlockSubs(OutputMemoryStream& stream, int width, int height, const uint8_t* red,
+            const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3])
+        {
+            __m128 k[10];
+            RgbToYuvInit(k);
+            int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2];
+            int width16 = width& (~15);
+            bool gray = red == green && red == blue;
+            Base::BitBuf bitBuf;
+            for (int y = 0; y < height; y += 16)
+            {
+                int x = 0;
+                SIMD_ALIGNED(16) float Y[256], U[256], V[256];
+                SIMD_ALIGNED(16) float subU[64], subV[64];
+                for (; x < width16; x += 16)
+                {
+                    if (gray)
+                        GrayToY(red + x, stride, height - y, k, Y, 16);
+                    else
+                        RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 16);
+                    DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        SubUv(U, subU);
+                        SubUv(V, subV);
+                        DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                    if (bitBuf.Full())
+                    {
+                        Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+                        bitBuf.Clear();
+                    }
+                }
+                for (; x < width; x += 16)
+                {
+                    if (gray)
+                        Base::GrayToY(red + x, stride, height - y, width - x, Y, 16);
+                    else
+                        Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 16);
+                    DCY = JpegProcessDu(bitBuf, Y + 0, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 8, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 128, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    DCY = JpegProcessDu(bitBuf, Y + 136, 16, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        SubUv(U, subU);
+                        SubUv(V, subV);
+                        DCU = JpegProcessDu(bitBuf, subU, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, subV, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                }
+            }
+            Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+            bitBuf.Clear();
+        }
+
+        void JpegWriteBlockFull(OutputMemoryStream& stream, int width, int height, const uint8_t* red,
+            const uint8_t* green, const uint8_t* blue, int stride, const float* fY, const float* fUv, int dc[3])
+        {
+            __m128 k[10];
+            RgbToYuvInit(k);
+            int& DCY = dc[0], & DCU = dc[1], & DCV = dc[2];
+            int width8 = width & (~7);
+            bool gray = red == green && red == blue;
+            Base::BitBuf bitBuf;
+            for (int y = 0; y < height; y += 8)
+            {
+                int x = 0;
+                SIMD_ALIGNED(16) float Y[64], U[64], V[64];
+                for (; x < width8; x += 8)
+                {
+                    if (gray)
+                        GrayToY(red + x, stride, height - y, k, Y, 8);
+                    else
+                        RgbToYuv(red + x, green + x, blue + x, stride, height - y, k, Y, U, V, 8);
+                    DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                    if (bitBuf.Full())
+                    {
+                        Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+                        bitBuf.Clear();
+                    }
+                }
+                for (; x < width; x += 8)
+                {
+                    if (gray)
+                        Base::GrayToY(red + x, stride, height - y, width - x, Y, 8);
+                    else
+                        Base::RgbToYuv(red + x, green + x, blue + x, stride, height - y, width - x, Y, U, V, 8);
+                    DCY = JpegProcessDu(bitBuf, Y, 8, fY, DCY, Base::HuffmanYdc, Base::HuffmanYac);
+                    if (gray)
+                        Base::JpegProcessDuGrayUv(bitBuf);
+                    else
+                    {
+                        DCU = JpegProcessDu(bitBuf, U, 8, fUv, DCU, Base::HuffmanUVdc, Base::HuffmanUVac);
+                        DCV = JpegProcessDu(bitBuf, V, 8, fUv, DCV, Base::HuffmanUVdc, Base::HuffmanUVac);
+                    }
+                }
+            }
+            Base::WriteBits(stream, bitBuf.data, bitBuf.size);
+            bitBuf.Clear();
+        }
+
+        //---------------------------------------------------------------------
+
+        ImageJpegSaver::ImageJpegSaver(const ImageSaverParam& param)
+            : Base::ImageJpegSaver(param)
+        {
+        }
+
+        void ImageJpegSaver::Init()
+        {
+            InitParams(true);
+            switch (_param.format)
+            {
+            case SimdPixelFormatBgr24:
+            case SimdPixelFormatRgb24:
+                _deintBgr = _param.width < 16 ? Base::DeinterleaveBgr : Sse41::DeinterleaveBgr;
+                break;
+            case SimdPixelFormatBgra32:
+            case SimdPixelFormatRgba32:
+                _deintBgra = _param.width < 16 ? Base::DeinterleaveBgra : Sse41::DeinterleaveBgra;
+                break;
+            default: 
+                break;
+            }
+            _writeBlock = _subSample ? JpegWriteBlockSubs : JpegWriteBlockFull;
+        }
+    }
+#endif// SIMD_SSE41_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp b/3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp
new file mode 100644
index 0000000000..0e1c76b710
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdSse41ImageSavePng.cpp
@@ -0,0 +1,370 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdImageSavePng.h"
+#include "Simd/SimdBase.h"
+#include "Simd/SimdSse41.h"
+#include "Simd/SimdExtract.h"
+
+namespace Simd
+{        
+#ifdef SIMD_SSE41_ENABLE    
+    namespace Sse41
+    {
+        uint32_t ZlibAdler32(uint8_t* data, int size)
+        {
+            __m128i _i0 = _mm_setr_epi32(0, -1, -2, -3), _4 = _mm_set1_epi32(4);
+            uint32_t lo = 1, hi = 0;
+            for (int b = 0, n = (int)(size % 5552); b < size;)
+            {
+                int n4 = n & (~3), i = 0;
+                __m128i _i = _mm_add_epi32(_i0, _mm_set1_epi32(n));
+                __m128i _l = _mm_setzero_si128(), _h = _mm_setzero_si128();
+                for (; i < n4; i += 4)
+                {
+                    __m128i d = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)(data + b + i)));
+                    _l = _mm_add_epi32(_l, d);
+                    _h = _mm_add_epi32(_h, _mm_mullo_epi32(d, _i));
+                    _i = _mm_sub_epi32(_i, _4);
+                }
+                int l = Sse2::ExtractInt32Sum(_l), h = Sse2::ExtractInt32Sum(_h);
+                for (; i < n; ++i)
+                {
+                    l += data[b + i];
+                    h += data[b + i]*(n - i);
+                }
+                hi = (hi + h + lo*n) % 65521;
+                lo = (lo + l) % 65521;
+                b += n;
+                n = 5552;
+            }
+            return (hi << 16) | lo;
+        }
+
+        void ZlibCompress(uint8_t* data, int size, int quality, OutputMemoryStream& stream)
+        {
+            const int ZHASH = 16384;
+            if (quality < 5)
+                quality = 5;
+            const int basket = quality * 2;
+            Array32i hashTable(ZHASH * basket);
+            memset(hashTable.data, -1, hashTable.RawSize());
+
+            stream.Write(uint8_t(0x78));
+            stream.Write(uint8_t(0x5e));
+            stream.WriteBits(1, 1);
+            stream.WriteBits(1, 2);
+
+            int i = 0, j;
+            while (i < size - 3)
+            {
+                int h = Base::ZlibHash(data + i) & (ZHASH - 1), best = 3;
+                uint8_t* bestLoc = 0;
+                int* hList = hashTable.data + h * basket;
+                for (j = 0; hList[j] != -1 && j < basket; ++j)
+                {
+                    if (hList[j] > i - 32768)
+                    {
+                        int d = ZlibCount(data + hList[j], data + i, size - i);
+                        if (d >= best)
+                        {
+                            best = d;
+                            bestLoc = data + hList[j];
+                        }
+                    }
+                }
+                if (j == basket)
+                {
+                    memcpy(hList, hList + quality, quality * sizeof(int));
+                    memset(hList + quality, -1, quality * sizeof(int));
+                    j = quality;
+                }
+                hList[j] = i;
+
+                if (bestLoc)
+                {
+                    h = Base::ZlibHash(data + i + 1) & (ZHASH - 1);
+                    int* hList = hashTable.data + h * basket;
+                    for (j = 0; hList[j] != -1 && j < basket; ++j)
+                    {
+                        if (hList[j] > i - 32767)
+                        {
+                            int e = ZlibCount(data + hList[j], data + i + 1, size - i - 1);
+                            if (e > best)
+                            {
+                                bestLoc = NULL;
+                                break;
+                            }
+                        }
+                    }
+                }
+
+                if (bestLoc)
+                {
+                    int d = (int)(data + i - bestLoc);
+                    assert(d <= 32767 && best <= 258);
+                    for (j = 0; best > Base::ZlibLenC[j + 1] - 1; ++j);
+                    Base::ZlibHuff(j + 257, stream);
+                    if (Base::ZlibLenEb[j])
+                        stream.WriteBits(best - Base::ZlibLenC[j], Base::ZlibLenEb[j]);
+                    for (j = 0; d > Base::ZlibDistC[j + 1] - 1; ++j);
+                    stream.WriteBits(Base::ZlibBitRev(j, 5), 5);
+                    if (Base::ZlibDistEb[j])
+                        stream.WriteBits(d - Base::ZlibDistC[j], Base::ZlibDistEb[j]);
+                    i += best;
+                }
+                else
+                {
+                    Base::ZlibHuffB(data[i], stream);
+                    ++i;
+                }
+            }
+            for (; i < size; ++i)
+                Base::ZlibHuffB(data[i], stream);
+            Base::ZlibHuff(256, stream);
+            stream.FlushBits();
+            stream.WriteBe32u(ZlibAdler32(data, size));
+        }
+
+        uint32_t EncodeLine0(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size, A);
+            __m128i _sum = _mm_setzero_si128();
+            for (; i < sizeA; i += A)
+            {
+                __m128i _src = _mm_loadu_si128((__m128i*)(src + i));
+                _mm_storeu_si128((__m128i*)(dst + i), _src);
+                _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_src)));
+            }
+            uint32_t sum = Sse2::ExtractInt32Sum(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine1(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            __m128i _sum = _mm_setzero_si128();
+            for (; i < sizeA; i += A)
+            {
+                __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i));
+                __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n));
+                __m128i _dst = _mm_sub_epi8(_src0, _src1);
+                _mm_storeu_si128((__m128i*)(dst + i), _dst);
+                _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst)));
+            }
+            sum += Sse2::ExtractInt32Sum(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - n];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine2(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i] - src[i - stride];
+                sum += ::abs(dst[i]);
+            }
+            __m128i _sum = _mm_setzero_si128();
+            for (; i < sizeA; i += A)
+            {
+                __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i));
+                __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - stride));
+                __m128i _dst = _mm_sub_epi8(_src0, _src1);
+                _mm_storeu_si128((__m128i*)(dst + i), _dst);
+                _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst)));
+            }
+            sum += Sse2::ExtractInt32Sum(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - stride];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine3(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i] - (src[i - stride] >> 1);
+                sum += ::abs(dst[i]);
+            }
+            __m128i _sum = _mm_setzero_si128();
+            for (; i < sizeA; i += A)
+            {
+                __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i));
+                __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n));
+                __m128i _src2 = _mm_loadu_si128((__m128i*)(src + i - stride));
+                __m128i lo = _mm_srli_epi16(_mm_add_epi16(UnpackU8<0>(_src1), UnpackU8<0>(_src2)), 1);
+                __m128i hi = _mm_srli_epi16(_mm_add_epi16(UnpackU8<1>(_src1), UnpackU8<1>(_src2)), 1);
+                __m128i _dst = _mm_sub_epi8(_src0, _mm_packus_epi16(lo, hi));
+                _mm_storeu_si128((__m128i*)(dst + i), _dst);
+                _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst)));
+            }
+            sum += Sse2::ExtractInt32Sum(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - ((src[i - n] + src[i - stride]) >> 1);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        SIMD_INLINE __m128i Paeth(__m128i a, __m128i b, __m128i c)
+        {
+            __m128i p = _mm_sub_epi16(_mm_add_epi16(a, b), c);
+            __m128i pa = _mm_abs_epi16(_mm_sub_epi16(p, a));
+            __m128i pb = _mm_abs_epi16(_mm_sub_epi16(p, b));
+            __m128i pc = _mm_abs_epi16(_mm_sub_epi16(p, c));
+            __m128i mbc = _mm_or_si128(_mm_cmpgt_epi16(pa, pb), _mm_cmpgt_epi16(pa, pc));
+            __m128i mc = _mm_cmpgt_epi16(pb, pc);
+            return _mm_blendv_epi8(a, _mm_blendv_epi8(b, c, mc), mbc);
+        }
+
+        uint32_t EncodeLine4(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = (int8_t)(src[i] - src[i - stride]);
+                sum += ::abs(dst[i]);
+            }
+            __m128i _sum = _mm_setzero_si128();
+            for (; i < sizeA; i += A)
+            {
+                __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i));
+                __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n));
+                __m128i _src2 = _mm_loadu_si128((__m128i*)(src + i - stride));
+                __m128i _src3 = _mm_loadu_si128((__m128i*)(src + i - stride - n));
+                __m128i lo = Paeth(UnpackU8<0>(_src1), UnpackU8<0>(_src2), UnpackU8<0>(_src3));
+                __m128i hi = Paeth(UnpackU8<1>(_src1), UnpackU8<1>(_src2), UnpackU8<1>(_src3));
+                __m128i _dst = _mm_sub_epi8(_src0, _mm_packus_epi16(lo, hi));
+                _mm_storeu_si128((__m128i*)(dst + i), _dst);
+                _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst)));
+            }
+            sum += Sse2::ExtractInt32Sum(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - Base::Paeth(src[i - n], src[i - stride], src[i - stride - n]);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine5(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            __m128i _sum = _mm_setzero_si128();
+            for (; i < sizeA; i += A)
+            {
+                __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i));
+                __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n));
+                __m128i lo = _mm_srli_epi16(UnpackU8<0>(_src1), 1);
+                __m128i hi = _mm_srli_epi16(UnpackU8<1>(_src1), 1);
+                __m128i _dst = _mm_sub_epi8(_src0, _mm_packus_epi16(lo, hi));
+                _mm_storeu_si128((__m128i*)(dst + i), _dst);
+                _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst)));
+            }
+            sum += Sse2::ExtractInt32Sum(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - (src[i - n] >> 1);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine6(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            __m128i _sum = _mm_setzero_si128();
+            for (; i < sizeA; i += A)
+            {
+                __m128i _src0 = _mm_loadu_si128((__m128i*)(src + i));
+                __m128i _src1 = _mm_loadu_si128((__m128i*)(src + i - n));
+                __m128i _dst = _mm_sub_epi8(_src0, _src1);
+                _mm_storeu_si128((__m128i*)(dst + i), _dst);
+                _sum = _mm_add_epi32(_sum, _mm_sad_epu8(_mm_setzero_si128(), _mm_abs_epi8(_dst)));
+            }
+            sum += Sse2::ExtractInt32Sum(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - n];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        ImagePngSaver::ImagePngSaver(const ImageSaverParam& param)
+            : Base::ImagePngSaver(param)
+        {
+            if (_param.format == SimdPixelFormatBgr24)
+                _convert = Sse41::BgrToRgb;
+            else if (_param.format == SimdPixelFormatBgra32)
+                _convert = Sse41::BgraToRgba;
+            _encode[0] = Sse41::EncodeLine0;
+            _encode[1] = Sse41::EncodeLine1;
+            _encode[2] = Sse41::EncodeLine2;
+            _encode[3] = Sse41::EncodeLine3;
+            _encode[4] = Sse41::EncodeLine4;
+            _encode[5] = Sse41::EncodeLine5;
+            _encode[6] = Sse41::EncodeLine6;
+            _compress = Sse41::ZlibCompress;
+        }
+    }
+#endif// SIMD_SSE41_ENABLE
+}
diff --git a/3rdparty/simdlib/Simd/SimdView.hpp b/3rdparty/simdlib/Simd/SimdView.hpp
index 0c61a0e6e8..33629be94f 100755
--- a/3rdparty/simdlib/Simd/SimdView.hpp
+++ b/3rdparty/simdlib/Simd/SimdView.hpp
@@ -27,7 +27,6 @@
 #ifndef __SimdView_hpp__
 #define __SimdView_hpp__
 
-#include "Simd/SimdDefs.h"
 #include "Simd/SimdRectangle.hpp"
 #include "Simd/SimdAllocator.hpp"
 
@@ -493,34 +492,57 @@ namespace Simd
         /*!
             Loads image from file.
             
-            Supported formats:
-             - PGM(Portable Gray Map) text(P2) or binary(P5) (the file is loaded as 8-bit gray image).
-             - PPM(Portable Pixel Map) text(P3) or binary(P6) (the file is loaded as 32-bit BGRA image).
+            Supported formats are described by ::SimdImageFileType enumeration.
 
             \note PGM and PPM files with comments are not supported.
 
-            \param [in] path - a path to file with PGM or PPM image.
+            \param [in] path - a path to image file.
+            \param [in] format - a desired format of loaded image. 
+                Supported values are View::Gray8, View::Bgr24, View::Bgra32, View::Rgb24, View::Rgba32 and View::None.
+                Default value is View::None (loads image in native pixel format of image file).
             \return - a result of loading.
         */
-        bool Load(const std::string & path);
+        bool Load(const std::string & path, Format format = None);
+
+        /*!
+            Loads image from memory buffer.
+
+            Supported formats are described by ::SimdImageFileType enumeration.
+
+            \note PGM and PPM files with comments are not supported.
+
+            \param [in] src - a pointer to memory buffer.
+            \param [in] size - a buffer size.
+            \param [in] format - a desired format of loaded image.
+                Supported values are View::Gray8, View::Bgr24, View::Bgra32, View::Rgb24, View::Rgba32 and View::None.
+                Default value is View::None (loads image in native pixel format of image file).
+            \return - a result of loading.
+        */
+        bool Load(const uint8_t * src, size_t size, Format format = None);
 
         /*!
             Saves image to file.
  
-            Supported formats:
-             - PGM(Portable Gray Map) binary(P5) (this format is used in order to save 8-bit gray images).
-             - PPM(Portable Pixel Map) binary(P6) (this format is used in order to save 24-bit BGR and 32-bit BGRA images).
-
             \param [in] path - a path to file.
+            \param [in] type - a image file format. By default is equal to ::SimdImageFileUndefined (format auto choice).
+            \param [in] quality - a parameter of compression quality (if file format supports it).
             \return - a result of saving.
         */
-        bool Save(const std::string & path) const;
+        bool Save(const std::string & path, SimdImageFileType type = SimdImageFileUndefined, int quality = 100) const;
 
         /*!
-            Clear View structure (reset all fields) and free memory if it's owner
+            Clears View structure (reset all fields) and free memory if it's owner.
          */
         void Clear();
 
+        /*!
+            Releases pixel data and resets all fields.
+
+            \param [out] size - a pointer to the size of released pixel data. Can be NULL.
+            \return - a released pointer to pixel data. It must be deleted by function ::SimdFree.
+        */
+        uint8_t* Release(size_t* size = NULL);
+
     private:
         bool _owner;
     };
@@ -1027,6 +1049,7 @@ namespace Simd
         case Float:     return 4;
         case Double:    return 8;
         case Rgb24:     return 3;
+        case Rgba32:    return 4;
         default: assert(0); return 0;
         }
     }
@@ -1050,6 +1073,7 @@ namespace Simd
         case Float:     return 4;
         case Double:    return 8;
         case Rgb24:     return 1;
+        case Rgba32:    return 1;
         default: assert(0); return 0;
         }
     }
@@ -1073,6 +1097,7 @@ namespace Simd
         case Float:     return 1;
         case Double:    return 1;
         case Rgb24:     return 3;
+        case Rgba32:    return 4;
         default: assert(0); return 0;
         }
     }
@@ -1124,139 +1149,33 @@ namespace Simd
         std::swap((bool&)_owner, (bool&)other._owner);
     }
 
-    template <template<class> class A> SIMD_INLINE bool View<A>::Load(const std::string & path)
+    template <template<class> class A> SIMD_INLINE bool View<A>::Load(const std::string & path, Format format_)
     {
-        std::ifstream ifs(path.c_str(), std::ifstream::binary);
-        if (ifs.is_open())
-        {
-            std::string type;
-            ifs >> type;
-            if (type == "P2" || type == "P5")
-            {
-                size_t w, h, d;
-                ifs >> w >> h >> d;
-                if (d != 255)
-                    return false;
-                ifs.get();
-                Recreate(w, h, View<A>::Gray8);
-                if (type == "P2")
-                {
-                    for (size_t row = 0; row < height; ++row)
-                    {
-                        for (size_t col = 0; col < width; ++col)
-                        {
-                            int gray;
-                            ifs >> gray;
-                            data[row * stride + col] = (uint8_t)gray;
-                        }
-                    }
-                }
-                else
-                {
-                    for (size_t row = 0; row < height; ++row)
-                        ifs.read((char*)(data + row*stride), width);
-                }
-                return true;
-            }
-            if (type == "P3" || type == "P6")
-            {
-                size_t w, h, d;
-                ifs >> w >> h >> d;
-                if (d != 255)
-                    return false;
-                ifs.get();
-                Recreate(w, h, View<A>::Bgra32);
-                if (type == "P3")
-                {
-                    for (size_t row = 0; row < height; ++row)
-                    {
-                        uint8_t * bgra = data + row * stride;
-                        for (size_t col = 0; col < width; ++col, bgra += 4)
-                        {
-                            int blue, green, red;
-                            ifs >> red >> green >> blue;
-                            bgra[0] = (uint8_t)blue;
-                            bgra[1] = (uint8_t)green;
-                            bgra[2] = (uint8_t)red;
-                            bgra[3] = 0xFF;
-                        }
-                    }
-                }
-                else
-                {
-                    View buffer(width, 1, Bgr24);
-                    for (size_t row = 0; row < height; ++row)
-                    {
-                        ifs.read((char*)buffer.data, width*3);
-                        const uint8_t * rgb = buffer.data;
-                        uint8_t * bgra = data + row*stride;
-                        for (size_t col = 0; col < width; ++col, rgb += 3, bgra += 4)
-                        {
-                            bgra[0] = rgb[2];
-                            bgra[1] = rgb[1];
-                            bgra[2] = rgb[0];
-                            bgra[3] = 0xFF;
-                        }
-                    }
-                }
-                return true;
-            }
-        }
-        return false;
+        Clear();
+        (Format&)format = format_;
+        *(uint8_t**)&data = SimdImageLoadFromFile(path.c_str(), (size_t*)&stride, (size_t*)&width, (size_t*)&height, (SimdPixelFormatType*)&format);
+        if (data)
+            _owner = true;
+        else
+            (Format&)format = None;
+        return _owner;
     }
 
-    template <template<class> class A> SIMD_INLINE bool View<A>::Save(const std::string & path) const
+    template <template<class> class A> SIMD_INLINE bool View<A>::Load(const uint8_t * src, size_t size, Format format_)
     {
-        if (!(format == View<A>::Gray8 || format == View<A>::Bgr24 || format == View<A>::Bgra32))
-            return false;
-
-        std::ofstream ofs(path.c_str(), std::ofstream::binary);
-        if (ofs.is_open())
-        {
-            if (format == View<A>::Gray8)
-            {
-                ofs << "P5\n" << width << " " << height << "\n255\n";
-                for (size_t row = 0; row < height; ++row)
-                    ofs.write((const char*)(data + row*stride), width);
-            }
-            else if (format == View<A>::Bgr24)
-            {
-                ofs << "P6\n" << width << " " << height << "\n255\n";
-                View buffer(width, 1, Bgr24);
-                for (size_t row = 0; row < height; ++row)
-                {
-                    const uint8_t * bgr = data + row*stride;
-                    uint8_t * rgb = buffer.data;
-                    for (size_t col = 0; col < width; ++col, bgr += 3, rgb += 3)
-                    {
-                        rgb[0] = bgr[2];
-                        rgb[1] = bgr[1];
-                        rgb[2] = bgr[0];
-                    }
-                    ofs.write((const char*)(buffer.data), width*3);
-                }
-            }
-            else if (format == View<A>::Bgra32)
-            {
-                ofs << "P6\n" << width << " " << height << "\n255\n";
-                View buffer(width, 1, Bgr24);
-                for (size_t row = 0; row < height; ++row)
-                {
-                    const uint8_t * bgra = data + row*stride;
-                    uint8_t * rgb = buffer.data;
-                    for (size_t col = 0; col < width; ++col, bgra += 4, rgb += 3)
-                    {
-                        rgb[0] = bgra[2];
-                        rgb[1] = bgra[1];
-                        rgb[2] = bgra[0];
-                    }
-                    ofs.write((const char*)buffer.data, width * 3);
-                }
-            }
-            return true;
-        }
+        Clear();
+        (Format&)format = format_;
+        *(uint8_t**)&data = SimdImageLoadFromMemory(src, size, (size_t*)&stride, (size_t*)&width, (size_t*)&height, (SimdPixelFormatType*)&format);
+        if (data)
+            _owner = true;
         else
-            return false;
+            (Format&)format = None;
+        return _owner;
+    }
+
+    template <template<class> class A> SIMD_INLINE bool View<A>::Save(const std::string & path, SimdImageFileType type, int quality) const
+    {
+        return SimdImageSaveToFile(data, stride, width, height, (SimdPixelFormatType)format, type, quality, path.c_str()) == SimdTrue;
     }
 
     template <template<class> class A> SIMD_INLINE void View<A>::Clear()
@@ -1279,6 +1198,16 @@ namespace Simd
 #endif
     }
 
+    template <template<class> class A> SIMD_INLINE uint8_t* View<A>::Release(size_t* size)
+    {
+        uint8_t* released = data;
+        if (size)
+            *size = DataSize();
+        _owner = false;
+        Clear();
+        return released;
+    }
+
     // View utilities implementation:
 
     template <template<class> class A, class T> const T & At(const View<A> & view, size_t x, size_t y)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e61019f297..32b89aae0e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -679,6 +679,8 @@ if(NOT USE_OPENCV AND (NOT USE_PNG OR NOT USE_JPEG))
 else()
   set(WITH_STBIMAGE OFF)
 endif()
+# TODO:
+set(WITH_STBIMAGE ON)
 
 VP_OPTION(WITH_CATCH2   ""           ""    "Use catch2" "" ON IF (VISP_CXX_STANDARD GREATER VISP_CXX_STANDARD_98))
 
diff --git a/modules/io/CMakeLists.txt b/modules/io/CMakeLists.txt
index 959ee1c9b6..949ec58aef 100644
--- a/modules/io/CMakeLists.txt
+++ b/modules/io/CMakeLists.txt
@@ -57,11 +57,21 @@ if(USE_PNG)
   add_definitions(${PNG_DEFINITIONS})
 endif()
 
-if(WITH_STBIMAGE)
+# TODO:
+#if(WITH_STBIMAGE)
   # stb_image is private
   include_directories(${STBIMAGE_INCLUDE_DIRS})
+#endif()
+
+if(WITH_CATCH2)
+  # catch2 is private
+  include_directories(${CATCH2_INCLUDE_DIRS})
 endif()
 
+# simdlib is always enabled since it contains fallback code to plain C++ code
+# Simd lib is private
+include_directories(${SIMDLIB_INCLUDE_DIRS})
+
 # OpenCV
 if(USE_OPENCV)
   # On win32 since OpenCV 2.4.7 and on OSX with OpenCV 2.4.10 we cannot use OpenCV_LIBS to set ViSP 3rd party libraries.
@@ -178,7 +188,7 @@ endif()
 vp_glob_module_sources()
 vp_module_include_directories(${opt_incs})
 vp_create_module(${opt_libs})
-vp_add_tests(DEPENDS_ON visp_features)
+vp_add_tests()
 
 vp_set_source_file_compile_flag(src/tools/vpParseArgv.cpp -Wno-strict-overflow)
 
diff --git a/modules/io/include/visp3/io/vpImageIo.h b/modules/io/include/visp3/io/vpImageIo.h
index d37cad48e3..11bd9aa766 100644
--- a/modules/io/include/visp3/io/vpImageIo.h
+++ b/modules/io/include/visp3/io/vpImageIo.h
@@ -144,6 +144,10 @@ class VISP_EXPORT vpImageIo
   static void readPNG(vpImage<unsigned char> &I, const std::string &filename);
   static void readPNG(vpImage<vpRGBa> &I, const std::string &filename);
 
+  //TODO:
+  static void readSimdlib(vpImage<vpRGBa> &I, const std::string &filename);
+  static void readStb(vpImage<vpRGBa> &I, const std::string &filename);
+
   static void writePFM(const vpImage<float> &I, const std::string &filename);
 
   static void writePGM(const vpImage<unsigned char> &I, const std::string &filename);
@@ -158,5 +162,9 @@ class VISP_EXPORT vpImageIo
 
   static void writePNG(const vpImage<unsigned char> &I, const std::string &filename);
   static void writePNG(const vpImage<vpRGBa> &I, const std::string &filename);
+
+  //TODO:
+  static void writeSimdlib(vpImage<vpRGBa> &I, const std::string &filename);
+  static void writeStb(vpImage<vpRGBa> &I, const std::string &filename);
 };
 #endif
diff --git a/modules/io/src/image/vpImageIo.cpp b/modules/io/src/image/vpImageIo.cpp
index ab290fa5f7..cc7799d158 100644
--- a/modules/io/src/image/vpImageIo.cpp
+++ b/modules/io/src/image/vpImageIo.cpp
@@ -62,6 +62,15 @@
 #include <png.h>
 #endif
 
+//TODO:
+#include <Simd/SimdLib.hpp>
+//TODO:
+#define STB_IMAGE_IMPLEMENTATION
+#include <stb_image.h>
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include <stb_image_write.h>
+
 #if !defined(VISP_HAVE_OPENCV)
 #if !defined(VISP_HAVE_JPEG) || !defined(VISP_HAVE_PNG)
 
@@ -2059,6 +2068,60 @@ void vpImageIo::readPNG(vpImage<vpRGBa> &I, const std::string &filename)
   fclose(file);
 }
 
+//TODO:
+void vpImageIo::readSimdlib(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  size_t stride = 0, width = 0, height = 0;
+  SimdPixelFormatType format = SimdPixelFormatRgba32;
+  uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format);
+  const bool copyData = false;
+  I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData);
+}
+
+void vpImageIo::readStb(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  int width = 0, height = 0, channels = 0;
+  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha);
+  if (image == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
+  }
+  I.init(reinterpret_cast<vpRGBa*>(image), static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
+  stbi_image_free(image);
+}
+
+inline bool ends_with(std::string const & value, std::string const & ending)
+{
+    if (ending.size() > value.size()) return false;
+    return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
+}
+
+void vpImageIo::writeSimdlib(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  if (ends_with(filename, ".png")) {
+    SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFilePng, 90, filename.c_str());
+  } else {
+    SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, 90, filename.c_str());
+  }
+}
+
+void vpImageIo::writeStb(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  if (ends_with(filename, ".png")) {
+    const int stride_in_bytes = static_cast<int>(4 * I.getWidth());
+    int res = stbi_write_png(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
+                             reinterpret_cast<void*>(I.bitmap), stride_in_bytes);
+    if (res == 0) {
+      throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str()));
+    }
+  } else {
+    int res = stbi_write_jpg(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
+                             reinterpret_cast<void*>(I.bitmap), 90);
+    if (res == 0) {
+      throw(vpImageException(vpImageException::ioError, "JEPG write error"));
+    }
+  }
+}
+
 #elif defined(VISP_HAVE_OPENCV)
 
 /*!
diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp
new file mode 100644
index 0000000000..ce0d416b70
--- /dev/null
+++ b/modules/io/test/perfImageLoadSave.cpp
@@ -0,0 +1,461 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Benchmark color image conversion.
+ *
+ *****************************************************************************/
+
+#include <visp3/core/vpConfig.h>
+
+#ifdef VISP_HAVE_CATCH2
+#define CATCH_CONFIG_ENABLE_BENCHMARKING
+#define CATCH_CONFIG_RUNNER
+#include <catch.hpp>
+
+#include <thread>
+#include <visp3/core/vpIoTools.h>
+#include <visp3/io/vpImageIo.h>
+
+static std::string ipath = vpIoTools::getViSPImagesDataPath();
+static std::string imagePathJpeg = vpIoTools::createFilePath(ipath, "Klimt/Klimt.jpeg");
+static std::string imagePathPng = vpIoTools::createFilePath(ipath, "Klimt/Klimt.png");
+static std::string imagePathPngBig = vpIoTools::createFilePath(ipath, "Klimt/test_image_resize.png");
+static int nThreads = 0;
+
+TEST_CASE("Benchmark Jpeg image loading", "[benchmark]") {
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::read()") {
+      vpImageIo::read(I, imagePathJpeg);
+      return I;
+    };
+  }
+
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::readSimdlib()") {
+      vpImageIo::readSimdlib(I, imagePathJpeg);
+      return I;
+    };
+  }
+
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::readStb()") {
+      vpImageIo::readStb(I, imagePathJpeg);
+      return I;
+    };
+  }
+}
+
+TEST_CASE("Benchmark Png image loading", "[benchmark]") {
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::read()") {
+      vpImageIo::read(I, imagePathPng);
+      return I;
+    };
+  }
+
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::readSimdlib()") {
+      vpImageIo::readSimdlib(I, imagePathPng);
+      return I;
+    };
+  }
+
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::readStb()") {
+      vpImageIo::readStb(I, imagePathPng);
+      return I;
+    };
+  }
+}
+
+TEST_CASE("Benchmark big Png image loading", "[benchmark]") {
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::read()") {
+      vpImageIo::read(I, imagePathPngBig);
+      return I;
+    };
+  }
+
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::readSimdlib()") {
+      vpImageIo::readSimdlib(I, imagePathPngBig);
+      return I;
+    };
+  }
+
+  {
+    vpImage<vpRGBa> I;
+
+    BENCHMARK("vpImageIo::readStb()") {
+      vpImageIo::readStb(I, imagePathPngBig);
+      return I;
+    };
+  }
+}
+
+TEST_CASE("Benchmark Jpeg image saving", "[benchmark]") {
+  vpImage<vpRGBa> I;
+  vpImageIo::read(I, imagePathJpeg);
+  {
+    const std::string filename = "/tmp/Klimt_ViSP.jpg";
+
+    BENCHMARK("vpImageIo::write()") {
+      vpImageIo::write(I, filename);
+      return I;
+    };
+  }
+
+  {
+    const std::string filename = "/tmp/Klimt_Simd.jpg";
+
+    BENCHMARK("vpImageIo::writeSimdlib()") {
+      vpImageIo::writeSimdlib(I, filename);
+      return I;
+    };
+  }
+
+  {
+    const std::string filename = "/tmp/Klimt_stb.jpg";
+
+    BENCHMARK("vpImageIo::writeStb()") {
+      vpImageIo::writeStb(I, filename);
+      return I;
+    };
+  }
+}
+
+TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") {
+  vpImage<vpRGBa> I;
+  vpImageIo::read(I, imagePathPngBig);
+  {
+    const std::string filename = "/tmp/Big_images_ViSP.jpg";
+
+    BENCHMARK("vpImageIo::write()") {
+      vpImageIo::write(I, filename);
+      return I;
+    };
+  }
+
+//  {
+//    const std::string filename = "/tmp/Big_images_Simd.jpg";
+
+//    BENCHMARK("vpImageIo::writeSimdlib()") {
+//      vpImageIo::writeSimdlib(I, filename);
+//      return I;
+//    };
+//  }
+
+  {
+    const std::string filename = "/tmp/Big_images_stb.jpg";
+
+    BENCHMARK("vpImageIo::writeStb()") {
+      vpImageIo::writeStb(I, filename);
+      return I;
+    };
+  }
+}
+
+TEST_CASE("Benchmark Png image saving", "[benchmark]") {
+  vpImage<vpRGBa> I;
+  vpImageIo::read(I, imagePathPng);
+  {
+    const std::string filename = "/tmp/Klimt_ViSP.png";
+
+    BENCHMARK("vpImageIo::write()") {
+      vpImageIo::write(I, filename);
+      return I;
+    };
+  }
+
+  {
+    const std::string filename = "/tmp/Klimt_Simd.png";
+
+    BENCHMARK("vpImageIo::writeSimdlib()") {
+      vpImageIo::writeSimdlib(I, filename);
+      return I;
+    };
+  }
+
+  {
+    const std::string filename = "/tmp/Klimt_stb.png";
+
+    BENCHMARK("vpImageIo::writeStb()") {
+      vpImageIo::writeStb(I, filename);
+      return I;
+    };
+  }
+}
+
+TEST_CASE("Benchmark big Png image saving", "[benchmark]") {
+  vpImage<vpRGBa> I;
+  vpImageIo::read(I, imagePathPngBig);
+  {
+    const std::string filename = "/tmp/Big_images_ViSP.png";
+
+    BENCHMARK("vpImageIo::write()") {
+      vpImageIo::write(I, filename);
+      return I;
+    };
+  }
+
+  {
+    const std::string filename = "/tmp/Big_images_Simd.png";
+
+    BENCHMARK("vpImageIo::writeSimdlib()") {
+      vpImageIo::writeSimdlib(I, filename);
+      return I;
+    };
+  }
+
+  {
+    const std::string filename = "/tmp/Big_images_stb.png";
+
+    BENCHMARK("vpImageIo::writeStb()") {
+      vpImageIo::writeStb(I, filename);
+      return I;
+    };
+  }
+}
+
+//TEST_CASE("Benchmark bgr to grayscale (ViSP)", "[benchmark]") {
+//  vpImage<vpRGBa> I;
+//  vpImageIo::read(I, imagePathColor);
+
+//  std::vector<unsigned char> bgr;
+//  common_tools::RGBaToBGR(I, bgr);
+
+//  vpImage<unsigned char> I_gray(I.getHeight(), I.getWidth());
+
+//  BENCHMARK("Benchmark bgr to grayscale (ViSP)") {
+//    vpImageConvert::BGRToGrey(bgr.data(),
+//                              I_gray.bitmap,
+//                              I.getWidth(), I.getHeight(),
+//                              false, nThreads);
+//    return I_gray;
+//  };
+
+//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101)
+//  SECTION("OpenCV Mat type")
+//  {
+//    cv::Mat img;
+//    vpImageConvert::convert(I, img);
+
+//    BENCHMARK("Benchmark bgr to grayscale (ViSP + OpenCV Mat type)") {
+//      vpImageConvert::convert(img, I_gray, false, nThreads);
+//      return I_gray;
+//    };
+//  }
+//#endif
+//}
+//#endif
+
+//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101)
+//TEST_CASE("Benchmark bgr to grayscale (OpenCV)", "[benchmark]") {
+//  cv::Mat img = cv::imread(imagePathColor);
+//  cv::Mat img_gray(img.size(), CV_8UC1);
+
+//  BENCHMARK("Benchmark bgr to grayscale (OpenCV)") {
+//    cv::cvtColor(img, img_gray, cv::COLOR_BGR2GRAY);
+//    return img_gray;
+//  };
+//}
+//#endif
+
+//// C++11 to be able to do bgr.data()
+//#if VISP_CXX_STANDARD >= VISP_CXX_STANDARD_11
+//TEST_CASE("Benchmark bgr to rgba (naive code)", "[benchmark]") {
+//  vpImage<vpRGBa> I;
+//  vpImageIo::read(I, imagePathColor);
+
+//  std::vector<unsigned char> bgr;
+//  common_tools::RGBaToBGR(I, bgr);
+
+//  vpImage<vpRGBa> I_bench(I.getHeight(), I.getWidth());
+//  BENCHMARK("Benchmark bgr to rgba (naive code)") {
+//    common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast<unsigned char*>(I_bench.bitmap),
+//                               I.getWidth(), I.getHeight(), false);
+//    return I_bench;
+//  };
+//}
+
+//TEST_CASE("Benchmark bgr to rgba (ViSP)", "[benchmark]") {
+//  vpImage<vpRGBa> I;
+//  vpImageIo::read(I, imagePathColor);
+
+//  std::vector<unsigned char> bgr;
+//  common_tools::RGBaToBGR(I, bgr);
+
+//  SECTION("Check BGR to RGBa conversion")
+//  {
+//    vpImage<vpRGBa> ref(I.getHeight(), I.getWidth());
+//    common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast<unsigned char*>(ref.bitmap),
+//                               I.getWidth(), I.getHeight(), false);
+//    vpImage<vpRGBa> rgba(I.getHeight(), I.getWidth());
+//    vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast<unsigned char *>(rgba.bitmap),
+//                              I.getWidth(), I.getHeight(), false);
+
+//    CHECK((rgba == ref));
+//  }
+
+//  vpImage<vpRGBa> I_rgba(I.getHeight(), I.getWidth());
+//  BENCHMARK("Benchmark bgr to rgba (ViSP)") {
+//    vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast<unsigned char *>(I_rgba.bitmap),
+//                              I.getWidth(), I.getHeight(), false);
+//    return I_rgba;
+//  };
+
+//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101)
+//  SECTION("OpenCV Mat type")
+//  {
+//    cv::Mat img;
+//    vpImageConvert::convert(I, img);
+
+//    BENCHMARK("Benchmark bgr to rgba (ViSP + OpenCV Mat type)") {
+//      vpImageConvert::convert(img, I_rgba);
+//      return I_rgba;
+//    };
+//  }
+//#endif
+//}
+
+//TEST_CASE("Benchmark bgra to rgba (naive code)", "[benchmark]") {
+//  vpImage<vpRGBa> I;
+//  vpImageIo::read(I, imagePathColor);
+
+//  std::vector<unsigned char> bgra;
+//  common_tools::RGBaToBGRa(I, bgra);
+
+//  vpImage<vpRGBa> I_bench(I.getHeight(), I.getWidth());
+//  BENCHMARK("Benchmark bgra to rgba (naive code)") {
+//    common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast<unsigned char*>(I_bench.bitmap),
+//                                I.getWidth(), I.getHeight(), false);
+//    return I_bench;
+//  };
+//}
+
+//TEST_CASE("Benchmark bgra to rgba (ViSP)", "[benchmark]") {
+//  vpImage<vpRGBa> I;
+//  vpImageIo::read(I, imagePathColor);
+
+//  std::vector<unsigned char> bgra;
+//  common_tools::RGBaToBGRa(I, bgra);
+
+//  SECTION("Check BGRa to RGBa conversion")
+//  {
+//    vpImage<vpRGBa> ref(I.getHeight(), I.getWidth());
+//    common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast<unsigned char*>(ref.bitmap),
+//                                I.getWidth(), I.getHeight(), false);
+//    vpImage<vpRGBa> rgba(I.getHeight(), I.getWidth());
+//    vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast<unsigned char *>(rgba.bitmap),
+//                               I.getWidth(), I.getHeight(), false);
+
+//    CHECK((rgba == ref));
+//  }
+//  vpImage<vpRGBa> I_rgba(I.getHeight(), I.getWidth());
+//  BENCHMARK("Benchmark bgra to rgba (ViSP)") {
+//    vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast<unsigned char *>(I_rgba.bitmap),
+//                               I.getWidth(), I.getHeight(), false);
+//    return I_rgba;
+//  };
+//}
+//#endif
+
+int main(int argc, char *argv[])
+{
+  Catch::Session session; // There must be exactly one instance
+
+  bool runBenchmark = false;
+  // Build a new parser on top of Catch's
+  using namespace Catch::clara;
+  auto cli = session.cli() // Get Catch's composite command line parser
+    | Opt(runBenchmark)    // bind variable to a new option, with a hint string
+    ["--benchmark"]        // the option names it will respond to
+    ("run benchmark?")     // description string for the help output
+    | Opt(imagePathJpeg, "imagePathColor")
+    ["--imagePathColor"]
+    ("Path to color image")
+    | Opt(imagePathPng, "imagePathColor")
+    ["--imagePathGray"]
+    ("Path to gray image")
+    | Opt(nThreads, "nThreads")
+    ["--nThreads"]
+    ("Number of threads");
+
+  // Now pass the new composite back to Catch so it uses that
+  session.cli(cli);
+
+  // Let Catch (using Clara) parse the command line
+  session.applyCommandLine(argc, argv);
+
+  if (runBenchmark) {
+//    vpImage<vpRGBa> I_color;
+//    vpImageIo::read(I_color, imagePathColor);
+//    std::cout << "imagePathColor:\n\t" << imagePathColor << "\n\t" << I_color.getWidth() << "x" << I_color.getHeight() << std::endl;
+
+//    vpImage<unsigned char> I_gray;
+//    vpImageIo::read(I_gray, imagePathGray);
+//    std::cout << "imagePathGray:\n\t" << imagePathGray << "\n\t" << I_gray.getWidth() << "x" << I_gray.getHeight() << std::endl;
+    std::cout << "nThreads: " << nThreads << " / available threads: " << std::thread::hardware_concurrency() << std::endl;
+
+    int numFailed = session.run();
+
+    // numFailed is clamped to 255 as some unices only use the lower 8 bits.
+    // This clamping has already been applied, so just return it here
+    // You can also do any post run clean-up here
+    return numFailed;
+  }
+
+  return EXIT_SUCCESS;
+}
+#else
+#include <iostream>
+
+int main()
+{
+  return 0;
+}
+#endif

From 9d2183b339c0bac50d2a1a3636b434b97ca4a959 Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Thu, 4 Nov 2021 14:06:19 +0100
Subject: [PATCH 11/18] Fix issue when writing big Jpeg images.

---
 3rdparty/simdlib/Simd/SimdImageSaveJpeg.h |  5 +++--
 modules/io/test/perfImageLoadSave.cpp     | 14 +++++++-------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h
index d54164f7d4..f3d5f4a96c 100644
--- a/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h
+++ b/3rdparty/simdlib/Simd/SimdImageSaveJpeg.h
@@ -35,9 +35,9 @@ namespace Simd
     {
         struct BitBuf
         {
-            static const uint32_t capacity = 1024;
+            static const uint32_t capacity = 2048;
             uint32_t size;
-            uint16_t data[1024][2];
+            uint16_t data[capacity][2];
 
             SIMD_INLINE BitBuf()
                 : size(0) 
@@ -51,6 +51,7 @@ namespace Simd
 
             SIMD_INLINE bool Full(uint32_t tail = capacity / 2) const
             {
+                assert(size <= capacity);
                 return size + tail >= capacity;
             }
 
diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp
index ce0d416b70..8efe2c759e 100644
--- a/modules/io/test/perfImageLoadSave.cpp
+++ b/modules/io/test/perfImageLoadSave.cpp
@@ -180,14 +180,14 @@ TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") {
     };
   }
 
-//  {
-//    const std::string filename = "/tmp/Big_images_Simd.jpg";
+  {
+    const std::string filename = "/tmp/Big_images_Simd.jpg";
 
-//    BENCHMARK("vpImageIo::writeSimdlib()") {
-//      vpImageIo::writeSimdlib(I, filename);
-//      return I;
-//    };
-//  }
+    BENCHMARK("vpImageIo::writeSimdlib()") {
+      vpImageIo::writeSimdlib(I, filename);
+      return I;
+    };
+  }
 
   {
     const std::string filename = "/tmp/Big_images_stb.jpg";

From 28c034ed87d847da2aa2be2f930d7c5d0d923c46 Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Wed, 17 Nov 2021 00:51:26 +0100
Subject: [PATCH 12/18] Experimental: wip code to try adding a backend system
 for image I/O.

---
 .../core/include/visp3/core/vpImageTools.h    |    8 +-
 modules/io/include/visp3/io/vpImageIo.h       |   34 +-
 .../io/src/image/private/vpImageIoBackend.h   |  104 +
 .../io/src/image/private/vpImageIoLibjpeg.cpp |  345 +++
 .../io/src/image/private/vpImageIoLibpng.cpp  |  615 +++++
 .../io/src/image/private/vpImageIoOpenCV.cpp  |  205 ++
 .../src/image/private/vpImageIoPortable.cpp   |  569 +++++
 .../io/src/image/private/vpImageIoSimd.cpp    |   87 +
 modules/io/src/image/private/vpImageIoStb.cpp |  121 +
 modules/io/src/image/vpImageIo.cpp            | 2112 ++---------------
 modules/io/test/perfImageLoadSave.cpp         |  171 +-
 11 files changed, 2286 insertions(+), 2085 deletions(-)
 create mode 100644 modules/io/src/image/private/vpImageIoBackend.h
 create mode 100644 modules/io/src/image/private/vpImageIoLibjpeg.cpp
 create mode 100644 modules/io/src/image/private/vpImageIoLibpng.cpp
 create mode 100644 modules/io/src/image/private/vpImageIoOpenCV.cpp
 create mode 100644 modules/io/src/image/private/vpImageIoPortable.cpp
 create mode 100644 modules/io/src/image/private/vpImageIoSimd.cpp
 create mode 100644 modules/io/src/image/private/vpImageIoStb.cpp

diff --git a/modules/core/include/visp3/core/vpImageTools.h b/modules/core/include/visp3/core/vpImageTools.h
index 4dbf1a809a..d367aa5290 100644
--- a/modules/core/include/visp3/core/vpImageTools.h
+++ b/modules/core/include/visp3/core/vpImageTools.h
@@ -1489,19 +1489,19 @@ void vpImageTools::warpLinear(const vpImage<Type> &src, const vpMatrix &T, vpIma
               const Type val01 = src[y_][x_ + 1];
               const Type val10 = src[y_ + 1][x_];
               const Type val11 = src[y_ + 1][x_ + 1];
-              const float col0 = lerp(val00, val01, s);
-              const float col1 = lerp(val10, val11, s);
+              const float col0 = lerp(static_cast<float>(val00), static_cast<float>(val01), s);
+              const float col1 = lerp(static_cast<float>(val10), static_cast<float>(val11), s);
               const float interp = lerp(col0, col1, t);
               dst[i][j] = vpMath::saturate<Type>(interp);
             } else if (y_ < static_cast<int>(src.getHeight()) - 1) {
               const Type val00 = src[y_][x_];
               const Type val10 = src[y_ + 1][x_];
-              const float interp = lerp(val00, val10, t);
+              const float interp = lerp(static_cast<float>(val00), static_cast<float>(val10), t);
               dst[i][j] = vpMath::saturate<Type>(interp);
             } else if (x_ < static_cast<int>(src.getWidth()) - 1) {
               const Type val00 = src[y_][x_];
               const Type val01 = src[y_][x_ + 1];
-              const float interp = lerp(val00, val01, s);
+              const float interp = lerp(static_cast<float>(val00), static_cast<float>(val01), s);
               dst[i][j] = vpMath::saturate<Type>(interp);
             } else {
               dst[i][j] = src[y_][x_];
diff --git a/modules/io/include/visp3/io/vpImageIo.h b/modules/io/include/visp3/io/vpImageIo.h
index 11bd9aa766..fa395e3882 100644
--- a/modules/io/include/visp3/io/vpImageIo.h
+++ b/modules/io/include/visp3/io/vpImageIo.h
@@ -124,6 +124,16 @@ class VISP_EXPORT vpImageIo
   static std::string getExtension(const std::string &filename);
 
 public:
+  //TODO:
+  // Image IO backend for only jpeg and png formats
+  enum vpImageIoBackendType {
+    IO_DEFAULT_BACKEND,
+    IO_LIB_BACKEND,
+    IO_OPENCV_BACKEND,
+    IO_SIMDLIB_BACKEND,
+    IO_STB_IMAGE_BACKEND
+  };
+
   static void read(vpImage<unsigned char> &I, const std::string &filename);
   static void read(vpImage<vpRGBa> &I, const std::string &filename);
 
@@ -138,15 +148,11 @@ class VISP_EXPORT vpImageIo
   static void readPPM(vpImage<unsigned char> &I, const std::string &filename);
   static void readPPM(vpImage<vpRGBa> &I, const std::string &filename);
 
-  static void readJPEG(vpImage<unsigned char> &I, const std::string &filename);
-  static void readJPEG(vpImage<vpRGBa> &I, const std::string &filename);
-
-  static void readPNG(vpImage<unsigned char> &I, const std::string &filename);
-  static void readPNG(vpImage<vpRGBa> &I, const std::string &filename);
+  static void readJPEG(vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
+  static void readJPEG(vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
 
-  //TODO:
-  static void readSimdlib(vpImage<vpRGBa> &I, const std::string &filename);
-  static void readStb(vpImage<vpRGBa> &I, const std::string &filename);
+  static void readPNG(vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
+  static void readPNG(vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
 
   static void writePFM(const vpImage<float> &I, const std::string &filename);
 
@@ -157,14 +163,10 @@ class VISP_EXPORT vpImageIo
   static void writePPM(const vpImage<unsigned char> &I, const std::string &filename);
   static void writePPM(const vpImage<vpRGBa> &I, const std::string &filename);
 
-  static void writeJPEG(const vpImage<unsigned char> &I, const std::string &filename);
-  static void writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename);
-
-  static void writePNG(const vpImage<unsigned char> &I, const std::string &filename);
-  static void writePNG(const vpImage<vpRGBa> &I, const std::string &filename);
+  static void writeJPEG(const vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
+  static void writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
 
-  //TODO:
-  static void writeSimdlib(vpImage<vpRGBa> &I, const std::string &filename);
-  static void writeStb(vpImage<vpRGBa> &I, const std::string &filename);
+  static void writePNG(const vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
+  static void writePNG(const vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
 };
 #endif
diff --git a/modules/io/src/image/private/vpImageIoBackend.h b/modules/io/src/image/private/vpImageIoBackend.h
new file mode 100644
index 0000000000..e1b434c030
--- /dev/null
+++ b/modules/io/src/image/private/vpImageIoBackend.h
@@ -0,0 +1,104 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Read/write images.
+ *
+ * Authors:
+ * Eric Marchand
+ *
+ *****************************************************************************/
+
+/*!
+  \file vpImageIo.h
+  \brief Read/write images
+*/
+
+#ifndef vpIMAGEIOBACKEND_H
+#define vpIMAGEIOBACKEND_H
+
+#include <visp3/core/vpImage.h>
+
+
+//
+void vp_writePFM(const vpImage<float> &I, const std::string &filename);
+void vp_writePGM(const vpImage<unsigned char> &I, const std::string &filename);
+void vp_writePGM(const vpImage<short> &I, const std::string &filename);
+void vp_writePGM(const vpImage<vpRGBa> &I, const std::string &filename);
+void vp_readPFM(vpImage<float> &I, const std::string &filename);
+void vp_readPGM(vpImage<unsigned char> &I, const std::string &filename);
+void vp_readPGM(vpImage<vpRGBa> &I, const std::string &filename);
+void vp_readPPM(vpImage<unsigned char> &I, const std::string &filename);
+void vp_readPPM(vpImage<vpRGBa> &I, const std::string &filename);
+void vp_writePPM(const vpImage<unsigned char> &I, const std::string &filename);
+void vp_writePPM(const vpImage<vpRGBa> &I, const std::string &filename);
+
+//
+void readJPEGLibjpeg(vpImage<unsigned char> &I, const std::string &filename);
+void readJPEGLibjpeg(vpImage<vpRGBa> &I, const std::string &filename);
+
+void writeJPEGLibjpeg(const vpImage<unsigned char> &I, const std::string &filename);
+void writeJPEGLibjpeg(const vpImage<vpRGBa> &I, const std::string &filename);
+
+//
+void readPNGLibpng(vpImage<unsigned char> &I, const std::string &filename);
+void readPNGLibpng(vpImage<vpRGBa> &I, const std::string &filename);
+
+void writePNGLibpng(const vpImage<unsigned char> &I, const std::string &filename);
+void writePNGLibpng(const vpImage<vpRGBa> &I, const std::string &filename);
+
+//
+void readOpenCV(vpImage<unsigned char> &I, const std::string &filename);
+void readOpenCV(vpImage<vpRGBa> &I, const std::string &filename);
+
+void writeOpenCV(const vpImage<unsigned char> &I, const std::string &filename);
+void writeOpenCV(const vpImage<vpRGBa> &I, const std::string &filename);
+
+//
+void readSimdlib(vpImage<unsigned char> &I, const std::string &filename);
+void readSimdlib(vpImage<vpRGBa> &I, const std::string &filename);
+
+void writeJPEGSimdlib(const vpImage<unsigned char> &I, const std::string &filename);
+void writeJPEGSimdlib(const vpImage<vpRGBa> &I, const std::string &filename);
+
+void writePNGSimdlib(const vpImage<unsigned char> &I, const std::string &filename);
+void writePNGSimdlib(const vpImage<vpRGBa> &I, const std::string &filename);
+
+//
+void readStb(vpImage<unsigned char> &I, const std::string &filename);
+void readStb(vpImage<vpRGBa> &I, const std::string &filename);
+
+void writeJPEGStb(const vpImage<unsigned char> &I, const std::string &filename);
+void writeJPEGStb(const vpImage<vpRGBa> &I, const std::string &filename);
+
+void writePNGStb(const vpImage<unsigned char> &I, const std::string &filename);
+void writePNGStb(const vpImage<vpRGBa> &I, const std::string &filename);
+
+#endif
diff --git a/modules/io/src/image/private/vpImageIoLibjpeg.cpp b/modules/io/src/image/private/vpImageIoLibjpeg.cpp
new file mode 100644
index 0000000000..99debb3021
--- /dev/null
+++ b/modules/io/src/image/private/vpImageIoLibjpeg.cpp
@@ -0,0 +1,345 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Read/write images.
+ *
+ * Authors:
+ * Eric Marchand
+ *
+ *****************************************************************************/
+
+/*!
+  \file vpImageIo.cpp
+  \brief Read/write images
+*/
+
+#include "vpImageIoBackend.h"
+#include <visp3/core/vpImageConvert.h>
+
+//TODO:
+#if defined(_WIN32)
+// Include WinSock2.h before windows.h to ensure that winsock.h is not
+// included by windows.h since winsock.h and winsock2.h are incompatible
+#include <WinSock2.h>
+#include <windows.h>
+#endif
+
+#if defined(VISP_HAVE_JPEG)
+#include <jerror.h>
+#include <jpeglib.h>
+#endif
+
+
+//--------------------------------------------------------------------------
+// JPEG
+//--------------------------------------------------------------------------
+
+#if defined(VISP_HAVE_JPEG)
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a JPEG file.
+
+  \param I : Image to save as a JPEG file.
+  \param filename : Name of the file containing the image.
+*/
+void writeJPEGLibjpeg(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  struct jpeg_compress_struct cinfo;
+  struct jpeg_error_mgr jerr;
+  FILE *file;
+
+  cinfo.err = jpeg_std_error(&jerr);
+  jpeg_create_compress(&cinfo);
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "wb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str()));
+  }
+
+  unsigned int width = I.getWidth();
+  unsigned int height = I.getHeight();
+
+  jpeg_stdio_dest(&cinfo, file);
+
+  cinfo.image_width = width;
+  cinfo.image_height = height;
+  cinfo.input_components = 1;
+  cinfo.in_color_space = JCS_GRAYSCALE;
+  jpeg_set_defaults(&cinfo);
+
+  jpeg_start_compress(&cinfo, TRUE);
+
+  unsigned char *line;
+  line = new unsigned char[width];
+  unsigned char *input = (unsigned char *)I.bitmap;
+  while (cinfo.next_scanline < cinfo.image_height) {
+    for (unsigned int i = 0; i < width; i++) {
+      line[i] = *(input);
+      input++;
+    }
+    jpeg_write_scanlines(&cinfo, &line, 1);
+  }
+
+  jpeg_finish_compress(&cinfo);
+  jpeg_destroy_compress(&cinfo);
+  delete[] line;
+  fclose(file);
+}
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a JPEG file.
+
+  \param I : Image to save as a JPEG file.
+  \param filename : Name of the file containing the image.
+*/
+void writeJPEGLibjpeg(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+  struct jpeg_compress_struct cinfo;
+  struct jpeg_error_mgr jerr;
+  FILE *file;
+
+  cinfo.err = jpeg_std_error(&jerr);
+  jpeg_create_compress(&cinfo);
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "wb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str()));
+  }
+
+  unsigned int width = I.getWidth();
+  unsigned int height = I.getHeight();
+
+  jpeg_stdio_dest(&cinfo, file);
+
+  cinfo.image_width = width;
+  cinfo.image_height = height;
+  cinfo.input_components = 3;
+  cinfo.in_color_space = JCS_RGB;
+  jpeg_set_defaults(&cinfo);
+
+  jpeg_start_compress(&cinfo, TRUE);
+
+  unsigned char *line;
+  line = new unsigned char[3 * width];
+  unsigned char *input = (unsigned char *)I.bitmap;
+  while (cinfo.next_scanline < cinfo.image_height) {
+    for (unsigned int i = 0; i < width; i++) {
+      line[i * 3] = *(input);
+      input++;
+      line[i * 3 + 1] = *(input);
+      input++;
+      line[i * 3 + 2] = *(input);
+      input++;
+      input++;
+    }
+    jpeg_write_scanlines(&cinfo, &line, 1);
+  }
+
+  jpeg_finish_compress(&cinfo);
+  jpeg_destroy_compress(&cinfo);
+  delete[] line;
+  fclose(file);
+}
+
+/*!
+  Read the contents of the JPEG file, allocate memory
+  for the corresponding gray level image, if necessary convert the data in
+  gray level, and set the bitmap whith the gray level data. That means that
+  the image \e I is a "black and white" rendering of the original image in \e
+  filename, as in a black and white photograph. If necessary, the quantization
+  formula used is \f$0,299 r + 0,587 g + 0,114 b\f$.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  \param I : Image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void readJPEGLibjpeg(vpImage<unsigned char> &I, const std::string &filename)
+{
+  struct jpeg_decompress_struct cinfo;
+  struct jpeg_error_mgr jerr;
+  FILE *file;
+
+  cinfo.err = jpeg_std_error(&jerr);
+  jpeg_create_decompress(&cinfo);
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "rb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str()));
+  }
+
+  jpeg_stdio_src(&cinfo, file);
+  jpeg_read_header(&cinfo, TRUE);
+
+  unsigned int width = cinfo.image_width;
+  unsigned int height = cinfo.image_height;
+
+  if ((width != I.getWidth()) || (height != I.getHeight()))
+    I.resize(height, width);
+
+  jpeg_start_decompress(&cinfo);
+
+  unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components);
+  JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1);
+
+  if (cinfo.out_color_space == JCS_RGB) {
+    vpImage<vpRGBa> Ic(height, width);
+    unsigned char *output = (unsigned char *)Ic.bitmap;
+    while (cinfo.output_scanline < cinfo.output_height) {
+      jpeg_read_scanlines(&cinfo, buffer, 1);
+      for (unsigned int i = 0; i < width; i++) {
+        *(output++) = buffer[0][i * 3];
+        *(output++) = buffer[0][i * 3 + 1];
+        *(output++) = buffer[0][i * 3 + 2];
+        *(output++) = vpRGBa::alpha_default;
+      }
+    }
+    vpImageConvert::convert(Ic, I);
+  }
+
+  else if (cinfo.out_color_space == JCS_GRAYSCALE) {
+    while (cinfo.output_scanline < cinfo.output_height) {
+      unsigned int row = cinfo.output_scanline;
+      jpeg_read_scanlines(&cinfo, buffer, 1);
+      memcpy(I[row], buffer[0], rowbytes);
+    }
+  }
+
+  jpeg_finish_decompress(&cinfo);
+  jpeg_destroy_decompress(&cinfo);
+  fclose(file);
+}
+
+/*!
+  Read a JPEG file and initialize a scalar image.
+
+  Read the contents of the JPEG file, allocate
+  memory for the corresponding image, and set
+  the bitmap whith the content of
+  the file.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  If the file corresponds to a grayscaled image, a conversion is done to deal
+  with \e I which is a color image.
+
+  \param I : Color image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void readJPEGLibjpeg(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  struct jpeg_decompress_struct cinfo;
+  struct jpeg_error_mgr jerr;
+  FILE *file;
+
+  cinfo.err = jpeg_std_error(&jerr);
+  jpeg_create_decompress(&cinfo);
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "rb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str()));
+  }
+
+  jpeg_stdio_src(&cinfo, file);
+
+  jpeg_read_header(&cinfo, TRUE);
+
+  unsigned int width = cinfo.image_width;
+  unsigned int height = cinfo.image_height;
+
+  if ((width != I.getWidth()) || (height != I.getHeight()))
+    I.resize(height, width);
+
+  jpeg_start_decompress(&cinfo);
+
+  unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components);
+  JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1);
+
+  if (cinfo.out_color_space == JCS_RGB) {
+    unsigned char *output = (unsigned char *)I.bitmap;
+    while (cinfo.output_scanline < cinfo.output_height) {
+      jpeg_read_scanlines(&cinfo, buffer, 1);
+      for (unsigned int i = 0; i < width; i++) {
+        *(output++) = buffer[0][i * 3];
+        *(output++) = buffer[0][i * 3 + 1];
+        *(output++) = buffer[0][i * 3 + 2];
+        *(output++) = vpRGBa::alpha_default;
+      }
+    }
+  }
+
+  else if (cinfo.out_color_space == JCS_GRAYSCALE) {
+    vpImage<unsigned char> Ig(height, width);
+
+    while (cinfo.output_scanline < cinfo.output_height) {
+      unsigned int row = cinfo.output_scanline;
+      jpeg_read_scanlines(&cinfo, buffer, 1);
+      memcpy(Ig[row], buffer[0], rowbytes);
+    }
+
+    vpImageConvert::convert(Ig, I);
+  }
+
+  jpeg_finish_decompress(&cinfo);
+  jpeg_destroy_decompress(&cinfo);
+  fclose(file);
+}
+#endif
diff --git a/modules/io/src/image/private/vpImageIoLibpng.cpp b/modules/io/src/image/private/vpImageIoLibpng.cpp
new file mode 100644
index 0000000000..e350e4260b
--- /dev/null
+++ b/modules/io/src/image/private/vpImageIoLibpng.cpp
@@ -0,0 +1,615 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Read/write images.
+ *
+ * Authors:
+ * Eric Marchand
+ *
+ *****************************************************************************/
+
+/*!
+  \file vpImageIo.cpp
+  \brief Read/write images
+*/
+
+#include "vpImageIoBackend.h"
+#include <visp3/core/vpImageConvert.h>
+
+//TODO:
+#if defined(_WIN32)
+// Include WinSock2.h before windows.h to ensure that winsock.h is not
+// included by windows.h since winsock.h and winsock2.h are incompatible
+#include <WinSock2.h>
+#include <windows.h>
+#endif
+
+#if defined(VISP_HAVE_PNG)
+#include <png.h>
+#endif
+
+
+//--------------------------------------------------------------------------
+// PNG
+//--------------------------------------------------------------------------
+
+#if defined(VISP_HAVE_PNG)
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a PNG file.
+
+  \param I : Image to save as a PNG file.
+  \param filename : Name of the file containing the image.
+*/
+void writePNGLibpng(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  FILE *file;
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "wb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str()));
+  }
+
+  /* create a png info struct */
+  png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+  if (!png_ptr) {
+    fclose(file);
+    vpERROR_TRACE("Error during png_create_write_struct()\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  png_infop info_ptr = png_create_info_struct(png_ptr);
+  if (!info_ptr) {
+    fclose(file);
+    png_destroy_write_struct(&png_ptr, NULL);
+    vpERROR_TRACE("Error during png_create_info_struct()\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  /* initialize the setjmp for returning properly after a libpng error occured
+   */
+  if (setjmp(png_jmpbuf(png_ptr))) {
+    fclose(file);
+    png_destroy_write_struct(&png_ptr, &info_ptr);
+    vpERROR_TRACE("Error during init_io\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  /* setup libpng for using standard C fwrite() function with our FILE pointer
+   */
+  png_init_io(png_ptr, file);
+
+  unsigned int width = I.getWidth();
+  unsigned int height = I.getHeight();
+  int bit_depth = 8;
+  int color_type = PNG_COLOR_TYPE_GRAY;
+  /* set some useful information from header */
+
+  if (setjmp(png_jmpbuf(png_ptr))) {
+    fclose(file);
+    png_destroy_write_struct(&png_ptr, &info_ptr);
+    vpERROR_TRACE("Error during write header\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
+               PNG_FILTER_TYPE_BASE);
+
+  png_write_info(png_ptr, info_ptr);
+
+  png_bytep *row_ptrs = new png_bytep[height];
+  for (unsigned int i = 0; i < height; i++)
+    row_ptrs[i] = new png_byte[width];
+
+  unsigned char *input = (unsigned char *)I.bitmap;
+
+  for (unsigned int i = 0; i < height; i++) {
+    png_byte *row = row_ptrs[i];
+    for (unsigned int j = 0; j < width; j++) {
+      row[j] = *(input);
+      input++;
+    }
+  }
+
+  png_write_image(png_ptr, row_ptrs);
+
+  png_write_end(png_ptr, NULL);
+
+  for (unsigned int j = 0; j < height; j++)
+    delete[] row_ptrs[j];
+
+  delete[] row_ptrs;
+
+  png_destroy_write_struct(&png_ptr, &info_ptr);
+
+  fclose(file);
+}
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a PNG file.
+
+  \param I : Image to save as a PNG file.
+  \param filename : Name of the file containing the image.
+*/
+void writePNGLibpng(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+  FILE *file;
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "wb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str()));
+  }
+
+  /* create a png info struct */
+  png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+  if (!png_ptr) {
+    fclose(file);
+    vpERROR_TRACE("Error during png_create_write_struct()\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  png_infop info_ptr = png_create_info_struct(png_ptr);
+  if (!info_ptr) {
+    fclose(file);
+    png_destroy_write_struct(&png_ptr, NULL);
+    vpERROR_TRACE("Error during png_create_info_struct()\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  /* initialize the setjmp for returning properly after a libpng error occured
+   */
+  if (setjmp(png_jmpbuf(png_ptr))) {
+    fclose(file);
+    png_destroy_write_struct(&png_ptr, &info_ptr);
+    vpERROR_TRACE("Error during init_io\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  /* setup libpng for using standard C fwrite() function with our FILE pointer
+   */
+  png_init_io(png_ptr, file);
+
+  unsigned int width = I.getWidth();
+  unsigned int height = I.getHeight();
+  int bit_depth = 8;
+  int color_type = PNG_COLOR_TYPE_RGB;
+  /* set some useful information from header */
+
+  if (setjmp(png_jmpbuf(png_ptr))) {
+    fclose(file);
+    png_destroy_write_struct(&png_ptr, &info_ptr);
+    vpERROR_TRACE("Error during write header\n");
+    throw(vpImageException(vpImageException::ioError, "PNG write error"));
+  }
+
+  png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
+               PNG_FILTER_TYPE_BASE);
+
+  png_write_info(png_ptr, info_ptr);
+
+  png_bytep *row_ptrs = new png_bytep[height];
+  for (unsigned int i = 0; i < height; i++)
+    row_ptrs[i] = new png_byte[3 * width];
+
+  unsigned char *input = (unsigned char *)I.bitmap;
+
+  for (unsigned int i = 0; i < height; i++) {
+    png_byte *row = row_ptrs[i];
+    for (unsigned int j = 0; j < width; j++) {
+      row[3 * j] = *(input);
+      input++;
+      row[3 * j + 1] = *(input);
+      input++;
+      row[3 * j + 2] = *(input);
+      input++;
+      input++;
+    }
+  }
+
+  png_write_image(png_ptr, row_ptrs);
+
+  png_write_end(png_ptr, NULL);
+
+  for (unsigned int j = 0; j < height; j++)
+    delete[] row_ptrs[j];
+
+  delete[] row_ptrs;
+
+  png_destroy_write_struct(&png_ptr, &info_ptr);
+
+  fclose(file);
+}
+
+/*!
+  Read the contents of the PNG file, allocate memory
+  for the corresponding gray level image, if necessary convert the data in
+  gray level, and set the bitmap whith the gray level data. That means that
+  the image \e I is a "black and white" rendering of the original image in \e
+  filename, as in a black and white photograph. If necessary, the quantization
+  formula used is \f$0,299 r + 0,587 g + 0,114 b\f$.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  \param I : Image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void readPNGLibpng(vpImage<unsigned char> &I, const std::string &filename)
+{
+  FILE *file;
+  png_byte magic[8];
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "rb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str()));
+  }
+
+  /* read magic number */
+  if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) {
+    fclose(file);
+    throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str()));
+  }
+
+  /* check for valid magic number */
+  if (png_sig_cmp(magic, 0, sizeof(magic))) {
+    fclose(file);
+    throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image",
+                           filename.c_str()));
+  }
+
+  /* create a png read struct */
+  // printf("version %s\n", PNG_LIBPNG_VER_STRING);
+  png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+  if (png_ptr == NULL) {
+    fprintf(stderr, "error: can't create a png read structure!\n");
+    fclose(file);
+    throw(vpImageException(vpImageException::ioError, "error reading png file"));
+  }
+
+  /* create a png info struct */
+  png_infop info_ptr = png_create_info_struct(png_ptr);
+  if (info_ptr == NULL) {
+    fprintf(stderr, "error: can't create a png info structure!\n");
+    fclose(file);
+    png_destroy_read_struct(&png_ptr, NULL, NULL);
+    throw(vpImageException(vpImageException::ioError, "error reading png file"));
+  }
+
+  /* initialize the setjmp for returning properly after a libpng error occured
+   */
+  if (setjmp(png_jmpbuf(png_ptr))) {
+    fclose(file);
+    png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+    vpERROR_TRACE("Error during init io\n");
+    throw(vpImageException(vpImageException::ioError, "PNG read error"));
+  }
+
+  /* setup libpng for using standard C fread() function with our FILE pointer
+   */
+  png_init_io(png_ptr, file);
+
+  /* tell libpng that we have already read the magic number */
+  png_set_sig_bytes(png_ptr, sizeof(magic));
+
+  /* read png info */
+  png_read_info(png_ptr, info_ptr);
+
+  unsigned int width = png_get_image_width(png_ptr, info_ptr);
+  unsigned int height = png_get_image_height(png_ptr, info_ptr);
+
+  unsigned int bit_depth, channels, color_type;
+  /* get some useful information from header */
+  bit_depth = png_get_bit_depth(png_ptr, info_ptr);
+  channels = png_get_channels(png_ptr, info_ptr);
+  color_type = png_get_color_type(png_ptr, info_ptr);
+
+  /* convert index color images to RGB images */
+  if (color_type == PNG_COLOR_TYPE_PALETTE)
+    png_set_palette_to_rgb(png_ptr);
+
+  /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */
+  if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8)
+    png_set_expand(png_ptr);
+
+  //  if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS))
+  //    png_set_tRNS_to_alpha (png_ptr);
+
+  if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA)
+    png_set_strip_alpha(png_ptr);
+
+  if (bit_depth == 16)
+    png_set_strip_16(png_ptr);
+  else if (bit_depth < 8)
+    png_set_packing(png_ptr);
+
+  /* update info structure to apply transformations */
+  png_read_update_info(png_ptr, info_ptr);
+
+  channels = png_get_channels(png_ptr, info_ptr);
+
+  if ((width != I.getWidth()) || (height != I.getHeight()))
+    I.resize(height, width);
+
+  png_bytep *rowPtrs = new png_bytep[height];
+
+  unsigned int stride = png_get_rowbytes(png_ptr, info_ptr);
+  unsigned char *data = new unsigned char[stride * height];
+
+  for (unsigned int i = 0; i < height; i++)
+    rowPtrs[i] = (png_bytep)data + (i * stride);
+
+  png_read_image(png_ptr, rowPtrs);
+
+  vpImage<vpRGBa> Ic(height, width);
+  unsigned char *output;
+
+  switch (channels) {
+  case 1:
+    output = (unsigned char *)I.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i];
+    }
+    break;
+
+  case 2:
+    output = (unsigned char *)I.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i * 2];
+    }
+    break;
+
+  case 3:
+    output = (unsigned char *)Ic.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i * 3];
+      *(output++) = data[i * 3 + 1];
+      *(output++) = data[i * 3 + 2];
+      *(output++) = vpRGBa::alpha_default;
+    }
+    vpImageConvert::convert(Ic, I);
+    break;
+
+  case 4:
+    output = (unsigned char *)Ic.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i * 4];
+      *(output++) = data[i * 4 + 1];
+      *(output++) = data[i * 4 + 2];
+      *(output++) = data[i * 4 + 3];
+    }
+    vpImageConvert::convert(Ic, I);
+    break;
+  }
+
+  delete[](png_bytep) rowPtrs;
+  delete[] data;
+  png_read_end(png_ptr, NULL);
+  png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+  fclose(file);
+}
+
+/*!
+  Read a PNG file and initialize a scalar image.
+
+  Read the contents of the PNG file, allocate
+  memory for the corresponding image, and set
+  the bitmap whith the content of
+  the file.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  If the file corresponds to a grayscaled image, a conversion is done to deal
+  with \e I which is a color image.
+
+  \param I : Color image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void readPNGLibpng(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  FILE *file;
+  png_byte magic[8];
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty"));
+  }
+
+  file = fopen(filename.c_str(), "rb");
+
+  if (file == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str()));
+  }
+
+  /* read magic number */
+  if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) {
+    fclose(file);
+    throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str()));
+  }
+
+  /* check for valid magic number */
+  if (png_sig_cmp(magic, 0, sizeof(magic))) {
+    fclose(file);
+    throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image",
+                           filename.c_str()));
+  }
+
+  /* create a png read struct */
+  png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+  if (!png_ptr) {
+    fclose(file);
+    vpERROR_TRACE("Error during png_create_read_struct()\n");
+    throw(vpImageException(vpImageException::ioError, "PNG read error"));
+  }
+
+  /* create a png info struct */
+  png_infop info_ptr = png_create_info_struct(png_ptr);
+  if (!info_ptr) {
+    fclose(file);
+    png_destroy_read_struct(&png_ptr, NULL, NULL);
+    vpERROR_TRACE("Error during png_create_info_struct()\n");
+    throw(vpImageException(vpImageException::ioError, "PNG read error"));
+  }
+
+  /* initialize the setjmp for returning properly after a libpng error occured
+   */
+  if (setjmp(png_jmpbuf(png_ptr))) {
+    fclose(file);
+    png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+    vpERROR_TRACE("Error during init io\n");
+    throw(vpImageException(vpImageException::ioError, "PNG read error"));
+  }
+
+  /* setup libpng for using standard C fread() function with our FILE pointer
+   */
+  png_init_io(png_ptr, file);
+
+  /* tell libpng that we have already read the magic number */
+  png_set_sig_bytes(png_ptr, sizeof(magic));
+
+  /* read png info */
+  png_read_info(png_ptr, info_ptr);
+
+  unsigned int width = png_get_image_width(png_ptr, info_ptr);
+  unsigned int height = png_get_image_height(png_ptr, info_ptr);
+
+  unsigned int bit_depth, channels, color_type;
+  /* get some useful information from header */
+  bit_depth = png_get_bit_depth(png_ptr, info_ptr);
+  channels = png_get_channels(png_ptr, info_ptr);
+  color_type = png_get_color_type(png_ptr, info_ptr);
+
+  /* convert index color images to RGB images */
+  if (color_type == PNG_COLOR_TYPE_PALETTE)
+    png_set_palette_to_rgb(png_ptr);
+
+  /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */
+  if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8)
+    png_set_expand(png_ptr);
+
+  //  if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS))
+  //    png_set_tRNS_to_alpha (png_ptr);
+
+  if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA)
+    png_set_strip_alpha(png_ptr);
+
+  if (bit_depth == 16)
+    png_set_strip_16(png_ptr);
+  else if (bit_depth < 8)
+    png_set_packing(png_ptr);
+
+  /* update info structure to apply transformations */
+  png_read_update_info(png_ptr, info_ptr);
+
+  channels = png_get_channels(png_ptr, info_ptr);
+
+  if ((width != I.getWidth()) || (height != I.getHeight()))
+    I.resize(height, width);
+
+  png_bytep *rowPtrs = new png_bytep[height];
+
+  unsigned int stride = png_get_rowbytes(png_ptr, info_ptr);
+  unsigned char *data = new unsigned char[stride * height];
+
+  for (unsigned int i = 0; i < height; i++)
+    rowPtrs[i] = (png_bytep)data + (i * stride);
+
+  png_read_image(png_ptr, rowPtrs);
+
+  vpImage<unsigned char> Ig(height, width);
+  unsigned char *output;
+
+  switch (channels) {
+  case 1:
+    output = (unsigned char *)Ig.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i];
+    }
+    vpImageConvert::convert(Ig, I);
+    break;
+
+  case 2:
+    output = (unsigned char *)Ig.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i * 2];
+    }
+    vpImageConvert::convert(Ig, I);
+    break;
+
+  case 3:
+    output = (unsigned char *)I.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i * 3];
+      *(output++) = data[i * 3 + 1];
+      *(output++) = data[i * 3 + 2];
+      *(output++) = vpRGBa::alpha_default;
+    }
+    break;
+
+  case 4:
+    output = (unsigned char *)I.bitmap;
+    for (unsigned int i = 0; i < width * height; i++) {
+      *(output++) = data[i * 4];
+      *(output++) = data[i * 4 + 1];
+      *(output++) = data[i * 4 + 2];
+      *(output++) = data[i * 4 + 3];
+    }
+    break;
+  }
+
+  delete[](png_bytep) rowPtrs;
+  delete[] data;
+  png_read_end(png_ptr, NULL);
+  png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+  fclose(file);
+}
+#endif
diff --git a/modules/io/src/image/private/vpImageIoOpenCV.cpp b/modules/io/src/image/private/vpImageIoOpenCV.cpp
new file mode 100644
index 0000000000..93b6a1ca1d
--- /dev/null
+++ b/modules/io/src/image/private/vpImageIoOpenCV.cpp
@@ -0,0 +1,205 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Read/write images.
+ *
+ * Authors:
+ * Eric Marchand
+ *
+ *****************************************************************************/
+
+/*!
+  \file vpImageIo.cpp
+  \brief Read/write images
+*/
+
+#include "vpImageIoBackend.h"
+
+//TODO:
+#ifdef VISP_HAVE_OPENCV
+#if (VISP_HAVE_OPENCV_VERSION >= 0x030000) // Require opencv >= 3.0.0
+#  include <opencv2/imgcodecs.hpp>
+#elif (VISP_HAVE_OPENCV_VERSION >= 0x020408) // Require opencv >= 2.4.8
+#  include <opencv2/core/core.hpp>
+#  include <opencv2/highgui/highgui.hpp>
+#  include <opencv2/imgproc/imgproc.hpp>
+#elif (VISP_HAVE_OPENCV_VERSION >= 0x020101) // Require opencv >= 2.1.1
+#  include <opencv2/core/core.hpp>
+#  include <opencv2/highgui/highgui.hpp>
+#  include <opencv2/highgui/highgui_c.h>
+#  include <opencv2/legacy/legacy.hpp>
+#else
+#  include <highgui.h>
+#endif
+#endif
+
+#include <visp3/core/vpImageConvert.h>
+
+
+#if defined(VISP_HAVE_OPENCV)
+
+/*!
+  Read the contents of the JPEG file, allocate memory
+  for the corresponding gray level image, if necessary convert the data in
+  gray level, and set the bitmap whith the gray level data. That means that
+  the image \e I is a "black and white" rendering of the original image in \e
+  filename, as in a black and white photograph. If necessary, the quantization
+  formula used is \f$0,299 r + 0,587 g + 0,114 b\f$.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  If EXIF information is embedded in the image file, the EXIF orientation is ignored.
+
+  \param I : Image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+
+*/
+void readOpenCV(vpImage<unsigned char> &I, const std::string &filename)
+{
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+#if VISP_HAVE_OPENCV_VERSION >= 0x030200
+    int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION;
+#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
+    int flags = cv::IMREAD_GRAYSCALE;
+#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
+    int flags = CV_LOAD_IMAGE_GRAYSCALE;
+#endif
+  cv::Mat Ip = cv::imread(filename.c_str(), flags);
+  if (!Ip.empty())
+    vpImageConvert::convert(Ip, I);
+  else
+    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
+#else
+  IplImage *Ip = NULL;
+  Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE);
+  if (Ip != NULL)
+    vpImageConvert::convert(Ip, I);
+  else
+    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
+  cvReleaseImage(&Ip);
+#endif
+}
+
+/*!
+  Read a JPEG file and initialize a scalar image.
+
+  Read the contents of the JPEG file, allocate
+  memory for the corresponding image, and set
+  the bitmap whith the content of
+  the file.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  If the file corresponds to a grayscaled image, a conversion is done to deal
+  with \e I which is a color image.
+
+  If EXIF information is embedded in the image file, the EXIF orientation is ignored.
+
+  \param I : Color image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void readOpenCV(vpImage<vpRGBa> &I, const std::string &filename)
+{
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+#if VISP_HAVE_OPENCV_VERSION >= 0x030200
+    int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION;
+#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
+    int flags = cv::IMREAD_GRAYSCALE;
+#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
+    int flags = CV_LOAD_IMAGE_GRAYSCALE;
+#endif
+  cv::Mat Ip = cv::imread(filename.c_str(), flags);
+  if (!Ip.empty())
+    vpImageConvert::convert(Ip, I);
+  else
+    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
+#else
+  IplImage *Ip = NULL;
+  Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_COLOR);
+  if (Ip != NULL)
+    vpImageConvert::convert(Ip, I);
+  else
+    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
+  cvReleaseImage(&Ip);
+#endif
+}
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a JPEG file.
+
+  \param I : Image to save as a JPEG file.
+  \param filename : Name of the file containing the image.
+*/
+void writeOpenCV(const vpImage<unsigned char> &I, const std::string &filename)
+{
+#if (VISP_HAVE_OPENCV_VERSION >= 0x020408)
+  cv::Mat Ip;
+  vpImageConvert::convert(I, Ip);
+  cv::imwrite(filename.c_str(), Ip);
+#else
+  IplImage *Ip = NULL;
+  vpImageConvert::convert(I, Ip);
+
+  cvSaveImage(filename.c_str(), Ip);
+
+  cvReleaseImage(&Ip);
+#endif
+}
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a JPEG file.
+
+  \param I : Image to save as a JPEG file.
+  \param filename : Name of the file containing the image.
+*/
+void writeOpenCV(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+#if (VISP_HAVE_OPENCV_VERSION >= 0x020408)
+  cv::Mat Ip;
+  vpImageConvert::convert(I, Ip);
+  cv::imwrite(filename.c_str(), Ip);
+#else
+  IplImage *Ip = NULL;
+  vpImageConvert::convert(I, Ip);
+
+  cvSaveImage(filename.c_str(), Ip);
+
+  cvReleaseImage(&Ip);
+#endif
+}
+
+#endif
diff --git a/modules/io/src/image/private/vpImageIoPortable.cpp b/modules/io/src/image/private/vpImageIoPortable.cpp
new file mode 100644
index 0000000000..0031e4c96a
--- /dev/null
+++ b/modules/io/src/image/private/vpImageIoPortable.cpp
@@ -0,0 +1,569 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Read/write images.
+ *
+ * Authors:
+ * Eric Marchand
+ *
+ *****************************************************************************/
+
+/*!
+  \file vpImageIo.cpp
+  \brief Read/write images
+*/
+
+#include "vpImageIoBackend.h"
+#include <visp3/core/vpIoTools.h>
+#include <visp3/core/vpImageConvert.h>
+
+//TODO:
+#if defined(_WIN32)
+// Include WinSock2.h before windows.h to ensure that winsock.h is not
+// included by windows.h since winsock.h and winsock2.h are incompatible
+#include <WinSock2.h>
+#include <windows.h>
+#endif
+
+
+void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w,
+                        unsigned int &h, unsigned int &maxval);
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+/*!
+ * Decode the PNM image header.
+ * \param filename[in] : File name.
+ * \param fd[in] : File desdcriptor.
+ * \param magic[in] : Magic number for identifying the file type.
+ * \param w[out] : Image width.
+ * \param h[out] : Image height.
+ * \param maxval[out] : Maximum pixel value.
+ */
+void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w,
+                        unsigned int &h, unsigned int &maxval)
+{
+  std::string line;
+  unsigned int nb_elt = 4, cpt_elt = 0;
+  while (cpt_elt != nb_elt) {
+    // Skip empty lines or lines starting with # (comment)
+    while (std::getline(fd, line) && (line.compare(0, 1, "#") == 0 || line.size() == 0)) {
+    }
+
+    if (fd.eof()) {
+      fd.close();
+      throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str()));
+    }
+
+    std::vector<std::string> header = vpIoTools::splitChain(line, std::string(" "));
+
+    if (header.size() == 0) {
+      fd.close();
+      throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str()));
+    }
+
+    if (cpt_elt == 0) { // decode magic
+      if (header[0].compare(0, magic.size(), magic) != 0) {
+        fd.close();
+        throw(vpImageException(vpImageException::ioError, "\"%s\" is not a PNM file with magic number %s",
+                               filename.c_str(), magic.c_str()));
+      }
+      cpt_elt++;
+      header.erase(header.begin(),
+                   header.begin() + 1); // erase first element that is processed
+    }
+    while (header.size()) {
+      if (cpt_elt == 1) { // decode width
+        std::istringstream ss(header[0]);
+        ss >> w;
+        cpt_elt++;
+        header.erase(header.begin(),
+                     header.begin() + 1); // erase first element that is processed
+      } else if (cpt_elt == 2) {          // decode height
+        std::istringstream ss(header[0]);
+        ss >> h;
+        cpt_elt++;
+        header.erase(header.begin(),
+                     header.begin() + 1); // erase first element that is processed
+      } else if (cpt_elt == 3) {          // decode maxval
+        std::istringstream ss(header[0]);
+        ss >> maxval;
+        cpt_elt++;
+        header.erase(header.begin(),
+                     header.begin() + 1); // erase first element that is processed
+      }
+    }
+  }
+}
+#endif
+
+//--------------------------------------------------------------------------
+// PFM
+//--------------------------------------------------------------------------
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function is built like portable gray pixmap (eg PGM P5) file.
+  but considers float image data.
+
+  \param I : Image to save as a (PFM P8) file.
+  \param filename : Name of the file containing the image.
+*/
+void vp_writePFM(const vpImage<float> &I, const std::string &filename)
+{
+  FILE *fd;
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot write PFM image: filename empty"));
+  }
+
+  fd = fopen(filename.c_str(), "wb");
+
+  if (fd == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PFM file \"%s\"", filename.c_str()));
+  }
+
+  // Write the head
+  fprintf(fd, "P8\n");                                 // Magic number
+  fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
+  fprintf(fd, "255\n");                                // Max level
+
+  // Write the bitmap
+  size_t ierr;
+  size_t nbyte = I.getWidth() * I.getHeight();
+
+  ierr = fwrite(I.bitmap, sizeof(float), nbyte, fd);
+  if (ierr != nbyte) {
+    fclose(fd);
+    throw(vpImageException(vpImageException::ioError, "Cannot save PFM file \"%s\": only %d bytes over %d saved ",
+                           filename.c_str(), ierr, nbyte));
+  }
+
+  fflush(fd);
+  fclose(fd);
+}
+
+//--------------------------------------------------------------------------
+// PGM
+//--------------------------------------------------------------------------
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a portable gray pixmap (PGM P5) file.
+
+  \param I : Image to save as a (PGM P5) file.
+  \param filename : Name of the file containing the image.
+*/
+void vp_writePGM(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  FILE *fd;
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty"));
+  }
+
+  fd = fopen(filename.c_str(), "wb");
+
+  if (fd == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str()));
+  }
+
+  // Write the head
+  fprintf(fd, "P5\n");                                 // Magic number
+  fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
+  fprintf(fd, "255\n");                                // Max level
+
+  // Write the bitmap
+  size_t ierr;
+  size_t nbyte = I.getWidth() * I.getHeight();
+
+  ierr = fwrite(I.bitmap, sizeof(unsigned char), nbyte, fd);
+  if (ierr != nbyte) {
+    fclose(fd);
+    throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved",
+                           filename.c_str(), ierr, nbyte));
+  }
+
+  fflush(fd);
+  fclose(fd);
+}
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a portable gray pixmap (PGM P5) file.
+
+  \param I : Image to save as a (PGM P5) file.
+  \param filename : Name of the file containing the image.
+*/
+void vp_writePGM(const vpImage<short> &I, const std::string &filename)
+{
+  vpImage<unsigned char> Iuc;
+  unsigned int nrows = I.getHeight();
+  unsigned int ncols = I.getWidth();
+
+  Iuc.resize(nrows, ncols);
+
+  for (unsigned int i = 0; i < nrows * ncols; i++)
+    Iuc.bitmap[i] = (unsigned char)I.bitmap[i];
+
+  vp_writePGM(Iuc, filename);
+}
+
+/*!
+  Write the content of the image bitmap in the file which name is given by \e
+  filename. This function writes a portable gray pixmap (PGM P5) file.
+  Color image is converted into a grayscale image.
+
+  \param I : Image to save as a (PGM P5) file.
+  \param filename : Name of the file containing the image.
+*/
+void vp_writePGM(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+
+  FILE *fd;
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty"));
+  }
+
+  fd = fopen(filename.c_str(), "wb");
+
+  if (fd == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str()));
+  }
+
+  // Write the head
+  fprintf(fd, "P5\n");                                 // Magic number
+  fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
+  fprintf(fd, "255\n");                                // Max level
+
+  // Write the bitmap
+  size_t ierr;
+  size_t nbyte = I.getWidth() * I.getHeight();
+
+  vpImage<unsigned char> Itmp;
+  vpImageConvert::convert(I, Itmp);
+
+  ierr = fwrite(Itmp.bitmap, sizeof(unsigned char), nbyte, fd);
+  if (ierr != nbyte) {
+    fclose(fd);
+    throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved",
+                           filename.c_str(), ierr, nbyte));
+  }
+
+  fflush(fd);
+  fclose(fd);
+}
+
+/*!
+  Read a PFM P8 file and initialize a float image.
+
+  Read the contents of the portable gray pixmap (PFM P8) filename, allocate
+  memory for the corresponding image, and set the bitmap whith the content of
+  the file.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  \param I : Image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void vp_readPFM(vpImage<float> &I, const std::string &filename)
+{
+  unsigned int w = 0, h = 0, maxval = 0;
+  unsigned int w_max = 100000, h_max = 100000, maxval_max = 255;
+  std::string magic("P8");
+
+  std::ifstream fd(filename.c_str(), std::ios::binary);
+
+  // Open the filename
+  if (!fd.is_open()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str()));
+  }
+
+  vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval);
+
+  if (w > w_max || h > h_max) {
+    fd.close();
+    throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str()));
+  }
+  if (maxval > maxval_max) {
+    fd.close();
+    throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str()));
+  }
+
+  if ((h != I.getHeight()) || (w != I.getWidth())) {
+    I.resize(h, w);
+  }
+
+  unsigned int nbyte = I.getHeight() * I.getWidth();
+  fd.read((char *)I.bitmap, sizeof(float) * nbyte);
+  if (!fd) {
+    fd.close();
+    throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte,
+                           filename.c_str()));
+  }
+
+  fd.close();
+}
+
+/*!
+  Read a PGM P5 file and initialize a scalar image.
+
+  Read the contents of the portable gray pixmap (PGM P5) filename, allocate
+  memory for the corresponding image, and set the bitmap whith the content of
+  the file.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  \param I : Image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void vp_readPGM(vpImage<unsigned char> &I, const std::string &filename)
+{
+  unsigned int w = 0, h = 0, maxval = 0;
+  unsigned int w_max = 100000, h_max = 100000, maxval_max = 255;
+  std::string magic("P5");
+
+  std::ifstream fd(filename.c_str(), std::ios::binary);
+
+  // Open the filename
+  if (!fd.is_open()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str()));
+  }
+
+  vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval);
+
+  if (w > w_max || h > h_max) {
+    fd.close();
+    throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str()));
+  }
+  if (maxval > maxval_max) {
+    fd.close();
+    throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str()));
+  }
+
+  if ((h != I.getHeight()) || (w != I.getWidth())) {
+    I.resize(h, w);
+  }
+
+  unsigned int nbyte = I.getHeight() * I.getWidth();
+  fd.read((char *)I.bitmap, nbyte);
+  if (!fd) {
+    fd.close();
+    throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte,
+                           filename.c_str()));
+  }
+
+  fd.close();
+}
+
+/*!
+  Read a PGM P5 file and initialize a scalar image.
+
+  Read the contents of the portable gray pixmap (PGM P5) filename, allocate
+  memory for the corresponding image, and set the bitmap whith the content of
+  the file.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  The gray level image contained in the \e filename is converted in a
+  color image in \e I.
+
+  \param I : Color image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void vp_readPGM(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  vpImage<unsigned char> Itmp;
+
+  vp_readPGM(Itmp, filename);
+
+  vpImageConvert::convert(Itmp, I);
+}
+
+//--------------------------------------------------------------------------
+// PPM
+//--------------------------------------------------------------------------
+
+/*!
+  Read the contents of the portable pixmap (PPM P6) filename, allocate memory
+  for the corresponding gray level image, convert the data in gray level, and
+  set the bitmap whith the gray level data. That means that the image \e I is
+  a "black and white" rendering of the original image in \e filename, as in a
+  black and white photograph. The quantization formula used is \f$0,299 r +
+  0,587 g + 0,114 b\f$.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  \param I : Image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void vp_readPPM(vpImage<unsigned char> &I, const std::string &filename)
+{
+  vpImage<vpRGBa> Itmp;
+
+  vp_readPPM(Itmp, filename);
+
+  vpImageConvert::convert(Itmp, I);
+}
+
+/*!
+  Read the contents of the portable pixmap (PPM P6) filename,
+  allocate memory for the corresponding vpRGBa image.
+
+  If the image has been already initialized, memory allocation is done
+  only if the new image size is different, else we re-use the same
+  memory space.
+
+  \param I : Image to set with the \e filename content.
+  \param filename : Name of the file containing the image.
+*/
+void vp_readPPM(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  unsigned int w = 0, h = 0, maxval = 0;
+  unsigned int w_max = 100000, h_max = 100000, maxval_max = 255;
+  std::string magic("P6");
+
+  std::ifstream fd(filename.c_str(), std::ios::binary);
+
+  // Open the filename
+  if (!fd.is_open()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str()));
+  }
+
+  vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval);
+
+  if (w > w_max || h > h_max) {
+    fd.close();
+    throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str()));
+  }
+  if (maxval > maxval_max) {
+    fd.close();
+    throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str()));
+  }
+
+  if ((h != I.getHeight()) || (w != I.getWidth())) {
+    I.resize(h, w);
+  }
+
+  for (unsigned int i = 0; i < I.getHeight(); i++) {
+    for (unsigned int j = 0; j < I.getWidth(); j++) {
+      unsigned char rgb[3];
+      fd.read((char *)&rgb, 3);
+
+      if (!fd) {
+        fd.close();
+        throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"",
+                               (i * I.getWidth() + j) * 3 + fd.gcount(), I.getSize() * 3, filename.c_str()));
+      }
+
+      I[i][j].R = rgb[0];
+      I[i][j].G = rgb[1];
+      I[i][j].B = rgb[2];
+      I[i][j].A = vpRGBa::alpha_default;
+    }
+  }
+
+  fd.close();
+}
+
+/*!
+  Write the content of the bitmap in the file which name is given by \e
+  filename. This function writes a portable gray pixmap (PPM P6) file.
+  grayscale image is converted into a color image vpRGBa.
+
+  \param I : Image to save as a (PPM P6) file.
+  \param filename : Name of the file containing the image.
+*/
+void vp_writePPM(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  vpImage<vpRGBa> Itmp;
+
+  vpImageConvert::convert(I, Itmp);
+
+  vp_writePPM(Itmp, filename);
+}
+
+/*!
+  Write the content of the bitmap in the file which name is given by \e
+  filename. This function writes a portable gray pixmap (PPM P6) file.
+
+  \param I : Image to save as a (PPM P6) file.
+  \param filename : Name of the file containing the image.
+*/
+void vp_writePPM(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+  FILE *f;
+
+  // Test the filename
+  if (filename.empty()) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PPM file: filename empty"));
+  }
+
+  f = fopen(filename.c_str(), "wb");
+
+  if (f == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Cannot create PPM file \"%s\"", filename.c_str()));
+  }
+
+  fprintf(f, "P6\n");                                 // Magic number
+  fprintf(f, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
+  fprintf(f, "%d\n", 255);                            // Max level
+
+  for (unsigned int i = 0; i < I.getHeight(); i++) {
+    for (unsigned int j = 0; j < I.getWidth(); j++) {
+      vpRGBa v = I[i][j];
+      unsigned char rgb[3];
+      rgb[0] = v.R;
+      rgb[1] = v.G;
+      rgb[2] = v.B;
+
+      size_t res = fwrite(&rgb, 1, 3, f);
+      if (res != 3) {
+        fclose(f);
+        throw(vpImageException(vpImageException::ioError, "cannot write file \"%s\"", filename.c_str()));
+      }
+    }
+  }
+
+  fflush(f);
+  fclose(f);
+}
diff --git a/modules/io/src/image/private/vpImageIoSimd.cpp b/modules/io/src/image/private/vpImageIoSimd.cpp
new file mode 100644
index 0000000000..40986bf743
--- /dev/null
+++ b/modules/io/src/image/private/vpImageIoSimd.cpp
@@ -0,0 +1,87 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Read/write images.
+ *
+ * Authors:
+ * Eric Marchand
+ *
+ *****************************************************************************/
+
+/*!
+  \file vpImageIo.cpp
+  \brief Read/write images
+*/
+
+#include "vpImageIoBackend.h"
+
+//TODO:
+#include <Simd/SimdLib.hpp>
+
+
+//TODO:
+void readSimdlib(vpImage<unsigned char> &I, const std::string &filename)
+{
+  size_t stride = 0, width = 0, height = 0;
+  SimdPixelFormatType format = SimdPixelFormatGray8;
+  uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format);
+  const bool copyData = false;
+  I.init(data, (unsigned int)height, (unsigned int)width, copyData);
+}
+
+void readSimdlib(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  size_t stride = 0, width = 0, height = 0;
+  SimdPixelFormatType format = SimdPixelFormatRgba32;
+  uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format);
+  const bool copyData = false;
+  I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData);
+}
+
+void writeJPEGSimdlib(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFileJpeg, 90, filename.c_str());
+}
+
+void writeJPEGSimdlib(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+  SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, 90, filename.c_str());
+}
+
+void writePNGSimdlib(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFileJpeg, 90, filename.c_str());
+}
+
+void writePNGSimdlib(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+  SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFilePng, 90, filename.c_str());
+}
diff --git a/modules/io/src/image/private/vpImageIoStb.cpp b/modules/io/src/image/private/vpImageIoStb.cpp
new file mode 100644
index 0000000000..97b453d841
--- /dev/null
+++ b/modules/io/src/image/private/vpImageIoStb.cpp
@@ -0,0 +1,121 @@
+/****************************************************************************
+ *
+ * ViSP, open source Visual Servoing Platform software.
+ * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
+ *
+ * This software is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * See the file LICENSE.txt at the root directory of this source
+ * distribution for additional information about the GNU GPL.
+ *
+ * For using ViSP with software that can not be combined with the GNU
+ * GPL, please contact Inria about acquiring a ViSP Professional
+ * Edition License.
+ *
+ * See http://visp.inria.fr for more information.
+ *
+ * This software was developed at:
+ * Inria Rennes - Bretagne Atlantique
+ * Campus Universitaire de Beaulieu
+ * 35042 Rennes Cedex
+ * France
+ *
+ * If you have questions regarding the use of this file, please contact
+ * Inria at visp@inria.fr
+ *
+ * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+ * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Description:
+ * Read/write images.
+ *
+ * Authors:
+ * Eric Marchand
+ *
+ *****************************************************************************/
+
+/*!
+  \file vpImageIo.cpp
+  \brief Read/write images
+*/
+
+#include "vpImageIoBackend.h"
+
+//TODO:
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#  define VISP_HAVE_SSE2 1
+#endif
+
+#ifndef VISP_HAVE_SSE2
+#  define STBI_NO_SIMD
+#endif
+
+#define STB_IMAGE_IMPLEMENTATION
+#include <stb_image.h>
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include <stb_image_write.h>
+
+
+//TODO:
+void readStb(vpImage<unsigned char> &I, const std::string &filename)
+{
+  int width = 0, height = 0, channels = 0;
+  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_grey);
+  if (image == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
+  }
+  I.init(image, static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
+  stbi_image_free(image);
+}
+
+void readStb(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  int width = 0, height = 0, channels = 0;
+  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha);
+  if (image == NULL) {
+    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
+  }
+  I.init(reinterpret_cast<vpRGBa*>(image), static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
+  stbi_image_free(image);
+}
+
+void writeJPEGStb(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  int res = stbi_write_jpg(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_grey,
+                           reinterpret_cast<void*>(I.bitmap), 90);
+  if (res == 0) {
+    throw(vpImageException(vpImageException::ioError, "JEPG write error"));
+  }
+}
+
+void writeJPEGStb(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+  int res = stbi_write_jpg(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
+                           reinterpret_cast<void*>(I.bitmap), 90);
+  if (res == 0) {
+    throw(vpImageException(vpImageException::ioError, "JEPG write error"));
+  }
+}
+
+void writePNGStb(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  const int stride_in_bytes = static_cast<int>(I.getWidth());
+  int res = stbi_write_png(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_grey,
+                           reinterpret_cast<void*>(I.bitmap), stride_in_bytes);
+  if (res == 0) {
+    throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str()));
+  }
+}
+
+void writePNGStb(const vpImage<vpRGBa> &I, const std::string &filename)
+{
+  const int stride_in_bytes = static_cast<int>(4 * I.getWidth());
+  int res = stbi_write_png(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
+                           reinterpret_cast<void*>(I.bitmap), stride_in_bytes);
+  if (res == 0) {
+    throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str()));
+  }
+}
diff --git a/modules/io/src/image/vpImageIo.cpp b/modules/io/src/image/vpImageIo.cpp
index cc7799d158..e8b221049e 100644
--- a/modules/io/src/image/vpImageIo.cpp
+++ b/modules/io/src/image/vpImageIo.cpp
@@ -46,119 +46,9 @@
 #include <visp3/core/vpIoTools.h>
 #include <visp3/io/vpImageIo.h>
 
-#if defined(_WIN32)
-// Include WinSock2.h before windows.h to ensure that winsock.h is not
-// included by windows.h since winsock.h and winsock2.h are incompatible
-#include <WinSock2.h>
-#include <windows.h>
-#endif
-
-#if defined(VISP_HAVE_JPEG)
-#include <jerror.h>
-#include <jpeglib.h>
-#endif
-
-#if defined(VISP_HAVE_PNG)
-#include <png.h>
-#endif
-
 //TODO:
-#include <Simd/SimdLib.hpp>
-//TODO:
-#define STB_IMAGE_IMPLEMENTATION
-#include <stb_image.h>
-
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-#include <stb_image_write.h>
-
-#if !defined(VISP_HAVE_OPENCV)
-#if !defined(VISP_HAVE_JPEG) || !defined(VISP_HAVE_PNG)
-
-#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
-#  define VISP_HAVE_SSE2 1
-#endif
-
-#ifndef VISP_HAVE_SSE2
-#  define STBI_NO_SIMD
-#endif
-
-#define STB_IMAGE_IMPLEMENTATION
-#include <stb_image.h>
+#include "private/vpImageIoBackend.h"
 
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-#include <stb_image_write.h>
-#endif
-#endif
-
-void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w,
-                        unsigned int &h, unsigned int &maxval);
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS
-/*!
- * Decode the PNM image header.
- * \param filename[in] : File name.
- * \param fd[in] : File desdcriptor.
- * \param magic[in] : Magic number for identifying the file type.
- * \param w[out] : Image width.
- * \param h[out] : Image height.
- * \param maxval[out] : Maximum pixel value.
- */
-void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w,
-                        unsigned int &h, unsigned int &maxval)
-{
-  std::string line;
-  unsigned int nb_elt = 4, cpt_elt = 0;
-  while (cpt_elt != nb_elt) {
-    // Skip empty lines or lines starting with # (comment)
-    while (std::getline(fd, line) && (line.compare(0, 1, "#") == 0 || line.size() == 0)) {
-    }
-
-    if (fd.eof()) {
-      fd.close();
-      throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str()));
-    }
-
-    std::vector<std::string> header = vpIoTools::splitChain(line, std::string(" "));
-
-    if (header.size() == 0) {
-      fd.close();
-      throw(vpImageException(vpImageException::ioError, "Cannot read header of file \"%s\"", filename.c_str()));
-    }
-
-    if (cpt_elt == 0) { // decode magic
-      if (header[0].compare(0, magic.size(), magic) != 0) {
-        fd.close();
-        throw(vpImageException(vpImageException::ioError, "\"%s\" is not a PNM file with magic number %s",
-                               filename.c_str(), magic.c_str()));
-      }
-      cpt_elt++;
-      header.erase(header.begin(),
-                   header.begin() + 1); // erase first element that is processed
-    }
-    while (header.size()) {
-      if (cpt_elt == 1) { // decode width
-        std::istringstream ss(header[0]);
-        ss >> w;
-        cpt_elt++;
-        header.erase(header.begin(),
-                     header.begin() + 1); // erase first element that is processed
-      } else if (cpt_elt == 2) {          // decode height
-        std::istringstream ss(header[0]);
-        ss >> h;
-        cpt_elt++;
-        header.erase(header.begin(),
-                     header.begin() + 1); // erase first element that is processed
-      } else if (cpt_elt == 3) {          // decode maxval
-        std::istringstream ss(header[0]);
-        ss >> maxval;
-        cpt_elt++;
-        header.erase(header.begin(),
-                     header.begin() + 1); // erase first element that is processed
-      }
-    }
-  }
-}
-#endif
 
 vpImageIo::vpImageFormatType vpImageIo::getFormat(const std::string &filename)
 {
@@ -271,18 +161,10 @@ void vpImageIo::read(vpImage<unsigned char> &I, const std::string &filename)
     readPPM(I, final_filename);
     break;
   case FORMAT_JPEG:
-#ifdef VISP_HAVE_JPEG
     readJPEG(I, final_filename);
-#else
-    try_opencv_reader = true;
-#endif
     break;
   case FORMAT_PNG:
-#if defined(VISP_HAVE_PNG)
     readPNG(I, final_filename);
-#else
-    try_opencv_reader = true;
-#endif
     break;
   case FORMAT_TIFF:
   case FORMAT_BMP:
@@ -297,39 +179,10 @@ void vpImageIo::read(vpImage<unsigned char> &I, const std::string &filename)
 
   if (try_opencv_reader) {
 #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-#if VISP_HAVE_OPENCV_VERSION >= 0x030200
-    int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
-    int flags = cv::IMREAD_GRAYSCALE;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
-    int flags = CV_LOAD_IMAGE_GRAYSCALE;
-#endif
-    // std::cout << "Use opencv to read the image" << std::endl;
-    cv::Mat cvI = cv::imread(final_filename, flags);
-    if (cvI.cols == 0 && cvI.rows == 0) {
-      std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported";
-      throw(vpImageException(vpImageException::ioError, message));
-    }
-    vpImageConvert::convert(cvI, I);
+    readOpenCV(I, filename);
 #else
-    switch (getFormat(final_filename)) {
-    case FORMAT_JPEG:
-      readJPEG(I, final_filename);
-      break;
-    case FORMAT_PNG:
-      readPNG(I, final_filename);
-      break;
-    case FORMAT_BMP:
-    case FORMAT_TIFF:
-    case FORMAT_DIB:
-    case FORMAT_PBM:
-    case FORMAT_RASTER:
-    case FORMAT_JPEG2000:
-    case FORMAT_UNKNOWN:
-    default:
-      std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported";
-      throw(vpImageException(vpImageException::ioError, message));
-    }
+    std::string message = "Cannot read file \"" + filename + "\": No backend able to support this image format";
+    throw(vpImageException(vpImageException::ioError, message));
 #endif
   }
 }
@@ -374,18 +227,10 @@ void vpImageIo::read(vpImage<vpRGBa> &I, const std::string &filename)
     readPPM(I, final_filename);
     break;
   case FORMAT_JPEG:
-#ifdef VISP_HAVE_JPEG
     readJPEG(I, final_filename);
-#else
-    try_opencv_reader = true;
-#endif
     break;
   case FORMAT_PNG:
-#if defined(VISP_HAVE_PNG)
     readPNG(I, final_filename);
-#else
-    try_opencv_reader = true;
-#endif
     break;
   case FORMAT_TIFF:
   case FORMAT_BMP:
@@ -400,39 +245,10 @@ void vpImageIo::read(vpImage<vpRGBa> &I, const std::string &filename)
 
   if (try_opencv_reader) {
 #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-#if VISP_HAVE_OPENCV_VERSION >= 0x030200
-    int flags = cv::IMREAD_COLOR | cv::IMREAD_IGNORE_ORIENTATION;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
-    int flags = cv::IMREAD_COLOR;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
-    int flags = CV_LOAD_IMAGE_COLOR;
-#endif
-    // std::cout << "Use opencv to read the image" << std::endl;
-    cv::Mat cvI = cv::imread(final_filename, flags);
-    if (cvI.cols == 0 && cvI.rows == 0) {
-      std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported";
-      throw(vpImageException(vpImageException::ioError, message));
-    }
-    vpImageConvert::convert(cvI, I);
+    readOpenCV(I, filename);
 #else
-    switch (getFormat(final_filename)) {
-    case FORMAT_JPEG:
-      readJPEG(I, final_filename);
-      break;
-    case FORMAT_PNG:
-      readPNG(I, final_filename);
-      break;
-    case FORMAT_BMP:
-    case FORMAT_TIFF:
-    case FORMAT_DIB:
-    case FORMAT_PBM:
-    case FORMAT_RASTER:
-    case FORMAT_JPEG2000:
-    case FORMAT_UNKNOWN:
-    default:
-      std::string message = "Cannot read file \"" + std::string(final_filename) + "\": Image format not supported";
-      throw(vpImageException(vpImageException::ioError, message));
-    }
+    std::string message = "Cannot read file \"" + filename + "\": No backend able to support this image format";
+    throw(vpImageException(vpImageException::ioError, message));
 #endif
   }
 }
@@ -463,18 +279,10 @@ void vpImageIo::write(const vpImage<unsigned char> &I, const std::string &filena
     writePPM(I, filename);
     break;
   case FORMAT_JPEG:
-#ifdef VISP_HAVE_JPEG
     writeJPEG(I, filename);
-#else
-    try_opencv_writer = true;
-#endif
     break;
   case FORMAT_PNG:
-#ifdef VISP_HAVE_PNG
     writePNG(I, filename);
-#else
-    try_opencv_writer = true;
-#endif
     break;
   case FORMAT_TIFF:
   case FORMAT_BMP:
@@ -488,30 +296,11 @@ void vpImageIo::write(const vpImage<unsigned char> &I, const std::string &filena
   }
 
   if (try_opencv_writer) {
-#if VISP_HAVE_OPENCV_VERSION >= 0x020100
-    // std::cout << "Use opencv to write the image" << std::endl;
-    cv::Mat cvI;
-    vpImageConvert::convert(I, cvI);
-    cv::imwrite(filename, cvI);
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    writeOpenCV(I, filename);
 #else
-    switch (getFormat(filename)) {
-    case FORMAT_JPEG:
-      writeJPEG(I, filename);
-      break;
-    case FORMAT_PNG:
-      writePNG(I, filename);
-      break;
-    case FORMAT_BMP:
-    case FORMAT_TIFF:
-    case FORMAT_DIB:
-    case FORMAT_PBM:
-    case FORMAT_RASTER:
-    case FORMAT_JPEG2000:
-    case FORMAT_UNKNOWN:
-    default:
-      vpCERROR << "Cannot write file: Image format not supported..." << std::endl;
-      throw(vpImageException(vpImageException::ioError, "Cannot write file: Image format not supported"));
-    }
+    std::string message = "Cannot write file \"" + filename + "\": No backend able to support this image format";
+    throw(vpImageException(vpImageException::ioError, message));
 #endif
   }
 }
@@ -542,18 +331,10 @@ void vpImageIo::write(const vpImage<vpRGBa> &I, const std::string &filename)
     writePPM(I, filename);
     break;
   case FORMAT_JPEG:
-#ifdef VISP_HAVE_JPEG
     writeJPEG(I, filename);
-#else
-    try_opencv_writer = true;
-#endif
     break;
   case FORMAT_PNG:
-#ifdef VISP_HAVE_PNG
     writePNG(I, filename);
-#else
-    try_opencv_writer = true;
-#endif
     break;
   case FORMAT_TIFF:
   case FORMAT_BMP:
@@ -567,1735 +348,250 @@ void vpImageIo::write(const vpImage<vpRGBa> &I, const std::string &filename)
   }
 
   if (try_opencv_writer) {
-#if VISP_HAVE_OPENCV_VERSION >= 0x020100
-    // std::cout << "Use opencv to write the image" << std::endl;
-    cv::Mat cvI;
-    vpImageConvert::convert(I, cvI);
-    cv::imwrite(filename, cvI);
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    writeOpenCV(I, filename);
 #else
-    switch (getFormat(filename)) {
-    case FORMAT_JPEG:
-      writeJPEG(I, filename);
-      break;
-    case FORMAT_PNG:
-      writePNG(I, filename);
-      break;
-    case FORMAT_BMP:
-    case FORMAT_TIFF:
-    case FORMAT_DIB:
-    case FORMAT_PBM:
-    case FORMAT_RASTER:
-    case FORMAT_JPEG2000:
-    case FORMAT_UNKNOWN:
-    default:
-      vpCERROR << "Cannot write file: Image format not supported..." << std::endl;
-      throw(vpImageException(vpImageException::ioError, "Cannot write file: Image format not supported"));
-  }
+    std::string message = "Cannot write file \"" + filename + "\": No backend able to support this image format";
+    throw(vpImageException(vpImageException::ioError, message));
 #endif
   }
 }
 
-//--------------------------------------------------------------------------
-// PFM
-//--------------------------------------------------------------------------
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function is built like portable gray pixmap (eg PGM P5) file.
-  but considers float image data.
-
-  \param I : Image to save as a (PFM P8) file.
-  \param filename : Name of the file containing the image.
-*/
-
-void vpImageIo::writePFM(const vpImage<float> &I, const std::string &filename)
+void vpImageIo::readJPEG(vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend)
 {
-  FILE *fd;
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot write PFM image: filename empty"));
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_JPEG)
+    readJPEGLibjpeg(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": Libjpeg backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    readOpenCV(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    readSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    readStb(I, filename);
   }
+}
 
-  fd = fopen(filename.c_str(), "wb");
-
-  if (fd == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PFM file \"%s\"", filename.c_str()));
+void vpImageIo::readJPEG(vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend)
+{
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_JPEG)
+    readJPEGLibjpeg(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": Libjpeg backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    readOpenCV(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    readSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    readStb(I, filename);
   }
+}
 
-  // Write the head
-  fprintf(fd, "P8\n");                                 // Magic number
-  fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
-  fprintf(fd, "255\n");                                // Max level
-
-  // Write the bitmap
-  size_t ierr;
-  size_t nbyte = I.getWidth() * I.getHeight();
-
-  ierr = fwrite(I.bitmap, sizeof(float), nbyte, fd);
-  if (ierr != nbyte) {
-    fclose(fd);
-    throw(vpImageException(vpImageException::ioError, "Cannot save PFM file \"%s\": only %d bytes over %d saved ",
-                           filename.c_str(), ierr, nbyte));
+void vpImageIo::readPNG(vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend)
+{
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_PNG)
+    readPNGLibpng(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": Libpng backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    readOpenCV(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    readSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    readStb(I, filename);
   }
-
-  fflush(fd);
-  fclose(fd);
 }
-//--------------------------------------------------------------------------
-// PGM
-//--------------------------------------------------------------------------
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a portable gray pixmap (PGM P5) file.
-
-  \param I : Image to save as a (PGM P5) file.
-  \param filename : Name of the file containing the image.
-*/
 
-void vpImageIo::writePGM(const vpImage<unsigned char> &I, const std::string &filename)
+void vpImageIo::readPNG(vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend)
 {
-
-  FILE *fd;
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty"));
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_PNG)
+    readPNGLibpng(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": Libpng backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    readOpenCV(I, filename);
+#else
+    std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    readSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    readStb(I, filename);
   }
+}
 
-  fd = fopen(filename.c_str(), "wb");
-
-  if (fd == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str()));
+void vpImageIo::writeJPEG(const vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend)
+{
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_JPEG)
+    writeJPEGLibjpeg(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": Libjpeg backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    writeOpenCV(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    writeJPEGSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    writeJPEGStb(I, filename);
   }
+}
 
-  // Write the head
-  fprintf(fd, "P5\n");                                 // Magic number
-  fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
-  fprintf(fd, "255\n");                                // Max level
+void vpImageIo::writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend)
+{
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_JPEG)
+    writeJPEGLibjpeg(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": Libjpeg backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    writeOpenCV(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    writeJPEGSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    writeJPEGStb(I, filename);
+  }
+}
 
-  // Write the bitmap
-  size_t ierr;
-  size_t nbyte = I.getWidth() * I.getHeight();
+void vpImageIo::writePNG(const vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend)
+{
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_PNG)
+    writePNGLibpng(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": Libpng backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    writeOpenCV(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    writePNGSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    writePNGStb(I, filename);
+  }
+}
 
-  ierr = fwrite(I.bitmap, sizeof(unsigned char), nbyte, fd);
-  if (ierr != nbyte) {
-    fclose(fd);
-    throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved",
-                           filename.c_str(), ierr, nbyte));
+void vpImageIo::writePNG(const vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend)
+{
+  if (backend == IO_LIB_BACKEND) {
+#if defined(VISP_HAVE_PNG)
+    writePNGLibpng(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": Libpng backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_OPENCV_BACKEND) {
+#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
+    writeOpenCV(I, filename);
+#else
+    std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available";
+    throw(vpImageException(vpImageException::ioError, message));
+#endif
+  } else if (backend == IO_SIMDLIB_BACKEND) {
+    writePNGSimdlib(I, filename);
+  } else if (backend == IO_STB_IMAGE_BACKEND) {
+    writePNGStb(I, filename);
   }
+}
 
-  fflush(fd);
-  fclose(fd);
+void vpImageIo::writePFM(const vpImage<float> &I, const std::string &filename)
+{
+  vp_writePFM(I, filename);
 }
 
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a portable gray pixmap (PGM P5) file.
+void vpImageIo::writePGM(const vpImage<unsigned char> &I, const std::string &filename)
+{
+  vp_writePGM(I, filename);
+}
 
-  \param I : Image to save as a (PGM P5) file.
-  \param filename : Name of the file containing the image.
-*/
 void vpImageIo::writePGM(const vpImage<short> &I, const std::string &filename)
 {
-  vpImage<unsigned char> Iuc;
-  unsigned int nrows = I.getHeight();
-  unsigned int ncols = I.getWidth();
-
-  Iuc.resize(nrows, ncols);
-
-  for (unsigned int i = 0; i < nrows * ncols; i++)
-    Iuc.bitmap[i] = (unsigned char)I.bitmap[i];
-
-  vpImageIo::writePGM(Iuc, filename);
+  vp_writePGM(I, filename);
 }
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a portable gray pixmap (PGM P5) file.
-  Color image is converted into a grayscale image.
-
-  \param I : Image to save as a (PGM P5) file.
-  \param filename : Name of the file containing the image.
-*/
 
 void vpImageIo::writePGM(const vpImage<vpRGBa> &I, const std::string &filename)
 {
-
-  FILE *fd;
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file: filename empty"));
-  }
-
-  fd = fopen(filename.c_str(), "wb");
-
-  if (fd == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PGM file \"%s\"", filename.c_str()));
-  }
-
-  // Write the head
-  fprintf(fd, "P5\n");                                 // Magic number
-  fprintf(fd, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
-  fprintf(fd, "255\n");                                // Max level
-
-  // Write the bitmap
-  size_t ierr;
-  size_t nbyte = I.getWidth() * I.getHeight();
-
-  vpImage<unsigned char> Itmp;
-  vpImageConvert::convert(I, Itmp);
-
-  ierr = fwrite(Itmp.bitmap, sizeof(unsigned char), nbyte, fd);
-  if (ierr != nbyte) {
-    fclose(fd);
-    throw(vpImageException(vpImageException::ioError, "Cannot save PGM file \"%s\": only %d over %d bytes saved",
-                           filename.c_str(), ierr, nbyte));
-  }
-
-  fflush(fd);
-  fclose(fd);
+  vp_writePGM(I, filename);
 }
 
-/*!
-  Read a PFM P8 file and initialize a float image.
-
-  Read the contents of the portable gray pixmap (PFM P8) filename, allocate
-  memory for the corresponding image, and set the bitmap whith the content of
-  the file.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-
-*/
-
 void vpImageIo::readPFM(vpImage<float> &I, const std::string &filename)
 {
-  unsigned int w = 0, h = 0, maxval = 0;
-  unsigned int w_max = 100000, h_max = 100000, maxval_max = 255;
-  std::string magic("P8");
-
-  std::ifstream fd(filename.c_str(), std::ios::binary);
-
-  // Open the filename
-  if (!fd.is_open()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str()));
-  }
-
-  vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval);
-
-  if (w > w_max || h > h_max) {
-    fd.close();
-    throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str()));
-  }
-  if (maxval > maxval_max) {
-    fd.close();
-    throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str()));
-  }
-
-  if ((h != I.getHeight()) || (w != I.getWidth())) {
-    I.resize(h, w);
-  }
-
-  unsigned int nbyte = I.getHeight() * I.getWidth();
-  fd.read((char *)I.bitmap, sizeof(float) * nbyte);
-  if (!fd) {
-    fd.close();
-    throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte,
-                           filename.c_str()));
-  }
-
-  fd.close();
+  vp_readPFM(I, filename);
 }
 
-/*!
-  Read a PGM P5 file and initialize a scalar image.
-
-  Read the contents of the portable gray pixmap (PGM P5) filename, allocate
-  memory for the corresponding image, and set the bitmap whith the content of
-  the file.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-*/
-
 void vpImageIo::readPGM(vpImage<unsigned char> &I, const std::string &filename)
 {
-  unsigned int w = 0, h = 0, maxval = 0;
-  unsigned int w_max = 100000, h_max = 100000, maxval_max = 255;
-  std::string magic("P5");
-
-  std::ifstream fd(filename.c_str(), std::ios::binary);
-
-  // Open the filename
-  if (!fd.is_open()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str()));
-  }
-
-  vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval);
-
-  if (w > w_max || h > h_max) {
-    fd.close();
-    throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str()));
-  }
-  if (maxval > maxval_max) {
-    fd.close();
-    throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str()));
-  }
-
-  if ((h != I.getHeight()) || (w != I.getWidth())) {
-    I.resize(h, w);
-  }
-
-  unsigned int nbyte = I.getHeight() * I.getWidth();
-  fd.read((char *)I.bitmap, nbyte);
-  if (!fd) {
-    fd.close();
-    throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"", fd.gcount(), nbyte,
-                           filename.c_str()));
-  }
-
-  fd.close();
+  vp_readPGM(I, filename);
 }
 
-/*!
-  Read a PGM P5 file and initialize a scalar image.
-
-  Read the contents of the portable gray pixmap (PGM P5) filename, allocate
-  memory for the corresponding image, and set the bitmap whith the content of
-  the file.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  The gray level image contained in the \e filename is converted in a
-  color image in \e I.
-
-  \param I : Color image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-*/
-
 void vpImageIo::readPGM(vpImage<vpRGBa> &I, const std::string &filename)
 {
-  vpImage<unsigned char> Itmp;
+  vp_readPGM(I, filename);
+}
 
-  vpImageIo::readPGM(Itmp, filename);
+void vpImageIo::readPPM(vpImage<unsigned char> &I, const std::string &filename)
+{
+  vp_readPPM(I, filename);
+}
 
-  vpImageConvert::convert(Itmp, I);
+void vpImageIo::readPPM(vpImage<vpRGBa> &I, const std::string &filename)
+{
+  vp_readPPM(I, filename);
 }
 
-//--------------------------------------------------------------------------
-// PPM
-//--------------------------------------------------------------------------
-
-/*!
-  Read the contents of the portable pixmap (PPM P6) filename, allocate memory
-  for the corresponding gray level image, convert the data in gray level, and
-  set the bitmap whith the gray level data. That means that the image \e I is
-  a "black and white" rendering of the original image in \e filename, as in a
-  black and white photograph. The quantization formula used is \f$0,299 r +
-  0,587 g + 0,114 b\f$.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-
-*/
-void vpImageIo::readPPM(vpImage<unsigned char> &I, const std::string &filename)
-{
-  vpImage<vpRGBa> Itmp;
-
-  vpImageIo::readPPM(Itmp, filename);
-
-  vpImageConvert::convert(Itmp, I);
-}
-
-/*!
-  Read the contents of the portable pixmap (PPM P6) filename,
-  allocate memory for the corresponding vpRGBa image.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::readPPM(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  unsigned int w = 0, h = 0, maxval = 0;
-  unsigned int w_max = 100000, h_max = 100000, maxval_max = 255;
-  std::string magic("P6");
-
-  std::ifstream fd(filename.c_str(), std::ios::binary);
-
-  // Open the filename
-  if (!fd.is_open()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot open file \"%s\"", filename.c_str()));
-  }
-
-  vp_decodeHeaderPNM(filename, fd, magic, w, h, maxval);
-
-  if (w > w_max || h > h_max) {
-    fd.close();
-    throw(vpException(vpException::badValue, "Bad image size in \"%s\"", filename.c_str()));
-  }
-  if (maxval > maxval_max) {
-    fd.close();
-    throw(vpImageException(vpImageException::ioError, "Bad maxval in \"%s\"", filename.c_str()));
-  }
-
-  if ((h != I.getHeight()) || (w != I.getWidth())) {
-    I.resize(h, w);
-  }
-
-  for (unsigned int i = 0; i < I.getHeight(); i++) {
-    for (unsigned int j = 0; j < I.getWidth(); j++) {
-      unsigned char rgb[3];
-      fd.read((char *)&rgb, 3);
-
-      if (!fd) {
-        fd.close();
-        throw(vpImageException(vpImageException::ioError, "Read only %d of %d bytes in file \"%s\"",
-                               (i * I.getWidth() + j) * 3 + fd.gcount(), I.getSize() * 3, filename.c_str()));
-      }
-
-      I[i][j].R = rgb[0];
-      I[i][j].G = rgb[1];
-      I[i][j].B = rgb[2];
-      I[i][j].A = vpRGBa::alpha_default;
-    }
-  }
-
-  fd.close();
-}
-
-/*!
-  Write the content of the bitmap in the file which name is given by \e
-  filename. This function writes a portable gray pixmap (PPM P6) file.
-  grayscale image is converted into a color image vpRGBa.
-
-  \param I : Image to save as a (PPM P6) file.
-  \param filename : Name of the file containing the image.
-
-*/
-
 void vpImageIo::writePPM(const vpImage<unsigned char> &I, const std::string &filename)
 {
-  vpImage<vpRGBa> Itmp;
-
-  vpImageConvert::convert(I, Itmp);
-
-  vpImageIo::writePPM(Itmp, filename);
+  vp_writePPM(I, filename);
 }
 
-/*!
-  Write the content of the bitmap in the file which name is given by \e
-  filename. This function writes a portable gray pixmap (PPM P6) file.
-
-  \param I : Image to save as a (PPM P6) file.
-  \param filename : Name of the file containing the image.
-*/
 void vpImageIo::writePPM(const vpImage<vpRGBa> &I, const std::string &filename)
 {
-  FILE *f;
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PPM file: filename empty"));
-  }
-
-  f = fopen(filename.c_str(), "wb");
-
-  if (f == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PPM file \"%s\"", filename.c_str()));
-  }
-
-  fprintf(f, "P6\n");                                 // Magic number
-  fprintf(f, "%u %u\n", I.getWidth(), I.getHeight()); // Image size
-  fprintf(f, "%d\n", 255);                            // Max level
-
-  for (unsigned int i = 0; i < I.getHeight(); i++) {
-    for (unsigned int j = 0; j < I.getWidth(); j++) {
-      vpRGBa v = I[i][j];
-      unsigned char rgb[3];
-      rgb[0] = v.R;
-      rgb[1] = v.G;
-      rgb[2] = v.B;
-
-      size_t res = fwrite(&rgb, 1, 3, f);
-      if (res != 3) {
-        fclose(f);
-        throw(vpImageException(vpImageException::ioError, "cannot write file \"%s\"", filename.c_str()));
-      }
-    }
-  }
-
-  fflush(f);
-  fclose(f);
-}
-
-//--------------------------------------------------------------------------
-// JPEG
-//--------------------------------------------------------------------------
-
-#if defined(VISP_HAVE_JPEG)
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a JPEG file.
-
-  \param I : Image to save as a JPEG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writeJPEG(const vpImage<unsigned char> &I, const std::string &filename)
-{
-  struct jpeg_compress_struct cinfo;
-  struct jpeg_error_mgr jerr;
-  FILE *file;
-
-  cinfo.err = jpeg_std_error(&jerr);
-  jpeg_create_compress(&cinfo);
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "wb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str()));
-  }
-
-  unsigned int width = I.getWidth();
-  unsigned int height = I.getHeight();
-
-  jpeg_stdio_dest(&cinfo, file);
-
-  cinfo.image_width = width;
-  cinfo.image_height = height;
-  cinfo.input_components = 1;
-  cinfo.in_color_space = JCS_GRAYSCALE;
-  jpeg_set_defaults(&cinfo);
-
-  jpeg_start_compress(&cinfo, TRUE);
-
-  unsigned char *line;
-  line = new unsigned char[width];
-  unsigned char *input = (unsigned char *)I.bitmap;
-  while (cinfo.next_scanline < cinfo.image_height) {
-    for (unsigned int i = 0; i < width; i++) {
-      line[i] = *(input);
-      input++;
-    }
-    jpeg_write_scanlines(&cinfo, &line, 1);
-  }
-
-  jpeg_finish_compress(&cinfo);
-  jpeg_destroy_compress(&cinfo);
-  delete[] line;
-  fclose(file);
-}
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a JPEG file.
-
-  \param I : Image to save as a JPEG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename)
-{
-  struct jpeg_compress_struct cinfo;
-  struct jpeg_error_mgr jerr;
-  FILE *file;
-
-  cinfo.err = jpeg_std_error(&jerr);
-  jpeg_create_compress(&cinfo);
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "wb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create JPEG file \"%s\"", filename.c_str()));
-  }
-
-  unsigned int width = I.getWidth();
-  unsigned int height = I.getHeight();
-
-  jpeg_stdio_dest(&cinfo, file);
-
-  cinfo.image_width = width;
-  cinfo.image_height = height;
-  cinfo.input_components = 3;
-  cinfo.in_color_space = JCS_RGB;
-  jpeg_set_defaults(&cinfo);
-
-  jpeg_start_compress(&cinfo, TRUE);
-
-  unsigned char *line;
-  line = new unsigned char[3 * width];
-  unsigned char *input = (unsigned char *)I.bitmap;
-  while (cinfo.next_scanline < cinfo.image_height) {
-    for (unsigned int i = 0; i < width; i++) {
-      line[i * 3] = *(input);
-      input++;
-      line[i * 3 + 1] = *(input);
-      input++;
-      line[i * 3 + 2] = *(input);
-      input++;
-      input++;
-    }
-    jpeg_write_scanlines(&cinfo, &line, 1);
-  }
-
-  jpeg_finish_compress(&cinfo);
-  jpeg_destroy_compress(&cinfo);
-  delete[] line;
-  fclose(file);
-}
-
-/*!
-  Read the contents of the JPEG file, allocate memory
-  for the corresponding gray level image, if necessary convert the data in
-  gray level, and set the bitmap whith the gray level data. That means that
-  the image \e I is a "black and white" rendering of the original image in \e
-  filename, as in a black and white photograph. If necessary, the quantization
-  formula used is \f$0,299 r + 0,587 g + 0,114 b\f$.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-
-*/
-void vpImageIo::readJPEG(vpImage<unsigned char> &I, const std::string &filename)
-{
-  struct jpeg_decompress_struct cinfo;
-  struct jpeg_error_mgr jerr;
-  FILE *file;
-
-  cinfo.err = jpeg_std_error(&jerr);
-  jpeg_create_decompress(&cinfo);
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "rb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str()));
-  }
-
-  jpeg_stdio_src(&cinfo, file);
-  jpeg_read_header(&cinfo, TRUE);
-
-  unsigned int width = cinfo.image_width;
-  unsigned int height = cinfo.image_height;
-
-  if ((width != I.getWidth()) || (height != I.getHeight()))
-    I.resize(height, width);
-
-  jpeg_start_decompress(&cinfo);
-
-  unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components);
-  JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1);
-
-  if (cinfo.out_color_space == JCS_RGB) {
-    vpImage<vpRGBa> Ic(height, width);
-    unsigned char *output = (unsigned char *)Ic.bitmap;
-    while (cinfo.output_scanline < cinfo.output_height) {
-      jpeg_read_scanlines(&cinfo, buffer, 1);
-      for (unsigned int i = 0; i < width; i++) {
-        *(output++) = buffer[0][i * 3];
-        *(output++) = buffer[0][i * 3 + 1];
-        *(output++) = buffer[0][i * 3 + 2];
-        *(output++) = vpRGBa::alpha_default;
-      }
-    }
-    vpImageConvert::convert(Ic, I);
-  }
-
-  else if (cinfo.out_color_space == JCS_GRAYSCALE) {
-    while (cinfo.output_scanline < cinfo.output_height) {
-      unsigned int row = cinfo.output_scanline;
-      jpeg_read_scanlines(&cinfo, buffer, 1);
-      memcpy(I[row], buffer[0], rowbytes);
-    }
-  }
-
-  jpeg_finish_decompress(&cinfo);
-  jpeg_destroy_decompress(&cinfo);
-  fclose(file);
-}
-
-/*!
-  Read a JPEG file and initialize a scalar image.
-
-  Read the contents of the JPEG file, allocate
-  memory for the corresponding image, and set
-  the bitmap whith the content of
-  the file.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  If the file corresponds to a grayscaled image, a conversion is done to deal
-  with \e I which is a color image.
-
-  \param I : Color image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::readJPEG(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  struct jpeg_decompress_struct cinfo;
-  struct jpeg_error_mgr jerr;
-  FILE *file;
-
-  cinfo.err = jpeg_std_error(&jerr);
-  jpeg_create_decompress(&cinfo);
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG image: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "rb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read JPEG file \"%s\"", filename.c_str()));
-  }
-
-  jpeg_stdio_src(&cinfo, file);
-
-  jpeg_read_header(&cinfo, TRUE);
-
-  unsigned int width = cinfo.image_width;
-  unsigned int height = cinfo.image_height;
-
-  if ((width != I.getWidth()) || (height != I.getHeight()))
-    I.resize(height, width);
-
-  jpeg_start_decompress(&cinfo);
-
-  unsigned int rowbytes = cinfo.output_width * (unsigned int)(cinfo.output_components);
-  JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, rowbytes, 1);
-
-  if (cinfo.out_color_space == JCS_RGB) {
-    unsigned char *output = (unsigned char *)I.bitmap;
-    while (cinfo.output_scanline < cinfo.output_height) {
-      jpeg_read_scanlines(&cinfo, buffer, 1);
-      for (unsigned int i = 0; i < width; i++) {
-        *(output++) = buffer[0][i * 3];
-        *(output++) = buffer[0][i * 3 + 1];
-        *(output++) = buffer[0][i * 3 + 2];
-        *(output++) = vpRGBa::alpha_default;
-      }
-    }
-  }
-
-  else if (cinfo.out_color_space == JCS_GRAYSCALE) {
-    vpImage<unsigned char> Ig(height, width);
-
-    while (cinfo.output_scanline < cinfo.output_height) {
-      unsigned int row = cinfo.output_scanline;
-      jpeg_read_scanlines(&cinfo, buffer, 1);
-      memcpy(Ig[row], buffer[0], rowbytes);
-    }
-
-    vpImageConvert::convert(Ig, I);
-  }
-
-  jpeg_finish_decompress(&cinfo);
-  jpeg_destroy_decompress(&cinfo);
-  fclose(file);
-}
-
-#elif defined(VISP_HAVE_OPENCV)
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a JPEG file.
-
-  \param I : Image to save as a JPEG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writeJPEG(const vpImage<unsigned char> &I, const std::string &filename)
-{
-#if (VISP_HAVE_OPENCV_VERSION >= 0x020408)
-  cv::Mat Ip;
-  vpImageConvert::convert(I, Ip);
-  cv::imwrite(filename.c_str(), Ip);
-#else
-  IplImage *Ip = NULL;
-  vpImageConvert::convert(I, Ip);
-
-  cvSaveImage(filename.c_str(), Ip);
-
-  cvReleaseImage(&Ip);
-#endif
-}
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a JPEG file.
-
-  \param I : Image to save as a JPEG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename)
-{
-#if (VISP_HAVE_OPENCV_VERSION >= 0x020408)
-  cv::Mat Ip;
-  vpImageConvert::convert(I, Ip);
-  cv::imwrite(filename.c_str(), Ip);
-#else
-  IplImage *Ip = NULL;
-  vpImageConvert::convert(I, Ip);
-
-  cvSaveImage(filename.c_str(), Ip);
-
-  cvReleaseImage(&Ip);
-#endif
-}
-
-/*!
-  Read the contents of the JPEG file, allocate memory
-  for the corresponding gray level image, if necessary convert the data in
-  gray level, and set the bitmap whith the gray level data. That means that
-  the image \e I is a "black and white" rendering of the original image in \e
-  filename, as in a black and white photograph. If necessary, the quantization
-  formula used is \f$0,299 r + 0,587 g + 0,114 b\f$.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  If EXIF information is embedded in the image file, the EXIF orientation is ignored.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-
-*/
-void vpImageIo::readJPEG(vpImage<unsigned char> &I, const std::string &filename)
-{
-#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-#if VISP_HAVE_OPENCV_VERSION >= 0x030200
-    int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
-    int flags = cv::IMREAD_GRAYSCALE;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
-    int flags = CV_LOAD_IMAGE_GRAYSCALE;
-#endif
-  cv::Mat Ip = cv::imread(filename.c_str(), flags);
-  if (!Ip.empty())
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-#else
-  IplImage *Ip = NULL;
-  Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE);
-  if (Ip != NULL)
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-  cvReleaseImage(&Ip);
-#endif
-}
-
-/*!
-  Read a JPEG file and initialize a scalar image.
-
-  Read the contents of the JPEG file, allocate
-  memory for the corresponding image, and set
-  the bitmap whith the content of
-  the file.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  If the file corresponds to a grayscaled image, a conversion is done to deal
-  with \e I which is a color image.
-
-  If EXIF information is embedded in the image file, the EXIF orientation is ignored.
-
-  \param I : Color image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::readJPEG(vpImage<vpRGBa> &I, const std::string &filename)
-{
-#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-#if VISP_HAVE_OPENCV_VERSION >= 0x030200
-    int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
-    int flags = cv::IMREAD_GRAYSCALE;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
-    int flags = CV_LOAD_IMAGE_GRAYSCALE;
-#endif
-  cv::Mat Ip = cv::imread(filename.c_str(), flags);
-  if (!Ip.empty())
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-#else
-  IplImage *Ip = NULL;
-  Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_COLOR);
-  if (Ip != NULL)
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-  cvReleaseImage(&Ip);
-#endif
-}
-#else
-void vpImageIo::readJPEG(vpImage<unsigned char> &I, const std::string &filename)
-{
-  int width = 0, height = 0, channels = 0;
-  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_grey);
-  if (image == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
-  }
-  I.init(image, static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
-  stbi_image_free(image);
-}
-void vpImageIo::readJPEG(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  int width = 0, height = 0, channels = 0;
-  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha);
-  if (image == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
-  }
-  I.init(reinterpret_cast<vpRGBa*>(image), static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
-  stbi_image_free(image);
-}
-void vpImageIo::writeJPEG(const vpImage<unsigned char> &I, const std::string &filename)
-{
-  int res = stbi_write_jpg(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_grey,
-                           reinterpret_cast<void*>(I.bitmap), 90);
-  if (res == 0) {
-    throw(vpImageException(vpImageException::ioError, "JPEG write error"));
-  }
-}
-void vpImageIo::writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename)
-{
-  int res = stbi_write_jpg(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
-                           reinterpret_cast<void*>(I.bitmap), 90);
-  if (res == 0) {
-    throw(vpImageException(vpImageException::ioError, "JEPG write error"));
-  }
+  vp_writePPM(I, filename);
 }
-#endif
-
-//--------------------------------------------------------------------------
-// PNG
-//--------------------------------------------------------------------------
-
-#if defined(VISP_HAVE_PNG)
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a PNG file.
-
-  \param I : Image to save as a PNG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writePNG(const vpImage<unsigned char> &I, const std::string &filename)
-{
-  FILE *file;
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "wb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str()));
-  }
-
-  /* create a png info struct */
-  png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
-  if (!png_ptr) {
-    fclose(file);
-    vpERROR_TRACE("Error during png_create_write_struct()\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  png_infop info_ptr = png_create_info_struct(png_ptr);
-  if (!info_ptr) {
-    fclose(file);
-    png_destroy_write_struct(&png_ptr, NULL);
-    vpERROR_TRACE("Error during png_create_info_struct()\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  /* initialize the setjmp for returning properly after a libpng error occured
-   */
-  if (setjmp(png_jmpbuf(png_ptr))) {
-    fclose(file);
-    png_destroy_write_struct(&png_ptr, &info_ptr);
-    vpERROR_TRACE("Error during init_io\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  /* setup libpng for using standard C fwrite() function with our FILE pointer
-   */
-  png_init_io(png_ptr, file);
-
-  unsigned int width = I.getWidth();
-  unsigned int height = I.getHeight();
-  int bit_depth = 8;
-  int color_type = PNG_COLOR_TYPE_GRAY;
-  /* set some useful information from header */
-
-  if (setjmp(png_jmpbuf(png_ptr))) {
-    fclose(file);
-    png_destroy_write_struct(&png_ptr, &info_ptr);
-    vpERROR_TRACE("Error during write header\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
-               PNG_FILTER_TYPE_BASE);
-
-  png_write_info(png_ptr, info_ptr);
-
-  png_bytep *row_ptrs = new png_bytep[height];
-  for (unsigned int i = 0; i < height; i++)
-    row_ptrs[i] = new png_byte[width];
-
-  unsigned char *input = (unsigned char *)I.bitmap;
-
-  for (unsigned int i = 0; i < height; i++) {
-    png_byte *row = row_ptrs[i];
-    for (unsigned int j = 0; j < width; j++) {
-      row[j] = *(input);
-      input++;
-    }
-  }
-
-  png_write_image(png_ptr, row_ptrs);
-
-  png_write_end(png_ptr, NULL);
-
-  for (unsigned int j = 0; j < height; j++)
-    delete[] row_ptrs[j];
-
-  delete[] row_ptrs;
-
-  png_destroy_write_struct(&png_ptr, &info_ptr);
-
-  fclose(file);
-}
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a PNG file.
-
-  \param I : Image to save as a PNG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writePNG(const vpImage<vpRGBa> &I, const std::string &filename)
-{
-  FILE *file;
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "wb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot create PNG file \"%s\"", filename.c_str()));
-  }
-
-  /* create a png info struct */
-  png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
-  if (!png_ptr) {
-    fclose(file);
-    vpERROR_TRACE("Error during png_create_write_struct()\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  png_infop info_ptr = png_create_info_struct(png_ptr);
-  if (!info_ptr) {
-    fclose(file);
-    png_destroy_write_struct(&png_ptr, NULL);
-    vpERROR_TRACE("Error during png_create_info_struct()\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  /* initialize the setjmp for returning properly after a libpng error occured
-   */
-  if (setjmp(png_jmpbuf(png_ptr))) {
-    fclose(file);
-    png_destroy_write_struct(&png_ptr, &info_ptr);
-    vpERROR_TRACE("Error during init_io\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  /* setup libpng for using standard C fwrite() function with our FILE pointer
-   */
-  png_init_io(png_ptr, file);
-
-  unsigned int width = I.getWidth();
-  unsigned int height = I.getHeight();
-  int bit_depth = 8;
-  int color_type = PNG_COLOR_TYPE_RGB;
-  /* set some useful information from header */
-
-  if (setjmp(png_jmpbuf(png_ptr))) {
-    fclose(file);
-    png_destroy_write_struct(&png_ptr, &info_ptr);
-    vpERROR_TRACE("Error during write header\n");
-    throw(vpImageException(vpImageException::ioError, "PNG write error"));
-  }
-
-  png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
-               PNG_FILTER_TYPE_BASE);
-
-  png_write_info(png_ptr, info_ptr);
-
-  png_bytep *row_ptrs = new png_bytep[height];
-  for (unsigned int i = 0; i < height; i++)
-    row_ptrs[i] = new png_byte[3 * width];
-
-  unsigned char *input = (unsigned char *)I.bitmap;
-  ;
-
-  for (unsigned int i = 0; i < height; i++) {
-    png_byte *row = row_ptrs[i];
-    for (unsigned int j = 0; j < width; j++) {
-      row[3 * j] = *(input);
-      input++;
-      row[3 * j + 1] = *(input);
-      input++;
-      row[3 * j + 2] = *(input);
-      input++;
-      input++;
-    }
-  }
-
-  png_write_image(png_ptr, row_ptrs);
-
-  png_write_end(png_ptr, NULL);
-
-  for (unsigned int j = 0; j < height; j++)
-    delete[] row_ptrs[j];
-
-  delete[] row_ptrs;
-
-  png_destroy_write_struct(&png_ptr, &info_ptr);
-
-  fclose(file);
-}
-
-/*!
-  Read the contents of the PNG file, allocate memory
-  for the corresponding gray level image, if necessary convert the data in
-  gray level, and set the bitmap whith the gray level data. That means that
-  the image \e I is a "black and white" rendering of the original image in \e
-  filename, as in a black and white photograph. If necessary, the quantization
-  formula used is \f$0,299 r + 0,587 g + 0,114 b\f$.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-
-*/
-void vpImageIo::readPNG(vpImage<unsigned char> &I, const std::string &filename)
-{
-  FILE *file;
-  png_byte magic[8];
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "rb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str()));
-  }
-
-  /* read magic number */
-  if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) {
-    fclose(file);
-    throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str()));
-  }
-
-  /* check for valid magic number */
-  if (png_sig_cmp(magic, 0, sizeof(magic))) {
-    fclose(file);
-    throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image",
-                           filename.c_str()));
-  }
-
-  /* create a png read struct */
-  // printf("version %s\n", PNG_LIBPNG_VER_STRING);
-  png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
-  if (png_ptr == NULL) {
-    fprintf(stderr, "error: can't create a png read structure!\n");
-    fclose(file);
-    throw(vpImageException(vpImageException::ioError, "error reading png file"));
-  }
-
-  /* create a png info struct */
-  png_infop info_ptr = png_create_info_struct(png_ptr);
-  if (info_ptr == NULL) {
-    fprintf(stderr, "error: can't create a png info structure!\n");
-    fclose(file);
-    png_destroy_read_struct(&png_ptr, NULL, NULL);
-    throw(vpImageException(vpImageException::ioError, "error reading png file"));
-  }
-
-  /* initialize the setjmp for returning properly after a libpng error occured
-   */
-  if (setjmp(png_jmpbuf(png_ptr))) {
-    fclose(file);
-    png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
-    vpERROR_TRACE("Error during init io\n");
-    throw(vpImageException(vpImageException::ioError, "PNG read error"));
-  }
-
-  /* setup libpng for using standard C fread() function with our FILE pointer
-   */
-  png_init_io(png_ptr, file);
-
-  /* tell libpng that we have already read the magic number */
-  png_set_sig_bytes(png_ptr, sizeof(magic));
-
-  /* read png info */
-  png_read_info(png_ptr, info_ptr);
-
-  unsigned int width = png_get_image_width(png_ptr, info_ptr);
-  unsigned int height = png_get_image_height(png_ptr, info_ptr);
-
-  unsigned int bit_depth, channels, color_type;
-  /* get some useful information from header */
-  bit_depth = png_get_bit_depth(png_ptr, info_ptr);
-  channels = png_get_channels(png_ptr, info_ptr);
-  color_type = png_get_color_type(png_ptr, info_ptr);
-
-  /* convert index color images to RGB images */
-  if (color_type == PNG_COLOR_TYPE_PALETTE)
-    png_set_palette_to_rgb(png_ptr);
-
-  /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */
-  if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8)
-    png_set_expand(png_ptr);
-
-  //  if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS))
-  //    png_set_tRNS_to_alpha (png_ptr);
-
-  if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA)
-    png_set_strip_alpha(png_ptr);
-
-  if (bit_depth == 16)
-    png_set_strip_16(png_ptr);
-  else if (bit_depth < 8)
-    png_set_packing(png_ptr);
-
-  /* update info structure to apply transformations */
-  png_read_update_info(png_ptr, info_ptr);
-
-  channels = png_get_channels(png_ptr, info_ptr);
-
-  if ((width != I.getWidth()) || (height != I.getHeight()))
-    I.resize(height, width);
-
-  png_bytep *rowPtrs = new png_bytep[height];
-
-  unsigned int stride = png_get_rowbytes(png_ptr, info_ptr);
-  unsigned char *data = new unsigned char[stride * height];
-
-  for (unsigned int i = 0; i < height; i++)
-    rowPtrs[i] = (png_bytep)data + (i * stride);
-
-  png_read_image(png_ptr, rowPtrs);
-
-  vpImage<vpRGBa> Ic(height, width);
-  unsigned char *output;
-
-  switch (channels) {
-  case 1:
-    output = (unsigned char *)I.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i];
-    }
-    break;
-
-  case 2:
-    output = (unsigned char *)I.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i * 2];
-    }
-    break;
-
-  case 3:
-    output = (unsigned char *)Ic.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i * 3];
-      *(output++) = data[i * 3 + 1];
-      *(output++) = data[i * 3 + 2];
-      *(output++) = vpRGBa::alpha_default;
-    }
-    vpImageConvert::convert(Ic, I);
-    break;
-
-  case 4:
-    output = (unsigned char *)Ic.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i * 4];
-      *(output++) = data[i * 4 + 1];
-      *(output++) = data[i * 4 + 2];
-      *(output++) = data[i * 4 + 3];
-    }
-    vpImageConvert::convert(Ic, I);
-    break;
-  }
-
-  delete[](png_bytep) rowPtrs;
-  delete[] data;
-  png_read_end(png_ptr, NULL);
-  png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
-  fclose(file);
-}
-
-/*!
-  Read a PNG file and initialize a scalar image.
-
-  Read the contents of the PNG file, allocate
-  memory for the corresponding image, and set
-  the bitmap whith the content of
-  the file.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  If the file corresponds to a grayscaled image, a conversion is done to deal
-  with \e I which is a color image.
-
-  \param I : Color image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::readPNG(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  FILE *file;
-  png_byte magic[8];
-
-  // Test the filename
-  if (filename.empty()) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read PNG image: filename empty"));
-  }
-
-  file = fopen(filename.c_str(), "rb");
-
-  if (file == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Cannot read file \"%s\"", filename.c_str()));
-  }
-
-  /* read magic number */
-  if (fread(magic, 1, sizeof(magic), file) != sizeof(magic)) {
-    fclose(file);
-    throw(vpImageException(vpImageException::ioError, "Cannot read magic number in file \"%s\"", filename.c_str()));
-  }
-
-  /* check for valid magic number */
-  if (png_sig_cmp(magic, 0, sizeof(magic))) {
-    fclose(file);
-    throw(vpImageException(vpImageException::ioError, "Cannot read PNG file: \"%s\" is not a valid PNG image",
-                           filename.c_str()));
-  }
-
-  /* create a png read struct */
-  png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
-  if (!png_ptr) {
-    fclose(file);
-    vpERROR_TRACE("Error during png_create_read_struct()\n");
-    throw(vpImageException(vpImageException::ioError, "PNG read error"));
-  }
-
-  /* create a png info struct */
-  png_infop info_ptr = png_create_info_struct(png_ptr);
-  if (!info_ptr) {
-    fclose(file);
-    png_destroy_read_struct(&png_ptr, NULL, NULL);
-    vpERROR_TRACE("Error during png_create_info_struct()\n");
-    throw(vpImageException(vpImageException::ioError, "PNG read error"));
-  }
-
-  /* initialize the setjmp for returning properly after a libpng error occured
-   */
-  if (setjmp(png_jmpbuf(png_ptr))) {
-    fclose(file);
-    png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
-    vpERROR_TRACE("Error during init io\n");
-    throw(vpImageException(vpImageException::ioError, "PNG read error"));
-  }
-
-  /* setup libpng for using standard C fread() function with our FILE pointer
-   */
-  png_init_io(png_ptr, file);
-
-  /* tell libpng that we have already read the magic number */
-  png_set_sig_bytes(png_ptr, sizeof(magic));
-
-  /* read png info */
-  png_read_info(png_ptr, info_ptr);
-
-  unsigned int width = png_get_image_width(png_ptr, info_ptr);
-  unsigned int height = png_get_image_height(png_ptr, info_ptr);
-
-  unsigned int bit_depth, channels, color_type;
-  /* get some useful information from header */
-  bit_depth = png_get_bit_depth(png_ptr, info_ptr);
-  channels = png_get_channels(png_ptr, info_ptr);
-  color_type = png_get_color_type(png_ptr, info_ptr);
-
-  /* convert index color images to RGB images */
-  if (color_type == PNG_COLOR_TYPE_PALETTE)
-    png_set_palette_to_rgb(png_ptr);
-
-  /* convert 1-2-4 bits grayscale images to 8 bits grayscale. */
-  if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8)
-    png_set_expand(png_ptr);
-
-  //  if (png_get_valid (png_ptr, info_ptr, PNG_INFO_tRNS))
-  //    png_set_tRNS_to_alpha (png_ptr);
-
-  if (color_type == PNG_COLOR_TYPE_GRAY_ALPHA)
-    png_set_strip_alpha(png_ptr);
-
-  if (bit_depth == 16)
-    png_set_strip_16(png_ptr);
-  else if (bit_depth < 8)
-    png_set_packing(png_ptr);
-
-  /* update info structure to apply transformations */
-  png_read_update_info(png_ptr, info_ptr);
-
-  channels = png_get_channels(png_ptr, info_ptr);
-
-  if ((width != I.getWidth()) || (height != I.getHeight()))
-    I.resize(height, width);
-
-  png_bytep *rowPtrs = new png_bytep[height];
-
-  unsigned int stride = png_get_rowbytes(png_ptr, info_ptr);
-  unsigned char *data = new unsigned char[stride * height];
-
-  for (unsigned int i = 0; i < height; i++)
-    rowPtrs[i] = (png_bytep)data + (i * stride);
-
-  png_read_image(png_ptr, rowPtrs);
-
-  vpImage<unsigned char> Ig(height, width);
-  unsigned char *output;
-
-  switch (channels) {
-  case 1:
-    output = (unsigned char *)Ig.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i];
-    }
-    vpImageConvert::convert(Ig, I);
-    break;
-
-  case 2:
-    output = (unsigned char *)Ig.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i * 2];
-    }
-    vpImageConvert::convert(Ig, I);
-    break;
-
-  case 3:
-    output = (unsigned char *)I.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i * 3];
-      *(output++) = data[i * 3 + 1];
-      *(output++) = data[i * 3 + 2];
-      *(output++) = vpRGBa::alpha_default;
-    }
-    break;
-
-  case 4:
-    output = (unsigned char *)I.bitmap;
-    for (unsigned int i = 0; i < width * height; i++) {
-      *(output++) = data[i * 4];
-      *(output++) = data[i * 4 + 1];
-      *(output++) = data[i * 4 + 2];
-      *(output++) = data[i * 4 + 3];
-    }
-    break;
-  }
-
-  delete[](png_bytep) rowPtrs;
-  delete[] data;
-  png_read_end(png_ptr, NULL);
-  png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
-  fclose(file);
-}
-
-//TODO:
-void vpImageIo::readSimdlib(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  size_t stride = 0, width = 0, height = 0;
-  SimdPixelFormatType format = SimdPixelFormatRgba32;
-  uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format);
-  const bool copyData = false;
-  I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData);
-}
-
-void vpImageIo::readStb(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  int width = 0, height = 0, channels = 0;
-  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha);
-  if (image == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
-  }
-  I.init(reinterpret_cast<vpRGBa*>(image), static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
-  stbi_image_free(image);
-}
-
-inline bool ends_with(std::string const & value, std::string const & ending)
-{
-    if (ending.size() > value.size()) return false;
-    return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
-}
-
-void vpImageIo::writeSimdlib(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  if (ends_with(filename, ".png")) {
-    SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFilePng, 90, filename.c_str());
-  } else {
-    SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, 90, filename.c_str());
-  }
-}
-
-void vpImageIo::writeStb(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  if (ends_with(filename, ".png")) {
-    const int stride_in_bytes = static_cast<int>(4 * I.getWidth());
-    int res = stbi_write_png(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
-                             reinterpret_cast<void*>(I.bitmap), stride_in_bytes);
-    if (res == 0) {
-      throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str()));
-    }
-  } else {
-    int res = stbi_write_jpg(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
-                             reinterpret_cast<void*>(I.bitmap), 90);
-    if (res == 0) {
-      throw(vpImageException(vpImageException::ioError, "JEPG write error"));
-    }
-  }
-}
-
-#elif defined(VISP_HAVE_OPENCV)
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a PNG file.
-
-  \param I : Image to save as a PNG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writePNG(const vpImage<unsigned char> &I, const std::string &filename)
-{
-#if (VISP_HAVE_OPENCV_VERSION >= 0x020408)
-  cv::Mat Ip;
-  vpImageConvert::convert(I, Ip);
-  cv::imwrite(filename.c_str(), Ip);
-#else
-  IplImage *Ip = NULL;
-  vpImageConvert::convert(I, Ip);
-
-  cvSaveImage(filename.c_str(), Ip);
-
-  cvReleaseImage(&Ip);
-#endif
-}
-
-/*!
-  Write the content of the image bitmap in the file which name is given by \e
-  filename. This function writes a PNG file.
-
-  \param I : Image to save as a PNG file.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::writePNG(const vpImage<vpRGBa> &I, const std::string &filename)
-{
-#if (VISP_HAVE_OPENCV_VERSION >= 0x020408)
-  cv::Mat Ip;
-  vpImageConvert::convert(I, Ip);
-  cv::imwrite(filename.c_str(), Ip);
-#else
-  IplImage *Ip = NULL;
-  vpImageConvert::convert(I, Ip);
-
-  cvSaveImage(filename.c_str(), Ip);
-
-  cvReleaseImage(&Ip);
-#endif
-}
-
-/*!
-  Read the contents of the PNG file, allocate memory
-  for the corresponding gray level image, if necessary convert the data in
-  gray level, and set the bitmap whith the gray level data. That means that
-  the image \e I is a "black and white" rendering of the original image in \e
-  filename, as in a black and white photograph. If necessary, the quantization
-  formula used is \f$0,299 r + 0,587 g + 0,114 b\f$.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  If EXIF information is embedded in the image file, the EXIF orientation is ignored.
-
-  \param I : Image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-
-*/
-void vpImageIo::readPNG(vpImage<unsigned char> &I, const std::string &filename)
-{
-#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-#if VISP_HAVE_OPENCV_VERSION >= 0x030200
-    int flags = cv::IMREAD_GRAYSCALE | cv::IMREAD_IGNORE_ORIENTATION;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
-    int flags = cv::IMREAD_GRAYSCALE;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
-    int flags = CV_LOAD_IMAGE_GRAYSCALE;
-#endif
-  cv::Mat Ip = cv::imread(filename.c_str(), flags);
-  if (!Ip.empty())
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-#else
-  IplImage *Ip = NULL;
-  Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE);
-  if (Ip != NULL)
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-  cvReleaseImage(&Ip);
-#endif
-}
-
-/*!
-  Read a PNG file and initialize a scalar image.
-
-  Read the contents of the PNG file, allocate
-  memory for the corresponding image, and set
-  the bitmap whith the content of
-  the file.
-
-  If the image has been already initialized, memory allocation is done
-  only if the new image size is different, else we re-use the same
-  memory space.
-
-  If the file corresponds to a grayscaled image, a conversion is done to deal
-  with \e I which is a color image.
-
-  If EXIF information is embedded in the image file, the EXIF orientation is ignored.
-
-  \param I : Color image to set with the \e filename content.
-  \param filename : Name of the file containing the image.
-*/
-void vpImageIo::readPNG(vpImage<vpRGBa> &I, const std::string &filename)
-{
-#if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-#if VISP_HAVE_OPENCV_VERSION >= 0x030200
-    int flags = cv::IMREAD_COLOR | cv::IMREAD_IGNORE_ORIENTATION;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x030000
-    int flags = cv::IMREAD_COLOR;
-#elif VISP_HAVE_OPENCV_VERSION >= 0x020100
-    int flags = CV_LOAD_IMAGE_COLOR;
-#endif
-  cv::Mat Ip = cv::imread(filename.c_str(), flags);
-  if (!Ip.empty())
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-#else
-  IplImage *Ip = NULL;
-  Ip = cvLoadImage(filename.c_str(), CV_LOAD_IMAGE_COLOR);
-  if (Ip != NULL)
-    vpImageConvert::convert(Ip, I);
-  else
-    throw(vpImageException(vpImageException::ioError, "Can't read the image"));
-  cvReleaseImage(&Ip);
-#endif
-}
-#else
-void vpImageIo::readPNG(vpImage<unsigned char> &I, const std::string &filename)
-{
-  int width = 0, height = 0, channels = 0;
-  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_grey);
-  if (image == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
-  }
-  I.init(image, static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
-  stbi_image_free(image);
-}
-void vpImageIo::readPNG(vpImage<vpRGBa> &I, const std::string &filename)
-{
-  int width = 0, height = 0, channels = 0;
-  unsigned char *image = stbi_load(filename.c_str(), &width, &height, &channels, STBI_rgb_alpha);
-  if (image == NULL) {
-    throw(vpImageException(vpImageException::ioError, "Can't read the image: %s", filename.c_str()));
-  }
-  I.init(reinterpret_cast<vpRGBa*>(image), static_cast<unsigned int>(height), static_cast<unsigned int>(width), true);
-  stbi_image_free(image);
-}
-void vpImageIo::writePNG(const vpImage<unsigned char> &I, const std::string &filename)
-{
-  const int stride_in_bytes = static_cast<int>(I.getWidth());
-  int res = stbi_write_png(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_grey,
-                           reinterpret_cast<void*>(I.bitmap), stride_in_bytes);
-  if (res == 0) {
-    throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str()));
-  }
-}
-void vpImageIo::writePNG(const vpImage<vpRGBa> &I, const std::string &filename)
-{
-  const int stride_in_bytes = static_cast<int>(4 * I.getWidth());
-  int res = stbi_write_png(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
-                           reinterpret_cast<void*>(I.bitmap), stride_in_bytes);
-  if (res == 0) {
-    throw(vpImageException(vpImageException::ioError, "PNG write error: %s", filename.c_str()));
-  }
-}
-#endif
diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp
index 8efe2c759e..3bf19a465e 100644
--- a/modules/io/test/perfImageLoadSave.cpp
+++ b/modules/io/test/perfImageLoadSave.cpp
@@ -64,7 +64,7 @@ TEST_CASE("Benchmark Jpeg image loading", "[benchmark]") {
     vpImage<vpRGBa> I;
 
     BENCHMARK("vpImageIo::readSimdlib()") {
-      vpImageIo::readSimdlib(I, imagePathJpeg);
+      vpImageIo::readJPEG(I, imagePathJpeg, vpImageIo::IO_SIMDLIB_BACKEND);
       return I;
     };
   }
@@ -73,7 +73,7 @@ TEST_CASE("Benchmark Jpeg image loading", "[benchmark]") {
     vpImage<vpRGBa> I;
 
     BENCHMARK("vpImageIo::readStb()") {
-      vpImageIo::readStb(I, imagePathJpeg);
+      vpImageIo::readJPEG(I, imagePathJpeg, vpImageIo::IO_STB_IMAGE_BACKEND);
       return I;
     };
   }
@@ -93,7 +93,7 @@ TEST_CASE("Benchmark Png image loading", "[benchmark]") {
     vpImage<vpRGBa> I;
 
     BENCHMARK("vpImageIo::readSimdlib()") {
-      vpImageIo::readSimdlib(I, imagePathPng);
+      vpImageIo::readPNG(I, imagePathPng, vpImageIo::IO_SIMDLIB_BACKEND);
       return I;
     };
   }
@@ -102,7 +102,7 @@ TEST_CASE("Benchmark Png image loading", "[benchmark]") {
     vpImage<vpRGBa> I;
 
     BENCHMARK("vpImageIo::readStb()") {
-      vpImageIo::readStb(I, imagePathPng);
+      vpImageIo::readPNG(I, imagePathPng, vpImageIo::IO_STB_IMAGE_BACKEND);
       return I;
     };
   }
@@ -122,7 +122,7 @@ TEST_CASE("Benchmark big Png image loading", "[benchmark]") {
     vpImage<vpRGBa> I;
 
     BENCHMARK("vpImageIo::readSimdlib()") {
-      vpImageIo::readSimdlib(I, imagePathPngBig);
+      vpImageIo::readPNG(I, imagePathPngBig, vpImageIo::IO_SIMDLIB_BACKEND);
       return I;
     };
   }
@@ -131,7 +131,7 @@ TEST_CASE("Benchmark big Png image loading", "[benchmark]") {
     vpImage<vpRGBa> I;
 
     BENCHMARK("vpImageIo::readStb()") {
-      vpImageIo::readStb(I, imagePathPngBig);
+      vpImageIo::readPNG(I, imagePathPngBig, vpImageIo::IO_STB_IMAGE_BACKEND);
       return I;
     };
   }
@@ -153,7 +153,7 @@ TEST_CASE("Benchmark Jpeg image saving", "[benchmark]") {
     const std::string filename = "/tmp/Klimt_Simd.jpg";
 
     BENCHMARK("vpImageIo::writeSimdlib()") {
-      vpImageIo::writeSimdlib(I, filename);
+      vpImageIo::writeJPEG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND);
       return I;
     };
   }
@@ -162,7 +162,7 @@ TEST_CASE("Benchmark Jpeg image saving", "[benchmark]") {
     const std::string filename = "/tmp/Klimt_stb.jpg";
 
     BENCHMARK("vpImageIo::writeStb()") {
-      vpImageIo::writeStb(I, filename);
+      vpImageIo::writeJPEG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND);
       return I;
     };
   }
@@ -184,7 +184,7 @@ TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") {
     const std::string filename = "/tmp/Big_images_Simd.jpg";
 
     BENCHMARK("vpImageIo::writeSimdlib()") {
-      vpImageIo::writeSimdlib(I, filename);
+      vpImageIo::writeJPEG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND);
       return I;
     };
   }
@@ -193,7 +193,7 @@ TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") {
     const std::string filename = "/tmp/Big_images_stb.jpg";
 
     BENCHMARK("vpImageIo::writeStb()") {
-      vpImageIo::writeStb(I, filename);
+      vpImageIo::writeJPEG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND);
       return I;
     };
   }
@@ -215,7 +215,7 @@ TEST_CASE("Benchmark Png image saving", "[benchmark]") {
     const std::string filename = "/tmp/Klimt_Simd.png";
 
     BENCHMARK("vpImageIo::writeSimdlib()") {
-      vpImageIo::writeSimdlib(I, filename);
+      vpImageIo::writePNG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND);
       return I;
     };
   }
@@ -224,7 +224,7 @@ TEST_CASE("Benchmark Png image saving", "[benchmark]") {
     const std::string filename = "/tmp/Klimt_stb.png";
 
     BENCHMARK("vpImageIo::writeStb()") {
-      vpImageIo::writeStb(I, filename);
+      vpImageIo::writePNG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND);
       return I;
     };
   }
@@ -246,7 +246,7 @@ TEST_CASE("Benchmark big Png image saving", "[benchmark]") {
     const std::string filename = "/tmp/Big_images_Simd.png";
 
     BENCHMARK("vpImageIo::writeSimdlib()") {
-      vpImageIo::writeSimdlib(I, filename);
+      vpImageIo::writePNG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND);
       return I;
     };
   }
@@ -255,155 +255,12 @@ TEST_CASE("Benchmark big Png image saving", "[benchmark]") {
     const std::string filename = "/tmp/Big_images_stb.png";
 
     BENCHMARK("vpImageIo::writeStb()") {
-      vpImageIo::writeStb(I, filename);
+      vpImageIo::writePNG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND);
       return I;
     };
   }
 }
 
-//TEST_CASE("Benchmark bgr to grayscale (ViSP)", "[benchmark]") {
-//  vpImage<vpRGBa> I;
-//  vpImageIo::read(I, imagePathColor);
-
-//  std::vector<unsigned char> bgr;
-//  common_tools::RGBaToBGR(I, bgr);
-
-//  vpImage<unsigned char> I_gray(I.getHeight(), I.getWidth());
-
-//  BENCHMARK("Benchmark bgr to grayscale (ViSP)") {
-//    vpImageConvert::BGRToGrey(bgr.data(),
-//                              I_gray.bitmap,
-//                              I.getWidth(), I.getHeight(),
-//                              false, nThreads);
-//    return I_gray;
-//  };
-
-//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101)
-//  SECTION("OpenCV Mat type")
-//  {
-//    cv::Mat img;
-//    vpImageConvert::convert(I, img);
-
-//    BENCHMARK("Benchmark bgr to grayscale (ViSP + OpenCV Mat type)") {
-//      vpImageConvert::convert(img, I_gray, false, nThreads);
-//      return I_gray;
-//    };
-//  }
-//#endif
-//}
-//#endif
-
-//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101)
-//TEST_CASE("Benchmark bgr to grayscale (OpenCV)", "[benchmark]") {
-//  cv::Mat img = cv::imread(imagePathColor);
-//  cv::Mat img_gray(img.size(), CV_8UC1);
-
-//  BENCHMARK("Benchmark bgr to grayscale (OpenCV)") {
-//    cv::cvtColor(img, img_gray, cv::COLOR_BGR2GRAY);
-//    return img_gray;
-//  };
-//}
-//#endif
-
-//// C++11 to be able to do bgr.data()
-//#if VISP_CXX_STANDARD >= VISP_CXX_STANDARD_11
-//TEST_CASE("Benchmark bgr to rgba (naive code)", "[benchmark]") {
-//  vpImage<vpRGBa> I;
-//  vpImageIo::read(I, imagePathColor);
-
-//  std::vector<unsigned char> bgr;
-//  common_tools::RGBaToBGR(I, bgr);
-
-//  vpImage<vpRGBa> I_bench(I.getHeight(), I.getWidth());
-//  BENCHMARK("Benchmark bgr to rgba (naive code)") {
-//    common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast<unsigned char*>(I_bench.bitmap),
-//                               I.getWidth(), I.getHeight(), false);
-//    return I_bench;
-//  };
-//}
-
-//TEST_CASE("Benchmark bgr to rgba (ViSP)", "[benchmark]") {
-//  vpImage<vpRGBa> I;
-//  vpImageIo::read(I, imagePathColor);
-
-//  std::vector<unsigned char> bgr;
-//  common_tools::RGBaToBGR(I, bgr);
-
-//  SECTION("Check BGR to RGBa conversion")
-//  {
-//    vpImage<vpRGBa> ref(I.getHeight(), I.getWidth());
-//    common_tools::BGRToRGBaRef(bgr.data(), reinterpret_cast<unsigned char*>(ref.bitmap),
-//                               I.getWidth(), I.getHeight(), false);
-//    vpImage<vpRGBa> rgba(I.getHeight(), I.getWidth());
-//    vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast<unsigned char *>(rgba.bitmap),
-//                              I.getWidth(), I.getHeight(), false);
-
-//    CHECK((rgba == ref));
-//  }
-
-//  vpImage<vpRGBa> I_rgba(I.getHeight(), I.getWidth());
-//  BENCHMARK("Benchmark bgr to rgba (ViSP)") {
-//    vpImageConvert::BGRToRGBa(bgr.data(), reinterpret_cast<unsigned char *>(I_rgba.bitmap),
-//                              I.getWidth(), I.getHeight(), false);
-//    return I_rgba;
-//  };
-
-//#if (VISP_HAVE_OPENCV_VERSION >= 0x020101)
-//  SECTION("OpenCV Mat type")
-//  {
-//    cv::Mat img;
-//    vpImageConvert::convert(I, img);
-
-//    BENCHMARK("Benchmark bgr to rgba (ViSP + OpenCV Mat type)") {
-//      vpImageConvert::convert(img, I_rgba);
-//      return I_rgba;
-//    };
-//  }
-//#endif
-//}
-
-//TEST_CASE("Benchmark bgra to rgba (naive code)", "[benchmark]") {
-//  vpImage<vpRGBa> I;
-//  vpImageIo::read(I, imagePathColor);
-
-//  std::vector<unsigned char> bgra;
-//  common_tools::RGBaToBGRa(I, bgra);
-
-//  vpImage<vpRGBa> I_bench(I.getHeight(), I.getWidth());
-//  BENCHMARK("Benchmark bgra to rgba (naive code)") {
-//    common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast<unsigned char*>(I_bench.bitmap),
-//                                I.getWidth(), I.getHeight(), false);
-//    return I_bench;
-//  };
-//}
-
-//TEST_CASE("Benchmark bgra to rgba (ViSP)", "[benchmark]") {
-//  vpImage<vpRGBa> I;
-//  vpImageIo::read(I, imagePathColor);
-
-//  std::vector<unsigned char> bgra;
-//  common_tools::RGBaToBGRa(I, bgra);
-
-//  SECTION("Check BGRa to RGBa conversion")
-//  {
-//    vpImage<vpRGBa> ref(I.getHeight(), I.getWidth());
-//    common_tools::BGRaToRGBaRef(bgra.data(), reinterpret_cast<unsigned char*>(ref.bitmap),
-//                                I.getWidth(), I.getHeight(), false);
-//    vpImage<vpRGBa> rgba(I.getHeight(), I.getWidth());
-//    vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast<unsigned char *>(rgba.bitmap),
-//                               I.getWidth(), I.getHeight(), false);
-
-//    CHECK((rgba == ref));
-//  }
-//  vpImage<vpRGBa> I_rgba(I.getHeight(), I.getWidth());
-//  BENCHMARK("Benchmark bgra to rgba (ViSP)") {
-//    vpImageConvert::BGRaToRGBa(bgra.data(), reinterpret_cast<unsigned char *>(I_rgba.bitmap),
-//                               I.getWidth(), I.getHeight(), false);
-//    return I_rgba;
-//  };
-//}
-//#endif
-
 int main(int argc, char *argv[])
 {
   Catch::Session session; // There must be exactly one instance

From 7dcc2a1d02ffe3b5bd27777d0e55930e44b9e6eb Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Sun, 21 Nov 2021 21:11:01 +0100
Subject: [PATCH 13/18] Update vpImageIo backend option. Add JPEG compression
 quality. Update perfImageLoadSave.cpp.

---
 modules/io/include/visp3/io/vpImageIo.h       |  12 +-
 .../io/src/image/private/vpImageIoBackend.h   |  39 +-
 .../io/src/image/private/vpImageIoLibjpeg.cpp |  37 +-
 .../io/src/image/private/vpImageIoLibpng.cpp  |  23 +-
 .../io/src/image/private/vpImageIoOpenCV.cpp  |  25 +-
 .../src/image/private/vpImageIoPortable.cpp   |  23 +-
 .../io/src/image/private/vpImageIoSimd.cpp    |  19 +-
 modules/io/src/image/private/vpImageIoStb.cpp |  15 +-
 modules/io/src/image/vpImageIo.cpp            | 117 +++---
 modules/io/test/perfImageLoadSave.cpp         | 356 ++++++++----------
 10 files changed, 305 insertions(+), 361 deletions(-)

diff --git a/modules/io/include/visp3/io/vpImageIo.h b/modules/io/include/visp3/io/vpImageIo.h
index fa395e3882..7edbb765e7 100644
--- a/modules/io/include/visp3/io/vpImageIo.h
+++ b/modules/io/include/visp3/io/vpImageIo.h
@@ -134,11 +134,11 @@ class VISP_EXPORT vpImageIo
     IO_STB_IMAGE_BACKEND
   };
 
-  static void read(vpImage<unsigned char> &I, const std::string &filename);
-  static void read(vpImage<vpRGBa> &I, const std::string &filename);
+  static void read(vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
+  static void read(vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
 
-  static void write(const vpImage<unsigned char> &I, const std::string &filename);
-  static void write(const vpImage<vpRGBa> &I, const std::string &filename);
+  static void write(const vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
+  static void write(const vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
 
   static void readPFM(vpImage<float> &I, const std::string &filename);
 
@@ -163,8 +163,8 @@ class VISP_EXPORT vpImageIo
   static void writePPM(const vpImage<unsigned char> &I, const std::string &filename);
   static void writePPM(const vpImage<vpRGBa> &I, const std::string &filename);
 
-  static void writeJPEG(const vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
-  static void writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
+  static void writeJPEG(const vpImage<unsigned char> &I, const std::string &filename, int quality=90, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
+  static void writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename, int quality=90, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
 
   static void writePNG(const vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
   static void writePNG(const vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend=IO_DEFAULT_BACKEND);
diff --git a/modules/io/src/image/private/vpImageIoBackend.h b/modules/io/src/image/private/vpImageIoBackend.h
index e1b434c030..75a33d1793 100644
--- a/modules/io/src/image/private/vpImageIoBackend.h
+++ b/modules/io/src/image/private/vpImageIoBackend.h
@@ -29,16 +29,13 @@
  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  *
  * Description:
- * Read/write images.
- *
- * Authors:
- * Eric Marchand
+ * Backend functions implementation for image I/O operations.
  *
  *****************************************************************************/
 
 /*!
-  \file vpImageIo.h
-  \brief Read/write images
+  \file vpImageIoBackend.h
+  \brief Backend functions implementation for image I/O operations.
 */
 
 #ifndef vpIMAGEIOBACKEND_H
@@ -47,7 +44,9 @@
 #include <visp3/core/vpImage.h>
 
 
-//
+// Portable FloatMap format (PFM)
+// Portable Graymap format (PGM)
+// Portable Pixmap format (PPM)
 void vp_writePFM(const vpImage<float> &I, const std::string &filename);
 void vp_writePGM(const vpImage<unsigned char> &I, const std::string &filename);
 void vp_writePGM(const vpImage<short> &I, const std::string &filename);
@@ -60,43 +59,43 @@ void vp_readPPM(vpImage<vpRGBa> &I, const std::string &filename);
 void vp_writePPM(const vpImage<unsigned char> &I, const std::string &filename);
 void vp_writePPM(const vpImage<vpRGBa> &I, const std::string &filename);
 
-//
+// libjpeg
 void readJPEGLibjpeg(vpImage<unsigned char> &I, const std::string &filename);
 void readJPEGLibjpeg(vpImage<vpRGBa> &I, const std::string &filename);
 
-void writeJPEGLibjpeg(const vpImage<unsigned char> &I, const std::string &filename);
-void writeJPEGLibjpeg(const vpImage<vpRGBa> &I, const std::string &filename);
+void writeJPEGLibjpeg(const vpImage<unsigned char> &I, const std::string &filename, int quality);
+void writeJPEGLibjpeg(const vpImage<vpRGBa> &I, const std::string &filename, int quality);
 
-//
+// libpng
 void readPNGLibpng(vpImage<unsigned char> &I, const std::string &filename);
 void readPNGLibpng(vpImage<vpRGBa> &I, const std::string &filename);
 
 void writePNGLibpng(const vpImage<unsigned char> &I, const std::string &filename);
 void writePNGLibpng(const vpImage<vpRGBa> &I, const std::string &filename);
 
-//
+// OpenCV
 void readOpenCV(vpImage<unsigned char> &I, const std::string &filename);
 void readOpenCV(vpImage<vpRGBa> &I, const std::string &filename);
 
-void writeOpenCV(const vpImage<unsigned char> &I, const std::string &filename);
-void writeOpenCV(const vpImage<vpRGBa> &I, const std::string &filename);
+void writeOpenCV(const vpImage<unsigned char> &I, const std::string &filename, int quality);
+void writeOpenCV(const vpImage<vpRGBa> &I, const std::string &filename, int quality);
 
-//
+// Simd lib
 void readSimdlib(vpImage<unsigned char> &I, const std::string &filename);
 void readSimdlib(vpImage<vpRGBa> &I, const std::string &filename);
 
-void writeJPEGSimdlib(const vpImage<unsigned char> &I, const std::string &filename);
-void writeJPEGSimdlib(const vpImage<vpRGBa> &I, const std::string &filename);
+void writeJPEGSimdlib(const vpImage<unsigned char> &I, const std::string &filename, int quality);
+void writeJPEGSimdlib(const vpImage<vpRGBa> &I, const std::string &filename, int quality);
 
 void writePNGSimdlib(const vpImage<unsigned char> &I, const std::string &filename);
 void writePNGSimdlib(const vpImage<vpRGBa> &I, const std::string &filename);
 
-//
+// stb lib
 void readStb(vpImage<unsigned char> &I, const std::string &filename);
 void readStb(vpImage<vpRGBa> &I, const std::string &filename);
 
-void writeJPEGStb(const vpImage<unsigned char> &I, const std::string &filename);
-void writeJPEGStb(const vpImage<vpRGBa> &I, const std::string &filename);
+void writeJPEGStb(const vpImage<unsigned char> &I, const std::string &filename, int quality);
+void writeJPEGStb(const vpImage<vpRGBa> &I, const std::string &filename, int quality);
 
 void writePNGStb(const vpImage<unsigned char> &I, const std::string &filename);
 void writePNGStb(const vpImage<vpRGBa> &I, const std::string &filename);
diff --git a/modules/io/src/image/private/vpImageIoLibjpeg.cpp b/modules/io/src/image/private/vpImageIoLibjpeg.cpp
index 99debb3021..8f5b021c8c 100644
--- a/modules/io/src/image/private/vpImageIoLibjpeg.cpp
+++ b/modules/io/src/image/private/vpImageIoLibjpeg.cpp
@@ -29,28 +29,25 @@
  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  *
  * Description:
- * Read/write images.
- *
- * Authors:
- * Eric Marchand
+ * Libjpeg backend for JPEG image I/O operations.
  *
  *****************************************************************************/
 
 /*!
-  \file vpImageIo.cpp
-  \brief Read/write images
+  \file vpImageIoLibjpeg.cpp
+  \brief Libjpeg backend for JPEG image I/O operations.
 */
 
 #include "vpImageIoBackend.h"
 #include <visp3/core/vpImageConvert.h>
 
-//TODO:
-#if defined(_WIN32)
-// Include WinSock2.h before windows.h to ensure that winsock.h is not
-// included by windows.h since winsock.h and winsock2.h are incompatible
-#include <WinSock2.h>
-#include <windows.h>
-#endif
+//TODO: is it needed?
+//#if defined(_WIN32)
+//// Include WinSock2.h before windows.h to ensure that winsock.h is not
+//// included by windows.h since winsock.h and winsock2.h are incompatible
+//#include <WinSock2.h>
+//#include <windows.h>
+//#endif
 
 #if defined(VISP_HAVE_JPEG)
 #include <jerror.h>
@@ -70,8 +67,9 @@
 
   \param I : Image to save as a JPEG file.
   \param filename : Name of the file containing the image.
+  \param quality : JPEG quality for compression.
 */
-void writeJPEGLibjpeg(const vpImage<unsigned char> &I, const std::string &filename)
+void writeJPEGLibjpeg(const vpImage<unsigned char> &I, const std::string &filename, int quality)
 {
   struct jpeg_compress_struct cinfo;
   struct jpeg_error_mgr jerr;
@@ -96,11 +94,13 @@ void writeJPEGLibjpeg(const vpImage<unsigned char> &I, const std::string &filena
 
   jpeg_stdio_dest(&cinfo, file);
 
+  jpeg_set_defaults(&cinfo);
   cinfo.image_width = width;
   cinfo.image_height = height;
   cinfo.input_components = 1;
   cinfo.in_color_space = JCS_GRAYSCALE;
-  jpeg_set_defaults(&cinfo);
+  //TODO:
+  jpeg_set_quality(&cinfo, quality, TRUE);
 
   jpeg_start_compress(&cinfo, TRUE);
 
@@ -127,8 +127,9 @@ void writeJPEGLibjpeg(const vpImage<unsigned char> &I, const std::string &filena
 
   \param I : Image to save as a JPEG file.
   \param filename : Name of the file containing the image.
+  \param quality : JPEG quality for compression.
 */
-void writeJPEGLibjpeg(const vpImage<vpRGBa> &I, const std::string &filename)
+void writeJPEGLibjpeg(const vpImage<vpRGBa> &I, const std::string &filename, int quality)
 {
   struct jpeg_compress_struct cinfo;
   struct jpeg_error_mgr jerr;
@@ -153,11 +154,13 @@ void writeJPEGLibjpeg(const vpImage<vpRGBa> &I, const std::string &filename)
 
   jpeg_stdio_dest(&cinfo, file);
 
+  jpeg_set_defaults(&cinfo);
   cinfo.image_width = width;
   cinfo.image_height = height;
   cinfo.input_components = 3;
   cinfo.in_color_space = JCS_RGB;
-  jpeg_set_defaults(&cinfo);
+  //TODO:
+  jpeg_set_quality(&cinfo, quality, TRUE);
 
   jpeg_start_compress(&cinfo, TRUE);
 
diff --git a/modules/io/src/image/private/vpImageIoLibpng.cpp b/modules/io/src/image/private/vpImageIoLibpng.cpp
index e350e4260b..e87a956a28 100644
--- a/modules/io/src/image/private/vpImageIoLibpng.cpp
+++ b/modules/io/src/image/private/vpImageIoLibpng.cpp
@@ -29,28 +29,25 @@
  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  *
  * Description:
- * Read/write images.
- *
- * Authors:
- * Eric Marchand
+ * Libpng backend for PNG image I/O operations.
  *
  *****************************************************************************/
 
 /*!
-  \file vpImageIo.cpp
-  \brief Read/write images
+  \file vpImageIoLibpng.cpp
+  \brief Libpng backend for PNG image I/O operations.
 */
 
 #include "vpImageIoBackend.h"
 #include <visp3/core/vpImageConvert.h>
 
-//TODO:
-#if defined(_WIN32)
-// Include WinSock2.h before windows.h to ensure that winsock.h is not
-// included by windows.h since winsock.h and winsock2.h are incompatible
-#include <WinSock2.h>
-#include <windows.h>
-#endif
+//TODO: is it needed?
+//#if defined(_WIN32)
+//// Include WinSock2.h before windows.h to ensure that winsock.h is not
+//// included by windows.h since winsock.h and winsock2.h are incompatible
+//#include <WinSock2.h>
+//#include <windows.h>
+//#endif
 
 #if defined(VISP_HAVE_PNG)
 #include <png.h>
diff --git a/modules/io/src/image/private/vpImageIoOpenCV.cpp b/modules/io/src/image/private/vpImageIoOpenCV.cpp
index 93b6a1ca1d..d13ed07216 100644
--- a/modules/io/src/image/private/vpImageIoOpenCV.cpp
+++ b/modules/io/src/image/private/vpImageIoOpenCV.cpp
@@ -29,16 +29,13 @@
  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  *
  * Description:
- * Read/write images.
- *
- * Authors:
- * Eric Marchand
+ * OpenCV backend for image I/O operations.
  *
  *****************************************************************************/
 
 /*!
-  \file vpImageIo.cpp
-  \brief Read/write images
+  \file vpImageIoOpenCV.cpp
+  \brief OpenCV backend for image I/O operations.
 */
 
 #include "vpImageIoBackend.h"
@@ -163,12 +160,16 @@ void readOpenCV(vpImage<vpRGBa> &I, const std::string &filename)
   \param I : Image to save as a JPEG file.
   \param filename : Name of the file containing the image.
 */
-void writeOpenCV(const vpImage<unsigned char> &I, const std::string &filename)
+void writeOpenCV(const vpImage<unsigned char> &I, const std::string &filename, int quality)
 {
 #if (VISP_HAVE_OPENCV_VERSION >= 0x020408)
   cv::Mat Ip;
   vpImageConvert::convert(I, Ip);
-  cv::imwrite(filename.c_str(), Ip);
+
+  std::vector<int> compression_params;
+  compression_params.push_back(cv::IMWRITE_JPEG_QUALITY);
+  compression_params.push_back(quality);
+  cv::imwrite(filename.c_str(), Ip, compression_params);
 #else
   IplImage *Ip = NULL;
   vpImageConvert::convert(I, Ip);
@@ -186,12 +187,16 @@ void writeOpenCV(const vpImage<unsigned char> &I, const std::string &filename)
   \param I : Image to save as a JPEG file.
   \param filename : Name of the file containing the image.
 */
-void writeOpenCV(const vpImage<vpRGBa> &I, const std::string &filename)
+void writeOpenCV(const vpImage<vpRGBa> &I, const std::string &filename, int quality)
 {
 #if (VISP_HAVE_OPENCV_VERSION >= 0x020408)
   cv::Mat Ip;
   vpImageConvert::convert(I, Ip);
-  cv::imwrite(filename.c_str(), Ip);
+
+  std::vector<int> compression_params;
+  compression_params.push_back(cv::IMWRITE_JPEG_QUALITY);
+  compression_params.push_back(quality);
+  cv::imwrite(filename.c_str(), Ip, compression_params);
 #else
   IplImage *Ip = NULL;
   vpImageConvert::convert(I, Ip);
diff --git a/modules/io/src/image/private/vpImageIoPortable.cpp b/modules/io/src/image/private/vpImageIoPortable.cpp
index 0031e4c96a..10a4a35fcd 100644
--- a/modules/io/src/image/private/vpImageIoPortable.cpp
+++ b/modules/io/src/image/private/vpImageIoPortable.cpp
@@ -29,29 +29,26 @@
  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  *
  * Description:
- * Read/write images.
- *
- * Authors:
- * Eric Marchand
+ * Backend for portable image format I/O operations.
  *
  *****************************************************************************/
 
 /*!
-  \file vpImageIo.cpp
-  \brief Read/write images
+  \file vpImageIoPortable.cpp
+  \brief Backend for portable image format I/O operations.
 */
 
 #include "vpImageIoBackend.h"
 #include <visp3/core/vpIoTools.h>
 #include <visp3/core/vpImageConvert.h>
 
-//TODO:
-#if defined(_WIN32)
-// Include WinSock2.h before windows.h to ensure that winsock.h is not
-// included by windows.h since winsock.h and winsock2.h are incompatible
-#include <WinSock2.h>
-#include <windows.h>
-#endif
+//TODO: is it needed?
+//#if defined(_WIN32)
+//// Include WinSock2.h before windows.h to ensure that winsock.h is not
+//// included by windows.h since winsock.h and winsock2.h are incompatible
+//#include <WinSock2.h>
+//#include <windows.h>
+//#endif
 
 
 void vp_decodeHeaderPNM(const std::string &filename, std::ifstream &fd, const std::string &magic, unsigned int &w,
diff --git a/modules/io/src/image/private/vpImageIoSimd.cpp b/modules/io/src/image/private/vpImageIoSimd.cpp
index 40986bf743..4612aa5f7f 100644
--- a/modules/io/src/image/private/vpImageIoSimd.cpp
+++ b/modules/io/src/image/private/vpImageIoSimd.cpp
@@ -29,21 +29,16 @@
  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  *
  * Description:
- * Read/write images.
- *
- * Authors:
- * Eric Marchand
+ * Simd backend for JPEG and PNG image I/O operations.
  *
  *****************************************************************************/
 
 /*!
   \file vpImageIo.cpp
-  \brief Read/write images
+  \brief Simd backend for JPEG and PNG image I/O operations.
 */
 
 #include "vpImageIoBackend.h"
-
-//TODO:
 #include <Simd/SimdLib.hpp>
 
 
@@ -66,19 +61,19 @@ void readSimdlib(vpImage<vpRGBa> &I, const std::string &filename)
   I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData);
 }
 
-void writeJPEGSimdlib(const vpImage<unsigned char> &I, const std::string &filename)
+void writeJPEGSimdlib(const vpImage<unsigned char> &I, const std::string &filename, int quality)
 {
-  SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFileJpeg, 90, filename.c_str());
+  SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFileJpeg, quality, filename.c_str());
 }
 
-void writeJPEGSimdlib(const vpImage<vpRGBa> &I, const std::string &filename)
+void writeJPEGSimdlib(const vpImage<vpRGBa> &I, const std::string &filename, int quality)
 {
-  SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, 90, filename.c_str());
+  SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatRgba32, SimdImageFileJpeg, quality, filename.c_str());
 }
 
 void writePNGSimdlib(const vpImage<unsigned char> &I, const std::string &filename)
 {
-  SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFileJpeg, 90, filename.c_str());
+  SimdImageSaveToFile((const uint8_t *)I.bitmap, I.getWidth()*4, I.getWidth(), I.getHeight(), SimdPixelFormatGray8, SimdImageFilePng, 90, filename.c_str());
 }
 
 void writePNGSimdlib(const vpImage<vpRGBa> &I, const std::string &filename)
diff --git a/modules/io/src/image/private/vpImageIoStb.cpp b/modules/io/src/image/private/vpImageIoStb.cpp
index 97b453d841..4b6626b0cc 100644
--- a/modules/io/src/image/private/vpImageIoStb.cpp
+++ b/modules/io/src/image/private/vpImageIoStb.cpp
@@ -29,16 +29,13 @@
  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  *
  * Description:
- * Read/write images.
- *
- * Authors:
- * Eric Marchand
+ * stb backend for JPEG and PNG image I/O operations.
  *
  *****************************************************************************/
 
 /*!
   \file vpImageIo.cpp
-  \brief Read/write images
+  \brief stb backend for JPEG and PNG image I/O operations.
 */
 
 #include "vpImageIoBackend.h"
@@ -82,19 +79,19 @@ void readStb(vpImage<vpRGBa> &I, const std::string &filename)
   stbi_image_free(image);
 }
 
-void writeJPEGStb(const vpImage<unsigned char> &I, const std::string &filename)
+void writeJPEGStb(const vpImage<unsigned char> &I, const std::string &filename, int quality)
 {
   int res = stbi_write_jpg(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_grey,
-                           reinterpret_cast<void*>(I.bitmap), 90);
+                           reinterpret_cast<void*>(I.bitmap), quality);
   if (res == 0) {
     throw(vpImageException(vpImageException::ioError, "JEPG write error"));
   }
 }
 
-void writeJPEGStb(const vpImage<vpRGBa> &I, const std::string &filename)
+void writeJPEGStb(const vpImage<vpRGBa> &I, const std::string &filename, int quality)
 {
   int res = stbi_write_jpg(filename.c_str(), static_cast<int>(I.getWidth()), static_cast<int>(I.getHeight()), STBI_rgb_alpha,
-                           reinterpret_cast<void*>(I.bitmap), 90);
+                           reinterpret_cast<void*>(I.bitmap), quality);
   if (res == 0) {
     throw(vpImageException(vpImageException::ioError, "JEPG write error"));
   }
diff --git a/modules/io/src/image/vpImageIo.cpp b/modules/io/src/image/vpImageIo.cpp
index e8b221049e..241a408e27 100644
--- a/modules/io/src/image/vpImageIo.cpp
+++ b/modules/io/src/image/vpImageIo.cpp
@@ -41,14 +41,19 @@
   \brief Read/write images
 */
 
-#include <visp3/core/vpImage.h>
-#include <visp3/core/vpImageConvert.h> //image  conversion
 #include <visp3/core/vpIoTools.h>
 #include <visp3/io/vpImageIo.h>
 
 //TODO:
 #include "private/vpImageIoBackend.h"
 
+//TODO:
+// priority order for backend selection is:
+//  - libjpeg / libpng if available
+//  - OpenCV if available
+//  - stb backend for image reading / Simd backend for image writing
+//  - Simd backend for image reading / stb backend for image writing
+
 
 vpImageIo::vpImageFormatType vpImageIo::getFormat(const std::string &filename)
 {
@@ -140,7 +145,7 @@ std::string vpImageIo::getExtension(const std::string &filename)
   \param I : Image to set with the \e filename content.
   \param filename : Name of the file containing the image.
  */
-void vpImageIo::read(vpImage<unsigned char> &I, const std::string &filename)
+void vpImageIo::read(vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend)
 {
   bool exist = vpIoTools::checkFilename(filename);
   if (!exist) {
@@ -161,10 +166,10 @@ void vpImageIo::read(vpImage<unsigned char> &I, const std::string &filename)
     readPPM(I, final_filename);
     break;
   case FORMAT_JPEG:
-    readJPEG(I, final_filename);
+    readJPEG(I, final_filename, backend);
     break;
   case FORMAT_PNG:
-    readPNG(I, final_filename);
+    readPNG(I, final_filename, backend);
     break;
   case FORMAT_TIFF:
   case FORMAT_BMP:
@@ -207,7 +212,7 @@ void vpImageIo::read(vpImage<unsigned char> &I, const std::string &filename)
   \param I : Image to set with the \e filename content.
   \param filename : Name of the file containing the image.
  */
-void vpImageIo::read(vpImage<vpRGBa> &I, const std::string &filename)
+void vpImageIo::read(vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend)
 {
   bool exist = vpIoTools::checkFilename(filename);
   if (!exist) {
@@ -227,10 +232,10 @@ void vpImageIo::read(vpImage<vpRGBa> &I, const std::string &filename)
     readPPM(I, final_filename);
     break;
   case FORMAT_JPEG:
-    readJPEG(I, final_filename);
+    readJPEG(I, final_filename, backend);
     break;
   case FORMAT_PNG:
-    readPNG(I, final_filename);
+    readPNG(I, final_filename, backend);
     break;
   case FORMAT_TIFF:
   case FORMAT_BMP:
@@ -267,7 +272,7 @@ void vpImageIo::read(vpImage<vpRGBa> &I, const std::string &filename)
   \param I : Image to write.
   \param filename : Name of the file containing the image.
  */
-void vpImageIo::write(const vpImage<unsigned char> &I, const std::string &filename)
+void vpImageIo::write(const vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend)
 {
   bool try_opencv_writer = false;
 
@@ -279,10 +284,10 @@ void vpImageIo::write(const vpImage<unsigned char> &I, const std::string &filena
     writePPM(I, filename);
     break;
   case FORMAT_JPEG:
-    writeJPEG(I, filename);
+    writeJPEG(I, filename, backend);
     break;
   case FORMAT_PNG:
-    writePNG(I, filename);
+    writePNG(I, filename, backend);
     break;
   case FORMAT_TIFF:
   case FORMAT_BMP:
@@ -297,7 +302,7 @@ void vpImageIo::write(const vpImage<unsigned char> &I, const std::string &filena
 
   if (try_opencv_writer) {
 #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-    writeOpenCV(I, filename);
+    writeOpenCV(I, filename, 90);
 #else
     std::string message = "Cannot write file \"" + filename + "\": No backend able to support this image format";
     throw(vpImageException(vpImageException::ioError, message));
@@ -319,7 +324,7 @@ void vpImageIo::write(const vpImage<unsigned char> &I, const std::string &filena
   \param I : Image to write.
   \param filename : Name of the file containing the image.
  */
-void vpImageIo::write(const vpImage<vpRGBa> &I, const std::string &filename)
+void vpImageIo::write(const vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend)
 {
   bool try_opencv_writer = false;
 
@@ -331,10 +336,10 @@ void vpImageIo::write(const vpImage<vpRGBa> &I, const std::string &filename)
     writePPM(I, filename);
     break;
   case FORMAT_JPEG:
-    writeJPEG(I, filename);
+    writeJPEG(I, filename, backend);
     break;
   case FORMAT_PNG:
-    writePNG(I, filename);
+    writePNG(I, filename, backend);
     break;
   case FORMAT_TIFF:
   case FORMAT_BMP:
@@ -349,7 +354,7 @@ void vpImageIo::write(const vpImage<vpRGBa> &I, const std::string &filename)
 
   if (try_opencv_writer) {
 #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-    writeOpenCV(I, filename);
+    writeOpenCV(I, filename, 90);
 #else
     std::string message = "Cannot write file \"" + filename + "\": No backend able to support this image format";
     throw(vpImageException(vpImageException::ioError, message));
@@ -359,159 +364,159 @@ void vpImageIo::write(const vpImage<vpRGBa> &I, const std::string &filename)
 
 void vpImageIo::readJPEG(vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend)
 {
-  if (backend == IO_LIB_BACKEND) {
+  if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_JPEG)
     readJPEGLibjpeg(I, filename);
 #else
     std::string message = "Cannot read file \"" + filename + "\": Libjpeg backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
-  } else if (backend == IO_OPENCV_BACKEND) {
+  } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
     readOpenCV(I, filename);
 #else
     std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
+  } else if (backend == IO_STB_IMAGE_BACKEND || backend == IO_DEFAULT_BACKEND) {
+    readStb(I, filename);
   } else if (backend == IO_SIMDLIB_BACKEND) {
     readSimdlib(I, filename);
-  } else if (backend == IO_STB_IMAGE_BACKEND) {
-    readStb(I, filename);
   }
 }
 
 void vpImageIo::readJPEG(vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend)
 {
-  if (backend == IO_LIB_BACKEND) {
+  if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_JPEG)
     readJPEGLibjpeg(I, filename);
 #else
     std::string message = "Cannot read file \"" + filename + "\": Libjpeg backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
-  } else if (backend == IO_OPENCV_BACKEND) {
+  } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
     readOpenCV(I, filename);
 #else
     std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
+  } else if (backend == IO_STB_IMAGE_BACKEND || backend == IO_DEFAULT_BACKEND) {
+    readStb(I, filename);
   } else if (backend == IO_SIMDLIB_BACKEND) {
     readSimdlib(I, filename);
-  } else if (backend == IO_STB_IMAGE_BACKEND) {
-    readStb(I, filename);
   }
 }
 
 void vpImageIo::readPNG(vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend)
 {
-  if (backend == IO_LIB_BACKEND) {
+  if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_PNG)
     readPNGLibpng(I, filename);
 #else
     std::string message = "Cannot read file \"" + filename + "\": Libpng backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
-  } else if (backend == IO_OPENCV_BACKEND) {
+  } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
     readOpenCV(I, filename);
 #else
     std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
+  } else if (backend == IO_STB_IMAGE_BACKEND || backend == IO_DEFAULT_BACKEND) {
+    readStb(I, filename);
   } else if (backend == IO_SIMDLIB_BACKEND) {
     readSimdlib(I, filename);
-  } else if (backend == IO_STB_IMAGE_BACKEND) {
-    readStb(I, filename);
   }
 }
 
 void vpImageIo::readPNG(vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend)
 {
-  if (backend == IO_LIB_BACKEND) {
+  if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_PNG)
     readPNGLibpng(I, filename);
 #else
     std::string message = "Cannot read file \"" + filename + "\": Libpng backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
-  } else if (backend == IO_OPENCV_BACKEND) {
+  } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
     readOpenCV(I, filename);
 #else
     std::string message = "Cannot read file \"" + filename + "\": OpenCV backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
+  } else if (backend == IO_STB_IMAGE_BACKEND || backend == IO_DEFAULT_BACKEND) {
+    readStb(I, filename);
   } else if (backend == IO_SIMDLIB_BACKEND) {
     readSimdlib(I, filename);
-  } else if (backend == IO_STB_IMAGE_BACKEND) {
-    readStb(I, filename);
   }
 }
 
-void vpImageIo::writeJPEG(const vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend)
+void vpImageIo::writeJPEG(const vpImage<unsigned char> &I, const std::string &filename, int quality, const vpImageIoBackendType& backend)
 {
-  if (backend == IO_LIB_BACKEND) {
+  if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_JPEG)
-    writeJPEGLibjpeg(I, filename);
+    writeJPEGLibjpeg(I, filename, quality);
 #else
     std::string message = "Cannot write file \"" + filename + "\": Libjpeg backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
-  } else if (backend == IO_OPENCV_BACKEND) {
+  } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-    writeOpenCV(I, filename);
+    writeOpenCV(I, filename, quality);
 #else
     std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
-  } else if (backend == IO_SIMDLIB_BACKEND) {
-    writeJPEGSimdlib(I, filename);
+  } else if (backend == IO_SIMDLIB_BACKEND || backend == IO_DEFAULT_BACKEND) {
+    writeJPEGSimdlib(I, filename, quality);
   } else if (backend == IO_STB_IMAGE_BACKEND) {
-    writeJPEGStb(I, filename);
+    writeJPEGStb(I, filename, quality);
   }
 }
 
-void vpImageIo::writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend)
+void vpImageIo::writeJPEG(const vpImage<vpRGBa> &I, const std::string &filename, int quality, const vpImageIoBackendType& backend)
 {
-  if (backend == IO_LIB_BACKEND) {
+  if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_JPEG)
-    writeJPEGLibjpeg(I, filename);
+    writeJPEGLibjpeg(I, filename, quality);
 #else
     std::string message = "Cannot write file \"" + filename + "\": Libjpeg backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
-  } else if (backend == IO_OPENCV_BACKEND) {
+  } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-    writeOpenCV(I, filename);
+    writeOpenCV(I, filename, quality);
 #else
     std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
-  } else if (backend == IO_SIMDLIB_BACKEND) {
-    writeJPEGSimdlib(I, filename);
+  } else if (backend == IO_SIMDLIB_BACKEND || backend == IO_DEFAULT_BACKEND) {
+    writeJPEGSimdlib(I, filename, quality);
   } else if (backend == IO_STB_IMAGE_BACKEND) {
-    writeJPEGStb(I, filename);
+    writeJPEGStb(I, filename, quality);
   }
 }
 
 void vpImageIo::writePNG(const vpImage<unsigned char> &I, const std::string &filename, const vpImageIoBackendType& backend)
 {
-  if (backend == IO_LIB_BACKEND) {
+  if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_PNG)
     writePNGLibpng(I, filename);
 #else
     std::string message = "Cannot write file \"" + filename + "\": Libpng backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
-  } else if (backend == IO_OPENCV_BACKEND) {
+  } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-    writeOpenCV(I, filename);
+    writeOpenCV(I, filename, 90);
 #else
     std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
-  } else if (backend == IO_SIMDLIB_BACKEND) {
+  } else if (backend == IO_SIMDLIB_BACKEND || backend == IO_DEFAULT_BACKEND) {
     writePNGSimdlib(I, filename);
   } else if (backend == IO_STB_IMAGE_BACKEND) {
     writePNGStb(I, filename);
@@ -520,21 +525,21 @@ void vpImageIo::writePNG(const vpImage<unsigned char> &I, const std::string &fil
 
 void vpImageIo::writePNG(const vpImage<vpRGBa> &I, const std::string &filename, const vpImageIoBackendType& backend)
 {
-  if (backend == IO_LIB_BACKEND) {
+  if (backend == IO_LIB_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_PNG)
     writePNGLibpng(I, filename);
 #else
     std::string message = "Cannot write file \"" + filename + "\": Libpng backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
-  } else if (backend == IO_OPENCV_BACKEND) {
+  } else if (backend == IO_OPENCV_BACKEND || backend == IO_DEFAULT_BACKEND) {
 #if defined(VISP_HAVE_OPENCV) && VISP_HAVE_OPENCV_VERSION >= 0x020100
-    writeOpenCV(I, filename);
+    writeOpenCV(I, filename, 90);
 #else
     std::string message = "Cannot write file \"" + filename + "\": OpenCV backend is not available";
     throw(vpImageException(vpImageException::ioError, message));
 #endif
-  } else if (backend == IO_SIMDLIB_BACKEND) {
+  } else if (backend == IO_SIMDLIB_BACKEND || backend == IO_DEFAULT_BACKEND) {
     writePNGSimdlib(I, filename);
   } else if (backend == IO_STB_IMAGE_BACKEND) {
     writePNGStb(I, filename);
diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp
index 3bf19a465e..6182df06e4 100644
--- a/modules/io/test/perfImageLoadSave.cpp
+++ b/modules/io/test/perfImageLoadSave.cpp
@@ -45,221 +45,179 @@
 #include <visp3/io/vpImageIo.h>
 
 static std::string ipath = vpIoTools::getViSPImagesDataPath();
-static std::string imagePathJpeg = vpIoTools::createFilePath(ipath, "Klimt/Klimt.jpeg");
-static std::string imagePathPng = vpIoTools::createFilePath(ipath, "Klimt/Klimt.png");
-static std::string imagePathPngBig = vpIoTools::createFilePath(ipath, "Klimt/test_image_resize.png");
+static std::vector<std::string> paths {
+  ipath + "/Solvay/Solvay_conference_1927_Version2_640x440",
+  ipath + "/Solvay/Solvay_conference_1927_Version2_1024x705",
+  ipath + "/Solvay/Solvay_conference_1927_Version2_1280x881",
+  ipath + "/Solvay/Solvay_conference_1927_Version2_2126x1463",
+};
+static std::vector<std::string> names {
+  "Solvay (640x440)", "Solvay (1024x705)", "Solvay (1280x881)", "Solvay (2126x1463)"
+};
+static std::vector<vpImageIo::vpImageIoBackendType> backends {
+  vpImageIo::IO_LIB_BACKEND, vpImageIo::IO_OPENCV_BACKEND, vpImageIo::IO_SIMDLIB_BACKEND, vpImageIo::IO_STB_IMAGE_BACKEND
+};
+static std::vector<std::string> backendNamesJpeg {
+  "libjpeg", "OpenCV", "simd", "stb"
+};
+static std::vector<std::string> backendNamesPng {
+  "libpng", "OpenCV", "simd", "stb"
+};
 static int nThreads = 0;
 
-TEST_CASE("Benchmark Jpeg image loading", "[benchmark]") {
-  {
-    vpImage<vpRGBa> I;
-
-    BENCHMARK("vpImageIo::read()") {
-      vpImageIo::read(I, imagePathJpeg);
-      return I;
-    };
+TEST_CASE("Benchmark JPEG image loading", "[benchmark]") {
+  SECTION("Grayscale") {
+    for (size_t i = 0; i < paths.size(); i++) {
+      SECTION(names[i]) {
+        for (size_t j = 0; j < backends.size(); j++) {
+          vpImage<vpRGBa> I;
+
+          BENCHMARK(backendNamesJpeg[j] + " backend") {
+            vpImageIo::read(I, paths[i] + ".jpg", backends[j]);
+            return I;
+          };
+        }
+      }
+    }
   }
 
-  {
-    vpImage<vpRGBa> I;
-
-    BENCHMARK("vpImageIo::readSimdlib()") {
-      vpImageIo::readJPEG(I, imagePathJpeg, vpImageIo::IO_SIMDLIB_BACKEND);
-      return I;
-    };
-  }
-
-  {
-    vpImage<vpRGBa> I;
-
-    BENCHMARK("vpImageIo::readStb()") {
-      vpImageIo::readJPEG(I, imagePathJpeg, vpImageIo::IO_STB_IMAGE_BACKEND);
-      return I;
-    };
+  SECTION("vpRGBa") {
+    for (size_t i = 0; i < paths.size(); i++) {
+      SECTION(names[i]) {
+        for (size_t j = 0; j < backends.size(); j++) {
+          vpImage<unsigned char> I;
+
+          BENCHMARK(backendNamesJpeg[j] + " backend") {
+            vpImageIo::read(I, paths[i] + ".jpg", backends[j]);
+            return I;
+          };
+        }
+      }
+    }
   }
 }
 
-TEST_CASE("Benchmark Png image loading", "[benchmark]") {
-  {
-    vpImage<vpRGBa> I;
-
-    BENCHMARK("vpImageIo::read()") {
-      vpImageIo::read(I, imagePathPng);
-      return I;
-    };
+TEST_CASE("Benchmark PNG image loading", "[benchmark]") {
+  SECTION("Grayscale") {
+    for (size_t i = 0; i < paths.size(); i++) {
+      SECTION(names[i]) {
+        for (size_t j = 0; j < backends.size(); j++) {
+          vpImage<vpRGBa> I;
+
+          BENCHMARK(backendNamesPng[j] + " backend") {
+            vpImageIo::read(I, paths[i] + ".png", backends[j]);
+            return I;
+          };
+        }
+      }
+    }
   }
 
-  {
-    vpImage<vpRGBa> I;
-
-    BENCHMARK("vpImageIo::readSimdlib()") {
-      vpImageIo::readPNG(I, imagePathPng, vpImageIo::IO_SIMDLIB_BACKEND);
-      return I;
-    };
-  }
-
-  {
-    vpImage<vpRGBa> I;
-
-    BENCHMARK("vpImageIo::readStb()") {
-      vpImageIo::readPNG(I, imagePathPng, vpImageIo::IO_STB_IMAGE_BACKEND);
-      return I;
-    };
+  SECTION("vpRGBa") {
+    for (size_t i = 0; i < paths.size(); i++) {
+      SECTION(names[i]) {
+        for (size_t j = 0; j < backends.size(); j++) {
+          vpImage<unsigned char> I;
+
+          BENCHMARK(backendNamesPng[j] + " backend") {
+            vpImageIo::read(I, paths[i] + ".png", backends[j]);
+            return I;
+          };
+        }
+      }
+    }
   }
 }
 
-TEST_CASE("Benchmark big Png image loading", "[benchmark]") {
-  {
-    vpImage<vpRGBa> I;
-
-    BENCHMARK("vpImageIo::read()") {
-      vpImageIo::read(I, imagePathPngBig);
-      return I;
-    };
+#if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || (defined(__APPLE__) && defined(__MACH__))) // UNIX
+// makeTempDirectory is only implemented for Unix platform
+
+std::string username, directory_filename_tmp;
+
+TEST_CASE("Benchmark JPEG image saving", "[benchmark]") {
+  vpIoTools::getUserName(username);
+  std::string tmp_dir = "/tmp/" + username;
+  vpIoTools::makeDirectory(tmp_dir);
+  directory_filename_tmp = tmp_dir + "/" + "vpIoTools_perfImageLoadSave_XXXXXX";
+  std::string converted_dirname_tmp = vpIoTools::makeTempDirectory(directory_filename_tmp);
+  REQUIRE(vpIoTools::checkDirectory(converted_dirname_tmp));
+
+  SECTION("Grayscale") {
+    for (size_t i = 0; i < paths.size(); i++) {
+      vpImage<unsigned char> I;
+      vpImageIo::read(I, paths[i] + ".png");
+
+      SECTION(names[i]) {
+        for (size_t j = 0; j < backends.size(); j++) {
+          BENCHMARK(backendNamesJpeg[j] + " backend") {
+            vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.jpg", backends[j]);
+            return I;
+          };
+        }
+      }
+    }
   }
 
-  {
-    vpImage<vpRGBa> I;
-
-    BENCHMARK("vpImageIo::readSimdlib()") {
-      vpImageIo::readPNG(I, imagePathPngBig, vpImageIo::IO_SIMDLIB_BACKEND);
-      return I;
-    };
+  SECTION("vpRGBa") {
+    for (size_t i = 0; i < paths.size(); i++) {
+      vpImage<vpRGBa> I;
+      vpImageIo::read(I, paths[i] + ".png");
+
+      SECTION(names[i]) {
+        for (size_t j = 0; j < backends.size(); j++) {
+          BENCHMARK(backendNamesJpeg[j] + " backend") {
+            vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.jpg", backends[j]);
+            return I;
+          };
+        }
+      }
+    }
   }
 
-  {
-    vpImage<vpRGBa> I;
-
-    BENCHMARK("vpImageIo::readStb()") {
-      vpImageIo::readPNG(I, imagePathPngBig, vpImageIo::IO_STB_IMAGE_BACKEND);
-      return I;
-    };
-  }
+  REQUIRE(vpIoTools::remove(converted_dirname_tmp));
 }
 
-TEST_CASE("Benchmark Jpeg image saving", "[benchmark]") {
-  vpImage<vpRGBa> I;
-  vpImageIo::read(I, imagePathJpeg);
-  {
-    const std::string filename = "/tmp/Klimt_ViSP.jpg";
-
-    BENCHMARK("vpImageIo::write()") {
-      vpImageIo::write(I, filename);
-      return I;
-    };
+TEST_CASE("Benchmark PNG image saving", "[benchmark]") {
+  vpIoTools::getUserName(username);
+  std::string tmp_dir = "/tmp/" + username;
+  vpIoTools::makeDirectory(tmp_dir);
+  directory_filename_tmp = tmp_dir + "/" + "vpIoTools_perfImageLoadSave_XXXXXX";
+  std::string converted_dirname_tmp = vpIoTools::makeTempDirectory(directory_filename_tmp);
+  REQUIRE(vpIoTools::checkDirectory(converted_dirname_tmp));
+
+  SECTION("Grayscale") {
+    for (size_t i = 0; i < paths.size(); i++) {
+      vpImage<unsigned char> I;
+      vpImageIo::read(I, paths[i] + ".png");
+
+      SECTION(names[i]) {
+        for (size_t j = 0; j < backends.size(); j++) {
+          BENCHMARK(backendNamesPng[j] + " backend") {
+            vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.png", backends[j]);
+            return I;
+          };
+        }
+      }
+    }
   }
 
-  {
-    const std::string filename = "/tmp/Klimt_Simd.jpg";
-
-    BENCHMARK("vpImageIo::writeSimdlib()") {
-      vpImageIo::writeJPEG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND);
-      return I;
-    };
-  }
-
-  {
-    const std::string filename = "/tmp/Klimt_stb.jpg";
-
-    BENCHMARK("vpImageIo::writeStb()") {
-      vpImageIo::writeJPEG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND);
-      return I;
-    };
-  }
-}
-
-TEST_CASE("Benchmark big Jpeg image saving", "[benchmark]") {
-  vpImage<vpRGBa> I;
-  vpImageIo::read(I, imagePathPngBig);
-  {
-    const std::string filename = "/tmp/Big_images_ViSP.jpg";
-
-    BENCHMARK("vpImageIo::write()") {
-      vpImageIo::write(I, filename);
-      return I;
-    };
-  }
-
-  {
-    const std::string filename = "/tmp/Big_images_Simd.jpg";
-
-    BENCHMARK("vpImageIo::writeSimdlib()") {
-      vpImageIo::writeJPEG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND);
-      return I;
-    };
-  }
-
-  {
-    const std::string filename = "/tmp/Big_images_stb.jpg";
-
-    BENCHMARK("vpImageIo::writeStb()") {
-      vpImageIo::writeJPEG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND);
-      return I;
-    };
-  }
-}
-
-TEST_CASE("Benchmark Png image saving", "[benchmark]") {
-  vpImage<vpRGBa> I;
-  vpImageIo::read(I, imagePathPng);
-  {
-    const std::string filename = "/tmp/Klimt_ViSP.png";
-
-    BENCHMARK("vpImageIo::write()") {
-      vpImageIo::write(I, filename);
-      return I;
-    };
-  }
-
-  {
-    const std::string filename = "/tmp/Klimt_Simd.png";
-
-    BENCHMARK("vpImageIo::writeSimdlib()") {
-      vpImageIo::writePNG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND);
-      return I;
-    };
-  }
-
-  {
-    const std::string filename = "/tmp/Klimt_stb.png";
-
-    BENCHMARK("vpImageIo::writeStb()") {
-      vpImageIo::writePNG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND);
-      return I;
-    };
-  }
-}
-
-TEST_CASE("Benchmark big Png image saving", "[benchmark]") {
-  vpImage<vpRGBa> I;
-  vpImageIo::read(I, imagePathPngBig);
-  {
-    const std::string filename = "/tmp/Big_images_ViSP.png";
-
-    BENCHMARK("vpImageIo::write()") {
-      vpImageIo::write(I, filename);
-      return I;
-    };
-  }
-
-  {
-    const std::string filename = "/tmp/Big_images_Simd.png";
-
-    BENCHMARK("vpImageIo::writeSimdlib()") {
-      vpImageIo::writePNG(I, filename, vpImageIo::IO_SIMDLIB_BACKEND);
-      return I;
-    };
-  }
-
-  {
-    const std::string filename = "/tmp/Big_images_stb.png";
-
-    BENCHMARK("vpImageIo::writeStb()") {
-      vpImageIo::writePNG(I, filename, vpImageIo::IO_STB_IMAGE_BACKEND);
-      return I;
-    };
+  SECTION("vpRGBa") {
+    for (size_t i = 0; i < paths.size(); i++) {
+      vpImage<vpRGBa> I;
+      vpImageIo::read(I, paths[i] + ".png");
+
+      SECTION(names[i]) {
+        for (size_t j = 0; j < backends.size(); j++) {
+          BENCHMARK(backendNamesPng[j] + " backend") {
+            vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.png", backends[j]);
+            return I;
+          };
+        }
+      }
+    }
   }
 }
+#endif
 
 int main(int argc, char *argv[])
 {
@@ -272,11 +230,6 @@ int main(int argc, char *argv[])
     | Opt(runBenchmark)    // bind variable to a new option, with a hint string
     ["--benchmark"]        // the option names it will respond to
     ("run benchmark?")     // description string for the help output
-    | Opt(imagePathJpeg, "imagePathColor")
-    ["--imagePathColor"]
-    ("Path to color image")
-    | Opt(imagePathPng, "imagePathColor")
-    ["--imagePathGray"]
     ("Path to gray image")
     | Opt(nThreads, "nThreads")
     ["--nThreads"]
@@ -289,13 +242,6 @@ int main(int argc, char *argv[])
   session.applyCommandLine(argc, argv);
 
   if (runBenchmark) {
-//    vpImage<vpRGBa> I_color;
-//    vpImageIo::read(I_color, imagePathColor);
-//    std::cout << "imagePathColor:\n\t" << imagePathColor << "\n\t" << I_color.getWidth() << "x" << I_color.getHeight() << std::endl;
-
-//    vpImage<unsigned char> I_gray;
-//    vpImageIo::read(I_gray, imagePathGray);
-//    std::cout << "imagePathGray:\n\t" << imagePathGray << "\n\t" << I_gray.getWidth() << "x" << I_gray.getHeight() << std::endl;
     std::cout << "nThreads: " << nThreads << " / available threads: " << std::thread::hardware_concurrency() << std::endl;
 
     int numFailed = session.run();

From 66b12c526f1991740867f56d4eefc395f38a8aee Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Sun, 21 Nov 2021 21:40:24 +0100
Subject: [PATCH 14/18] Update stb_image.h to 2.27 version.

---
 3rdparty/stb_image/CMakeLists.txt |   2 +-
 3rdparty/stb_image/README.md      |   2 +-
 3rdparty/stb_image/stb_image.h    | 576 ++++++++++++++++++++++++------
 3 files changed, 459 insertions(+), 121 deletions(-)

diff --git a/3rdparty/stb_image/CMakeLists.txt b/3rdparty/stb_image/CMakeLists.txt
index 84ded2f220..f344e7f27d 100644
--- a/3rdparty/stb_image/CMakeLists.txt
+++ b/3rdparty/stb_image/CMakeLists.txt
@@ -1,5 +1,5 @@
 project(${STBIMAGE_LIBRARY})
 
 set(STBIMAGE_MAJOR_VERSION 2  PARENT_SCOPE)
-set(STBIMAGE_MINOR_VERSION 22 PARENT_SCOPE)
+set(STBIMAGE_MINOR_VERSION 27 PARENT_SCOPE)
 set(STBIMAGE_PATCH_VERSION 0  PARENT_SCOPE)
diff --git a/3rdparty/stb_image/README.md b/3rdparty/stb_image/README.md
index 80019a1405..efa37458eb 100644
--- a/3rdparty/stb_image/README.md
+++ b/3rdparty/stb_image/README.md
@@ -12,7 +12,7 @@ by Jorge L. "VinoBS" Rodriguez, and stb_sprintf by Jeff Roberts.
 library    | lastest version | category | LoC | description
 --------------------- | ---- | -------- | --- | --------------------------------
 **[stb_vorbis.c](stb_vorbis.c)** | 1.16 | audio | 5486 | decode ogg vorbis files from file/memory to float/16-bit signed output
-**[stb_image.h](stb_image.h)** | 2.22 | graphics | 7547 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
+**[stb_image.h](stb_image.h)** | 2.27 | graphics | 7897 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
 **[stb_truetype.h](stb_truetype.h)** | 1.21 | graphics | 4882 | parse, decode, and rasterize characters from truetype fonts
 **[stb_image_write.h](stb_image_write.h)** | 1.13 | graphics | 1617 | image writing to disk: PNG, TGA, BMP
 **[stb_image_resize.h](stb_image_resize.h)** | 0.96 | graphics | 2630 | resize images larger/smaller with good quality
diff --git a/3rdparty/stb_image/stb_image.h b/3rdparty/stb_image/stb_image.h
index eb8d215b40..d60371b95f 100644
--- a/3rdparty/stb_image/stb_image.h
+++ b/3rdparty/stb_image/stb_image.h
@@ -1,4 +1,4 @@
-/* stb_image - v2.23 - public domain image loader - http://nothings.org/stb
+/* stb_image - v2.27 - public domain image loader - http://nothings.org/stb
                                   no warranty implied; use at your own risk
 
    Do this:
@@ -48,6 +48,10 @@ LICENSE
 
 RECENT REVISION HISTORY:
 
+      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
+      2.26  (2020-07-13) many minor fixes
+      2.25  (2020-02-02) fix warnings
+      2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
       2.23  (2019-08-11) fix clang static analysis warning
       2.22  (2019-03-04) gif fixes, fix warnings
       2.21  (2019-02-25) fix typo in comment
@@ -86,26 +90,37 @@ RECENT REVISION HISTORY:
                                            Jeremy Sawicki (handle all ImageNet JPGs)
  Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
     Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
-    Arseny Kapoulkine
+    Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
     John-Mark Allen
     Carmelo J Fdez-Aguera
 
  Bug & warning fixes
-    Marc LeBlanc            David Woo          Guillaume George   Martins Mozeiko
-    Christpher Lloyd        Jerry Jansson      Joseph Thomson     Phil Jordan
-    Dave Moore              Roy Eltham         Hayaki Saito       Nathan Reed
-    Won Chun                Luke Graham        Johan Duparc       Nick Verigakis
-    the Horde3D community   Thomas Ruf         Ronny Chevalier    github:rlyeh
-    Janez Zemva             John Bartholomew   Michal Cichon      github:romigrou
-    Jonathan Blow           Ken Hamada         Tero Hanninen      github:svdijk
-    Laurent Gomila          Cort Stratton      Sergio Gonzalez    github:snagar
-    Aruelien Pocheville     Thibault Reuille   Cass Everitt       github:Zelex
-    Ryamond Barbiero        Paul Du Bois       Engin Manap        github:grim210
-    Aldo Culquicondor       Philipp Wiesemann  Dale Weiler        github:sammyhw
-    Oriol Ferrer Mesia      Josh Tobin         Matthew Gregan     github:phprus
-    Julian Raschke          Gregory Mullen     Baldur Karlsson    github:poppolopoppo
-    Christian Floisand      Kevin Schmidt      JR Smith           github:darealshinji
-    Blazej Dariusz Roszkowski                                     github:Michaelangel007
+    Marc LeBlanc            David Woo          Guillaume George     Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson       Blazej Dariusz Roszkowski
+    Phil Jordan                                Dave Moore           Roy Eltham
+    Hayaki Saito            Nathan Reed        Won Chun
+    Luke Graham             Johan Duparc       Nick Verigakis       the Horde3D community
+    Thomas Ruf              Ronny Chevalier                         github:rlyeh
+    Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
+    Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
+    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
+    Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
+    Cass Everitt            Ryamond Barbiero                        github:grim210
+    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
+    Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
+    Josh Tobin                                 Matthew Gregan       github:poppolopoppo
+    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
+    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
+                            Brad Weinberger    Matvey Cherevko      github:mosra
+    Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
+    Ryan C. Gordon          [reserved]                              [reserved]
+                     DO NOT ADD YOUR NAME HERE
+
+                     Jacko Dirks
+
+  To add your name to the credits, pick a random blank space in the middle and fill it.
+  80% of merge conflicts on stb PRs are due to people adding their name at the end
+  of the credits.
 */
 
 #ifndef STBI_INCLUDE_STB_IMAGE_H
@@ -164,6 +179,32 @@ RECENT REVISION HISTORY:
 //
 // Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
 //
+// To query the width, height and component count of an image without having to
+// decode the full file, you can use the stbi_info family of functions:
+//
+//   int x,y,n,ok;
+//   ok = stbi_info(filename, &x, &y, &n);
+//   // returns ok=1 and sets x, y, n if image is a supported format,
+//   // 0 otherwise.
+//
+// Note that stb_image pervasively uses ints in its public API for sizes,
+// including sizes of memory buffers. This is now part of the API and thus
+// hard to change without causing breakage. As a result, the various image
+// loaders all have certain limits on image size; these differ somewhat
+// by format but generally boil down to either just under 2GB or just under
+// 1GB. When the decoded image would be larger than this, stb_image decoding
+// will fail.
+//
+// Additionally, stb_image will reject image files that have any of their
+// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
+// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
+// the only way to have an image with such dimensions load correctly
+// is for it to have a rather extreme aspect ratio. Either way, the
+// assumption here is that such larger images are likely to be malformed
+// or malicious. If you do need to load an image with individual dimensions
+// larger than that, and it still fits in the overall size limit, you can
+// #define STBI_MAX_DIMENSIONS on your own to be something larger.
+//
 // ===========================================================================
 //
 // UNICODE:
@@ -269,11 +310,10 @@ RECENT REVISION HISTORY:
 //
 // iPhone PNG support:
 //
-// By default we convert iphone-formatted PNGs back to RGB, even though
-// they are internally encoded differently. You can disable this conversion
-// by calling stbi_convert_iphone_png_to_rgb(0), in which case
-// you will always just get the native iphone "format" through (which
-// is BGR stored in RGB).
+// We optionally support converting iPhone-formatted PNGs (which store
+// premultiplied BGRA) back to RGB, even though they're internally encoded
+// differently. To enable this conversion, call
+// stbi_convert_iphone_png_to_rgb(1).
 //
 // Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
 // pixel to remove any premultiplied alpha *only* if the image file explicitly
@@ -315,7 +355,14 @@ RECENT REVISION HISTORY:
 //   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
 //     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
 //
-
+//  - If you define STBI_MAX_DIMENSIONS, stb_image will reject images greater
+//    than that size (in either width or height) without further processing.
+//    This is to let programs in the wild set an upper bound to prevent
+//    denial-of-service attacks on untrusted data, as one could generate a
+//    valid image of gigantic dimensions and force stb_image to allocate a
+//    huge block of memory and spend disproportionate time decoding it. By
+//    default this is set to (1 << 24), which is 16777216, but that's still
+//    very big.
 
 #ifndef STBI_NO_STDIO
 #include <stdio.h>
@@ -434,7 +481,7 @@ STBIDEF int      stbi_is_hdr_from_file(FILE *f);
 
 
 // get a VERY brief reason for failure
-// NOT THREADSAFE
+// on most compilers (and ALL modern mainstream compilers) this is threadsafe
 STBIDEF const char *stbi_failure_reason  (void);
 
 // free the loaded image -- this is just free()
@@ -467,6 +514,13 @@ STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
 // flip the image vertically, so the first pixel in the output array is the bottom left
 STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
 
+// as above, but only applies to images loaded on the thread that calls the function
+// this function is only available if your compiler supports thread-local variables;
+// calling it will fail to link if your compiler doesn't
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
+
 // ZLIB client - used by PNG, available for other purposes
 
 STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
@@ -563,6 +617,23 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
    #define stbi_inline __forceinline
 #endif
 
+#ifndef STBI_NO_THREAD_LOCALS
+   #if defined(__cplusplus) &&  __cplusplus >= 201103L
+      #define STBI_THREAD_LOCAL       thread_local
+   #elif defined(__GNUC__) && __GNUC__ < 5
+      #define STBI_THREAD_LOCAL       __thread
+   #elif defined(_MSC_VER)
+      #define STBI_THREAD_LOCAL       __declspec(thread)
+   #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+      #define STBI_THREAD_LOCAL       _Thread_local
+   #endif
+
+   #ifndef STBI_THREAD_LOCAL
+      #if defined(__GNUC__)
+        #define STBI_THREAD_LOCAL       __thread
+      #endif
+   #endif
+#endif
 
 #ifdef _MSC_VER
 typedef unsigned short stbi__uint16;
@@ -593,7 +664,7 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 #ifdef STBI_HAS_LROTL
    #define stbi_lrot(x,y)  _lrotl(x,y)
 #else
-   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
 #endif
 
 #if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
@@ -707,14 +778,21 @@ static int stbi__sse2_available(void)
 
 #ifdef STBI_NEON
 #include <arm_neon.h>
-// assume GCC or Clang on ARM targets
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
 #endif
+#endif
 
 #ifndef STBI_SIMD_ALIGN
 #define STBI_SIMD_ALIGN(type, name) type name
 #endif
 
+#ifndef STBI_MAX_DIMENSIONS
+#define STBI_MAX_DIMENSIONS (1 << 24)
+#endif
+
 ///////////////////////////////////////////////
 //
 //  stbi__context struct and start_xxx functions
@@ -732,6 +810,7 @@ typedef struct
    int read_from_callbacks;
    int buflen;
    stbi_uc buffer_start[128];
+   int callback_already_read;
 
    stbi_uc *img_buffer, *img_buffer_end;
    stbi_uc *img_buffer_original, *img_buffer_original_end;
@@ -745,6 +824,7 @@ static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
 {
    s->io.read = NULL;
    s->read_from_callbacks = 0;
+   s->callback_already_read = 0;
    s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
    s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
 }
@@ -756,7 +836,8 @@ static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *
    s->io_user_data = user;
    s->buflen = sizeof(s->buffer_start);
    s->read_from_callbacks = 1;
-   s->img_buffer_original = s->buffer_start;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = s->buffer_start;
    stbi__refill_buffer(s);
    s->img_buffer_original_end = s->img_buffer_end;
 }
@@ -770,12 +851,17 @@ static int stbi__stdio_read(void *user, char *data, int size)
 
 static void stbi__stdio_skip(void *user, int n)
 {
+   int ch;
    fseek((FILE*) user, n, SEEK_CUR);
+   ch = fgetc((FILE*) user);  /* have to read a byte to reset feof()'s flag */
+   if (ch != EOF) {
+      ungetc(ch, (FILE *) user);  /* push byte back onto stream if valid. */
+   }
 }
 
 static int stbi__stdio_eof(void *user)
 {
-   return feof((FILE*) user);
+   return feof((FILE*) user) || ferror((FILE *) user);
 }
 
 static stbi_io_callbacks stbi__stdio_callbacks =
@@ -871,21 +957,27 @@ static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
 static int      stbi__pnm_test(stbi__context *s);
 static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__pnm_is16(stbi__context *s);
 #endif
 
-// this is not threadsafe
-static const char *stbi__g_failure_reason;
+static
+#ifdef STBI_THREAD_LOCAL
+STBI_THREAD_LOCAL
+#endif
+const char *stbi__g_failure_reason;
 
 STBIDEF const char *stbi_failure_reason(void)
 {
    return stbi__g_failure_reason;
 }
 
+#ifndef STBI_NO_FAILURE_STRINGS
 static int stbi__err(const char *str)
 {
    stbi__g_failure_reason = str;
    return 0;
 }
+#endif
 
 static void *stbi__malloc(size_t size)
 {
@@ -924,11 +1016,13 @@ static int stbi__mul2sizes_valid(int a, int b)
    return a <= INT_MAX/b;
 }
 
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
 // returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
 static int stbi__mad2sizes_valid(int a, int b, int add)
 {
    return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
 }
+#endif
 
 // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
 static int stbi__mad3sizes_valid(int a, int b, int c, int add)
@@ -938,7 +1032,7 @@ static int stbi__mad3sizes_valid(int a, int b, int c, int add)
 }
 
 // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
 static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
 {
    return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
@@ -946,12 +1040,14 @@ static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
 }
 #endif
 
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
 // mallocs with size overflow checking
 static void *stbi__malloc_mad2(int a, int b, int add)
 {
    if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
    return stbi__malloc(a*b + add);
 }
+#endif
 
 static void *stbi__malloc_mad3(int a, int b, int c, int add)
 {
@@ -959,7 +1055,7 @@ static void *stbi__malloc_mad3(int a, int b, int c, int add)
    return stbi__malloc(a*b*c + add);
 }
 
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
 static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
 {
    if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
@@ -995,13 +1091,29 @@ static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
 static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
 #endif
 
-static int stbi__vertically_flip_on_load = 0;
+static int stbi__vertically_flip_on_load_global = 0;
 
 STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
 {
-    stbi__vertically_flip_on_load = flag_true_if_should_flip;
+   stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__vertically_flip_on_load  stbi__vertically_flip_on_load_global
+#else
+static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set;
+
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
+   stbi__vertically_flip_on_load_set = 1;
 }
 
+#define stbi__vertically_flip_on_load  (stbi__vertically_flip_on_load_set       \
+                                         ? stbi__vertically_flip_on_load_local  \
+                                         : stbi__vertically_flip_on_load_global)
+#endif // STBI_THREAD_LOCAL
+
 static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
 {
    memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
@@ -1009,9 +1121,8 @@ static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int re
    ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
    ri->num_channels = 0;
 
-   #ifndef STBI_NO_JPEG
-   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
-   #endif
+   // test the formats with a very explicit header first (at least a FOURCC
+   // or distinctive magic number first)
    #ifndef STBI_NO_PNG
    if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
    #endif
@@ -1023,10 +1134,19 @@ static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int re
    #endif
    #ifndef STBI_NO_PSD
    if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
+   #else
+   STBI_NOTUSED(bpc);
    #endif
    #ifndef STBI_NO_PIC
    if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
    #endif
+
+   // then the formats that can end up attempting to load with just 1 or 2
+   // bytes matching expectations; these are prone to false positives, so
+   // try them later
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
+   #endif
    #ifndef STBI_NO_PNM
    if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
    #endif
@@ -1125,8 +1245,10 @@ static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x,
    if (result == NULL)
       return NULL;
 
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
    if (ri.bits_per_channel != 8) {
-      STBI_ASSERT(ri.bits_per_channel == 16);
       result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
       ri.bits_per_channel = 8;
    }
@@ -1149,8 +1271,10 @@ static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x,
    if (result == NULL)
       return NULL;
 
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
    if (ri.bits_per_channel != 16) {
-      STBI_ASSERT(ri.bits_per_channel == 8);
       result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
       ri.bits_per_channel = 16;
    }
@@ -1178,33 +1302,33 @@ static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, in
 
 #ifndef STBI_NO_STDIO
 
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
 STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
 STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
 #endif
 
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
 STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
 {
-  return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
 }
 #endif
 
 static FILE *stbi__fopen(char const *filename, char const *mode)
 {
    FILE *f;
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
    wchar_t wMode[64];
    wchar_t wFilename[1024];
-  if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)))
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
       return 0;
 
-  if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
       return 0;
 
-#if _MSC_VER >= 1400
-  if (0 != _wfopen_s(&f, wFilename, wMode))
-    f = 0;
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+	if (0 != _wfopen_s(&f, wFilename, wMode))
+		f = 0;
 #else
    f = _wfopen(wFilename, wMode);
 #endif
@@ -1453,6 +1577,7 @@ enum
 static void stbi__refill_buffer(stbi__context *s)
 {
    int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
+   s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original);
    if (n == 0) {
       // at end of file, treat same as if from memory, but need to handle case
       // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
@@ -1477,6 +1602,9 @@ stbi_inline static stbi_uc stbi__get8(stbi__context *s)
    return 0;
 }
 
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
 stbi_inline static int stbi__at_eof(stbi__context *s)
 {
    if (s->io.read) {
@@ -1488,9 +1616,14 @@ stbi_inline static int stbi__at_eof(stbi__context *s)
 
    return s->img_buffer >= s->img_buffer_end;
 }
+#endif
 
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
+// nothing
+#else
 static void stbi__skip(stbi__context *s, int n)
 {
+   if (n == 0) return;  // already there!
    if (n < 0) {
       s->img_buffer = s->img_buffer_end;
       return;
@@ -1505,7 +1638,11 @@ static void stbi__skip(stbi__context *s, int n)
    }
    s->img_buffer += n;
 }
+#endif
 
+#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM)
+// nothing
+#else
 static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
 {
    if (s->io.read) {
@@ -1529,18 +1666,27 @@ static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
    } else
       return 0;
 }
+#endif
 
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
 static int stbi__get16be(stbi__context *s)
 {
    int z = stbi__get8(s);
    return (z << 8) + stbi__get8(s);
 }
+#endif
 
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
 static stbi__uint32 stbi__get32be(stbi__context *s)
 {
    stbi__uint32 z = stbi__get16be(s);
    return (z << 16) + stbi__get16be(s);
 }
+#endif
 
 #if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
 // nothing
@@ -1556,13 +1702,16 @@ static int stbi__get16le(stbi__context *s)
 static stbi__uint32 stbi__get32le(stbi__context *s)
 {
    stbi__uint32 z = stbi__get16le(s);
-   return z + (stbi__get16le(s) << 16);
+   z += (stbi__uint32)stbi__get16le(s) << 16;
+   return z;
 }
 #endif
 
 #define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
 
-
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
 //////////////////////////////////////////////////////////////////////////////
 //
 //  generic converter from built-in img_n to req_comp
@@ -1578,7 +1727,11 @@ static stbi_uc stbi__compute_y(int r, int g, int b)
 {
    return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
 }
+#endif
 
+#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
 static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
 {
    int i,j;
@@ -1614,7 +1767,7 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r
          STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
          STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
          STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
-         default: STBI_ASSERT(0);
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion");
       }
       #undef STBI__CASE
    }
@@ -1622,12 +1775,20 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r
    STBI_FREE(data);
    return good;
 }
+#endif
 
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
 static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
 {
    return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
 }
+#endif
 
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
 static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
 {
    int i,j;
@@ -1663,7 +1824,7 @@ static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int r
          STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
          STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
          STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
-         default: STBI_ASSERT(0);
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion");
       }
       #undef STBI__CASE
    }
@@ -1671,6 +1832,7 @@ static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int r
    STBI_FREE(data);
    return good;
 }
+#endif
 
 #ifndef STBI_NO_LINEAR
 static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
@@ -1969,13 +2131,12 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
    int sgn;
    if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
 
-   sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
+   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
    k = stbi_lrot(j->code_buffer, n);
-   STBI_ASSERT(n >= 0 && n < (int) (sizeof(stbi__bmask)/sizeof(*stbi__bmask)));
    j->code_buffer = k & ~stbi__bmask[n];
    k &= stbi__bmask[n];
    j->code_bits -= n;
-   return k + (stbi__jbias[n] & ~sgn);
+   return k + (stbi__jbias[n] & (sgn - 1));
 }
 
 // get some unsigned bits
@@ -2025,7 +2186,7 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman
 
    if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
    t = stbi__jpeg_huff_decode(j, hdc);
-   if (t < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");
 
    // 0 all the ac values now so we can do it 32-bits at a time
    memset(data,0,64*sizeof(data[0]));
@@ -2082,11 +2243,12 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__
       // first scan for DC coefficient, must be first
       memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
       t = stbi__jpeg_huff_decode(j, hdc);
+      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
       diff = t ? stbi__extend_receive(j, t) : 0;
 
       dc = j->img_comp[b].dc_pred + diff;
       j->img_comp[b].dc_pred = dc;
-      data[0] = (short) (dc << j->succ_low);
+      data[0] = (short) (dc * (1 << j->succ_low));
    } else {
       // refinement scan for DC coefficient
       if (stbi__jpeg_get_bit(j))
@@ -2123,7 +2285,7 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__
             j->code_buffer <<= s;
             j->code_bits -= s;
             zig = stbi__jpeg_dezigzag[k++];
-            data[zig] = (short) ((r >> 8) << shift);
+            data[zig] = (short) ((r >> 8) * (1 << shift));
          } else {
             int rs = stbi__jpeg_huff_decode(j, hac);
             if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
@@ -2141,7 +2303,7 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__
             } else {
                k += r;
                zig = stbi__jpeg_dezigzag[k++];
-               data[zig] = (short) (stbi__extend_receive(j,s) << shift);
+               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
             }
          }
       } while (k <= j->spec_end);
@@ -3072,6 +3234,8 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
    p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
    s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
    s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
    c = stbi__get8(s);
    if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
    s->img_n = c;
@@ -3103,6 +3267,13 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
       if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
    }
 
+   // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
+   // and I've never seen a non-corrupted JPEG file actually use them
+   for (i=0; i < s->img_n; ++i) {
+      if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
+      if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
+   }
+
    // compute interleaved mcu info
    z->img_h_max = h_max;
    z->img_v_max = v_max;
@@ -3658,6 +3829,10 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
    else
       decode_n = z->s->img_n;
 
+   // nothing to do if no components requested; check this now to avoid
+   // accessing uninitialized coutput[0] later
+   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
+
    // resample and color-convert
    {
       int k;
@@ -3800,6 +3975,7 @@ static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int re
 {
    unsigned char* result;
    stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__errpuc("outofmem", "Out of memory");
    STBI_NOTUSED(ri);
    j->s = s;
    stbi__setup_jpeg(j);
@@ -3812,6 +3988,7 @@ static int stbi__jpeg_test(stbi__context *s)
 {
    int r;
    stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__err("outofmem", "Out of memory");
    j->s = s;
    stbi__setup_jpeg(j);
    r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
@@ -3836,6 +4013,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 {
    int result;
    stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   if (!j) return stbi__err("outofmem", "Out of memory");
    j->s = s;
    result = stbi__jpeg_info_raw(j, x, y, comp);
    STBI_FREE(j);
@@ -3855,6 +4033,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 // fast-way is faster to check than jpeg huffman, but slow way is slower
 #define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
 #define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
 
 // zlib-style huffman encoding
 // (jpegs packs from left, zlib from right, so can't share code)
@@ -3864,8 +4043,8 @@ typedef struct
    stbi__uint16 firstcode[16];
    int maxcode[17];
    stbi__uint16 firstsymbol[16];
-   stbi_uc  size[288];
-   stbi__uint16 value[288];
+   stbi_uc  size[STBI__ZNSYMS];
+   stbi__uint16 value[STBI__ZNSYMS];
 } stbi__zhuffman;
 
 stbi_inline static int stbi__bitreverse16(int n)
@@ -3952,16 +4131,23 @@ typedef struct
    stbi__zhuffman z_length, z_distance;
 } stbi__zbuf;
 
+stbi_inline static int stbi__zeof(stbi__zbuf *z)
+{
+   return (z->zbuffer >= z->zbuffer_end);
+}
+
 stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
 {
-   if (z->zbuffer >= z->zbuffer_end) return 0;
-   return *z->zbuffer++;
+   return stbi__zeof(z) ? 0 : *z->zbuffer++;
 }
 
 static void stbi__fill_bits(stbi__zbuf *z)
 {
    do {
-      STBI_ASSERT(z->code_buffer < (1U << z->num_bits));
+      if (z->code_buffer >= (1U << z->num_bits)) {
+        z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
+        return;
+      }
       z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
       z->num_bits += 8;
    } while (z->num_bits <= 24);
@@ -3986,10 +4172,11 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
    for (s=STBI__ZFAST_BITS+1; ; ++s)
       if (k < z->maxcode[s])
          break;
-   if (s == 16) return -1; // invalid code!
+   if (s >= 16) return -1; // invalid code!
    // code size is s, so:
    b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
-   STBI_ASSERT(z->size[b] == s);
+   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
+   if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
    a->code_buffer >>= s;
    a->num_bits -= s;
    return z->value[b];
@@ -3998,7 +4185,12 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
 stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
 {
    int b,s;
-   if (a->num_bits < 16) stbi__fill_bits(a);
+   if (a->num_bits < 16) {
+      if (stbi__zeof(a)) {
+         return -1;   /* report error for unexpected end of data. */
+      }
+      stbi__fill_bits(a);
+   }
    b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
    if (b) {
       s = b >> 9;
@@ -4012,13 +4204,16 @@ stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
 static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
 {
    char *q;
-   int cur, limit, old_limit;
+   unsigned int cur, limit, old_limit;
    z->zout = zout;
    if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
-   cur   = (int) (z->zout     - z->zout_start);
-   limit = old_limit = (int) (z->zout_end - z->zout_start);
-   while (cur + n > limit)
+   cur   = (unsigned int) (z->zout - z->zout_start);
+   limit = old_limit = (unsigned) (z->zout_end - z->zout_start);
+   if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory");
+   while (cur + n > limit) {
+      if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory");
       limit *= 2;
+   }
    q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
    STBI_NOTUSED(old_limit);
    if (q == NULL) return stbi__err("outofmem", "Out of memory");
@@ -4116,11 +4311,12 @@ static int stbi__compute_huffman_codes(stbi__zbuf *a)
             c = stbi__zreceive(a,2)+3;
             if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
             fill = lencodes[n-1];
-         } else if (c == 17)
+         } else if (c == 17) {
             c = stbi__zreceive(a,3)+3;
-         else {
-            STBI_ASSERT(c == 18);
+         } else if (c == 18) {
             c = stbi__zreceive(a,7)+11;
+         } else {
+            return stbi__err("bad codelengths", "Corrupt PNG");
          }
          if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
          memset(lencodes+n, fill, c);
@@ -4146,7 +4342,7 @@ static int stbi__parse_uncompressed_block(stbi__zbuf *a)
       a->code_buffer >>= 8;
       a->num_bits -= 8;
    }
-   STBI_ASSERT(a->num_bits == 0);
+   if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG");
    // now fill header the normal way
    while (k < 4)
       header[k++] = stbi__zget8(a);
@@ -4168,6 +4364,7 @@ static int stbi__parse_zlib_header(stbi__zbuf *a)
    int cm    = cmf & 15;
    /* int cinfo = cmf >> 4; */
    int flg   = stbi__zget8(a);
+   if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
    if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
    if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
    if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
@@ -4175,7 +4372,7 @@ static int stbi__parse_zlib_header(stbi__zbuf *a)
    return 1;
 }
 
-static const stbi_uc stbi__zdefault_length[288] =
+static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
 {
    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
@@ -4221,7 +4418,7 @@ static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
       } else {
          if (type == 1) {
             // use fixed code lengths
-            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , 288)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
             if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
          } else {
             if (!stbi__compute_huffman_codes(a)) return 0;
@@ -4429,7 +4626,7 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
          return stbi__err("invalid filter","Corrupt PNG");
 
       if (depth < 8) {
-         STBI_ASSERT(img_width_bytes <= x);
+         if (img_width_bytes > x) return stbi__err("invalid width","Corrupt PNG");
          cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
          filter_bytes = 1;
          width = img_width_bytes;
@@ -4617,6 +4814,7 @@ static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint3
 
    // de-interlacing
    final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   if (!final) return stbi__err("outofmem", "Out of memory");
    for (p=0; p < 7; ++p) {
       int xorig[] = { 0,4,0,2,0,1,0 };
       int yorig[] = { 0,0,4,0,2,0,1 };
@@ -4737,19 +4935,46 @@ static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int
    return 1;
 }
 
-static int stbi__unpremultiply_on_load = 0;
-static int stbi__de_iphone_flag = 0;
+static int stbi__unpremultiply_on_load_global = 0;
+static int stbi__de_iphone_flag_global = 0;
 
 STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
 {
-   stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
 }
 
 STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
 {
-   stbi__de_iphone_flag = flag_true_if_should_convert;
+   stbi__de_iphone_flag_global = flag_true_if_should_convert;
 }
 
+#ifndef STBI_THREAD_LOCAL
+#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
+#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
+#else
+static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
+static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
+
+STBIDEF void stbi__unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_set = 1;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_local = flag_true_if_should_convert;
+   stbi__de_iphone_flag_set = 1;
+}
+
+#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
+                                       ? stbi__unpremultiply_on_load_local      \
+                                       : stbi__unpremultiply_on_load_global)
+#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
+                                ? stbi__de_iphone_flag_local                    \
+                                : stbi__de_iphone_flag_global)
+#endif // STBI_THREAD_LOCAL
+
 static void stbi__de_iphone(stbi__png *z)
 {
    stbi__context *s = z->s;
@@ -4824,8 +5049,10 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
             first = 0;
             if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
-            s->img_x = stbi__get32be(s); if (s->img_x > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
-            s->img_y = stbi__get32be(s); if (s->img_y > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
+            s->img_x = stbi__get32be(s);
+            s->img_y = stbi__get32be(s);
+            if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
             z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
             color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
             if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
@@ -4942,6 +5169,8 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                ++s->img_n;
             }
             STBI_FREE(z->expanded); z->expanded = NULL;
+            // end of PNG chunk, read and skip CRC
+            stbi__get32be(s);
             return 1;
          }
 
@@ -4972,10 +5201,12 @@ static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, st
    void *result=NULL;
    if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
    if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
-      if (p->depth < 8)
+      if (p->depth <= 8)
          ri->bits_per_channel = 8;
+      else if (p->depth == 16)
+         ri->bits_per_channel = 16;
       else
-         ri->bits_per_channel = p->depth;
+         return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
       result = p->out;
       p->out = NULL;
       if (req_comp && req_comp != p->s->img_out_n) {
@@ -5036,7 +5267,7 @@ static int stbi__png_is16(stbi__context *s)
    stbi__png p;
    p.s = s;
    if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
-     return 0;
+	   return 0;
    if (p.depth != 16) {
       stbi__rewind(p.s);
       return 0;
@@ -5111,7 +5342,7 @@ static int stbi__shiftsigned(unsigned int v, int shift, int bits)
       v <<= -shift;
    else
       v >>= shift;
-   STBI_ASSERT(/* v >= 0 && */ v < 256);
+   STBI_ASSERT(v < 256);
    v >>= (8-bits);
    STBI_ASSERT(bits >= 0 && bits <= 8);
    return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
@@ -5121,8 +5352,35 @@ typedef struct
 {
    int bpp, offset, hsz;
    unsigned int mr,mg,mb,ma, all_a;
+   int extra_read;
 } stbi__bmp_data;
 
+static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
+{
+   // BI_BITFIELDS specifies masks explicitly, don't override
+   if (compress == 3)
+      return 1;
+
+   if (compress == 0) {
+      if (info->bpp == 16) {
+         info->mr = 31u << 10;
+         info->mg = 31u <<  5;
+         info->mb = 31u <<  0;
+      } else if (info->bpp == 32) {
+         info->mr = 0xffu << 16;
+         info->mg = 0xffu <<  8;
+         info->mb = 0xffu <<  0;
+         info->ma = 0xffu << 24;
+         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+      } else {
+         // otherwise, use defaults, which is all-0
+         info->mr = info->mg = info->mb = info->ma = 0;
+      }
+      return 1;
+   }
+   return 0; // error
+}
+
 static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
 {
    int hsz;
@@ -5133,6 +5391,9 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
    info->offset = stbi__get32le(s);
    info->hsz = hsz = stbi__get32le(s);
    info->mr = info->mg = info->mb = info->ma = 0;
+   info->extra_read = 14;
+
+   if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP");
 
    if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
    if (hsz == 12) {
@@ -5147,6 +5408,8 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
    if (hsz != 12) {
       int compress = stbi__get32le(s);
       if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
+      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
       stbi__get32le(s); // discard sizeof
       stbi__get32le(s); // discard hres
       stbi__get32le(s); // discard vres
@@ -5161,21 +5424,12 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
          }
          if (info->bpp == 16 || info->bpp == 32) {
             if (compress == 0) {
-               if (info->bpp == 32) {
-                  info->mr = 0xffu << 16;
-                  info->mg = 0xffu <<  8;
-                  info->mb = 0xffu <<  0;
-                  info->ma = 0xffu << 24;
-                  info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
-               } else {
-                  info->mr = 31u << 10;
-                  info->mg = 31u <<  5;
-                  info->mb = 31u <<  0;
-               }
+               stbi__bmp_set_mask_defaults(info, compress);
             } else if (compress == 3) {
                info->mr = stbi__get32le(s);
                info->mg = stbi__get32le(s);
                info->mb = stbi__get32le(s);
+               info->extra_read += 12;
                // not documented, but generated by photoshop and handled by mspaint
                if (info->mr == info->mg && info->mg == info->mb) {
                   // ?!?!?
@@ -5185,6 +5439,7 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
                return stbi__errpuc("bad BMP", "bad BMP");
          }
       } else {
+         // V4/V5 header
          int i;
          if (hsz != 108 && hsz != 124)
             return stbi__errpuc("bad BMP", "bad BMP");
@@ -5192,6 +5447,8 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
          info->mg = stbi__get32le(s);
          info->mb = stbi__get32le(s);
          info->ma = stbi__get32le(s);
+         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
+            stbi__bmp_set_mask_defaults(info, compress);
          stbi__get32le(s); // discard color space
          for (i=0; i < 12; ++i)
             stbi__get32le(s); // discard color space parameters
@@ -5224,6 +5481,9 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req
    flip_vertically = ((int) s->img_y) > 0;
    s->img_y = abs((int) s->img_y);
 
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
    mr = info.mr;
    mg = info.mg;
    mb = info.mb;
@@ -5232,10 +5492,15 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req
 
    if (info.hsz == 12) {
       if (info.bpp < 24)
-         psize = (info.offset - 14 - 24) / 3;
+         psize = (info.offset - info.extra_read - 24) / 3;
    } else {
       if (info.bpp < 16)
-         psize = (info.offset - 14 - info.hsz) >> 2;
+         psize = (info.offset - info.extra_read - info.hsz) >> 2;
+   }
+   if (psize == 0) {
+      if (info.offset != s->callback_already_read + (s->img_buffer - s->img_buffer_original)) {
+        return stbi__errpuc("bad offset", "Corrupt BMP");
+      }
    }
 
    if (info.bpp == 24 && ma == 0xff000000)
@@ -5263,7 +5528,7 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req
          if (info.hsz != 12) stbi__get8(s);
          pal[i][3] = 255;
       }
-      stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
       if (info.bpp == 1) width = (s->img_x + 7) >> 3;
       else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
       else if (info.bpp == 8) width = s->img_x;
@@ -5312,7 +5577,7 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req
       int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
       int z = 0;
       int easy=0;
-      stbi__skip(s, info.offset - 14 - info.hsz);
+      stbi__skip(s, info.offset - info.extra_read - info.hsz);
       if (info.bpp == 24) width = 3 * s->img_x;
       else if (info.bpp == 16) width = 2*s->img_x;
       else /* bpp = 32 and pad = 0 */ width=0;
@@ -5330,6 +5595,7 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req
          gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
          bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
          ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+         if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
       }
       for (j=0; j < (int) s->img_y; ++j) {
          if (easy) {
@@ -5554,6 +5820,9 @@ static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req
    STBI_NOTUSED(tga_x_origin); // @TODO
    STBI_NOTUSED(tga_y_origin); // @TODO
 
+   if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
    //   do a tiny bit of precessing
    if ( tga_image_type >= 8 )
    {
@@ -5593,6 +5862,11 @@ static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req
       //   do I need to load a palette?
       if ( tga_indexed)
       {
+         if (tga_palette_len == 0) {  /* you have to have at least one entry! */
+            STBI_FREE(tga_data);
+            return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+
          //   any data to skip? (offset usually = 0)
          stbi__skip(s, tga_palette_start );
          //   load the palette
@@ -5801,6 +6075,9 @@ static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req
    h = stbi__get32be(s);
    w = stbi__get32be(s);
 
+   if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
    // Make sure the depth is 8 bits.
    bitdepth = stbi__get16be(s);
    if (bitdepth != 8 && bitdepth != 16)
@@ -6155,6 +6432,10 @@ static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_c
 
    x = stbi__get16be(s);
    y = stbi__get16be(s);
+
+   if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
    if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
    if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
 
@@ -6164,6 +6445,7 @@ static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_c
 
    // intermediate buffer is RGBA
    result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
+   if (!result) return stbi__errpuc("outofmem", "Out of memory");
    memset(result, 0xff, x*y*4);
 
    if (!stbi__pic_load_core(s,x,y,comp, result)) {
@@ -6263,6 +6545,9 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_in
    g->ratio = stbi__get8(s);
    g->transparent = -1;
 
+   if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+
    if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
 
    if (is_info) return 1;
@@ -6276,6 +6561,7 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_in
 static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
 {
    stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!g) return stbi__err("outofmem", "Out of memory");
    if (!stbi__gif_header(s, g, comp, 1)) {
       STBI_FREE(g);
       stbi__rewind( s );
@@ -6440,7 +6726,7 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
       memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
       first_frame = 1;
    } else {
-      // second frame - how do we dispoase of the previous one?
+      // second frame - how do we dispose of the previous one?
       dispose = (g->eflags & 0x1C) >> 2;
       pcount = g->w * g->h;
 
@@ -6585,6 +6871,17 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
    }
 }
 
+static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
+{
+   STBI_FREE(g->out);
+   STBI_FREE(g->history);
+   STBI_FREE(g->background);
+
+   if (out) STBI_FREE(out);
+   if (delays && *delays) STBI_FREE(*delays);
+   return stbi__errpuc("outofmem", "Out of memory");
+}
+
 static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
 {
    if (stbi__gif_test(s)) {
@@ -6594,6 +6891,12 @@ static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y,
       stbi_uc *two_back = 0;
       stbi__gif g;
       int stride;
+      int out_size = 0;
+      int delays_size = 0;
+
+      STBI_NOTUSED(out_size);
+      STBI_NOTUSED(delays_size);
+
       memset(&g, 0, sizeof(g));
       if (delays) {
          *delays = 0;
@@ -6610,14 +6913,31 @@ static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y,
             stride = g.w * g.h * 4;
 
             if (out) {
-               out = (stbi_uc*) STBI_REALLOC( out, layers * stride );
+               void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
+               if (!tmp)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               else {
+                   out = (stbi_uc*) tmp;
+                   out_size = layers * stride;
+               }
+
                if (delays) {
-                  *delays = (int*) STBI_REALLOC( *delays, sizeof(int) * layers );
+                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
+                  if (!new_delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  *delays = new_delays;
+                  delays_size = layers * sizeof(int);
                }
             } else {
                out = (stbi_uc*)stbi__malloc( layers * stride );
+               if (!out)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               out_size = layers * stride;
                if (delays) {
                   *delays = (int*) stbi__malloc( layers * sizeof(int) );
+                  if (!*delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  delays_size = layers * sizeof(int);
                }
             }
             memcpy( out + ((layers - 1) * stride), u, stride );
@@ -6796,6 +7116,9 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
    token += 3;
    width = (int) strtol(token, NULL, 10);
 
+   if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+   if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+
    *x = width;
    *y = height;
 
@@ -6938,9 +7261,10 @@ static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
 
    info.all_a = 255;
    p = stbi__bmp_parse_header(s, &info);
-   stbi__rewind( s );
-   if (p == NULL)
+   if (p == NULL) {
+      stbi__rewind( s );
       return 0;
+   }
    if (x) *x = s->img_x;
    if (y) *y = s->img_y;
    if (comp) {
@@ -7006,8 +7330,8 @@ static int stbi__psd_is16(stbi__context *s)
        stbi__rewind( s );
        return 0;
    }
-   (void) stbi__get32be(s);
-   (void) stbi__get32be(s);
+   STBI_NOTUSED(stbi__get32be(s));
+   STBI_NOTUSED(stbi__get32be(s));
    depth = stbi__get16be(s);
    if (depth != 16) {
        stbi__rewind( s );
@@ -7086,7 +7410,6 @@ static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
 // Known limitations:
 //    Does not support comments in the header section
 //    Does not support ASCII image data (formats P2 and P3)
-//    Does not support 16-bit-per-channel
 
 #ifndef STBI_NO_PNM
 
@@ -7107,19 +7430,23 @@ static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req
    stbi_uc *out;
    STBI_NOTUSED(ri);
 
-   if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
+   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
+   if (ri->bits_per_channel == 0)
       return 0;
 
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
    *x = s->img_x;
    *y = s->img_y;
    if (comp) *comp = s->img_n;
 
-   if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
+   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
       return stbi__errpuc("too large", "PNM too large");
 
-   out = (stbi_uc *) stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
+   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
+   stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8));
 
    if (req_comp && req_comp != s->img_n) {
       out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
@@ -7195,11 +7522,19 @@ static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
    stbi__pnm_skip_whitespace(s, &c);
 
    maxv = stbi__pnm_getinteger(s, &c);  // read max value
-
-   if (maxv > 255)
-      return stbi__err("max value > 255", "PPM image not 8-bit");
+   if (maxv > 65535)
+      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
+   else if (maxv > 255)
+      return 16;
    else
-      return 1;
+      return 8;
+}
+
+static int stbi__pnm_is16(stbi__context *s)
+{
+   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
+	   return 1;
+   return 0;
 }
 #endif
 
@@ -7255,6 +7590,9 @@ static int stbi__is_16_main(stbi__context *s)
    if (stbi__psd_is16(s))  return 1;
    #endif
 
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_is16(s))  return 1;
+   #endif
    return 0;
 }
 

From fd312faaa2fd37de36d104ab779cc9463abe5c67 Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Sun, 21 Nov 2021 21:44:34 +0100
Subject: [PATCH 15/18] Update stb_image_write.h to 1.16 version. Do we need to
 introduce another STBIMAGE_LIBRARY and STBIMAGE_*_VERSION CMake variables?

---
 3rdparty/stb_image/README.md         |   2 +-
 3rdparty/stb_image/stb_image_write.h | 251 +++++++++++++++++++--------
 2 files changed, 179 insertions(+), 74 deletions(-)

diff --git a/3rdparty/stb_image/README.md b/3rdparty/stb_image/README.md
index efa37458eb..736bef4a74 100644
--- a/3rdparty/stb_image/README.md
+++ b/3rdparty/stb_image/README.md
@@ -14,7 +14,7 @@ library    | lastest version | category | LoC | description
 **[stb_vorbis.c](stb_vorbis.c)** | 1.16 | audio | 5486 | decode ogg vorbis files from file/memory to float/16-bit signed output
 **[stb_image.h](stb_image.h)** | 2.27 | graphics | 7897 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
 **[stb_truetype.h](stb_truetype.h)** | 1.21 | graphics | 4882 | parse, decode, and rasterize characters from truetype fonts
-**[stb_image_write.h](stb_image_write.h)** | 1.13 | graphics | 1617 | image writing to disk: PNG, TGA, BMP
+**[stb_image_write.h](stb_image_write.h)** | 1.16 | graphics | 1724 | image writing to disk: PNG, TGA, BMP
 **[stb_image_resize.h](stb_image_resize.h)** | 0.96 | graphics | 2630 | resize images larger/smaller with good quality
 **[stb_rect_pack.h](stb_rect_pack.h)** | 1.00 | graphics | 628 | simple 2D rectangle packer with decent quality
 **[stb_ds.h](stb_ds.h)** | 0.5 | utility | 1691 | typesafe dynamic array and hash tables for C, will compile in C++
diff --git a/3rdparty/stb_image/stb_image_write.h b/3rdparty/stb_image/stb_image_write.h
index c989bc1418..e4b32ed1bc 100644
--- a/3rdparty/stb_image/stb_image_write.h
+++ b/3rdparty/stb_image/stb_image_write.h
@@ -1,4 +1,4 @@
-/* stb_image_write - v1.13 - public domain - http://nothings.org/stb
+/* stb_image_write - v1.16 - public domain - http://nothings.org/stb
    writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
                                      no warranty implied; use at your own risk
 
@@ -140,6 +140,7 @@
       Ivan Tikhonov
       github:ignotion
       Adam Schackart
+      Andrew Kensler
 
 LICENSE
 
@@ -166,9 +167,9 @@ LICENSE
 #endif
 
 #ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
-extern int stbi_write_tga_with_rle;
-extern int stbi_write_png_compression_level;
-extern int stbi_write_force_png_filter;
+STBIWDEF int stbi_write_tga_with_rle;
+STBIWDEF int stbi_write_png_compression_level;
+STBIWDEF int stbi_write_force_png_filter;
 #endif
 
 #ifndef STBI_WRITE_NO_STDIO
@@ -178,7 +179,7 @@ STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const
 STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
 STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
 
-#ifdef STBI_WINDOWS_UTF8
+#ifdef STBIW_WINDOWS_UTF8
 STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
 #endif
 #endif
@@ -247,17 +248,17 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 #define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
 
 #ifdef STB_IMAGE_WRITE_STATIC
-static int stbi__flip_vertically_on_write=0;
 static int stbi_write_png_compression_level = 8;
 static int stbi_write_tga_with_rle = 1;
 static int stbi_write_force_png_filter = -1;
 #else
 int stbi_write_png_compression_level = 8;
-int stbi__flip_vertically_on_write=0;
 int stbi_write_tga_with_rle = 1;
 int stbi_write_force_png_filter = -1;
 #endif
 
+static int stbi__flip_vertically_on_write = 0;
+
 STBIWDEF void stbi_flip_vertically_on_write(int flag)
 {
    stbi__flip_vertically_on_write = flag;
@@ -267,6 +268,8 @@ typedef struct
 {
    stbi_write_func *func;
    void *context;
+   unsigned char buffer[64];
+   int buf_used;
 } stbi__write_context;
 
 // initialize a callback-based context
@@ -283,7 +286,7 @@ static void stbi__stdio_write(void *context, void *data, int size)
    fwrite(data,1,size,(FILE*) context);
 }
 
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
 #ifdef __cplusplus
 #define STBIW_EXTERN extern "C"
 #else
@@ -294,25 +297,25 @@ STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned in
 
 STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
 {
-  return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+   return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
 }
 #endif
 
 static FILE *stbiw__fopen(char const *filename, char const *mode)
 {
    FILE *f;
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
    wchar_t wMode[64];
    wchar_t wFilename[1024];
-  if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)))
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
       return 0;
 
-  if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
       return 0;
 
-#if _MSC_VER >= 1400
-  if (0 != _wfopen_s(&f, wFilename, wMode))
-    f = 0;
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != _wfopen_s(&f, wFilename, wMode))
+      f = 0;
 #else
    f = _wfopen(wFilename, wMode);
 #endif
@@ -380,16 +383,36 @@ static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
    va_end(v);
 }
 
+static void stbiw__write_flush(stbi__write_context *s)
+{
+   if (s->buf_used) {
+      s->func(s->context, &s->buffer, s->buf_used);
+      s->buf_used = 0;
+   }
+}
+
 static void stbiw__putc(stbi__write_context *s, unsigned char c)
 {
    s->func(s->context, &c, 1);
 }
 
+static void stbiw__write1(stbi__write_context *s, unsigned char a)
+{
+   if ((size_t)s->buf_used + 1 > sizeof(s->buffer))
+      stbiw__write_flush(s);
+   s->buffer[s->buf_used++] = a;
+}
+
 static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
 {
-   unsigned char arr[3];
-   arr[0] = a; arr[1] = b; arr[2] = c;
-   s->func(s->context, arr, 3);
+   int n;
+   if ((size_t)s->buf_used + 3 > sizeof(s->buffer))
+      stbiw__write_flush(s);
+   n = s->buf_used;
+   s->buf_used = n+3;
+   s->buffer[n+0] = a;
+   s->buffer[n+1] = b;
+   s->buffer[n+2] = c;
 }
 
 static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
@@ -398,7 +421,7 @@ static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, in
    int k;
 
    if (write_alpha < 0)
-      s->func(s->context, &d[comp - 1], 1);
+      stbiw__write1(s, d[comp - 1]);
 
    switch (comp) {
       case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
@@ -406,7 +429,7 @@ static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, in
          if (expand_mono)
             stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
          else
-            s->func(s->context, d, 1);  // monochrome TGA
+            stbiw__write1(s, d[0]);  // monochrome TGA
          break;
       case 4:
          if (!write_alpha) {
@@ -422,7 +445,7 @@ static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, in
          break;
    }
    if (write_alpha > 0)
-      s->func(s->context, &d[comp - 1], 1);
+      stbiw__write1(s, d[comp - 1]);
 }
 
 static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
@@ -447,6 +470,7 @@ static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, i
          unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
          stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
       }
+      stbiw__write_flush(s);
       s->func(s->context, &zero, scanline_pad);
    }
 }
@@ -467,16 +491,27 @@ static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x,
 
 static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
 {
-   int pad = (-x*3) & 3;
-   return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
-           "11 4 22 4" "4 44 22 444444",
-           'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
-            40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+   if (comp != 4) {
+      // write RGB bitmap
+      int pad = (-x*3) & 3;
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
+              "11 4 22 4" "4 44 22 444444",
+              'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
+               40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+   } else {
+      // RGBA bitmaps need a v4 header
+      // use BI_BITFIELDS mode with 32bpp and alpha mask
+      // (straight BI_RGB with alpha mask doesn't work in most readers)
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0,
+         "11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444",
+         'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header
+         108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header
+   }
 }
 
 STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
 {
-   stbi__write_context s;
+   stbi__write_context s = { 0 };
    stbi__start_write_callbacks(&s, func, context);
    return stbi_write_bmp_core(&s, x, y, comp, data);
 }
@@ -484,7 +519,7 @@ STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x,
 #ifndef STBI_WRITE_NO_STDIO
 STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
 {
-   stbi__write_context s;
+   stbi__write_context s = { 0 };
    if (stbi__start_write_file(&s,filename)) {
       int r = stbi_write_bmp_core(&s, x, y, comp, data);
       stbi__end_write_file(&s);
@@ -557,24 +592,25 @@ static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, v
 
             if (diff) {
                unsigned char header = STBIW_UCHAR(len - 1);
-               s->func(s->context, &header, 1);
+               stbiw__write1(s, header);
                for (k = 0; k < len; ++k) {
                   stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
                }
             } else {
                unsigned char header = STBIW_UCHAR(len - 129);
-               s->func(s->context, &header, 1);
+               stbiw__write1(s, header);
                stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
             }
          }
       }
+      stbiw__write_flush(s);
    }
    return 1;
 }
 
 STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
 {
-   stbi__write_context s;
+   stbi__write_context s = { 0 };
    stbi__start_write_callbacks(&s, func, context);
    return stbi_write_tga_core(&s, x, y, comp, (void *) data);
 }
@@ -582,7 +618,7 @@ STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x,
 #ifndef STBI_WRITE_NO_STDIO
 STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
 {
-   stbi__write_context s;
+   stbi__write_context s = { 0 };
    if (stbi__start_write_file(&s,filename)) {
       int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
       stbi__end_write_file(&s);
@@ -598,6 +634,8 @@ STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const
 
 #define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
 
+#ifndef STBI_WRITE_NO_STDIO
+
 static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
 {
    int exponent;
@@ -732,7 +770,7 @@ static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, f
       char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
       s->func(s->context, header, sizeof(header)-1);
 
-#ifdef __STDC_WANT_SECURE_LIB__
+#ifdef __STDC_LIB_EXT1__
       len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
 #else
       len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
@@ -748,15 +786,14 @@ static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, f
 
 STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
 {
-   stbi__write_context s;
+   stbi__write_context s = { 0 };
    stbi__start_write_callbacks(&s, func, context);
    return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
 }
 
-#ifndef STBI_WRITE_NO_STDIO
 STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
 {
-   stbi__write_context s;
+   stbi__write_context s = { 0 };
    if (stbi__start_write_file(&s,filename)) {
       int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
       stbi__end_write_file(&s);
@@ -774,7 +811,7 @@ STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const
 
 #ifndef STBIW_ZLIB_COMPRESS
 // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
-#define stbiw__sbraw(a) ((int *) (a) - 2)
+#define stbiw__sbraw(a) ((int *) (void *) (a) - 2)
 #define stbiw__sbm(a)   stbiw__sbraw(a)[0]
 #define stbiw__sbn(a)   stbiw__sbraw(a)[1]
 
@@ -944,6 +981,23 @@ STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, i
       (void) stbiw__sbfree(hash_table[i]);
    STBIW_FREE(hash_table);
 
+   // store uncompressed instead if compression was worse
+   if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) {
+      stbiw__sbn(out) = 2;  // truncate to DEFLATE 32K window and FLEVEL = 1
+      for (j = 0; j < data_len;) {
+         int blocklen = data_len - j;
+         if (blocklen > 32767) blocklen = 32767;
+         stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8));
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8));
+         memcpy(out+stbiw__sbn(out), data+j, blocklen);
+         stbiw__sbn(out) += blocklen;
+         j += blocklen;
+      }
+   }
+
    {
       // compute adler32 on input
       unsigned int s1=1, s2=0;
@@ -1271,26 +1325,31 @@ static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
    bits[0] = val & ((1<<bits[1])-1);
 }
 
-static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
+static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, int du_stride, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
    const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
    const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
-   int dataOff, i, diff, end0pos;
+   int dataOff, i, j, n, diff, end0pos, x, y;
    int DU[64];
 
    // DCT rows
-   for(dataOff=0; dataOff<64; dataOff+=8) {
+   for(dataOff=0, n=du_stride*8; dataOff<n; dataOff+=du_stride) {
       stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
    }
    // DCT columns
    for(dataOff=0; dataOff<8; ++dataOff) {
-      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+8], &CDU[dataOff+16], &CDU[dataOff+24], &CDU[dataOff+32], &CDU[dataOff+40], &CDU[dataOff+48], &CDU[dataOff+56]);
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+du_stride], &CDU[dataOff+du_stride*2], &CDU[dataOff+du_stride*3], &CDU[dataOff+du_stride*4],
+                     &CDU[dataOff+du_stride*5], &CDU[dataOff+du_stride*6], &CDU[dataOff+du_stride*7]);
    }
    // Quantize/descale/zigzag the coefficients
-   for(i=0; i<64; ++i) {
-      float v = CDU[i]*fdtbl[i];
-      // DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
-      // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
-      DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
+   for(y = 0, j=0; y < 8; ++y) {
+      for(x = 0; x < 8; ++x,++j) {
+         float v;
+         i = y*du_stride+x;
+         v = CDU[i]*fdtbl[j];
+         // DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
+         // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
+         DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
+      }
    }
 
    // Encode DC
@@ -1408,7 +1467,7 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, in
    static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
                                  1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
 
-   int row, col, i, k;
+   int row, col, i, k, subsample;
    float fdtbl_Y[64], fdtbl_UV[64];
    unsigned char YTable[64], UVTable[64];
 
@@ -1417,6 +1476,7 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, in
    }
 
    quality = quality ? quality : 90;
+   subsample = quality <= 90 ? 1 : 0;
    quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
    quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
 
@@ -1439,7 +1499,7 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, in
       static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
       static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
       const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
-                                      3,1,0x11,0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
+                                      3,1,(unsigned char)(subsample?0x22:0x11),0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
       s->func(s->context, (void*)head0, sizeof(head0));
       s->func(s->context, (void*)YTable, sizeof(YTable));
       stbiw__putc(s, 1);
@@ -1462,36 +1522,74 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, in
    // Encode 8x8 macroblocks
    {
       static const unsigned short fillBits[] = {0x7F, 7};
-      const unsigned char *imageData = (const unsigned char *)data;
       int DCY=0, DCU=0, DCV=0;
       int bitBuf=0, bitCnt=0;
       // comp == 2 is grey+alpha (alpha is ignored)
       int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
+      const unsigned char *dataR = (const unsigned char *)data;
+      const unsigned char *dataG = dataR + ofsG;
+      const unsigned char *dataB = dataR + ofsB;
       int x, y, pos;
-      for(y = 0; y < height; y += 8) {
-         for(x = 0; x < width; x += 8) {
-            float YDU[64], UDU[64], VDU[64];
-            for(row = y, pos = 0; row < y+8; ++row) {
-               // row >= height => use last input row
-               int clamped_row = (row < height) ? row : height - 1;
-               int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
-               for(col = x; col < x+8; ++col, ++pos) {
-                  float r, g, b;
-                  // if col >= width => use pixel from last input column
-                  int p = base_p + ((col < width) ? col : (width-1))*comp;
-
-                  r = imageData[p+0];
-                  g = imageData[p+ofsG];
-                  b = imageData[p+ofsB];
-                  YDU[pos]=+0.29900f*r+0.58700f*g+0.11400f*b-128;
-                  UDU[pos]=-0.16874f*r-0.33126f*g+0.50000f*b;
-                  VDU[pos]=+0.50000f*r-0.41869f*g-0.08131f*b;
+      if(subsample) {
+         for(y = 0; y < height; y += 16) {
+            for(x = 0; x < width; x += 16) {
+               float Y[256], U[256], V[256];
+               for(row = y, pos = 0; row < y+16; ++row) {
+                  // row >= height => use last input row
+                  int clamped_row = (row < height) ? row : height - 1;
+                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+                  for(col = x; col < x+16; ++col, ++pos) {
+                     // if col >= width => use pixel from last input column
+                     int p = base_p + ((col < width) ? col : (width-1))*comp;
+                     float r = dataR[p], g = dataG[p], b = dataB[p];
+                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
+                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
+                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
+                  }
+               }
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+0,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+8,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+128, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+136, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+
+               // subsample U,V
+               {
+                  float subU[64], subV[64];
+                  int yy, xx;
+                  for(yy = 0, pos = 0; yy < 8; ++yy) {
+                     for(xx = 0; xx < 8; ++xx, ++pos) {
+                        int j = yy*32+xx*2;
+                        subU[pos] = (U[j+0] + U[j+1] + U[j+16] + U[j+17]) * 0.25f;
+                        subV[pos] = (V[j+0] + V[j+1] + V[j+16] + V[j+17]) * 0.25f;
+                     }
+                  }
+                  DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+                  DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
                }
             }
+         }
+      } else {
+         for(y = 0; y < height; y += 8) {
+            for(x = 0; x < width; x += 8) {
+               float Y[64], U[64], V[64];
+               for(row = y, pos = 0; row < y+8; ++row) {
+                  // row >= height => use last input row
+                  int clamped_row = (row < height) ? row : height - 1;
+                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+                  for(col = x; col < x+8; ++col, ++pos) {
+                     // if col >= width => use pixel from last input column
+                     int p = base_p + ((col < width) ? col : (width-1))*comp;
+                     float r = dataR[p], g = dataG[p], b = dataB[p];
+                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
+                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
+                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
+                  }
+               }
 
-            DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, YDU, fdtbl_Y, DCY, YDC_HT, YAC_HT);
-            DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, UDU, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
-            DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, VDU, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y,  DCY, YDC_HT, YAC_HT);
+               DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+               DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+            }
          }
       }
 
@@ -1508,7 +1606,7 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, in
 
 STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
 {
-   stbi__write_context s;
+   stbi__write_context s = { 0 };
    stbi__start_write_callbacks(&s, func, context);
    return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
 }
@@ -1517,7 +1615,7 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x,
 #ifndef STBI_WRITE_NO_STDIO
 STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
 {
-   stbi__write_context s;
+   stbi__write_context s = { 0 };
    if (stbi__start_write_file(&s,filename)) {
       int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
       stbi__end_write_file(&s);
@@ -1530,6 +1628,13 @@ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const
 #endif // STB_IMAGE_WRITE_IMPLEMENTATION
 
 /* Revision history
+      1.16  (2021-07-11)
+             make Deflate code emit uncompressed blocks when it would otherwise expand
+             support writing BMPs with alpha channel
+      1.15  (2020-07-13) unknown
+      1.14  (2020-02-02) updated JPEG writer to downsample chroma channels
+      1.13
+      1.12
       1.11  (2019-08-11)
 
       1.10  (2019-02-07)
@@ -1564,7 +1669,7 @@ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const
              add HDR output
              fix monochrome BMP
       0.95 (2014-08-17)
-           add monochrome TGA output
+             add monochrome TGA output
       0.94 (2014-05-31)
              rename private functions to avoid conflicts with stb_image.h
       0.93 (2014-05-27)

From 02895d0ec39d3d2fbeb039fecc0789445aec8e05 Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Sun, 21 Nov 2021 23:05:58 +0100
Subject: [PATCH 16/18] Add missing file. Update Simd CMakeLists.txt. Add
 SimdFree().

---
 3rdparty/simdlib/CMakeLists.txt               |  10 +-
 .../simdlib/Simd/SimdNeonImageSavePng.cpp     | 409 ++++++++++++++++++
 .../io/src/image/private/vpImageIoSimd.cpp    |   8 +-
 3 files changed, 423 insertions(+), 4 deletions(-)
 create mode 100644 3rdparty/simdlib/Simd/SimdNeonImageSavePng.cpp

diff --git a/3rdparty/simdlib/CMakeLists.txt b/3rdparty/simdlib/CMakeLists.txt
index 95b3358ad2..f737f8ea89 100644
--- a/3rdparty/simdlib/CMakeLists.txt
+++ b/3rdparty/simdlib/CMakeLists.txt
@@ -89,11 +89,19 @@ if(X86 OR X86_64)
     set_source_files_properties(${SIMD_SSE41_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${SSE4_2_FLAG}")
 
     file(GLOB_RECURSE SIMD_AVX1_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx1*.cpp)
-    set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}")
+    if(MSVC)
+        set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG} -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
+    elseif((CMAKE_CXX_COMPILER MATCHES "clang") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
+        set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG}")
+    else()
+        set_source_files_properties(${SIMD_AVX1_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX_FLAG} -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
+    endif()
 
     file(GLOB_RECURSE SIMD_AVX2_SRC ${CMAKE_CURRENT_SOURCE_DIR}/Simd/SimdAvx2*.cpp)
     if(MSVC)
         set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
+    elseif((CMAKE_CXX_COMPILER MATCHES "clang") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
+        set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mfma -mbmi -mbmi2 -mlzcnt")
     else()
         set_source_files_properties(${SIMD_AVX2_SRC} PROPERTIES COMPILE_FLAGS "${COMMON_CXX_FLAGS} ${AVX2_FLAG} -mfma -mbmi -mbmi2 -mlzcnt -fabi-version=4 -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
     endif()
diff --git a/3rdparty/simdlib/Simd/SimdNeonImageSavePng.cpp b/3rdparty/simdlib/Simd/SimdNeonImageSavePng.cpp
new file mode 100644
index 0000000000..330a64374d
--- /dev/null
+++ b/3rdparty/simdlib/Simd/SimdNeonImageSavePng.cpp
@@ -0,0 +1,409 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2021 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdImageSave.h"
+#include "Simd/SimdImageSavePng.h"
+#include "Simd/SimdBase.h"
+#include "Simd/SimdNeon.h"
+#include "Simd/SimdSet.h"
+#include "Simd/SimdExtract.h"
+#include "Simd/SimdStore.h"
+
+namespace Simd
+{        
+#ifdef SIMD_NEON_ENABLE    
+    namespace Neon
+    {
+        uint32_t ZlibAdler32(uint8_t* data, int size)
+        {
+            int32x4_t _i0 = SetI32(0, -1, -2, -3), _4 = vdupq_n_s32(4);
+            uint32_t lo = 1, hi = 0;
+            for (int b = 0, n = (int)(size % 5552); b < size;)
+            {
+                int n8 = n & (~7), i = 0;
+                int32x4_t _i = vaddq_s32(_i0, vdupq_n_s32(n));
+                int32x4_t _l = vdupq_n_s32(0), _h = vdupq_n_s32(0);
+                for (; i < n8; i += 8)
+                {
+                    uint8x8_t d8 = LoadHalf<false>(data + b + i);
+                    int16x8_t d16 = (int16x8_t)vmovl_u8(d8);
+                    int32x4_t d0 = vmovl_s16(Half<0>(d16));
+                    _l = vaddq_s32(_l, d0);
+                    _h = vmlaq_s32(_h, d0, _i);
+                    _i = vsubq_s32(_i, _4);
+                    int32x4_t d1 = vmovl_s16((int16x4_t)Half<1>(d16));
+                    _l = vaddq_s32(_l, d1);
+                    _h = vmlaq_s32(_h, d1, _i);
+                    _i = vsubq_s32(_i, _4);
+                }
+                int l = ExtractSum32s(_l), h = ExtractSum32s(_h);
+                for (; i < n; ++i)
+                {
+                    l += data[b + i];
+                    h += data[b + i]*(n - i);
+                }
+                hi = (hi + h + lo*n) % 65521;
+                lo = (lo + l) % 65521;
+                b += n;
+                n = 5552;
+            }
+            return (hi << 16) | lo;
+        }
+
+        void ZlibCompress(uint8_t* data, int size, int quality, OutputMemoryStream& stream)
+        {
+            const int ZHASH = 16384;
+            if (quality < 5)
+                quality = 5;
+            const int basket = quality * 2;
+            Array32i hashTable(ZHASH * basket);
+            memset(hashTable.data, -1, hashTable.RawSize());
+
+            stream.Write(uint8_t(0x78));
+            stream.Write(uint8_t(0x5e));
+            stream.WriteBits(1, 1);
+            stream.WriteBits(1, 2);
+
+            int i = 0, j;
+            while (i < size - 3)
+            {
+                int h = Base::ZlibHash(data + i) & (ZHASH - 1), best = 3;
+                uint8_t* bestLoc = 0;
+                int* hList = hashTable.data + h * basket;
+                for (j = 0; hList[j] != -1 && j < basket; ++j)
+                {
+                    if (hList[j] > i - 32768)
+                    {
+                        int d = Base::ZlibCount(data + hList[j], data + i, size - i);
+                        if (d >= best)
+                        {
+                            best = d;
+                            bestLoc = data + hList[j];
+                        }
+                    }
+                }
+                if (j == basket)
+                {
+                    memcpy(hList, hList + quality, quality * sizeof(int));
+                    memset(hList + quality, -1, quality * sizeof(int));
+                    j = quality;
+                }
+                hList[j] = i;
+
+                if (bestLoc)
+                {
+                    h = Base::ZlibHash(data + i + 1) & (ZHASH - 1);
+                    int* hList = hashTable.data + h * basket;
+                    for (j = 0; hList[j] != -1 && j < basket; ++j)
+                    {
+                        if (hList[j] > i - 32767)
+                        {
+                            int e = Base::ZlibCount(data + hList[j], data + i + 1, size - i - 1);
+                            if (e > best)
+                            {
+                                bestLoc = NULL;
+                                break;
+                            }
+                        }
+                    }
+                }
+
+                if (bestLoc)
+                {
+                    int d = (int)(data + i - bestLoc);
+                    assert(d <= 32767 && best <= 258);
+                    for (j = 0; best > Base::ZlibLenC[j + 1] - 1; ++j);
+                    Base::ZlibHuff(j + 257, stream);
+                    if (Base::ZlibLenEb[j])
+                        stream.WriteBits(best - Base::ZlibLenC[j], Base::ZlibLenEb[j]);
+                    for (j = 0; d > Base::ZlibDistC[j + 1] - 1; ++j);
+                    stream.WriteBits(Base::ZlibBitRev(j, 5), 5);
+                    if (Base::ZlibDistEb[j])
+                        stream.WriteBits(d - Base::ZlibDistC[j], Base::ZlibDistEb[j]);
+                    i += best;
+                }
+                else
+                {
+                    Base::ZlibHuffB(data[i], stream);
+                    ++i;
+                }
+            }
+            for (; i < size; ++i)
+                Base::ZlibHuffB(data[i], stream);
+            Base::ZlibHuff(256, stream);
+            stream.FlushBits();
+            stream.WriteBe32u(ZlibAdler32(data, size));
+        }
+
+        uint32_t EncodeLine0(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size, A), bS = A << 7, bC = (sizeA >> 7) + 1;
+            uint32x4_t _sum = vdupq_n_u32(0);
+            for (size_t b = 0; b < bC; ++b)
+            {
+                uint16x8_t bSum = vdupq_n_u16(0);
+                for (size_t end = Min(i + bS, sizeA); i < end; i += A)
+                {
+                    int8x16_t _src = (int8x16_t)Load<false>(src + i);
+                    Store<false>(dst + i, _src);
+                    bSum = vaddq_u16(bSum, vpaddlq_u8((uint8x16_t)vabsq_s8(_src)));
+                }
+                _sum = vaddq_u32(_sum, vpaddlq_u16(bSum));
+            }
+            uint32_t sum = ExtractSum32u(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine1(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n, bS = A << 7, bC = (sizeA >> 7) + 1;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            uint32x4_t _sum = vdupq_n_u32(0);
+            for (size_t b = 0; b < bC; ++b)
+            {
+                uint16x8_t bSum = vdupq_n_u16(0);
+                for (size_t end = Min(i + bS, sizeA); i < end; i += A)
+                {
+                    int8x16_t _src0 = (int8x16_t)Load<false>(src + i);
+                    int8x16_t _src1 = (int8x16_t)Load<false>(src + i - n);
+                    int8x16_t _dst = vsubq_s8(_src0, _src1);
+                    Store<false>(dst + i, _dst);
+                    bSum = vaddq_u16(bSum, vpaddlq_u8((uint8x16_t)vabsq_s8(_dst)));
+                }
+                _sum = vaddq_u32(_sum, vpaddlq_u16(bSum));
+            }
+            sum += ExtractSum32u(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - n];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine2(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n, bS = A << 7, bC = (sizeA >> 7) + 1;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i] - src[i - stride];
+                sum += ::abs(dst[i]);
+            }
+            uint32x4_t _sum = vdupq_n_u32(0);
+            for (size_t b = 0; b < bC; ++b)
+            {
+                uint16x8_t bSum = vdupq_n_u16(0);
+                for (size_t end = Min(i + bS, sizeA); i < end; i += A)
+                {
+                    int8x16_t _src0 = (int8x16_t)Load<false>(src + i);
+                    int8x16_t _src1 = (int8x16_t)Load<false>(src + i - stride);
+                    int8x16_t _dst = vsubq_s8(_src0, _src1);
+                    Store<false>(dst + i, _dst);
+                    bSum = vaddq_u16(bSum, vpaddlq_u8((uint8x16_t)vabsq_s8(_dst)));
+                }
+                _sum = vaddq_u32(_sum, vpaddlq_u16(bSum));
+            }
+            sum += ExtractSum32u(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - stride];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine3(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n, bS = A << 7, bC = (sizeA >> 7) + 1;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i] - (src[i - stride] >> 1);
+                sum += ::abs(dst[i]);
+            }
+            uint32x4_t _sum = vdupq_n_u32(0);
+            for (size_t b = 0; b < bC; ++b)
+            {
+                uint16x8_t bSum = vdupq_n_u16(0);
+                for (size_t end = Min(i + bS, sizeA); i < end; i += A)
+                {
+                    uint8x16_t _src0 = Load<false>(src + i);
+                    uint8x16_t _src1 = Load<false>(src + i - n);
+                    uint8x16_t _src2 = Load<false>(src + i - stride);
+                    int8x16_t _dst = (int8x16_t)vsubq_u8(_src0, vhaddq_u8(_src1, _src2));
+                    Store<false>(dst + i, _dst);
+                    bSum = vaddq_u16(bSum, vpaddlq_u8((uint8x16_t)vabsq_s8(_dst)));
+                }
+                _sum = vaddq_u32(_sum, vpaddlq_u16(bSum));
+            }
+            sum += ExtractSum32u(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - ((src[i - n] + src[i - stride]) >> 1);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        SIMD_INLINE uint16x8_t Paeth(uint16x8_t a, uint16x8_t b, uint16x8_t c)
+        {
+            int16x8_t p = (int16x8_t)vsubq_u16(vaddq_u16(a, b), c);
+            int16x8_t pa = vabsq_s16(vsubq_s16(p, (int16x8_t)a));
+            int16x8_t pb = vabsq_s16(vsubq_s16(p, (int16x8_t)b));
+            int16x8_t pc = vabsq_s16(vsubq_s16(p, (int16x8_t)c));
+            uint16x8_t mbc = vorrq_u16(vcgtq_s16(pa, pb), vcgtq_s16(pa, pc));
+            uint16x8_t mc = vcgtq_s16(pb, pc);
+            return (uint16x8_t)vbslq_u16(mbc, vbslq_u16(mc, c, b), a);
+        }
+
+        uint32_t EncodeLine4(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n, bS = A << 7, bC = (sizeA >> 7) + 1;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = (int8_t)(src[i] - src[i - stride]);
+                sum += ::abs(dst[i]);
+            }
+            uint32x4_t _sum = vdupq_n_u32(0);
+            for (size_t b = 0; b < bC; ++b)
+            {
+                uint16x8_t bSum = vdupq_n_u16(0);
+                for (size_t end = Min(i + bS, sizeA); i < end; i += A)
+                {
+                    uint8x16_t _src0 = Load<false>(src + i);
+                    uint8x16_t _src1 = Load<false>(src + i - n);
+                    uint8x16_t _src2 = Load<false>(src + i - stride);
+                    uint8x16_t _src3 = Load<false>(src + i - stride - n);
+                    uint16x8_t lo = Paeth(UnpackU8<0>(_src1), UnpackU8<0>(_src2), UnpackU8<0>(_src3));
+                    uint16x8_t hi = Paeth(UnpackU8<1>(_src1), UnpackU8<1>(_src2), UnpackU8<1>(_src3));
+                    int8x16_t _dst = (int8x16_t)vsubq_u8(_src0, PackU16(lo, hi));
+                    Store<false>(dst + i, _dst);
+                    bSum = vaddq_u16(bSum, vpaddlq_u8((uint8x16_t)vabsq_s8(_dst)));
+                }
+                _sum = vaddq_u32(_sum, vpaddlq_u16(bSum));
+            }
+            sum += ExtractSum32u(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - Base::Paeth(src[i - n], src[i - stride], src[i - stride - n]);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine5(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n, bS = A << 7, bC = (sizeA >> 7) + 1;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            uint32x4_t _sum = vdupq_n_u32(0);
+            for (size_t b = 0; b < bC; ++b)
+            {
+                uint16x8_t bSum = vdupq_n_u16(0);
+                for (size_t end = Min(i + bS, sizeA); i < end; i += A)
+                {
+                    uint8x16_t _src0 = Load<false>(src + i);
+                    uint8x16_t _src1 = Load<false>(src + i - n);
+                    int8x16_t _dst = (int8x16_t)vsubq_u8(_src0, vshrq_n_u8(_src1, 1));
+                    Store<false>(dst + i, _dst);
+                    bSum = vaddq_u16(bSum, vpaddlq_u8((uint8x16_t)vabsq_s8(_dst)));
+                }
+                _sum = vaddq_u32(_sum, vpaddlq_u16(bSum));
+            }
+            sum += ExtractSum32u(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - (src[i - n] >> 1);
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        uint32_t EncodeLine6(const uint8_t* src, size_t stride, size_t n, size_t size, int8_t* dst)
+        {
+            size_t i = 0, sizeA = AlignLo(size - n, A) + n, bS = A << 7, bC = (sizeA >> 7) + 1;
+            uint32_t sum = 0;
+            for (; i < n; ++i)
+            {
+                dst[i] = src[i];
+                sum += ::abs(dst[i]);
+            }
+            uint32x4_t _sum = vdupq_n_u32(0);
+            for (size_t b = 0; b < bC; ++b)
+            {
+                uint16x8_t bSum = vdupq_n_u16(0);
+                for (size_t end = Min(i + bS, sizeA); i < end; i += A)
+                {
+                    int8x16_t _src0 = (int8x16_t)Load<false>(src + i);
+                    int8x16_t _src1 = (int8x16_t)Load<false>(src + i - n);
+                    int8x16_t _dst = vsubq_s8(_src0, _src1);
+                    Store<false>(dst + i, _dst);
+                    bSum = vaddq_u16(bSum, vpaddlq_u8((uint8x16_t)vabsq_s8(_dst)));
+                }
+                _sum = vaddq_u32(_sum, vpaddlq_u16(bSum));
+            }
+            sum += ExtractSum32u(_sum);
+            for (; i < size; ++i)
+            {
+                dst[i] = src[i] - src[i - n];
+                sum += ::abs(dst[i]);
+            }
+            return sum;
+        }
+
+        ImagePngSaver::ImagePngSaver(const ImageSaverParam& param)
+            : Base::ImagePngSaver(param)
+        {
+            if (_param.format == SimdPixelFormatBgr24)
+                _convert = Neon::BgrToRgb;
+            else if (_param.format == SimdPixelFormatBgra32)
+                _convert = Neon::BgraToRgba;
+            _encode[0] = Neon::EncodeLine0;
+            _encode[1] = Neon::EncodeLine1;
+            _encode[2] = Neon::EncodeLine2;
+            _encode[3] = Neon::EncodeLine3;
+            _encode[4] = Neon::EncodeLine4;
+            _encode[5] = Neon::EncodeLine5;
+            _encode[6] = Neon::EncodeLine6;
+            _compress = Neon::ZlibCompress;
+        }
+    }
+#endif// SIMD_NEON_ENABLE
+}
diff --git a/modules/io/src/image/private/vpImageIoSimd.cpp b/modules/io/src/image/private/vpImageIoSimd.cpp
index 4612aa5f7f..424286dc70 100644
--- a/modules/io/src/image/private/vpImageIoSimd.cpp
+++ b/modules/io/src/image/private/vpImageIoSimd.cpp
@@ -39,7 +39,7 @@
 */
 
 #include "vpImageIoBackend.h"
-#include <Simd/SimdLib.hpp>
+#include <Simd/SimdLib.h>
 
 
 //TODO:
@@ -48,8 +48,9 @@ void readSimdlib(vpImage<unsigned char> &I, const std::string &filename)
   size_t stride = 0, width = 0, height = 0;
   SimdPixelFormatType format = SimdPixelFormatGray8;
   uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format);
-  const bool copyData = false;
+  const bool copyData = true;
   I.init(data, (unsigned int)height, (unsigned int)width, copyData);
+  SimdFree(data);
 }
 
 void readSimdlib(vpImage<vpRGBa> &I, const std::string &filename)
@@ -57,8 +58,9 @@ void readSimdlib(vpImage<vpRGBa> &I, const std::string &filename)
   size_t stride = 0, width = 0, height = 0;
   SimdPixelFormatType format = SimdPixelFormatRgba32;
   uint8_t* data = SimdImageLoadFromFile(filename.c_str(), &stride, &width, &height, &format);
-  const bool copyData = false;
+  const bool copyData = true;
   I.init((vpRGBa *)data, (unsigned int)height, (unsigned int)width, copyData);
+  SimdFree(data);
 }
 
 void writeJPEGSimdlib(const vpImage<unsigned char> &I, const std::string &filename, int quality)

From 557f1beda01f36ca886ec039d0a1a80a7446ca59 Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Sun, 21 Nov 2021 23:58:41 +0100
Subject: [PATCH 17/18] Fix write with libjpeg. Try to fix ARM build.

---
 3rdparty/simdlib/Simd/SimdStore.h             | 35 +++++++++++++++++++
 .../io/src/image/private/vpImageIoLibjpeg.cpp |  4 +--
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/3rdparty/simdlib/Simd/SimdStore.h b/3rdparty/simdlib/Simd/SimdStore.h
index 2b22a9616d..465c757dc4 100755
--- a/3rdparty/simdlib/Simd/SimdStore.h
+++ b/3rdparty/simdlib/Simd/SimdStore.h
@@ -62,6 +62,14 @@ namespace Simd
         {
             __m128 old = Load<align>(p);
             Store<align>(p, Combine(mask, value, old));
+        } 
+
+        SIMD_INLINE void Store(float* ptr, __m128 val, size_t size)
+        {
+            SIMD_ALIGNED(16) float buf[F];
+            _mm_store_ps(buf, val);
+            for (size_t i = 0; i < size; ++i)
+                ptr[i] = buf[i];
         }
 
         template <bool align> SIMD_INLINE void Store(__m128i * p, __m128i a);
@@ -113,6 +121,14 @@ namespace Simd
             _mm256_store_ps(p, a);
         }
 
+        SIMD_INLINE void Store(float* ptr, __m256 val, size_t size)
+        {
+            SIMD_ALIGNED(32) float buf[F];
+            _mm256_store_ps(buf, val);
+            for (size_t i = 0; i < size; ++i)
+                ptr[i] = buf[i];
+        }
+
         template <bool align> SIMD_INLINE void Store(float * p0, float * p1, __m256 a)
         {
             Sse2::Store<align>(p0, _mm256_extractf128_ps(a, 0));
@@ -144,6 +160,12 @@ namespace Simd
             _mm256_store_si256(p, a);
         }
 
+        template <bool align> SIMD_INLINE void Store(__m128i* p0, __m128i* p1, __m256i a)
+        {
+            Sse2::Store<align>(p0, _mm256_extractf128_si256(a, 0));
+            Sse2::Store<align>(p1, _mm256_extractf128_si256(a, 1));
+        }
+
         template <bool align> SIMD_INLINE void StoreMasked(__m256i * p, __m256i value, __m256i mask)
         {
             __m256i old = Load<align>(p);
@@ -207,6 +229,11 @@ namespace Simd
 #endif
         }
 
+        template <bool align> SIMD_INLINE void Store(int8_t* p, int8x16_t a)
+        {
+            Store<align>((uint8_t*)p, vreinterpretq_u8_s8(a));
+        }
+
         template <bool align> SIMD_INLINE void Store(uint8_t * p, uint8x8_t a);
 
         template <> SIMD_INLINE void Store<false>(uint8_t * p, uint8x8_t a)
@@ -403,6 +430,14 @@ namespace Simd
 #endif
         }
 
+        SIMD_INLINE void Store(float* ptr, float32x4_t val, size_t size)
+        {
+            SIMD_ALIGNED(16) float buf[F];
+            Store<true>(buf, val);
+            for (size_t i = 0; i < size; ++i)
+                ptr[i] = buf[i];
+        }
+
         template <bool align> SIMD_INLINE void Store(float * p, float32x2_t a);
 
         template <> SIMD_INLINE void Store<false>(float * p, float32x2_t a)
diff --git a/modules/io/src/image/private/vpImageIoLibjpeg.cpp b/modules/io/src/image/private/vpImageIoLibjpeg.cpp
index 8f5b021c8c..77777b0814 100644
--- a/modules/io/src/image/private/vpImageIoLibjpeg.cpp
+++ b/modules/io/src/image/private/vpImageIoLibjpeg.cpp
@@ -94,11 +94,11 @@ void writeJPEGLibjpeg(const vpImage<unsigned char> &I, const std::string &filena
 
   jpeg_stdio_dest(&cinfo, file);
 
-  jpeg_set_defaults(&cinfo);
   cinfo.image_width = width;
   cinfo.image_height = height;
   cinfo.input_components = 1;
   cinfo.in_color_space = JCS_GRAYSCALE;
+  jpeg_set_defaults(&cinfo);
   //TODO:
   jpeg_set_quality(&cinfo, quality, TRUE);
 
@@ -154,11 +154,11 @@ void writeJPEGLibjpeg(const vpImage<vpRGBa> &I, const std::string &filename, int
 
   jpeg_stdio_dest(&cinfo, file);
 
-  jpeg_set_defaults(&cinfo);
   cinfo.image_width = width;
   cinfo.image_height = height;
   cinfo.input_components = 3;
   cinfo.in_color_space = JCS_RGB;
+  jpeg_set_defaults(&cinfo);
   //TODO:
   jpeg_set_quality(&cinfo, quality, TRUE);
 

From ca1cca3e4ea431410964cbd62bb878d6ce37f80b Mon Sep 17 00:00:00 2001
From: Souriya Trinh <souriya.trinh+github@gmail.com>
Date: Wed, 24 Nov 2021 10:06:16 +0100
Subject: [PATCH 18/18] This should allow running the benchmarks on Windows and
 Unix.

---
 modules/io/test/perfImageLoadSave.cpp | 49 ++++++++++++++++-----------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/modules/io/test/perfImageLoadSave.cpp b/modules/io/test/perfImageLoadSave.cpp
index 6182df06e4..b4a1bd97dd 100644
--- a/modules/io/test/perfImageLoadSave.cpp
+++ b/modules/io/test/perfImageLoadSave.cpp
@@ -55,7 +55,14 @@ static std::vector<std::string> names {
   "Solvay (640x440)", "Solvay (1024x705)", "Solvay (1280x881)", "Solvay (2126x1463)"
 };
 static std::vector<vpImageIo::vpImageIoBackendType> backends {
-  vpImageIo::IO_LIB_BACKEND, vpImageIo::IO_OPENCV_BACKEND, vpImageIo::IO_SIMDLIB_BACKEND, vpImageIo::IO_STB_IMAGE_BACKEND
+#if defined(VISP_HAVE_JPEG) && defined(VISP_HAVE_PNG)
+  vpImageIo::IO_LIB_BACKEND,
+#endif
+#if defined(VISP_HAVE_OPENCV)
+  vpImageIo::IO_OPENCV_BACKEND,
+#endif
+  vpImageIo::IO_SIMDLIB_BACKEND,
+  vpImageIo::IO_STB_IMAGE_BACKEND
 };
 static std::vector<std::string> backendNamesJpeg {
   "libjpeg", "OpenCV", "simd", "stb"
@@ -129,18 +136,20 @@ TEST_CASE("Benchmark PNG image loading", "[benchmark]") {
   }
 }
 
-#if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || (defined(__APPLE__) && defined(__MACH__))) // UNIX
-// makeTempDirectory is only implemented for Unix platform
-
 std::string username, directory_filename_tmp;
 
+#if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || (defined(__APPLE__) && defined(__MACH__))) // UNIX
+std::string tmp_dir = "/tmp/";
+#else
+std::string tmp_dir = "C:/Temp/";
+#endif
+
 TEST_CASE("Benchmark JPEG image saving", "[benchmark]") {
   vpIoTools::getUserName(username);
-  std::string tmp_dir = "/tmp/" + username;
-  vpIoTools::makeDirectory(tmp_dir);
-  directory_filename_tmp = tmp_dir + "/" + "vpIoTools_perfImageLoadSave_XXXXXX";
-  std::string converted_dirname_tmp = vpIoTools::makeTempDirectory(directory_filename_tmp);
-  REQUIRE(vpIoTools::checkDirectory(converted_dirname_tmp));
+  vpIoTools::makeDirectory(tmp_dir + username);
+  directory_filename_tmp = tmp_dir + username + "/vpIoTools_perfImageLoadSave_" + vpTime::getDateTime("%Y-%m-%d_%H.%M.%S");
+  vpIoTools::makeDirectory(directory_filename_tmp);
+  REQUIRE(vpIoTools::checkDirectory(directory_filename_tmp));
 
   SECTION("Grayscale") {
     for (size_t i = 0; i < paths.size(); i++) {
@@ -150,7 +159,7 @@ TEST_CASE("Benchmark JPEG image saving", "[benchmark]") {
       SECTION(names[i]) {
         for (size_t j = 0; j < backends.size(); j++) {
           BENCHMARK(backendNamesJpeg[j] + " backend") {
-            vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.jpg", backends[j]);
+            vpImageIo::write(I, directory_filename_tmp + "/ViSP_tmp_perf_write.jpg", backends[j]);
             return I;
           };
         }
@@ -166,7 +175,7 @@ TEST_CASE("Benchmark JPEG image saving", "[benchmark]") {
       SECTION(names[i]) {
         for (size_t j = 0; j < backends.size(); j++) {
           BENCHMARK(backendNamesJpeg[j] + " backend") {
-            vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.jpg", backends[j]);
+            vpImageIo::write(I, directory_filename_tmp + "/ViSP_tmp_perf_write.jpg", backends[j]);
             return I;
           };
         }
@@ -174,16 +183,15 @@ TEST_CASE("Benchmark JPEG image saving", "[benchmark]") {
     }
   }
 
-  REQUIRE(vpIoTools::remove(converted_dirname_tmp));
+  REQUIRE(vpIoTools::remove(directory_filename_tmp));
 }
 
 TEST_CASE("Benchmark PNG image saving", "[benchmark]") {
   vpIoTools::getUserName(username);
-  std::string tmp_dir = "/tmp/" + username;
-  vpIoTools::makeDirectory(tmp_dir);
-  directory_filename_tmp = tmp_dir + "/" + "vpIoTools_perfImageLoadSave_XXXXXX";
-  std::string converted_dirname_tmp = vpIoTools::makeTempDirectory(directory_filename_tmp);
-  REQUIRE(vpIoTools::checkDirectory(converted_dirname_tmp));
+  vpIoTools::makeDirectory(tmp_dir + username);
+  directory_filename_tmp = tmp_dir + username + "/vpIoTools_perfImageLoadSave_" + vpTime::getDateTime("%Y-%m-%d_%H.%M.%S");
+  vpIoTools::makeDirectory(directory_filename_tmp);
+  REQUIRE(vpIoTools::checkDirectory(directory_filename_tmp));
 
   SECTION("Grayscale") {
     for (size_t i = 0; i < paths.size(); i++) {
@@ -193,7 +201,7 @@ TEST_CASE("Benchmark PNG image saving", "[benchmark]") {
       SECTION(names[i]) {
         for (size_t j = 0; j < backends.size(); j++) {
           BENCHMARK(backendNamesPng[j] + " backend") {
-            vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.png", backends[j]);
+            vpImageIo::write(I, directory_filename_tmp + "/ViSP_tmp_perf_write.png", backends[j]);
             return I;
           };
         }
@@ -209,15 +217,16 @@ TEST_CASE("Benchmark PNG image saving", "[benchmark]") {
       SECTION(names[i]) {
         for (size_t j = 0; j < backends.size(); j++) {
           BENCHMARK(backendNamesPng[j] + " backend") {
-            vpImageIo::write(I, converted_dirname_tmp + "/ViSP_tmp_perf_write.png", backends[j]);
+            vpImageIo::write(I, directory_filename_tmp + "/ViSP_tmp_perf_write.png", backends[j]);
             return I;
           };
         }
       }
     }
   }
+
+  REQUIRE(vpIoTools::remove(directory_filename_tmp));
 }
-#endif
 
 int main(int argc, char *argv[])
 {