Added CUDA opOutputToCvMat for images as well

CMU-Perceptual-Computing-Lab · May 14, 2019 · acc4fb2 · acc4fb2
1 parent 56bc772
commit acc4fb2
Show file tree

Hide file tree

Showing 8 changed files with 215 additions and 178 deletions.
diff --git a/doc/faq.md b/doc/faq.md
diff --git a/doc/installation.md b/doc/installation.md
@@ -21,8 +21,8 @@ OpenPose - Installation
     7. [3D Reconstruction Module](#3d-reconstruction-module)
     8. [Calibration Module](#calibration-module)
     9. [Compiling without cuDNN](#compiling-without-cudnn)
-    10. [Custom Caffe (Ubuntu Only)](#custom-caffe-ubuntu-only)
-    11. [Custom OpenCV (Ubuntu Only)](#custom-opencv-ubuntu-only)
+    10. [Custom Caffe](#custom-caffe)
+    11. [Custom OpenCV](#custom-opencv)
     12. [Doxygen Documentation Autogeneration (Ubuntu Only)](#doxygen-documentation-autogeneration-ubuntu-only)
     13. [CMake Command Line Configuration (Ubuntu Only)](#cmake-command-line-configuration-ubuntu-only)
 
@@ -353,18 +353,20 @@ Then, you would have to reduce the `--net_resolution` flag to fit the model into
 
 
 
-#### Custom Caffe (Ubuntu Only)
-Note that OpenPose uses a [custom fork of Caffe](https://github.com/CMU-Perceptual-Computing-Lab/caffe) (rather than the official Caffe master). Our custom fork is only updated if it works on our machines, but we try to keep it updated with the latest Caffe version. This version works on a newly formatted machine (Ubuntu 16.04 LTS) and in all our machines (CUDA 8 and 10 tested). The default GPU version is the master branch, which it is also compatible with CUDA 10 without changes (official Caffe version might require some changes for it). We also use the OpenCL and CPU tags if their CMake flags are selected.
+#### Custom Caffe
+OpenPose uses a [custom fork of Caffe](https://github.com/CMU-Perceptual-Computing-Lab/caffe) (rather than the official Caffe master). Our custom fork is only updated if it works on our machines, but we try to keep it updated with the latest Caffe version. This version works on a newly formatted machine (Ubuntu 16.04 LTS) and in all our machines (CUDA 8 and 10 tested). The default GPU version is the master branch, which it is also compatible with CUDA 10 without changes (official Caffe version might require some changes for it). We also use the OpenCL and CPU tags if their CMake flags are selected. We only modified some Caffe compilation flags and minor details.
 
-We only modified some Caffe compilation flags and minor details. You can use your own Caffe distribution, simply specify the Caffe include path and the library as shown below. You will also need to turn off the `BUILD_CAFFE` variable. Note that cuDNN is required in order to get the maximum possible accuracy in OpenPose.
+Alternatively, you can use your own Caffe distribution on Ubuntu/Mac by 1) disabling `BUILD_CAFFE`, 2) setting `Caffe_INCLUDE_DIRS` to `{CAFFE_PATH}/include/caffe`, and 3) setting `Caffe_LIBS` to `{CAFFE_PATH}/build/lib/libcaffe.so`, as shown in the image below. Note that cuDNN-compatible Caffe version is required in order to get the maximum possible accuracy in OpenPose.
 <p align="center">
     <img src="media/cmake_installation/im_5.png", width="480">
 </p>
 
+For Windows, simply replace the OpenCV DLLs and include folder for your custom one.
 
 
-#### Custom OpenCV (Ubuntu Only)
-If you have built OpenCV from source and OpenPose cannot find it automatically, you can set the `OPENCV_DIR` variable to the directory where you build OpenCV.
+
+#### Custom OpenCV
+If you have built OpenCV from source and OpenPose cannot find it automatically, you can set the `OPENCV_DIR` variable to the directory where you build OpenCV (Ubuntu and Mac). For Windows, simply replace the OpenCV DLLs and include folder for your custom one.
 
 
 

diff --git a/src/openpose/core/cvMatToOpInput.cpp b/src/openpose/core/cvMatToOpInput.cpp
@@ -87,6 +87,9 @@ namespace op
                 // CUDA version (if #Gpus > n)
                 else
                 {
+                    // Note: This version reduces the global accuracy about 0.1%, so it is disabled for now
+                    error("This version reduces the global accuracy about 0.1%, so it is disabled for now.",
+                        __LINE__, __FUNCTION__, __FILE__);
                     #ifdef USE_CUDA
                         // (Re)Allocate temporary memory
                         const unsigned int inputImageSize = 3 * cvInputData.rows * cvInputData.cols;

diff --git a/src/openpose/core/cvMatToOpOutput.cpp b/src/openpose/core/cvMatToOpOutput.cpp
@@ -90,7 +90,8 @@ namespace op
             else
             {
                 #ifdef USE_CUDA
-// Input image can be shared between this one and cvMatToOpInput.hpp
+                    // Input image can be shared between this one and cvMatToOpInput.hpp
+                    // However, that version reduces the global accuracy a bit
                     // (Free and re-)Allocate temporary memory
                     const unsigned int inputImageSize = 3 * cvInputData.rows * cvInputData.cols;
                     if (pInputMaxSize < inputImageSize)

diff --git a/src/openpose/core/opOutputToCvMat.cpp b/src/openpose/core/opOutputToCvMat.cpp
@@ -96,7 +96,7 @@ namespace op
                     cvMat = cv::Mat(outputData.getSize(0), outputData.getSize(1), CV_8UC3);
                     // CUDA --> CPU: Copy output image back to CPU
                     cudaMemcpy(
-                        cvMat.data, pOutputImageUCharCuda, sizeof(unsigned char) * mOutputMaxSizeUChar,
+                        cvMat.data, pOutputImageUCharCuda, sizeof(unsigned char) * volume,
                         cudaMemcpyDeviceToHost);
                     // Indicate memory was copied out
                     *spGpuMemoryAllocated = false;

diff --git a/src/openpose/gpu/cuda.cpp b/src/openpose/gpu/cuda.cpp
@@ -8,11 +8,16 @@
 namespace op
 {
     #ifdef USE_CUDA
-        const dim3 THREADS_PER_BLOCK_TINY{32, 32, 1};
-        const dim3 THREADS_PER_BLOCK_SMALL{64, 64, 1};
-        const dim3 THREADS_PER_BLOCK_MEDIUM{128, 128, 1};
-        const dim3 THREADS_PER_BLOCK_BIG{256, 256, 1};
-        const dim3 THREADS_PER_BLOCK_HUGE{512, 512, 1};
+        #ifdef DNDEBUG
+            #define base 32
+        #else
+            #define base 64
+        #endif
+        const dim3 THREADS_PER_BLOCK_TINY{base, base, 1};       // 32 |64
+        const dim3 THREADS_PER_BLOCK_SMALL{2*base, 2*base, 1};  // 64 |128
+        const dim3 THREADS_PER_BLOCK_MEDIUM{4*base, 4*base, 1}; // 128|256
+        const dim3 THREADS_PER_BLOCK_BIG{8*base, 8*base, 1};    // 256|512
+        const dim3 THREADS_PER_BLOCK_HUGE{16*base, 16*base, 1}; // 512|1024
     #endif
 
     void cudaCheck(const int line, const std::string& function, const std::string& file)

diff --git a/src/openpose/net/resizeAndMergeBase.cu b/src/openpose/net/resizeAndMergeBase.cu
@@ -151,48 +151,6 @@ namespace op
         }
     }
 
-    template <typename T>
-    __global__ void resizeAndAddKernel(
-        T* targetPtr, const T* const sourcePtr, const T scaleWidth, const T scaleHeight, const int widthSource,
-        const int heightSource, const int widthTarget, const int heightTarget)
-    {
-        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
-        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
-        const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z;
-        if (x < widthTarget && y < heightTarget)
-        {
-            const auto sourceArea = widthSource * heightSource;
-            const auto targetArea = widthTarget * heightTarget;
-            const T xSource = (x + T(0.5f)) * widthSource / T(widthTarget) - T(0.5f);
-            const T ySource = (y + T(0.5f)) * heightSource / T(heightTarget) - T(0.5f);
-            const T* const sourcePtrChannel = sourcePtr + channel * sourceArea;
-            targetPtr[channel * targetArea + y*widthTarget+x] += bicubicInterpolate(
-                sourcePtrChannel, xSource, ySource, widthSource, heightSource, widthSource);
-        }
-    }
-
-    template <typename T>
-    __global__ void resizeAndAverageKernel(
-        T* targetPtr, const T* const sourcePtr, const T scaleWidth, const T scaleHeight, const int widthSource,
-        const int heightSource, const int widthTarget, const int heightTarget, const int counter)
-    {
-        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
-        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
-        const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z;
-        if (x < widthTarget && y < heightTarget)
-        {
-            const auto sourceArea = widthSource * heightSource;
-            const auto targetArea = widthTarget * heightTarget;
-            const T xSource = (x + T(0.5f)) / scaleWidth - T(0.5f);
-            const T ySource = (y + T(0.5f)) / scaleHeight - T(0.5f);
-            const T* const sourcePtrChannel = sourcePtr + channel * sourceArea;
-            const auto interpolated = bicubicInterpolate(
-                sourcePtrChannel, xSource, ySource, widthSource, heightSource, widthSource);
-            auto& targetPixel = targetPtr[channel * targetArea + y*widthTarget+x];
-            targetPixel = (targetPixel + interpolated) / T(counter);
-        }
-    }
-
     template <typename T>
     __global__ void resizeAndAddAndAverageKernel(
         T* targetPtr, const int counter, const T* const scaleWidths, const T* const scaleHeights,
@@ -227,39 +185,81 @@ namespace op
         }
     }
 
-    template <typename T>
-    __global__ void resizeAndAddKernelOld(
-        T* targetPtr, const T* const sourcePtr, const T scaleWidth, const T scaleHeight, const int widthSource,
-        const int heightSource, const int widthTarget, const int heightTarget)
-    {
-        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
-        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
-        if (x < widthTarget && y < heightTarget)
-        {
-            const T xSource = (x + T(0.5f)) / scaleWidth - T(0.5f);
-            const T ySource = (y + T(0.5f)) / scaleHeight - T(0.5f);
-            targetPtr[y*widthTarget+x] += bicubicInterpolate(
-                sourcePtr, xSource, ySource, widthSource, heightSource, widthSource);
-        }
-    }
+    // template <typename T>
+    // __global__ void resizeAndAddKernel(
+    //     T* targetPtr, const T* const sourcePtr, const T scaleWidth, const T scaleHeight, const int widthSource,
+    //     const int heightSource, const int widthTarget, const int heightTarget)
+    // {
+    //     const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    //     const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+    //     const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z;
+    //     if (x < widthTarget && y < heightTarget)
+    //     {
+    //         const auto sourceArea = widthSource * heightSource;
+    //         const auto targetArea = widthTarget * heightTarget;
+    //         const T xSource = (x + T(0.5f)) * widthSource / T(widthTarget) - T(0.5f);
+    //         const T ySource = (y + T(0.5f)) * heightSource / T(heightTarget) - T(0.5f);
+    //         const T* const sourcePtrChannel = sourcePtr + channel * sourceArea;
+    //         targetPtr[channel * targetArea + y*widthTarget+x] += bicubicInterpolate(
+    //             sourcePtrChannel, xSource, ySource, widthSource, heightSource, widthSource);
+    //     }
+    // }
 
-    template <typename T>
-    __global__ void resizeAndAverageKernelOld(
-        T* targetPtr, const T* const sourcePtr, const T scaleWidth, const T scaleHeight, const int widthSource,
-        const int heightSource, const int widthTarget, const int heightTarget, const int counter)
-    {
-        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
-        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
-        if (x < widthTarget && y < heightTarget)
-        {
-            const T xSource = (x + T(0.5f)) / scaleWidth - T(0.5f);
-            const T ySource = (y + T(0.5f)) / scaleHeight - T(0.5f);
-            const auto interpolated = bicubicInterpolate(
-                sourcePtr, xSource, ySource, widthSource, heightSource, widthSource);
-            auto& targetPixel = targetPtr[y*widthTarget+x];
-            targetPixel = (targetPixel + interpolated) / T(counter);
-        }
-    }
+    // template <typename T>
+    // __global__ void resizeAndAverageKernel(
+    //     T* targetPtr, const T* const sourcePtr, const T scaleWidth, const T scaleHeight, const int widthSource,
+    //     const int heightSource, const int widthTarget, const int heightTarget, const int counter)
+    // {
+    //     const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    //     const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+    //     const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z;
+    //     if (x < widthTarget && y < heightTarget)
+    //     {
+    //         const auto sourceArea = widthSource * heightSource;
+    //         const auto targetArea = widthTarget * heightTarget;
+    //         const T xSource = (x + T(0.5f)) / scaleWidth - T(0.5f);
+    //         const T ySource = (y + T(0.5f)) / scaleHeight - T(0.5f);
+    //         const T* const sourcePtrChannel = sourcePtr + channel * sourceArea;
+    //         const auto interpolated = bicubicInterpolate(
+    //             sourcePtrChannel, xSource, ySource, widthSource, heightSource, widthSource);
+    //         auto& targetPixel = targetPtr[channel * targetArea + y*widthTarget+x];
+    //         targetPixel = (targetPixel + interpolated) / T(counter);
+    //     }
+    // }
+
+    // template <typename T>
+    // __global__ void resizeAndAddKernelOld(
+    //     T* targetPtr, const T* const sourcePtr, const T scaleWidth, const T scaleHeight, const int widthSource,
+    //     const int heightSource, const int widthTarget, const int heightTarget)
+    // {
+    //     const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    //     const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+    //     if (x < widthTarget && y < heightTarget)
+    //     {
+    //         const T xSource = (x + T(0.5f)) / scaleWidth - T(0.5f);
+    //         const T ySource = (y + T(0.5f)) / scaleHeight - T(0.5f);
+    //         targetPtr[y*widthTarget+x] += bicubicInterpolate(
+    //             sourcePtr, xSource, ySource, widthSource, heightSource, widthSource);
+    //     }
+    // }
+
+    // template <typename T>
+    // __global__ void resizeAndAverageKernelOld(
+    //     T* targetPtr, const T* const sourcePtr, const T scaleWidth, const T scaleHeight, const int widthSource,
+    //     const int heightSource, const int widthTarget, const int heightTarget, const int counter)
+    // {
+    //     const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    //     const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+    //     if (x < widthTarget && y < heightTarget)
+    //     {
+    //         const T xSource = (x + T(0.5f)) / scaleWidth - T(0.5f);
+    //         const T ySource = (y + T(0.5f)) / scaleHeight - T(0.5f);
+    //         const auto interpolated = bicubicInterpolate(
+    //             sourcePtr, xSource, ySource, widthSource, heightSource, widthSource);
+    //         auto& targetPixel = targetPtr[y*widthTarget+x];
+    //         targetPixel = (targetPixel + interpolated) / T(counter);
+    //     }
+    // }
 
     template <typename T>
     void resizeAndMergeGpu(

diff --git a/src/openpose/pose/poseGpuRenderer.cpp b/src/openpose/pose/poseGpuRenderer.cpp
@@ -103,10 +103,9 @@ namespace op
                         scaleKeypoints(poseKeypointsRescaled, scaleInputToOutput);
                         // Render keypoints
                         if (!poseKeypoints.empty())
-                            cudaMemcpy(pGpuPose,
-                                       poseKeypointsRescaled.getConstPtr(),
-                                       numberPeople * numberBodyParts * 3 * sizeof(float),
-                                       cudaMemcpyHostToDevice);
+                            cudaMemcpy(
+                                pGpuPose, poseKeypointsRescaled.getConstPtr(),
+                                numberPeople * numberBodyParts * 3 * sizeof(float), cudaMemcpyHostToDevice);
                         renderPoseKeypointsGpu(
                             *spGpuMemory, pMaxPtr, pMinPtr, pScalePtr, mPoseModel, numberPeople, frameSize, pGpuPose,
                             mRenderThreshold, mShowGooglyEyes, mBlendOriginalFrame, getAlphaKeypoint());