real_dnn.cpp

// This example is derived from the ssd_mobilenet_object_detection opencv demo
// and adapted to be used with Intel RealSense Cameras
// Please see https://github.com/opencv/opencv/blob/master/LICENSE

#include <opencv2/dnn.hpp>
#include <librealsense2/rs.hpp>
#include "cv-helpers.hpp"
#include <chrono>
#include <algorithm>

// #define YOLO
const size_t inWidth      = 300;
const size_t inHeight     = 300;
const float WHRatio       = inWidth / (float)inHeight;
const float inScaleFactor = 0.007843f;
const float meanVal       = 127.5;
const char* classNames[]  = {"background",
                             "aeroplane", "bicycle", "bird", "boat",
                             "bottle", "bus", "car", "cat", "chair",
                             "cow", "diningtable", "dog", "horse",
                             "motorbike", "person", "pottedplant",
                             "sheep", "sofa", "train", "tvmonitor"};

float cvMin(cv::Mat mat)
{
    // float min=0;
    // float* pData=(float*)mat.data;
    // for(int i=0;i<mat.rows;i++) 
    // {
    //     for(int j=0;j<mat.cols;j++)
    //     {
    //         float value = pData[j+i*mat.cols];
    //         if(value<min) 
    //         {
    //             min=value;
    //         }
    //     }
    // }
    double min, max;
    minMaxIdx(mat, &min, &max);
    return min;
}

double cvMax(cv::Mat mat)
{
    // float max=0;
    // float* pData=(float*)mat.data;
    // for(int i=0;i<mat.rows;i++) 
    // {
    //     for(int j=0;j<mat.cols;j++)
    //     {
    //         float value = pData[j+i*mat.cols];
    //         if(value>max) 
    //         {
    //             max=value;
    //         }
    //     }
    // }
    double max, min;
    minMaxIdx(mat, &min, &max);
    // return min;
    return max;
}

int main(int argc, char** argv) try
{
    using namespace cv;
    using namespace cv::dnn;
    using namespace rs2;
	String caffeCfg = "MobileNetSSD_deploy.prototxt.txt";
	String caffeModel = "MobileNetSSD_deploy.caffemodel";
	String yoloCfg = "yolov3-tiny.cfg";
	String yoloModel = "yolov3-tiny.weights";

    #ifndef YOLO
        Net net = readNetFromCaffe(caffeCfg, caffeModel);
        std::cout<<"Using Caffe model..."<<std::endl;
    #else
        Net net = readNetFromDarknet(yoloCfg, yoloModel);
        std::cout<<"Using YOLO model..."<<std::endl;
    #endif

    // Start streaming from Intel RealSense Camera
    pipeline pipe;
    auto config = pipe.start();
    auto profile = config.get_stream(RS2_STREAM_COLOR)
                         .as<video_stream_profile>();
    rs2::align align_to(RS2_STREAM_COLOR);

    Size cropSize;
    if (profile.width() / (float)profile.height() > WHRatio)
    {
        cropSize = Size(static_cast<int>(profile.height() * WHRatio),
                        profile.height());
    }
    else
    {
        cropSize = Size(profile.width(),
                        static_cast<int>(profile.width() / WHRatio));
    }

    Rect crop(Point((profile.width() - cropSize.width) / 2,
                    (profile.height() - cropSize.height) / 2),
              cropSize);

    const auto window_name = "Display Image";
    namedWindow(window_name, WINDOW_AUTOSIZE);
    int frame = 0;
    auto start = std::chrono::high_resolution_clock::now();
    while (cvGetWindowHandle(window_name))
    {
    	//Increment frame count
    	frame += 1;
        // Wait for the next set of frames
        auto data = pipe.wait_for_frames();
        // Make sure the frames are spatially aligned
        data = align_to.process(data);

        auto color_frame = data.get_color_frame();
        auto depth_frame = data.get_depth_frame();

        // If we only received new depth frame, 
        // but the color did not update, continue
        static int last_frame_number = 0;
        if (color_frame.get_frame_number() == last_frame_number) continue;
        last_frame_number = color_frame.get_frame_number();

        // Convert RealSense frame to OpenCV matrix:
        auto color_mat = frame_to_mat(color_frame);
        auto depth_mat = depth_frame_to_meters(pipe, depth_frame);

        Mat inputBlob = blobFromImage(color_mat, inScaleFactor,
                                      Size(inWidth, inHeight), meanVal, false); //Convert Mat to batch of images
        net.setInput(inputBlob, "data"); //set the network input
        Mat detection = net.forward("detection_out"); //compute output

        Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr<float>());

        // Crop both color and depth frames
        color_mat = color_mat(crop);
        depth_mat = depth_mat(crop);

        float confidenceThreshold = 0.8f;
        for(int i = 0; i < detectionMat.rows; i++)
        {
            float confidence = detectionMat.at<float>(i, 2);

            if(confidence > confidenceThreshold)
            {
                size_t objectClass = (size_t)(detectionMat.at<float>(i, 1));

                int xLeftBottom = static_cast<int>(detectionMat.at<float>(i, 3) * color_mat.cols);
                int yLeftBottom = static_cast<int>(detectionMat.at<float>(i, 4) * color_mat.rows);
                int xRightTop = static_cast<int>(detectionMat.at<float>(i, 5) * color_mat.cols);
                int yRightTop = static_cast<int>(detectionMat.at<float>(i, 6) * color_mat.rows);

                Rect object((int)xLeftBottom, (int)yLeftBottom,
                            (int)(xRightTop - xLeftBottom),
                            (int)(yRightTop - yLeftBottom));

                object = object  & Rect(0, 0, depth_mat.cols, depth_mat.rows);

                // Calculate mean depth inside the detection region
                // This is a very naive way to estimate objects depth
                // but it is intended to demonstrate how one might 
                // use depht data in general
                Scalar m = mean(depth_mat(object));
                // float m = cvMin(depth_mat(object));
                // double m = cvMax(depth_mat(object));


                std::ostringstream ss;
                ss << classNames[objectClass] << " ";
                // ss << std::setprecision(2) << m[0] << " meters away";
                ss << std::setprecision(2) << m[0] << " meters away";
                String conf(ss.str());

                rectangle(color_mat, object, Scalar(0, 255, 0));
                int baseLine = 0;
                Size labelSize = getTextSize(ss.str(), FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

                auto center = (object.br() + object.tl())*0.5;
                center.x = center.x - labelSize.width / 2;

                rectangle(color_mat, Rect(Point(center.x, center.y - labelSize.height),
                    Size(labelSize.width, labelSize.height + baseLine)),
                    Scalar(255, 255, 255), CV_FILLED);
                putText(color_mat, ss.str(), center,
                        FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0,0,0));
            }
        }

        imshow(window_name, color_mat);
        if (waitKey(1) >= 0)
        {
        	auto finish = std::chrono::high_resolution_clock::now();
        	std::chrono::duration<double> elapsed = finish - start;
        	std::cout<<"FPS "<<frame / elapsed.count() <<std::endl;
        	break;
        }
    }

    return EXIT_SUCCESS;
}
catch (const rs2::error & e)
{
    std::cerr << "RealSense error calling " << e.get_failed_function() << "(" << e.get_failed_args() << "):\n    " << e.what() << std::endl;
    return EXIT_FAILURE;
}
catch (const std::exception& e)
{
    std::cerr << e.what() << std::endl;
    return EXIT_FAILURE;
}