CUDA performance with computer vision algorithm

I am promoting my first step in the world of CUDA C programming!

As a first test, I am writing a simple algorithm for gray-scale conversion and threshold value on images (I am a fan of Computer Vision and OpenCV!). I decided to compare the results of CUDA with a similar algorithm on the processor and with the corresponding OpenCV (cpu) functions. Here is the result in full hd video:

Frame Count: 4754
Frame Resolution: 1920x1080
Total time CPU: 67418.6 ms
Frame Avg CPU:  14.1814 ms

Frame Count: 4754
Frame Resolution: 1920x1080
Total time OpenCV: 23805.3 ms
Frame Avg OpenCV:  5.00742 ms

Frame Count: 4754
Frame Resolution: 1920x1080
==6149== NVPROF is profiling process 6149, command: ./OpenCV_test
Total time CUDA: 28018.2 ms
Frame Avg CUDA:  5.89361 ms

==6149== Profiling application: ./OpenCV_test
==6149== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
55.45%  4.05731s      4754  853.45us  849.54us  1.1141ms doThreshold(unsigned char const *, unsigned char*, unsigned int, unsigned int, unsigned int)
34.03%  2.49028s      4754  523.83us  513.67us  1.3338ms  [CUDA memcpy HtoD]
10.52%  769.46ms      4754  161.85us  161.15us  301.06us  [CUDA memcpy DtoH]

==6149== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 80.11%  8.19501s      9508  861.91us  490.81us  2.7719ms  cudaMemcpy
 12.82%  1.31106s      9508  137.89us  66.639us  218.56ms  cudaMalloc
  5.74%  587.05ms      9508  61.742us  39.566us  2.0234ms  cudaFree
  1.21%  124.16ms      4754  26.116us  16.990us  365.86us  cudaLaunch
  0.06%  5.7645ms     23770     242ns      97ns  106.27us  cudaSetupArgument
  0.05%  5.4291ms      4754  1.1410us     602ns  10.150us  cudaConfigureCall
  0.01%  594.89us        83  7.1670us     249ns  282.44us  cuDeviceGetAttribute
  0.00%  45.536us         1  45.536us  45.536us  45.536us  cuDeviceTotalMem
  0.00%  35.649us         1  35.649us  35.649us  35.649us  cuDeviceGetName
  0.00%  1.8960us         2     948ns     345ns  1.5510us  cuDeviceGetCount
  0.00%     892ns         2     446ns     255ns     637ns  cuDeviceGet

As you can see, OpenCV is much better than my processor implementation, and better than my Cuda algorithm! Where is the trick? My suspect is that OpenCV uses a set of special hardware processor instructions. I expected something more with CUDA: people talk about 20x-30x acceleration in primitive image processing! Did I miss something?

:

  • Intel Core i7 5820k @4ghz
  • GeForce GTX 970
  • Linux Mint 17.2 Mate 64
  • nVidia 352.55
  • Cuda 7.5.18

OpenCV 3.0:

  • Cuda
  • OpenCL
  • TBB ( )
  • Intel IPP

, :

#include <iostream>
#include <numeric>
#include <string>
#include <stdlib.h>
#include <chrono>

#include <opencv2/opencv.hpp>

using namespace cv;
using namespace std;
using namespace std::chrono;

const char* file = "PATH TO A VIDEO FILE";

__global__ void doThreshold(const uchar* bgrInput, uchar* output, uint inputSize, uint soglia, uint maxVal)
{
    uint i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < inputSize)
    {
        output[i] = 0.5f + ((bgrInput[3 * i] + bgrInput[3 * i + 1] + bgrInput[3 * i + 2]) / 3.0f); // gray conversion
        output[i] = output[i] > soglia ? maxVal : 0; // thresholding
    }

}


void cudaCvtThreshold(const Mat& mat, Mat& result, uint soglia, uint maxVal)
{
    if (mat.type() == CV_8UC3)
    {
        uint size = mat.rows * mat.cols;
        uint blockSize = 128; // no significant result varying this variable
        uint gridSize = ceil(size/(float)blockSize);
        uchar* d_bgrInput, *d_output;
        cudaMalloc((void**)&d_bgrInput, mat.channels() * size);
        cudaMalloc((void**)&d_output, size);
        cudaMemcpy(d_bgrInput, mat.data, mat.channels() * size, cudaMemcpyHostToDevice);

        doThreshold<<<gridSize, blockSize>>>(d_bgrInput, d_output, size, soglia, maxVal);

        result = Mat(mat.rows, mat.cols, CV_8UC1);
        cudaMemcpy(result.data, d_output, size, cudaMemcpyDeviceToHost);
        cudaFree(d_bgrInput);
        cudaFree(d_output);
    }
    else
        cerr << "Only CV_8UC3 matrix supported" << endl;

}

void cpuCvtThreshold(const Mat& mat, Mat& result, uint soglia, uint maxVal)
{
    if (mat.type() == CV_8UC3)
    {
        uint size = mat.rows * mat.cols;
        result = Mat(mat.rows, mat.cols, CV_8UC1);
        uchar* input = mat.data;
        uchar* output = result.data;
        for (uint i = 0; i < size; ++i)
        {
            output[i] = 0.5f + ((input[3 * i] + input[3 * i + 1] + input[3 * i + 2]) / 3.0f); // gray conversion
            output[i] = output[i] > soglia ? maxVal : 0; // thresholding
        }
    }
    else
        cerr << "Only CV_8UC3 matrix supported" << endl;
}

void cudaTest(const string src)
{
    VideoCapture cap(src);
    Mat frame, result;
    uint frameCount = cap.get(CAP_PROP_FRAME_COUNT);
    cout << "Frame Count: " << frameCount << endl;
    auto startTs = system_clock::now();
    cap >> frame;
    cout << "Frame Resolution: " << frame.cols << "x" << frame.rows << endl;

    while (not frame.empty()) {
        cudaCvtThreshold(frame, result, 127, 255);
        cap >> frame;
    }

    auto stopTs = system_clock::now();
    auto diff = stopTs - startTs;
    auto elapsed = chrono::duration_cast<chrono::microseconds>(diff).count() / (double)1e3;
    cout << "Total time CUDA: " << elapsed << " ms" << endl;
    cout << "Frame Avg CUDA:  " << elapsed / frameCount << " ms" << endl << endl;
}

void naiveCpu(const string src)
{
    VideoCapture cap(src);
    Mat frame, result;
    uint frameCount = cap.get(CAP_PROP_FRAME_COUNT);
    cout << "Frame Count: " << frameCount << endl;
    auto startTs = system_clock::now();
    cap >> frame;
    cout << "Frame Resolution: " << frame.cols << "x" << frame.rows << endl;

    while (not frame.empty()) {
        cpuCvtThreshold(frame, result, 127, 255);
        cap >> frame;
    }
    auto stopTs = system_clock::now();
    auto diff = stopTs - startTs;
    auto elapsed = chrono::duration_cast<chrono::microseconds>(diff).count() / (double)1e3;
    cout << "Total time CPU: " << elapsed << " ms" << endl;
    cout << "Frame Avg CPU:  " << elapsed / frameCount << " ms" << endl << endl;
}


void opencv(const string src)
{
    VideoCapture cap(src);
    Mat frame, result;
    uint frameCount = cap.get(CAP_PROP_FRAME_COUNT);
    cout << "Frame Count: " << frameCount << endl;
    auto startTs = system_clock::now();
    cap >> frame;
    cout << "Frame Resolution: " << frame.cols << "x" << frame.rows << endl;

    while (not frame.empty()) {
        cv::cvtColor(frame, result, COLOR_BGR2GRAY);
        threshold(result, result, 127, 255, THRESH_BINARY);
        cap >> frame;
    }
    auto stopTs = system_clock::now();
    auto diff = stopTs - startTs;
    auto elapsed = chrono::duration_cast<chrono::microseconds>(diff).count() / (double)1e3;
    cout << "Total time OpenCV: " << elapsed << " ms" << endl;
    cout << "Frame Avg OpenCV:  " << elapsed / frameCount << " ms" << endl << endl;
}

int main(void)
{
    naiveCpu(file);
    opencv(file);
    cudaTest(file);
    return 0;
}

EDIT:

/

__global__ void doThreshold(const uchar* bgrInput, uchar* output, uint inputSize, uint soglia, uint maxVal)
{
    uint i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < inputSize)
    {
        uchar grayPix = 0.5f + ((bgrInput[3 * i] + bgrInput[3 * i + 1] + bgrInput[3 * i + 2]) / 3.0f); // gray conversion
        output[i] = grayPix > soglia ? maxVal : 0; // thresholding
    }

}

void cudaCvtThreshold(const Mat& mat, Mat& result, uint soglia, uint maxVal, uchar* d_bgrInput, uchar* d_output)
{
    uint size = mat.rows * mat.cols;
    uint blockSize = 128; // no significant result varying this variable
    uint gridSize = ceil(size/(float)blockSize);
    doThreshold<<<gridSize, blockSize>>>(d_bgrInput, d_output, size, soglia, maxVal);
}

void cudaTestOutMallocFree(const string src)
{
    VideoCapture cap(src);
    Mat frame;
    uint frameCount = cap.get(CAP_PROP_FRAME_COUNT);
    cout << "Frame Count: " << frameCount << endl;
    auto startTs = system_clock::now();
    cap >> frame;
    cout << "Frame Resolution: " << frame.cols << "x" << frame.rows << endl;
    uint size = frame.rows * frame.cols;

    Mat result(frame.rows, frame.cols, CV_8UC1);
    uchar* d_bgrInput, *d_output;
    cudaMalloc((void**)&d_bgrInput, frame.channels() * size);
    cudaMalloc((void**)&d_output, size);

    while (not frame.empty())
    {
        cudaMemcpy(d_bgrInput, frame.data, frame.channels() * size, cudaMemcpyHostToDevice);
        cudaCvtThreshold(frame, result, 127, 255, d_bgrInput, d_output);
        cudaMemcpy(result.data, d_output, size, cudaMemcpyDeviceToHost);
        cap >> frame;
    }

    cudaFree(d_bgrInput);
    cudaFree(d_output);

    auto stopTs = system_clock::now();
    auto diff = stopTs - startTs;
    auto elapsed = chrono::duration_cast<chrono::microseconds>(diff).count() / (double)1e3;
    cout << "Total time CUDA (out malloc-free): " << elapsed << " ms" << endl;
    cout << "Frame Avg CUDA (out malloc-free):  " << elapsed / frameCount << " ms" << endl << endl;
}

int main(void)
{
    naiveCpu(file);
    opencv(file);
    cudaTest(file);
    cudaTestOutMallocFree(file);
    return 0;
}

:

Frame Count: 4754
Frame Resolution: 1920x1080
Total time CPU: 70972.6 ms
Frame Avg CPU:  14.929 ms

Frame Count: 4754
Frame Resolution: 1920x1080
Total time OpenCV: 23475.4 ms
Frame Avg OpenCV:  4.93804 ms

Frame Count: 4754
Frame Resolution: 1920x1080
==4493== NVPROF is profiling process 4493, command: ./OpenCV_test
Total time CUDA: 27451.3 ms
Frame Avg CUDA:  5.77435 ms

Frame Count: 4754
Frame Resolution: 1920x1080
Total time CUDA (out malloc-free): 26137.3 ms
Frame Avg CUDA (out malloc-free):  5.49796 ms

==4493== Profiling application: ./OpenCV_test
==4493== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 53.74%  7.53280s      9508  792.26us  789.61us  896.17us  doThreshold(unsigned char const *, unsigned char*, unsigned int, unsigned int, unsigned int)
 35.57%  4.98604s      9508  524.40us  513.54us  979.37us  [CUDA memcpy HtoD]
 10.69%  1.49876s      9508  157.63us  157.09us  206.24us  [CUDA memcpy DtoH]

==4493== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 88.22%  15.7392s     19016  827.68us  482.18us  1.7570ms  cudaMemcpy
  7.07%  1.26081s      9510  132.58us  65.458us  198.86ms  cudaMalloc
  3.26%  582.24ms      9510  61.223us  39.675us  304.16us  cudaFree
  1.33%  236.64ms      9508  24.888us  13.497us  277.21us  cudaLaunch
  0.06%  10.667ms     47540     224ns      96ns  347.09us  cudaSetupArgument
  0.06%  9.9587ms      9508  1.0470us     504ns  9.4800us  cudaConfigureCall
  0.00%  428.88us        83  5.1670us     225ns  228.70us  cuDeviceGetAttribute
  0.00%  43.388us         1  43.388us  43.388us  43.388us  cuDeviceTotalMem
  0.00%  34.389us         1  34.389us  34.389us  34.389us  cuDeviceGetName
  0.00%  1.7010us         2     850ns     409ns  1.2920us  cuDeviceGetCount
  0.00%     821ns         2     410ns     225ns     596ns  cuDeviceGet

malloc , ...

EDIT2:

Jez I, ​​Cuda (8 ) :

:

__global__ void doThreshold(const uchar* bgrInput, uchar* output, uint inputSize, uint soglia, uint maxVal, uint pixelPerThread)
{
    uint i = pixelPerThread * (blockIdx.x * blockDim.x + threadIdx.x);

    if (i < inputSize)
    {
        for (uint j = 0; j < pixelPerThread; j++) {
            uchar grayPix = 0.5f + ( (bgrInput[3 * (i + j)] + bgrInput[3 * (i + j) + 1] + bgrInput[3 * (i + j) + 2]) / 3.0f ); // gray conversion
            output[i + j] = grayPix > soglia ? maxVal : 0; // thresholding
        }
    }
}

void cudaCvtThreshold(const Mat& mat, Mat& result, uint soglia, uint maxVal, uchar* d_bgrInput, uchar* d_output)
{
    uint size = mat.rows * mat.cols;
    uint pixelPerThread = 8;
    uint blockSize = 128; // no significant result varying this variable
    uint gridSize = ceil(size/(float)(blockSize * pixelPerThread));
    doThreshold<<<gridSize, blockSize>>>(d_bgrInput, d_output, size, soglia, maxVal, pixelPerThread);
}

:

Frame Count: 4754
Frame Resolution: 1920x1080
Total time OpenCV: 23628.8 ms
Frame Avg OpenCV:  4.97031 ms

Frame Count: 4754
Frame Resolution: 1920x1080
==13441== NVPROF is profiling process 13441, command: ./OpenCV_test
Total time CUDA (out malloc-free): 25655.5 ms
Frame Avg CUDA (out malloc-free):  5.39662 ms

==13441== Profiling application: ./OpenCV_test
==13441== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 49.30%  3.15853s      4754  664.39us  658.24us  779.04us  doThreshold(unsigned char const *, unsigned char*, unsigned int, unsigned int, unsigned int, unsigned int)
 38.69%  2.47838s      4754  521.32us  513.35us  870.69us  [CUDA memcpy HtoD]
 12.01%  769.53ms      4754  161.87us  161.31us  200.58us  [CUDA memcpy DtoH]

==13441== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 95.78%  7.26387s      9508  763.97us  491.11us  1.6589ms  cudaMemcpy
  2.51%  190.70ms         2  95.350ms  82.529us  190.62ms  cudaMalloc
  1.53%  116.31ms      4754  24.465us  16.844us  286.56us  cudaLaunch
  0.09%  6.7052ms     28524     235ns      98ns  233.19us  cudaSetupArgument
  0.08%  5.9538ms      4754  1.2520us     642ns  12.039us  cudaConfigureCall
  0.00%  263.87us        83  3.1790us     225ns  111.03us  cuDeviceGetAttribute
  0.00%  174.45us         2  87.227us  52.521us  121.93us  cudaFree
  0.00%  34.612us         1  34.612us  34.612us  34.612us  cuDeviceTotalMem
  0.00%  29.376us         1  29.376us  29.376us  29.376us  cuDeviceGetName
  0.00%  1.6950us         2     847ns     343ns  1.3520us  cuDeviceGetCount
  0.00%     745ns         2     372ns     217ns     528ns  cuDeviceGet

, 664,39 us 792,26 us !:-) OpenCV ( Intel IPP) !

EDIT3: OpenCV WITHOUT IPP SSE. OpenCV !

Frame Count: 4754
Frame Resolution: 1920x1080
Total time OpenCV: 23541.7 ms
Frame Avg OpenCV:  4.95198 ms
+4
1

.

GPU, GPU . CPU-GPU , , . , , , , , copy- > execute- > copy .

Kernel

, ​​ . () 4 /, ~ 2 () 853us 10 /. GTX 970 224 /. .

, 8- . . (, int4s, ) , , 32 + . , , .

. NPP, , , , , .


, ​​ > 10 . - Amdahl , , , , 2 .

+3

Source: https://habr.com/ru/post/1616348/


All Articles