My application uses the gpu class of Opencv gpu gpu::FarnebackOpticalFlowto calculate the optical flow between a pair of consecutive frames of the input video. To speed up the process, I used TBC OpenCV support to run the method in multithreading. However, multi-threaded performance does not behave as single-threaded. Just to give you an idea of the different behaviors, here are two snapshots of a single-threaded and multi-threaded implementation, respectively.

A multi-threaded implementation involves dividing the image into 8 different bands (the number of cores on my computer), and the gpu method for implementing optical flow in Farneback is applied to each of them. Here are the relevant lines of code for both methods:
Single threaded implementation
...
GpuMat gpuImg8U(img);
GpuMat gpuPrevImg8U(prevImg);
GpuMat u_flow, v_flow;
gpu::FarnebackOpticalFlow farneback_flow;
farneback_flow.numLevels = maxLayer;
farneback_flow.pyrScale = 0.5;
farneback_flow.winSize = windows_size;
farneback_flow.numIters = of_iterations;
farneback_flow(gpuPrevImg8U,gpuImg8U,u_flow,v_flow);
getFlowField(Mat(u_flow),Mat(v_flow),optical_flow);
...
}
void getFlowField(const Mat& u, const Mat& v, Mat& flowField){
for (int i = 0; i < flowField.rows; ++i){
const float* ptr_u = u.ptr<float>(i);
const float* ptr_v = v.ptr<float>(i);
Point2f* row = flowField.ptr<Point2f>(i);
for (int j = 0; j < flowField.cols; ++j){
row[j].y = ptr_v[j];
row[j].x = ptr_u[j];
}
}
}
class ParallelOpticalFlow : public cv::ParallelLoopBody {
private:
int coreNum;
cv::gpu::GpuMat img, img2;
cv::gpu::FarnebackOpticalFlow& farneback_flow;
const cv::gpu::GpuMat u_flow, v_flow;
cv::Mat& optical_flow;
public:
ParallelOpticalFlow(int cores, cv::gpu::FarnebackOpticalFlow& flowHandler, cv::gpu::GpuMat img_, cv::gpu::GpuMat img2_, const cv::gpu::GpuMat u, const cv::gpu::GpuMat v, cv::Mat& of)
: coreNum(cores), farneback_flow(flowHandler), img(img_), img2(img2_), u_flow(u), v_flow(v), optical_flow(of){}
virtual void operator()(const cv::Range& range) const;
};
void ParallelOpticalFlow::operator()(const cv::Range& range) const {
for (int k = range.start ; k < range.end ; k ++){
cv::gpu::GpuMat img_rect(img,cv::Rect(0,img.rows/coreNum*k,img.cols,img.rows/coreNum));
cv::gpu::GpuMat img2_rect(img2,cv::Rect(0,img2.rows/coreNum*k,img2.cols,img2.rows/coreNum));
cv::gpu::GpuMat u_rect(u_flow,cv::Rect(0,u_flow.rows/coreNum*k,u_flow.cols,u_flow.rows/coreNum));
cv::gpu::GpuMat v_rect(v_flow,cv::Rect(0,v_flow.rows/coreNum*k,v_flow.cols,v_flow.rows/coreNum));
cv::Mat of_rect(optical_flow,cv::Rect(0,optical_flow.rows/coreNum*k,optical_flow.cols,optical_flow.rows/coreNum));
farneback_flow(img_rect,img2_rect,u_rect,v_rect);
getFlowField(Mat(u_rect),Mat(v_rect),of_rect);
}
}
parallel_for_(Range(0,cores_num),ParallelOpticalFlow(cores_num,farneback_flow,gpuPrevImg8U,gpuImg8U,u_flow,v_flow,optical_flow));
. - , ? ?