Why is cublas on GTX Titan slower than single-threaded processor code?

Question

Why is cublas on GTX Titan slower than single-threaded processor code?

I am testing the Nvidia Cublas library on my GTX Titan. I have the following code:

#include "cublas.h"
#include <stdlib.h>
#include <conio.h>
#include <Windows.h>
#include <iostream>
#include <iomanip>

/* Vector size */
#define N (1024 * 1024 * 32)

/* Main */
int main(int argc, char** argv)
{
  LARGE_INTEGER frequency;
  LARGE_INTEGER t1, t2;

  float* h_A;
  float* h_B;
  float* d_A = 0;
  float* d_B = 0;

  /* Initialize CUBLAS */
  cublasInit();

  /* Allocate host memory for the vectors */
  h_A = (float*)malloc(N * sizeof(h_A[0]));
  h_B = (float*)malloc(N * sizeof(h_B[0]));

  /* Fill the vectors with test data */
  for (int i = 0; i < N; i++)
  {
    h_A[i] = rand() / (float)RAND_MAX;
    h_B[i] = rand() / (float)RAND_MAX;
  }

  QueryPerformanceFrequency(&frequency);
  QueryPerformanceCounter(&t1);
  /* Allocate device memory for the vectors */
  cublasAlloc(N, sizeof(d_A[0]), (void**)&d_A);
  cublasAlloc(N, sizeof(d_B[0]), (void**)&d_B);

  /* Initialize the device matrices with the host vectors */
  cublasSetVector(N, sizeof(h_A[0]), h_A, 1, d_A, 1);
  cublasSetVector(N, sizeof(h_B[0]), h_B, 1, d_B, 1);

  /* Performs operation using cublas */
  float res = cublasSdot(N, d_A, 1, d_B, 1);  

  /* Memory clean up */
  cublasFree(d_A);
  cublasFree(d_B);

  QueryPerformanceCounter(&t2);
  double elapsedTime = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;
  std::cout << "GPU time = " << std::setprecision(16) << elapsedTime << std::endl;
  std::cout << "GPU result = " << res << std::endl;

  QueryPerformanceFrequency(&frequency);
  QueryPerformanceCounter(&t1);
  float sum = 0.;
  for (int i = 0; i < N; i++) {
      sum += h_A[i] * h_B[i];
  }
  QueryPerformanceCounter(&t2);
  elapsedTime = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;
  std::cout << "CPU time = " << std::setprecision(16) << elapsedTime << std::endl;
  std::cout << "CPU result = " << sum << std::endl;

  free(h_A);
  free(h_B);

  /* Shutdown */
  cublasShutdown();

  getch();

  return EXIT_SUCCESS;
}

When I run the code, I get the following result:

GPU time = 164.7487009845991
GPU result = 8388851
CPU time = 45.22368030957917
CPU result = 7780599.5

Why is using the cublas library on the GTX Titan 3 times slower than computing on a single core IvyBridge on the Xeon 2.4 GHz? When I increase or decrease the size of the vector, I get the same results: the GPU is slower than the processor. Double precision does not change it.

+4

c ++ performance gpgpu cuda cublas

whatisgto Feb 04 '14 at 18:32

source share

2 answers

, :

http://blog.theincredibleholk.org/blog/2012/12/10/optimizing-dot-product/

, :

( ), ( > ), , , .
, , , , .
, . , PCI-E .

, , , .

:

(, , )

: http://www.nvidia.com/object/nvidia_research_pub_001.html

+6

Marco A. 04 . '14 18:42

Alex Telishev · Accepted Answer · 2014-02-04T18:41:09+0000

dot- - , . , , , PCIExpress , .

Why is cublas on GTX Titan slower than single-threaded processor code?

More articles: