I went through this site . From here I got this pinned memory using cudamallocHost, which gives better performance than cudamalloc. Then I use two different simple programs and checked the runtime as
using cudaMallocHost
#include <stdio.h>
#include <cuda.h>
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
int main(void)
{
clock_t start;
start=clock();
clock_t finish;
float *a_h, *a_d;
const int N = 100000;
size_t size = N * sizeof(float);
cudaMallocHost((void **) &a_h, size);
cudaMalloc((void **) &a_d, size);
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
cudaFreeHost(a_h);
cudaFree(a_d);
finish = clock() - start;
double interval = finish / (double)CLOCKS_PER_SEC;
printf("%f seconds elapsed", interval);
}
using malloc
#include <stdio.h>
#include <cuda.h>
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
int main(void)
{
clock_t start;
start=clock();
clock_t finish;
float *a_h, *a_d;
const int N = 100000;
size_t size = N * sizeof(float);
a_h = (float *)malloc(size);
cudaMalloc((void **) &a_d, size);
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
free(a_h); cudaFree(a_d);
finish = clock() - start;
double interval = finish / (double)CLOCKS_PER_SEC;
printf("%f seconds elapsed", interval);
}
here, during the execution of both programs, the execution time was almost the same. Is there something wrong with the implementation? What is the exact difference in performance in cudamalloc and cudamallochost?
, and also at each start execution time decreases