I have two CUDA cores that compute similar things. One uses global memory ( myfun is a device function that reads a lot from global memory and performs calculations). The second core transfers this piece of data from global memory to shared memory so that data can be shared between different block streams. My kernel using global memory is much faster than a shared memory kernel. What are the possible reasons?
loadArray just copies a small part of d_x to m .
__global__ void mykernel(float *d_x, float *d_y, int *d_z, float *d_u, int N, int K, int D) { int tid = blockIdx.x*blockDim.x + threadIdx.x; int index = 0; float max_s = 1e+37F; if (tid < N) { for (int i = 0; i < K; i++) { float s = myfun(&d_x[i*D], d_y, tid); if (s > max_s) { max_s = s; index = i; } } d_z[tid] = index; d_u[tid] = max_s; } }
Using shared memory:
__global__ void mykernel(float *d_x, float *d_y, int *d_z, float *d_u, int N, int K) { int tid = blockIdx.x*blockDim.x + threadIdx.x; int index = 0; float max_s = 1e+37F; extern __shared__ float m[]; if( threadIdx.x == 0 ) loadArray( m, d_x ); __syncthreads(); if (tid < N) { for (int i = 0; i < K; i++) { float s = myfun(m, d_y, tid); if (s > max_s) { max_s = s; index = i; } } d_z[tid] = index; d_u[tid] = max_s; } }
source share