talonmies already answered your question, so I just want to share the code inspired by the first part of V. Volkov's presentation mentioned in the answer above.
This is the code:
#include<stdio.h> #define N_ITERATIONS 8192 //#define DEBUG /********************/ /* CUDA ERROR CHECK */ /********************/ #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } /********************************************************/ /* KERNEL0 - NO INSTRUCTION LEVEL PARALLELISM (ILP = 0) */ /********************************************************/ __global__ void kernel0(int *d_a, int *d_b, int *d_c, unsigned int N) { const int tid = threadIdx.x + blockIdx.x * blockDim.x ; if (tid < N) { int a = d_a[tid]; int b = d_b[tid]; int c = d_c[tid]; for(unsigned int i = 0; i < N_ITERATIONS; i++) { a = a * b + c; } d_a[tid] = a; } } /*****************************************************/ /* KERNEL1 - INSTRUCTION LEVEL PARALLELISM (ILP = 2) */ /*****************************************************/ __global__ void kernel1(int *d_a, int *d_b, int *d_c, unsigned int N) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < N/2) { int a1 = d_a[tid]; int b1 = d_b[tid]; int c1 = d_c[tid]; int a2 = d_a[tid+N/2]; int b2 = d_b[tid+N/2]; int c2 = d_c[tid+N/2]; for(unsigned int i = 0; i < N_ITERATIONS; i++) { a1 = a1 * b1 + c1; a2 = a2 * b2 + c2; } d_a[tid] = a1; d_a[tid+N/2] = a2; } } /*****************************************************/ /* KERNEL2 - INSTRUCTION LEVEL PARALLELISM (ILP = 4) */ /*****************************************************/ __global__ void kernel2(int *d_a, int *d_b, int *d_c, unsigned int N) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < N/4) { int a1 = d_a[tid]; int b1 = d_b[tid]; int c1 = d_c[tid]; int a2 = d_a[tid+N/4]; int b2 = d_b[tid+N/4]; int c2 = d_c[tid+N/4]; int a3 = d_a[tid+N/2]; int b3 = d_b[tid+N/2]; int c3 = d_c[tid+N/2]; int a4 = d_a[tid+3*N/4]; int b4 = d_b[tid+3*N/4]; int c4 = d_c[tid+3*N/4]; for(unsigned int i = 0; i < N_ITERATIONS; i++) { a1 = a1 * b1 + c1; a2 = a2 * b2 + c2; a3 = a3 * b3 + c3; a4 = a4 * b4 + c4; } d_a[tid] = a1; d_a[tid+N/4] = a2; d_a[tid+N/2] = a3; d_a[tid+3*N/4] = a4; } } /********/ /* MAIN */ /********/ void main() { const int N = 1024; int *h_a = (int*)malloc(N*sizeof(int)); int *h_a_result_host = (int*)malloc(N*sizeof(int)); int *h_a_result_device = (int*)malloc(N*sizeof(int)); int *h_b = (int*)malloc(N*sizeof(int)); int *h_c = (int*)malloc(N*sizeof(int)); for (int i=0; i<N; i++) { h_a[i] = 2; h_b[i] = 1; h_c[i] = 2; h_a_result_host[i] = h_a[i]; for(unsigned int k = 0; k < N_ITERATIONS; k++) { h_a_result_host[i] = h_a_result_host[i] * h_b[i] + h_c[i]; } } int *d_a; gpuErrchk(cudaMalloc((void**)&d_a, N*sizeof(int))); int *d_b; gpuErrchk(cudaMalloc((void**)&d_b, N*sizeof(int))); int *d_c; gpuErrchk(cudaMalloc((void**)&d_c, N*sizeof(int))); gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(int), cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpy(d_c, h_c, N*sizeof(int), cudaMemcpyHostToDevice)); // --- Creating events for timing float time; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); /***********/ /* KERNEL0 */ /***********/ cudaEventRecord(start, 0); kernel0<<<1, N>>>(d_a, d_b, d_c, N); #ifdef DEBUG gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); #endif cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time); gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost)); for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; } /***********/ /* KERNEL1 */ /***********/ gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice)); cudaEventRecord(start, 0); kernel1<<<1, N/2>>>(d_a, d_b, d_c, N); #ifdef DEBUG gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); #endif cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time); gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost)); for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; } /***********/ /* KERNEL2 */ /***********/ gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice)); cudaEventRecord(start, 0); kernel2<<<1, N/4>>>(d_a, d_b, d_c, N); #ifdef DEBUG gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); #endif cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time); gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost)); for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; } cudaDeviceReset(); }
On my GeForce GT540M, the result
kernel0 GFlops = 21.069281 Occupancy = 66% kernel1 GFlops = 21.183354 Occupancy = 33% kernel2 GFlops = 21.224517 Occupancy = 16.7%
which means that cores with a lower level of employment can still demonstrate high performance if the Parallelism (ILP) level of training is used.