The problem with large array sizes in CUDA

I met CUDA by writing a point product calculator. I wanted to test it with large array sizes in order to do a time study, to test two different ways of collecting vector sums. However, when the size of the array exceeds 1024, I get errors. I am not sure where this problem comes from. The card is a GTX460M with 1.5 GB of RAM. I use a map to display (this is a laptop). In addition, I do not know where this problem came from.

Here is the nvcc compilation line:

nvcc D:\Research\CUDA\TestCode\test_dotProduct_1.cu --use_fast_math --gpu-architecture sm_13 --compiler-bindir="D:\Programming\VisualStudio\2010express\VC\bin" --machine 32 -o multi_dot.exe 

I am also having problems compiling in the 64-bit version, but this is another problem.

Here is the result for an array of size 1024:
CALCULATION HOST: 357389824.000000
DEV PARA CALCULATION: 357389824.000000
DEV SERI CALCULATION: 357389824.000000

Here is the result for an array of size 2048:
CALCULATION HOST: 2861214720.000000
DEV PARA CALCULATION: -1. # INF00
DEV SERI CALCULATION: -1. # INF00

Here is my code:

  /*Code for a CUDA test project doing a basic dot product with doubles * * * */ #include <stdio.h> #include <cuda.h> __global__ void GPU_parallelDotProduct(double *array_a, double *array_b, double *array_c){ array_c[threadIdx.x] = array_a[threadIdx.x] * array_b[threadIdx.x]; } __global__ void GPU_parallelSumVector(double *vector, double *sum, int base){ sum[threadIdx.x + blockIdx.x] = vector[blockIdx.x + threadIdx.x * base] + vector[blockIdx.x + threadIdx.x * base + 1]; } __global__ void GPU_serialSumVector(double *vector, double *sum, int dim){ for(int i = 0; i < dim; ++i){ sum[0] += vector[i]; } } __host__ void CPU_serialDot(double *first, double *second, double *dot, int dim){ for(int i=0; i<dim; ++i){ dot[0] += first[i] * second[i]; } } __host__ void CPU_serialSetupVector(double *vector, int dim, int incrSize, int start){ for(int i=0; i<dim; ++i){ vector[i] = start + i * incrSize; } } int main(){ //define array size to be used //int i,j; const int VECTOR_LENGTH = 2048; int SUM_BASE = 2; int SUM_ROUNDS = VECTOR_LENGTH / SUM_BASE; int ELEMENT_SIZE = sizeof(double); // int currentSize = VECTOR_LENGTH; //arrays for dot product //host double *array_a = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE); double *array_b = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE); double *dev_dot_product_parallel = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE); double *dev_dot_product_serial = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE); double host_dot_product = 0.0; //fill with values CPU_serialSetupVector(array_a, VECTOR_LENGTH, 1, 0); CPU_serialSetupVector(array_b, VECTOR_LENGTH, 1, 0); CPU_serialDot(array_a, array_b, &host_dot_product, VECTOR_LENGTH); //device double *dev_array_a; double *dev_array_b; double *dev_array_c; double *dev_dot_serial; double *dev_dot_parallel; //allocate cuda memory cudaMalloc((void**)&dev_array_a, ELEMENT_SIZE * VECTOR_LENGTH); cudaMalloc((void**)&dev_array_b, ELEMENT_SIZE * VECTOR_LENGTH); cudaMalloc((void**)&dev_array_c, ELEMENT_SIZE * VECTOR_LENGTH); cudaMalloc((void**)&dev_dot_parallel, ELEMENT_SIZE * VECTOR_LENGTH); cudaMalloc((void**)&dev_dot_serial, ELEMENT_SIZE * VECTOR_LENGTH); //copy to from host to device cudaMemcpy(dev_array_a, array_a, ELEMENT_SIZE * VECTOR_LENGTH, cudaMemcpyHostToDevice); cudaMemcpy(dev_array_b, array_b, ELEMENT_SIZE * VECTOR_LENGTH, cudaMemcpyHostToDevice); cudaMemcpy(dev_dot_parallel, &dev_dot_product_parallel, ELEMENT_SIZE, cudaMemcpyHostToDevice); cudaMemcpy(dev_dot_serial, &dev_dot_product_serial, ELEMENT_SIZE, cudaMemcpyHostToDevice); //perform CUDA dot product GPU_parallelDotProduct<<<1, VECTOR_LENGTH>>>(dev_array_a, dev_array_b, dev_array_c); //condense a second vector in serial to compare speed up of tree condensing GPU_serialSumVector<<<1,1>>>(dev_array_c, dev_dot_serial, VECTOR_LENGTH); //condense vector (parallel) for(int i=SUM_ROUNDS; i>1; i/=SUM_BASE){ GPU_parallelSumVector<<<1,i>>>(dev_array_c, dev_array_c, SUM_BASE); } GPU_parallelSumVector<<<1,1>>>(dev_array_c, dev_array_c, SUM_BASE); //get computed product back to the machine cudaMemcpy(dev_dot_product_parallel, dev_array_c, VECTOR_LENGTH * ELEMENT_SIZE, cudaMemcpyDeviceToHost); cudaMemcpy(dev_dot_product_serial, dev_dot_serial, VECTOR_LENGTH * ELEMENT_SIZE, cudaMemcpyDeviceToHost); FILE *output = fopen("test_dotProduct_1.txt", "w"); fprintf(output, "HOST CALCULATION: %f \n", host_dot_product); fprintf(output, "DEV PARA CALCULATION: %f \n", dev_dot_product_parallel[0]); fprintf(output, "DEV SERI CALCULATION: %f \n", dev_dot_product_serial[0]); /* fprintf(output, "VALUES OF DEV_ARRAY_C VEC: \n"); for(int i=0; i<VECTOR_LENGTH; ++i){ fprintf(output, "value %i is: %f \n", i, dev_dot_product_parallel[i]); } */ free(array_a); free(array_b); //free(host_dot_product); cudaFree(dev_array_a); cudaFree(dev_array_b); cudaFree(dev_array_c); cudaFree(dev_dot_parallel); cudaFree(dev_dot_serial); return(0); } 
+4
source share
1 answer

The maximum number of threads per block for your card is 1024, so you get an error message (512 for some old cards). You either need to separate the blocks in order to use several dimensions (again limited to 1024 in the direction for x, y, z on your map) or use more than one block in your grid.

+6
source

Source: https://habr.com/ru/post/1395834/


All Articles