CUDA Programming

I am new to CUDA. I have a question about a simple program, I hope someone will notice my mistake.

__global__ void ADD(float* A, float* B, float* C)
{
   const int ix = blockDim.x * blockIdx.x + threadIdx.x;
   const int iy = blockDim.y * blockIdx.y + threadIdx.y;

   if(ix < 16 && iy < 16)
   {
      for(int i = 0; i<256; i++)
      C[i] = A[ix+iy*16] + B[ix+iy*16] + C[i]; // << I wish to store all in C
   }
}

extern "C" void cuda_p(float* A, float* B, float* C)
{
    float* dev_A;
    float* dev_B;
    float* dev_C;
    cudaMalloc((void**) &dev_A,  sizeof(float) * 256);
    cudaMalloc((void**) &dev_B,  sizeof(float) * 256);
    cudaMalloc((void**) &dev_C,  sizeof(float) * 256);
    cudaMemcpy(dev_A, A, sizeof(float) * 256, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_B, B, sizeof(float) * 256, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_C, C, sizeof(float) * 256, cudaMemcpyHostToDevice);
    ADDD<<<16,16>>>(dev_A,dev_B,dev_C);
    cudaMemcpy(A, dev_A, sizeof(float) * 256, cudaMemcpyDeviceToHost);
    cudaMemcpy(B, dev_B, sizeof(float) * 256, cudaMemcpyDeviceToHost);
    cudaMemcpy(C, dev_C, sizeof(float) * 256, cudaMemcpyDeviceToHost);
 cudaFree(dev_A);
 cudaFree(dev_B);
 cudaFree(dev_C);
}
+3
source share
2 answers
  • Are you sure about the kernel launch configuration? In the code, you are trying to run an unknown function ADDD. And your execution configuration: gridDim = (16, 0, 0) and blockDim = (16, 0, 0). So, in your kernel blockIdx.x = [0..16) and threadIdx.x = [0..16). If I understood you correctly, then

    ix = threadIdx.x; iy = blockIdx.x;

    Read about this in the CUDA Programming Guide (Appendix B.15).

  • . C[i], . 16 (1 warp) C[i], (A[ix+iy*16] + B[ix+iy*16]) C[i]. ( CUDA, B.11.1.1) ​​ (CUDA C Best Practices Guide 3.2.1), - ...

+4

, ​​ . :

for(int i = 0; i<256; i++)
      C[i] = A[ix+iy*16] + B[ix+iy*16] + C[i]; // << I wish to store all in C

C, A B . , . , C all B, C:

for(int i = 0; i<256; i++)
      C[ix+iy*16] += A[i] + B[i];

, C B, , , ​​ :

__global__ void ADD(float* A, float* B, float* C)
{
   const int ix = blockDim.x * blockIdx.x + threadIdx.x;
   const int iy = blockDim.y * blockIdx.y + threadIdx.y;

   if(ix < 16 && iy < 16)
   {
      C[ix+iy*16] = A[ix+iy*16] + B[ix+iy*16];
   }
}

A B C.

, ​​. :

ADDD<<<16,16>>>(dev_A,dev_B,dev_C);

1x16 1x16 ( typo'd). , ( x y), dim3 . - :

// Use a grid of 4x4 blocks
dim3 gridSize;
gridSize.x = 4;
gridSize.y = 4;

// Use blocks of 4x4 threads.
dim3 blockSize;
blockSize.x = 4;
blockSize.y = 4;

// Run a 4x4 grid of blocks, each with 4x4 threads.
// So you end up with a 16x16 group of threads, matching your data layout.
ADD<<<gridSize,blockSize>>>(dev_A,dev_B,dev_C);
+4

Source: https://habr.com/ru/post/1757186/


All Articles