Cuda __syncthreads () does not work in my code

So here is the situation.

I have a thread block that executes a while loop, and I need a loop to continue if and only if any condition is met by any of these threads. To do this, I use a common variable as a continuation flag, the flag is cleared by thread # 0 at the beginning of each iteration, and then __syncthreads()it can be set by any thread during the iteration if it meets a continuation condition. Then another call __syncthreads()is placed before the breakpoint of the next iteration to make sure the threads are in sync. The core is basically like this:

__global__ void foo(void* data) {
    __shared__ int blockContinueFlag;
    do {
        if (threadIdx.x || threadIdx.y || threadIdx.z) {
            blockContinueFlag = 0;
        }
        __syncthreads(); //synch1
        //some data manipulations...
        if(some predicate) {
            blockContinueFlag = true;
        }
        //some data manipulations...
        __syncthreads(); //synch2
    } while (blockContinueFlag);
}

, synch2 , ​​, ( , ). , do-while, , blockContinueFlag true ( , , blockContinueFlag).

__global__ void foo(void* data) {
    __shared__ int blockContinueFlag;
    do {
        if (threadIdx.x || threadIdx.y || threadIdx.z) {
            blockContinueFlag = 0;
        }
        __syncthreads(); //synch1
        //some data manipulations...
        if(some predicate) {
            blockContinueFlag = true;
        }
        //some data manipulations...
        __syncthreads(); //synch2
    } while (blockContinueFlag);
    //a break point is set here
}

, cuda, __syncthreads() , , .

, .

__global__ void foo(int* data, int kernelSize, int threshold) {
    __shared__ int blockContinueFlag;
    do {
        if (threadIdx.x == 0) {
            blockContinueFlag = 0;
        }
        __syncthreads();
        if (threadIdx.x < kernelSize)  {
            data[threadIdx.x]--;
            for (int i = 0; i < threadIdx.x; i++);
            if (data[threadIdx.x] > threshold)
                blockContinueFlag = true;
        }
        __syncthreads();
    } while (blockContinueFlag);
}

int main()
{
    int hostData[1024], *deviceData;
    for (int i = 0; i < 1024; i++)
        hostData[i] = i;
    cudaMalloc(&deviceData, 1024 * sizeof(int));
    cudaMemcpy(deviceData, hostData, 1024 * sizeof(int), cudaMemcpyHostToDevice);
    foo << <1, 1024 >> >(deviceData, 512, 0);
    cudaDeviceSynchronize();
    cudaMemcpy(hostData, deviceData, 1024 * sizeof(int), cudaMemcpyDeviceToHost);
    fprintf(stderr, cudaGetErrorString(cudaGetLastError()));
    return 0;

}

hostData[] {-511, -510, -509, ..., 0, 512, 513, 514,..., 1023} main(), . VS 2013

[0]: -95
[1]: -94
...
[29]: -66
[30]: -65
[31]: -64
[32]: 31
[33]: 32
[34]: 33
...
[61]: 60
[62]: 61
[63]: 62
[64]: -31
[65]: -30
[66]: -29
...
[92]: -3
[93]: -2
[94]: -1
[95]: 0
[96]: 95
[97]: 96
[98]: 97
...

, .

- / , ?

. .

+4
2

syncthreads. . , :

__global__ void foo(void* data) {
  __shared__ int blockContinueFlag;
  blockContinueFlag = true;
  while (true) {
    if (!blockContinueFlag)
        break;
    if (threadIdx.x || threadIdx.y || threadIdx.z) {
        blockContinueFlag = 0;
    }
    __syncthreads(); //synch1
    //some data manipulations...
    if(some predicate) {
      blockContinueFlag = true;
    }
    //some data manipulations...
    __syncthreads(); //synch2
  };

, ( ).

, , 0 , 33 ( warp) . , .

- __syncthreads(), .

+1

, __syncthreads_or() __syncthreads() .

__global__ void foo(void* data) {
    int blockContinueFlag;
    do {
        blockContinueFlag = 0;
        //some data manipulations...
        if(some predicate) {
            blockContinueFlag = true;
        }
        //some data manipulations...
    } while (__syncthreads_or(blockContinueFlag));
}

, syncthreads.

.

+3

Source: https://habr.com/ru/post/1609264/


All Articles