How can I implement a custom atomic function with multiple variables?

I would like to implement this atomic function in CUDA:

__device__ float lowest; // global var __device__ int lowIdx; // global var float realNum; // thread reg var int index; // thread reg var if(realNum < lowest) { lowest= realNum; // the new lowest lowIdx= index; // update the 'low' index } 

I do not believe that I can do this using any of the atomic functions. I need to lock a pair of global memory folders for a couple of instructions. Can I implement this using PTXAS (assembly) code?

+4
source share
2 answers

As I said in my second comment above, you can combine your two 32-bit numbers into one 64-bit atom-controlled number and solve the problem this way. Then we process the 64-bit quantity atomically, using an arbitrary atomic example as an approximate guide. Obviously, you cannot extend this idea beyond two 32-bit values. Here is an example:

 #include <stdio.h> #define DSIZE 5000 #define nTPB 256 #define cudaCheckErrors(msg) \ do { \ cudaError_t __err = cudaGetLastError(); \ if (__err != cudaSuccess) { \ fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ msg, cudaGetErrorString(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ exit(1); \ } \ } while (0) typedef union { float floats[2]; // floats[0] = lowest int ints[2]; // ints[1] = lowIdx unsigned long long int ulong; // for atomic update } my_atomics; __device__ my_atomics test; __device__ unsigned long long int my_atomicMin(unsigned long long int* address, float val1, int val2) { my_atomics loc, loctest; loc.floats[0] = val1; loc.ints[1] = val2; loctest.ulong = *address; while (loctest.floats[0] > val1) loctest.ulong = atomicCAS(address, loctest.ulong, loc.ulong); return loctest.ulong; } __global__ void min_test(const float* data) { int idx = (blockDim.x * blockIdx.x) + threadIdx.x; if (idx < DSIZE) my_atomicMin(&(test.ulong), data[idx],idx); } int main() { float *d_data, *h_data; my_atomics my_init; my_init.floats[0] = 10.0f; my_init.ints[1] = DSIZE; h_data = (float *)malloc(DSIZE * sizeof(float)); if (h_data == 0) {printf("malloc fail\n"); return 1;} cudaMalloc((void **)&d_data, DSIZE * sizeof(float)); cudaCheckErrors("cm1 fail"); // create random floats between 0 and 1 for (int i = 0; i < DSIZE; i++) h_data[i] = rand()/(float)RAND_MAX; cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice); cudaCheckErrors("cmcp1 fail"); cudaMemcpyToSymbol(test, &(my_init.ulong), sizeof(unsigned long long int)); cudaCheckErrors("cmcp2 fail"); min_test<<<(DSIZE+nTPB-1)/nTPB, nTPB>>>(d_data); cudaDeviceSynchronize(); cudaCheckErrors("kernel fail"); cudaMemcpyFromSymbol(&(my_init.ulong), test, sizeof(unsigned long long int)); cudaCheckErrors("cmcp3 fail"); printf("device min result = %f\n", my_init.floats[0]); printf("device idx result = %d\n", my_init.ints[1]); float host_val = 10.0f; int host_idx = DSIZE; for (int i=0; i<DSIZE; i++) if (h_data[i] < host_val){ host_val = h_data[i]; host_idx = i; } printf("host min result = %f\n", host_val); printf("host idx result = %d\n", host_idx); return 0; } 
+9
source

@Robert Crovella: Great idea, but I think this function needs to be slightly modified as follows:

 __device__ unsigned long long int my_atomicMin(unsigned long long int* address, float val1, int val2) { my_atomics loc, loctest, old; loc.floats[0] = val1; loc.ints[1] = val2; loctest.ulong = *address; old.ulong = loctest.ulong; while (loctest.floats[0] > val1){ old.ulong = loctest.ulong; loctest.ulong = atomicCAS(address, loctest.ulong, loc.ulong); } return old.ulong; } 
0
source

Source: https://habr.com/ru/post/1489145/


All Articles