I assume that you want to stop the running kernel (not one thread).
The easiest way (and the one I suggest) is to set the global memory flag, which has been checked by the kernel. You can set the flag using cudaMemcpy () (or without using shared memory).
As below:
if (gm_flag) {
__threadfence();
asm("trap;");
}
ams ( "trap;" )
, cuda 2.0 assert() !
( !)
__device__ bool go(int val){
return true;
}
__global__ void stopme(bool* flag, int* val, int size){
int idx= blockIdx.x *blockDim.x + threadIdx.x;
if(idx < size){
bool canContinue = true;
while(canContinue && (flag[0])){
printf("HELLO from %i\n",idx);
if(!(*flag)){
return;
}
else{
val[idx]++;
val[idx]%=100;
}
canContinue = go(val[idx]);
}
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main(void)
{
int size = 128;
int* h_val = (int*)malloc(sizeof(int)*size);
bool * h_flag = new bool;
*h_flag=true;
bool* d_flag;
cudaMalloc(&d_flag,sizeof(bool));
cudaMemcpy(d_flag,h_flag,1,cudaMemcpyHostToDevice);
int* d_val;
cudaMalloc(&d_val,sizeof(int)*size );
for(int i=0;i<size;i++){
h_val[i] = i;
}
cudaMemcpy(d_val,h_val,size,cudaMemcpyHostToDevice);
int BSIZE=32;
int nblocks =size/BSIZE;
printf("%i,%i",nblocks,BSIZE);
stopme<<<nblocks,BSIZE>>>(d_flag,d_val,size);
*h_flag=false;
cudaMemcpy(d_flag,h_flag,1,cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
gpuErrchk( cudaPeekAtLastError() );
printf("END\n");
}
stopMe , - false. , , , return , ( ). , .