I'm currently trying to make a piece of CUDA code with a class that will be used exclusively on the device side (i.e. the host should not know about its existence). However, I cannot work out the correct qualifiers for the class ( deviceclass below):
__device__ float devicefunction (float *x) {return x[0]+x[1];} class deviceclass { private: float _a; public: deviceclass(float *x) {_a = devicefunction(x);} float getvalue () {return _a;} }; // Device code __global__ void VecInit(float* A, int N) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < N) { deviceclass *test; test = new deviceclass(1.0, 2.0); A[i] = test->getvalue(); } } // Standard CUDA guff below: Variables float *h_A, *d_A; // Host code int main(int argc, char** argv) { printf("Vector initialization...\n"); int N = 10000; size_t size = N * sizeof(float); // Allocate h_A = (float*)malloc(size); cudaMalloc(&d_A, size); printf("Computing...\n"); // Invoke kernel int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; VecInit<<<blocksPerGrid, threadsPerBlock>>>(d_A, N); // Copy result from device memory to host memory cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost); //...etc }
Configuring deviceclass as soon as a __device__ throws an error __device__ from a global function, however setting it as __device__ __host__ or __global__ seems unnecessary. Can someone point me in the right direction?
source share