I use 63 registers / threads, so (maximum 32768) I can use about 520 threads. In this example, I am now using 512 threads.
(parallelism is in the "computeEvec" function inside the global computeEHfields function.) Problems:
1) Memory check error below.
2) When I use numPointsRp> 2000, it shows me “out of memory”, but (if I'm not mistaken), I calculate global memory, and that’s fine.
------------------------------- UPDATED ------------ ------ ---------
I run the cuda-memcheck program and it gives me (only when numPointsRs> numPointsRp):
========= Invalid global 4 view
========= at 0x00000428 in computeEHfields
========== stream (2,0,0) in block (0,0,0)
========= Address 0x4001076e0 is out of bounds
=================== Invalid global reading of size 4
========= at 0x00000428 in computeEHfields
========== stream (1,0,0) in block (0,0,0)
========= Address 0x4001076e0 is out of bounds
=================== Invalid global reading of size 4
========= at 0x00000428 in computeEHfields
========= stream (0,0,0) in block (0,0,0)
========= Address 0x4001076e0 is out of bounds
ERROR SUMMARY: 160 errors
----------- EDIT ----------------------------
Also, several times (if I use only streams, not blocks (I have not tested it for blocks)), if, for example, I have numPointsRs = 1000 and numPointsRp = 100, then change numPointsRp = 200 and then again change numPointsRp = 100 I do not accept the first results!
import pycuda.gpuarray as gpuarray import pycuda.autoinit from pycuda.compiler import SourceModule import numpy as np import cmath import pycuda.driver as drv Rs=np.zeros((numPointsRs,3)).astype(np.float32) for k in range (numPointsRs): Rs[k]=[0,k,0] Rp=np.zeros((numPointsRp,3)).astype(np.float32) for k in range (numPointsRp): Rp[k]=[1+k,0,0] #---- Initialization and passing(allocate memory and transfer data) to GPU ------------------------- Rs_gpu=gpuarray.to_gpu(Rs) Rp_gpu=gpuarray.to_gpu(Rp) J_gpu=gpuarray.to_gpu(np.ones((numPointsRs,3)).astype(np.complex64)) M_gpu=gpuarray.to_gpu(np.ones((numPointsRs,3)).astype(np.complex64)) Evec_gpu=gpuarray.to_gpu(np.zeros((numPointsRp,3)).astype(np.complex64)) Hvec_gpu=gpuarray.to_gpu(np.zeros((numPointsRp,3)).astype(np.complex64)) All_gpu=gpuarray.to_gpu(np.ones(numPointsRp).astype(np.complex64)) mod =SourceModule(""" #include <pycuda-complex.hpp> #include <cmath> #include <vector> #define RowRsSize %(numrs)d #define RowRpSize %(numrp)d typedef pycuda::complex<float> cmplx; extern "C"{ __device__ void computeEvec(float Rs_mat[][3], int numPointsRs, cmplx J[][3], cmplx M[][3], float *Rp, cmplx kp, cmplx eta, cmplx *Evec, cmplx *Hvec, cmplx *All) { while (c<numPointsRs){ ... c++; } } __global__ void computeEHfields(float *Rs_mat_, int numPointsRs, float *Rp_mat_, int numPointsRp, cmplx *J_, cmplx *M_, cmplx kp, cmplx eta, cmplx E[][3], cmplx H[][3], cmplx *All ) { float Rs_mat[RowRsSize][3]; float Rp_mat[RowRpSize][3]; cmplx J[RowRsSize][3]; cmplx M[RowRsSize][3]; int k=threadIdx.x+blockIdx.x*blockDim.x; while (k<numPointsRp) { computeEvec( Rs_mat, numPointsRs, J, M, Rp_mat[k], kp, eta, E[k], H[k], All ); k+=blockDim.x*gridDim.x; } } } """% { "numrs":numPointsRs, "numrp":numPointsRp},no_extern_c=1) func = mod.get_function("computeEHfields") func(Rs_gpu,np.int32(numPointsRs),Rp_gpu,np.int32(numPointsRp),J_gpu, M_gpu, np.complex64(kp), np.complex64(eta),Evec_gpu,Hvec_gpu, All_gpu, block=(128,1,1),grid=(200,1)) print(" \n") #----- get data back from GPU----- Rs=Rs_gpu.get() Rp=Rp_gpu.get() J=J_gpu.get() M=M_gpu.get() Evec=Evec_gpu.get() Hvec=Hvec_gpu.get() All=All_gpu.get()
-------------------- GPU model ------------------------- --- --------------------
Device 0: "GeForce GTX 560" CUDA Driver Version / Runtime Version 4.20 / 4.10 CUDA Capability Major/Minor version number: 2.1 Total amount of global memory: 1024 MBytes (1073283072 bytes) ( 0) Multiprocessors x (48) CUDA Cores/MP: 0 CUDA Cores //CUDA Cores 336 => 7 MP and 48 Cores/MP