Here's how I managed to solve my problem based on the suggestion in the comments to the answer. The example compiles if you put helper_cuda.h and helper_string.h in the same directory or add the -I directive to CUDA examples, including the path set with the CUDA toolkit. Corresponding changes are only in the kernel; there are minor changes to main() , although since I called dual resources to execute the kernel, the .y axis of the .y block grid was not used at all, so it did not generate any errors.
#include <stdio.h> #include <stdlib.h> #include <iostream> #include <math.h> #include <assert.h> #include <cuda.h> #include <cuda_runtime.h> #include "helper_cuda.h" #include "helper_string.h" #include <fstream> #ifndef MAX #define MAX(a,b) ((a > b) ? a : b) #endif #define REAL sizeof(float) #define N 128 #define BLOCK_SIZE 128 #define NUM_BLOCKS ((N*N + BLOCK_SIZE - 1)/ BLOCK_SIZE) template <typename T> inline void printmatrix( T mat, int rows, int cols); template <typename T> __global__ void jacobian_kernel ( const T * A, T * J, const T t0, const T tn, const T h, const T * u0, const T * un, const T * un_old); template<typename T> __device__ void d_func(const T t, const T u[], T res[], const T h = 1); template<typename T> int main () { float t0 = 0.; //float tn = 0.; float h = 0.1; float* u0 = (float*)malloc(REAL*N); for(int i = 0; i < N; ++i){u0[i] = i+1;} float* un = (float*)malloc(REAL*N); memcpy(un, u0, REAL*N); float* un_old = (float*)malloc(REAL*N); memcpy(un_old, u0, REAL*N); float* J = (float*)malloc(REAL*N*N); float* A = (float*)malloc(REAL*N*N); host_heat_matrix(A); float *d_u0; float *d_un; float *d_un_old; float *d_J; float *d_A; checkCudaErrors(cudaMalloc((void**)&d_u0, REAL*N)); //printf("1: %p\n", d_u0); checkCudaErrors(cudaMalloc((void**)&d_un, REAL*N)); //printf("2: %p\n", d_un); checkCudaErrors(cudaMalloc((void**)&d_un_old, REAL*N)); //printf("3: %p\n", d_un_old); checkCudaErrors(cudaMalloc((void**)&d_J, REAL*N*N)); //printf("4: %p\n", d_J); checkCudaErrors(cudaMalloc((void**)&d_A, REAL*N*N)); //printf("4: %p\n", d_J); checkCudaErrors(cudaMemcpy(d_u0, u0, REAL*N, cudaMemcpyHostToDevice)); assert(d_u0 != NULL); checkCudaErrors(cudaMemcpy(d_un, un, REAL*N, cudaMemcpyHostToDevice)); assert(d_un != NULL); checkCudaErrors(cudaMemcpy(d_un_old, un_old, REAL*N, cudaMemcpyHostToDevice)); assert(d_un_old != NULL); checkCudaErrors(cudaMemcpy(d_J, J, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_J != NULL); checkCudaErrors(cudaMemcpy(d_A, A, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_A != NULL); dim3 dimGrid(NUM_BLOCKS); std::cout << "NUM_BLOCKS \t" << dimGrid.x << "\n"; dim3 dimBlock(BLOCK_SIZE); std::cout << "BLOCK_SIZE \t" << dimBlock.x << "\n"; size_t shm_size = N*REAL; //std::cout << shm_size << "\n"; //HERE IS A RELEVANT CHANGE OF THE MAIN, SINCE I WAS CALLING //THE KERNEL WITH A 2D GRID BUT WITHOUT USING THE .y AXIS, //WHILE NOW THE GRID IS 1D jacobian_kernel <<< dimGrid, dimBlock, shm_size >>> (d_A, d_J, t0, t0, h, d_u0, d_un, d_un_old); checkCudaErrors(cudaMemcpy(J, d_J, REAL*N*N, cudaMemcpyDeviceToHost)); //printf("4: %p\n", d_J); printmatrix( J, N, N); checkCudaErrors(cudaDeviceReset()); free(u0); free(un); free(un_old); free(J); } template <typename T> __global__ void jacobian_kernel ( const T * A, T * J, const T t0, const T tn, const T h, const T * u0, const T * un, const T * un_old) { T cgamma = 2 - sqrtf(2); const unsigned int t = threadIdx.x; const unsigned int b = blockIdx.x; const unsigned int tid = t + b * blockDim.x; /*__shared__*/ T temp_sx[BLOCK_SIZE][BLOCK_SIZE]; /*__shared__*/ T temp_dx[BLOCK_SIZE][BLOCK_SIZE]; __shared__ T sm_temp_du; T* temp_du = &sm_temp_du; //HERE IS A RELEVANT CHANGE (*) if ( t < BLOCK_SIZE && b < NUM_BLOCKS ) { temp_sx[b][t] = un[t]; //printf("temp_sx[%d] = %f\n", t,(temp_sx[b][t])); temp_dx[b][t] = un[t]; //printf("t = %d, b = %d, t + b * blockDim.x = %d \n",t, b, tid); //HERE IS A NOTE (**) if ( t == b ) { //printf("t = %d, b = %d \n",t, b); if ( tn == t0 ) { *temp_du = u0[t]*0.001; temp_sx[b][t] += *temp_du; temp_dx[b][t] -= *temp_du; temp_sx[b][t] += ( abs( temp_sx[b][t] ) < 10e-6 ? 0.1 : 0 ); temp_dx[b][t] += ( abs( temp_dx[b][t] ) < 10e-6 ? 0.1 : 0 ); temp_sx[b][t] = ( temp_sx[b][t] == 0 ? 0.1 : temp_sx[b][t] ); temp_dx[b][t] = ( temp_dx[b][t] == 0 ? 0.1 : temp_dx[b][t] ); } else { *temp_du = MAX( un[t] - un_old[t], 10e-6 ); temp_sx[b][t] += *temp_du; temp_dx[b][t] -= *temp_du; } ; } //printf("du[%d] %f\n", tid, (*temp_du)); __syncthreads(); //printf("temp_sx[%d][%d] = %f\n", b, t, temp_sx[b][t]); //printf("temp_dx[%d][%d] = %f\n", b, t, temp_dx[b][t]); //d_func(tn, (temp_sx[b]), (temp_sx[b]), 1.f); //d_func(tn, (temp_dx[b]), (temp_dx[b]), 1.f); matvec_dev( tn, A, (temp_sx[b]), (temp_sx[b]), N, N, 1.f ); matvec_dev( tn, A, (temp_dx[b]), (temp_dx[b]), N, N, 1.f ); __syncthreads(); //printf("temp_sx_later[%d][%d] = %f\n", b, t, (temp_sx[b][t])); //printf("temp_sx_later[%d][%d] - temp_dx_later[%d][%d] = %f\n", b,t,b,t, (temp_sx[b][t] - temp_dx[b][t]) / 2 * *temp_du); //if (t == b ) printf( "2du[%d]^-1 = %f\n",t, powf((2 * *temp_du), -1)); J[tid] = (temp_sx[b][t] - temp_dx[b][t]) / (2 * *temp_du); } } template<typename T> __device__ void d_func(const T t, const T u[], T res[], const T h ) { __shared__ float temp_u; temp_u = u[threadIdx.x]; res[threadIdx.x] = h*powf( (temp_u), 2); } template <typename T> inline void printmatrix( T mat, int rows, int cols) { std::ofstream matrix_out; matrix_out.open( "heat_matrix.txt", std::ofstream::out); for( int i = 0; i < rows; i++) { for( int j = 0; j <cols; j++) { double next = mat[i + N*j]; matrix_out << ( (next >= 0) ? " " : "") << next << " "; } matrix_out << "\n"; } }
The corresponding change is at (*) . Before I used if (tid < N) , which has two drawbacks:
- Firstly, this is wrong, since it should be
tid < N*N , since my data is 2D, and tid is a global index that keeps track of all the data. - Even if I wrote
tid < N*N , since I am breaking function calls into blocks, t < BLOCK_SIZE && b < NUM_BLOCKS seems to me more clearly how indexing is organized in code.
Moreover, the statement t == b in (**) actually correct for working with diagonal elements of the matrix. The fact that it was rated true only by 0 was due to my mistake right above.
Thanks for the suggestions!