Reduce matrix columns with CUDA

Question

Reduce matrix columns with CUDA

I have a matrix, and I would like to use CUDA and the fastest way to calculate the average of a column (it comes down to just a sum), i.e. return a row vector containing the average for each column in this matrix. The implementation of the summation to calculate the sum of a single column vector is as follows:

template<typename T> __global__ void kernelSum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) { extern __shared__ T sdata[]; size_t tid = blockIdx.x * blockDim.x + threadIdx.x; // load input into __shared__ memory T x = 0.0; if (tid < n) { x = input[tid]; } sdata[threadIdx.x] = x; __syncthreads(); // contiguous range pattern for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) { if(threadIdx.x < offset) { // add a partial sum upstream to our own sdata[threadIdx.x] += sdata[threadIdx.x + offset]; } // wait until all threads in the block have // updated their partial sums __syncthreads(); } // thread 0 writes the final result if(threadIdx.x == 0) { per_block_results[blockIdx.x] = sdata[0]; } }

and this is called as:

 int n = ... // vector size const int BLOCK_SIZE = 1024; int number_of_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; double* per_block_results = NULL; cudaMalloc((void**) &per_block_results, sizeof(double)*(number_of_blocks + 1)); // launch one kernel to compute, per-block, a partial sum kernelSum<double> <<<number_of_blocks, BLOCK_SIZE, BLOCK_SIZE*sizeof(double)>>>(a, per_block_results, n); // launch a single block to compute the sum of the partial sums kernelSum<double> <<<1, number_of_blocks, number_of_blocks*sizeof(double)>>>(per_block_results, per_block_results + number_of_blocks, number_of_blocks);

I could generalize this kernel to matrices of any number of columns, but I am limited by shared memory. My GPU has the ability to calculate 3.5 , so it has 48KB shared memory and a maximum block size of 1024 , i.e. The number of threads per block. Since I'm interested in double precision, I have 48*1024/8= 6144 maximum amount of shared memory. Since the reduction is performed for each block, I can have a maximum of 6144 (doubles in shared memory) / 1024 (block size) = 6 columns 6144 (doubles in shared memory) / 1024 (block size) = 6 , for which I can simultaneously calculate the summation. Reducing the size of the block would allow more columns to be computed simultaneously, for example. 6144 (doubles in shared memory) / 512 (block size) = 12 .

Will this more complex strategy beat a simple processor cycle over each column of the matrix and cause a reduction in the amount. Is there another better way to do this?

+3

c ++ cuda

bravegag Jan 29 '14 at 10:34

source share

2 answers

As an alternative to the answer already provided by talonmies, I am reporting 4 other approaches to reduce columns here, 3 of them based on using CUDA Thrust and 1 based on using cublas<t>gemv() with a column of 1 's, as suggested in my comment above.

CUDA Thrust approaches are similar to Cut matrix rows with CUDA with implicit transposition obtained using

 thrust::make_permutation_iterator(d_matrix.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0), (_1 % Nrows) * Ncols + _1 / Nrows))

Here is the complete code:

 #include <cublas_v2.h> #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/generate.h> #include <thrust/reduce.h> #include <thrust/functional.h> #include <thrust/random.h> #include <thrust/sequence.h> #include <stdio.h> #include <iostream> #include "Utilities.cuh" #include "TimingGPU.cuh" using namespace thrust::placeholders; // --- Required for approach #2 __device__ float *vals; /**************************************************************/ /* CONVERT LINEAR INDEX TO ROW INDEX - NEEDED FOR APPROACH #1 */ /**************************************************************/ template <typename T> struct linear_index_to_row_index : public thrust::unary_function<T,T> { T Ncols; // --- Number of columns __host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {} __host__ __device__ T operator()(T i) { return i / Ncols; } }; /******************************************/ /* ROW_REDUCTION - NEEDED FOR APPROACH #2 */ /******************************************/ struct col_reduction { const int Nrows; // --- Number of rows const int Ncols; // --- Number of cols col_reduction(int _Nrows, int _Ncols) : Nrows(_Nrows), Ncols(_Ncols) {} __device__ float operator()(float& x, int& y ) { float temp = 0.f; for (int i = 0; i<Nrows; i++) { temp += vals[y + (i*Ncols)]; } return temp; } }; /**************************/ /* NEEDED FOR APPROACH #3 */ /**************************/ template<typename T> struct MulC: public thrust::unary_function<T, T> { TC; __host__ __device__ MulC(T c) : C(c) { } __host__ __device__ T operator()(T x) { return x * C; } }; /********/ /* MAIN */ /********/ int main() { const int Nrows = 5; // --- Number of rows const int Ncols = 8; // --- Number of columns // --- Random uniform integer distribution between 10 and 99 thrust::default_random_engine rng; thrust::uniform_int_distribution<int> dist(10, 99); // --- Matrix allocation and initialization thrust::device_vector<float> d_matrix(Nrows * Ncols); for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist(rng); TimingGPU timerGPU; /***************/ /* APPROACH #1 */ /***************/ timerGPU.StartCounter(); // --- Allocate space for row sums and indices thrust::device_vector<float> d_col_sums(Ncols); thrust::device_vector<int> d_col_indices(Ncols); // --- Compute row sums by summing values with equal row indices thrust::reduce_by_key(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Nrows)), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols), thrust::make_permutation_iterator( d_matrix.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)), d_col_indices.begin(), d_col_sums.begin(), thrust::equal_to<int>(), thrust::plus<float>()); //thrust::reduce_by_key( // thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)), // thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols), // thrust::make_permutation_iterator( // d_matrix.begin(), // thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)), // thrust::make_discard_iterator(), // d_col_sums.begin()); printf("Timing for approach #1 = %f\n", timerGPU.GetCounter()); // --- Print result for(int j = 0; j < Ncols; j++) { std::cout << "[ "; for(int i = 0; i < Nrows; i++) std::cout << d_matrix[i * Ncols + j] << " "; std::cout << "] = " << d_col_sums[j] << "\n"; } /***************/ /* APPROACH #2 */ /***************/ timerGPU.StartCounter(); thrust::device_vector<float> d_col_sums_2(Ncols, 0); float *s_vals = thrust::raw_pointer_cast(&d_matrix[0]); gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *))); thrust::transform(d_col_sums_2.begin(), d_col_sums_2.end(), thrust::counting_iterator<int>(0), d_col_sums_2.begin(), col_reduction(Nrows, Ncols)); printf("Timing for approach #2 = %f\n", timerGPU.GetCounter()); for(int j = 0; j < Ncols; j++) { std::cout << "[ "; for(int i = 0; i < Nrows; i++) std::cout << d_matrix[i * Ncols + j] << " "; std::cout << "] = " << d_col_sums_2[j] << "\n"; } /***************/ /* APPROACH #3 */ /***************/ timerGPU.StartCounter(); thrust::device_vector<float> d_col_sums_3(Ncols, 0); thrust::device_vector<float> d_temp(Nrows * Ncols); thrust::inclusive_scan_by_key( thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)), thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols), thrust::make_permutation_iterator( d_matrix.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)), d_temp.begin()); thrust::copy( thrust::make_permutation_iterator( d_temp.begin() + Nrows - 1, thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Nrows))), thrust::make_permutation_iterator( d_temp.begin() + Nrows - 1, thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Nrows))) + Ncols, d_col_sums_3.begin()); printf("Timing for approach #3 = %f\n", timerGPU.GetCounter()); for(int j = 0; j < Ncols; j++) { std::cout << "[ "; for(int i = 0; i < Nrows; i++) std::cout << d_matrix[i * Ncols + j] << " "; std::cout << "] = " << d_col_sums_3[j] << "\n"; } /***************/ /* APPROACH #4 */ /***************/ cublasHandle_t handle; timerGPU.StartCounter(); cublasSafeCall(cublasCreate(&handle)); thrust::device_vector<float> d_col_sums_4(Ncols); thrust::device_vector<float> d_ones(Nrows, 1.f); float alpha = 1.f; float beta = 0.f; cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_N, Ncols, Nrows, &alpha, thrust::raw_pointer_cast(d_matrix.data()), Ncols, thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_col_sums_4.data()), 1)); printf("Timing for approach #4 = %f\n", timerGPU.GetCounter()); for(int j = 0; j < Ncols; j++) { std::cout << "[ "; for(int i = 0; i < Nrows; i++) std::cout << d_matrix[i * Ncols + j] << " "; std::cout << "] = " << d_col_sums_4[j] << "\n"; } return 0; }

+2

Jackolantern Apr 17 '15 at 5:21

source share

talonmies · Accepted Answer · 2014-01-29T14:12:15+0000

What makes you do something like this:

 template<typename T> __global__ void kernelSum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t lda, const size_t n) { extern __shared__ T sdata[]; // Accumulate per thread partial sum T x = 0.0; T * p = &input[blockIdx.x * lda]; for(int i=threadIdx.x; i < n; i += blockDim.x) { x += p[i]; } // load partial sum into __shared__ memory sdata[threadIdx.x] = x; __syncthreads(); // contiguous range pattern for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) { if(threadIdx.x < offset) { // add a partial sum upstream to our own sdata[threadIdx.x] += sdata[threadIdx.x + offset]; } // wait until all threads in the block have // updated their partial sums __syncthreads(); } // thread 0 writes the final result if(threadIdx.x == 0) { per_block_results[blockIdx.x] = sdata[0]; } }

[standard disclaimer: written in a browser, never compiled or tested, use at your own risk)

i.e. you only need one entry in sdata for each thread in the block to reduce shared memory. Each stream sums as many values as needed to cover the full input of a column. Then there is no shared memory limit, you can sum columns of any size with the same block size.

EDIT: Apparently, the idea of using partial amounts per stream is new to you, so here is a complete example to explore:

 #include <iostream> template<typename T> __global__ void kernelSum(const T* __restrict__ input, const size_t lda, // pitch of input in words of sizeof(T) T* __restrict__ per_block_results, const size_t n) { extern __shared__ T sdata[]; T x = 0.0; const T * p = &input[blockIdx.x * lda]; // Accumulate per thread partial sum for(int i=threadIdx.x; i < n; i += blockDim.x) { x += p[i]; } // load thread partial sum into shared memory sdata[threadIdx.x] = x; __syncthreads(); for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) { if(threadIdx.x < offset) { sdata[threadIdx.x] += sdata[threadIdx.x + offset]; } __syncthreads(); } // thread 0 writes the final result if(threadIdx.x == 0) { per_block_results[blockIdx.x] = sdata[0]; } } int main(void) { const int m = 10000, n = 16; float * a = new float[m*n]; for(int i=0; i<(m*n); i++) { a[i] = (float)(i%10); } float *a_; size_t size_a = m * n * sizeof(float); cudaMalloc((void **)&a_, size_a); cudaMemcpy(a_, a, size_a, cudaMemcpyHostToDevice); float *b_; size_t size_b = n * sizeof(float); cudaMalloc((void **)&b_, size_b); // select number of warps per block according to size of the // colum and launch one block per column. Probably makes sense // to have at least 4:1 column size to block size dim3 blocksize(256); dim3 gridsize(n); size_t shmsize = sizeof(float) * (size_t)blocksize.x; kernelSum<float><<<gridsize, blocksize, shmsize>>>(a_, b_, m, m); float * b = new float[n]; cudaMemcpy(b, b_, size_b, cudaMemcpyDeviceToHost); for(int i=0; i<n; i++) { std::cout << i << " " << b[i] << std::endl; } cudaDeviceReset(); return 0; }

You should experiment with the block size relative to your matrix size for optimal performance, but in general, the more work per thread than the kernel, the better overall performance (because reducing shared memory is quite expensive). You can see one approach to block heuristics and grid size for a similar memory bandwidth issue in this answer .

Reduce matrix columns with CUDA

More articles: