Sorting (small) arrays using a key in CUDA

I am trying to write a function that takes a block of unsorted key / value pairs, e.g.

<7, 4> <2, 8> <3, 1> <2, 2> <1, 5> <7, 1> <3, 8> <7, 2> 

and sorts them by key, while decreasing the values ​​of pairs with the same key:

 <1, 5> <2, 10> <3, 9> <7, 7> 

I'm currently using a __device__ function similar to the one below, which is essentially a bitonic sort, which combines the values ​​of the same key and sets the old data to an infinitely large value (just using 99 at the moment), so the next the bitnic view will sift them to the bottom, and the array cut to the int * value is deleted.

 __device__ void interBitonicSortReduce(int2 *sdata, int tid, int recordNum, int *removed) { int n = MIN(DEFAULT_DIMBLOCK, recordNum); for (int k = 2; k <= n; k *= 2) { for (int j = k / 2; j > 0; j /= 2) { int ixj = tid ^ j; if (ixj > tid) { if (sdata[tid].x == sdata[ixj].x && sdata[tid].x < 99) { atomicAdd(&sdata[tid].y, sdata[ixj].y); sdata[ixj].x = 99; sdata[ixj].y = 99; atomicAdd(removed, 1); } if ((tid & k) == 0 && sdata[tid].x > sdata[ixj].x) swapData2(sdata[tid], sdata[ixj]); if ((tid & k) != 0 && sdata[tid].x < sdata[ixj].x) swapData2(sdata[tid], sdata[ixj]); __syncthreads(); } } } } 

This works fine for small datasets, but with large datasets (although still within the same block), a single call simply does not.

Is it possible to try to combine sorting and reduction of the same function? Obviously, this function will need to be called more than once, but can it be determined exactly how many times it must be called to exhaust all the data based on its size?

Or should I pre-generate the reduction separately with something like this:

 __device__ int interReduce(int2 *sdata, int tid) { int index = tid; while (sdata[index].x == sdata[tid].x) { index--; if (index < 0) break; } if (index+1 != tid) { atomicAdd(&sdata[index+1].y, sdata[tid].y); sdata[tid].x = 99; sdata[tid].y = 99; return 1; } return 0; } 

I am trying to find the most efficient solution, but my experience with CUDA and parallel algorithms is limited.

+6
source share
4 answers

You can use thrust to do this.

Use thrust :: sort_by_key and then thrust :: reduce_by_key

Here is an example:

 #include <iostream> #include <thrust/device_vector.h> #include <thrust/copy.h> #include <thrust/sort.h> #include <thrust/reduce.h> #include <thrust/sequence.h> #define N 12 typedef thrust::device_vector<int>::iterator dintiter; int main(){ thrust::device_vector<int> keys(N); thrust::device_vector<int> values(N); thrust::device_vector<int> new_keys(N); thrust::device_vector<int> new_values(N); thrust::sequence(keys.begin(), keys.end()); thrust::sequence(values.begin(), values.end()); keys[3] = 1; keys[9] = 1; keys[8] = 2; keys[7] = 4; thrust::sort_by_key(keys.begin(), keys.end(), values.begin()); thrust::pair<dintiter, dintiter> new_end; new_end = thrust::reduce_by_key(keys.begin(), keys.end(), values.begin(), new_keys.begin(), new_values.begin()); std::cout << "results values:" << std::endl; thrust::copy(new_values.begin(), new_end.second, std::ostream_iterator<int>( std::cout, " ")); std::cout << std::endl << "results keys:" << std::endl; thrust::copy(new_keys.begin(), new_end.first, std::ostream_iterator<int>( std::cout, " ")); std::cout << std::endl; return 0; } 
+4
source

From your post, it seems that you need to sort by many small arrays. Quote yourself:

This works fine for small datasets, but with large datasets (although still within the same block), a single call simply does not.

Below you will find a fully processed example built around my answer on Sorting many small arrays in CUDA and using cub :: BlockRadixSort .

 #include <cub/cub.cuh> #include <stdio.h> #include <stdlib.h> #include "Utilities.cuh" using namespace cub; /**********************************/ /* CUB BLOCKSORT KERNEL NO SHARED */ /**********************************/ template <int BLOCK_THREADS, int ITEMS_PER_THREAD> __global__ void BlockSortKernel(float *d_values, int *d_keys, float *d_values_result, int *d_keys_result) { // --- Specialize BlockLoad, BlockStore, and BlockRadixSort collective types typedef cub::BlockLoad <int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE> BlockLoadIntT; typedef cub::BlockLoad <float*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE> BlockLoadFloatT; typedef cub::BlockStore <int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_TRANSPOSE> BlockStoreIntT; typedef cub::BlockStore <float*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_TRANSPOSE> BlockStoreFloatT; typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD, float> BlockRadixSortT; // --- Allocate type-safe, repurposable shared memory for collectives __shared__ union { typename BlockLoadIntT ::TempStorage loadInt; typename BlockLoadFloatT ::TempStorage loadFloat; typename BlockStoreIntT ::TempStorage storeInt; typename BlockStoreFloatT ::TempStorage storeFloat; typename BlockRadixSortT ::TempStorage sort; } temp_storage; // --- Obtain this block segment of consecutive keys (blocked across threads) int thread_keys[ITEMS_PER_THREAD]; float thread_values[ITEMS_PER_THREAD]; int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD); BlockLoadIntT(temp_storage.loadInt).Load(d_keys + block_offset, thread_keys); BlockLoadFloatT(temp_storage.loadFloat).Load(d_values + block_offset, thread_values); __syncthreads(); // --- Collectively sort the keys BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(thread_keys, thread_values); __syncthreads(); // --- Store the sorted segment BlockStoreIntT(temp_storage.storeInt).Store(d_keys_result + block_offset, thread_keys); BlockStoreFloatT(temp_storage.storeFloat).Store(d_values_result + block_offset, thread_values); } /*******************************/ /* CUB BLOCKSORT KERNEL SHARED */ /*******************************/ template <int BLOCK_THREADS, int ITEMS_PER_THREAD> __global__ void shared_BlockSortKernel(float *d_values, int *d_keys, float *d_values_result, int *d_keys_result) { // --- Shared memory allocation __shared__ float sharedMemoryArrayValues[BLOCK_THREADS * ITEMS_PER_THREAD]; __shared__ int sharedMemoryArrayKeys[BLOCK_THREADS * ITEMS_PER_THREAD]; // --- Specialize BlockStore and BlockRadixSort collective types typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD, float> BlockRadixSortT; // --- Allocate type-safe, repurposable shared memory for collectives __shared__ typename BlockRadixSortT::TempStorage temp_storage; int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD); // --- Load data to shared memory for (int k = 0; k < ITEMS_PER_THREAD; k++) { sharedMemoryArrayValues[threadIdx.x * ITEMS_PER_THREAD + k] = d_values[block_offset + threadIdx.x * ITEMS_PER_THREAD + k]; sharedMemoryArrayKeys[threadIdx.x * ITEMS_PER_THREAD + k] = d_keys[block_offset + threadIdx.x * ITEMS_PER_THREAD + k]; } __syncthreads(); // --- Collectively sort the keys BlockRadixSortT(temp_storage).SortBlockedToStriped(*static_cast<int(*) [ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayKeys + (threadIdx.x * ITEMS_PER_THREAD))), *static_cast<float(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayValues + (threadIdx.x * ITEMS_PER_THREAD)))); __syncthreads(); // --- Write data to shared memory for (int k = 0; k < ITEMS_PER_THREAD; k++) { d_values_result[block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayValues[threadIdx.x * ITEMS_PER_THREAD + k]; d_keys_result [block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayKeys [threadIdx.x * ITEMS_PER_THREAD + k]; } } /********/ /* MAIN */ /********/ int main() { const int numElemsPerArray = 8; const int numArrays = 4; const int N = numArrays * numElemsPerArray; const int numElemsPerThread = 4; const int RANGE = N * numElemsPerThread; // --- Allocating and initializing the data on the host float *h_values = (float *)malloc(N * sizeof(float)); int *h_keys = (int *) malloc(N * sizeof(int)); for (int i = 0 ; i < N; i++) { h_values[i] = rand() % RANGE; h_keys[i] = rand() % RANGE; } printf("Original\n\n"); for (int k = 0; k < numArrays; k++) for (int i = 0; i < numElemsPerArray; i++) printf("Array nr. %i; Element nr. %i; Key %i; Value %f\n", k, i, h_keys[k * numElemsPerArray + i], h_values[k * numElemsPerArray + i]); // --- Allocating the results on the host float *h_values_result1 = (float *)malloc(N * sizeof(float)); float *h_values_result2 = (float *)malloc(N * sizeof(float)); int *h_keys_result1 = (int *) malloc(N * sizeof(int)); int *h_keys_result2 = (int *) malloc(N * sizeof(int)); // --- Allocating space for data and results on device float *d_values; gpuErrchk(cudaMalloc((void **)&d_values, N * sizeof(float))); int *d_keys; gpuErrchk(cudaMalloc((void **)&d_keys, N * sizeof(int))); float *d_values_result1; gpuErrchk(cudaMalloc((void **)&d_values_result1, N * sizeof(float))); float *d_values_result2; gpuErrchk(cudaMalloc((void **)&d_values_result2, N * sizeof(float))); int *d_keys_result1; gpuErrchk(cudaMalloc((void **)&d_keys_result1, N * sizeof(int))); int *d_keys_result2; gpuErrchk(cudaMalloc((void **)&d_keys_result2, N * sizeof(int))); // --- BlockSortKernel no shared gpuErrchk(cudaMemcpy(d_values, h_values, N * sizeof(float), cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpy(d_keys, h_keys, N * sizeof(int), cudaMemcpyHostToDevice)); BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_values, d_keys, d_values_result1, d_keys_result1); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); gpuErrchk(cudaMemcpy(h_values_result1, d_values_result1, N * sizeof(float), cudaMemcpyDeviceToHost)); gpuErrchk(cudaMemcpy(h_keys_result1, d_keys_result1, N * sizeof(int), cudaMemcpyDeviceToHost)); printf("\n\nBlockSortKernel no shared\n\n"); for (int k = 0; k < numArrays; k++) for (int i = 0; i < numElemsPerArray; i++) printf("Array nr. %i; Element nr. %i; Key %i; Value %f\n", k, i, h_keys_result1[k * numElemsPerArray + i], h_values_result1[k * numElemsPerArray + i]); // --- BlockSortKernel with shared gpuErrchk(cudaMemcpy(d_values, h_values, N * sizeof(float), cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpy(d_keys, h_keys, N * sizeof(int), cudaMemcpyHostToDevice)); shared_BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_values, d_keys, d_values_result2, d_keys_result2); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); gpuErrchk(cudaMemcpy(h_values_result2, d_values_result2, N * sizeof(float), cudaMemcpyDeviceToHost)); gpuErrchk(cudaMemcpy(h_keys_result2, d_keys_result2, N * sizeof(int), cudaMemcpyDeviceToHost)); printf("\n\nBlockSortKernel shared\n\n"); for (int k = 0; k < numArrays; k++) for (int i = 0; i < numElemsPerArray; i++) printf("Array nr. %i; Element nr. %i; Key %i; Value %f\n", k, i, h_keys_result2[k * numElemsPerArray + i], h_values_result2[k * numElemsPerArray + i]); return 0; } 
+1
source

I recently ran into the problem of extending the approach above in the case where multiple arrays must be ordered by the same key.

It seems that because of its prototype, it is impossible to use cub::BlockRadixSort by β€œpacking” arrays using iterators and zip tuples, see C ++ running on β€œpacked” arrays . Accordingly, I used the auxiliary index approach proposed in the cited article.

Here is an example that I developed:

 #include <cub/cub.cuh> #include <stdio.h> #include <stdlib.h> #include "Utilities.cuh" using namespace cub; /*******************************/ /* CUB BLOCKSORT KERNEL SHARED */ /*******************************/ template <int BLOCK_THREADS, int ITEMS_PER_THREAD> __global__ void shared_BlockSortKernel(float *d_valuesA, float *d_valuesB, int *d_keys, float *d_values_resultA, float *d_values_resultB, int *d_keys_result) { // --- Shared memory allocation __shared__ float sharedMemoryArrayValuesA[BLOCK_THREADS * ITEMS_PER_THREAD]; __shared__ float sharedMemoryArrayValuesB[BLOCK_THREADS * ITEMS_PER_THREAD]; __shared__ int sharedMemoryArrayKeys[BLOCK_THREADS * ITEMS_PER_THREAD]; __shared__ int sharedMemoryHelperIndices[BLOCK_THREADS * ITEMS_PER_THREAD]; // --- Specialize BlockStore and BlockRadixSort collective types typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD, int> BlockRadixSortT; // --- Allocate type-safe, repurposable shared memory for collectives __shared__ typename BlockRadixSortT::TempStorage temp_storage; int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD); // --- Load data to shared memory for (int k = 0; k < ITEMS_PER_THREAD; k++) { sharedMemoryArrayValuesA [threadIdx.x * ITEMS_PER_THREAD + k] = d_valuesA[block_offset + threadIdx.x * ITEMS_PER_THREAD + k]; sharedMemoryArrayValuesB [threadIdx.x * ITEMS_PER_THREAD + k] = d_valuesB[block_offset + threadIdx.x * ITEMS_PER_THREAD + k]; sharedMemoryArrayKeys [threadIdx.x * ITEMS_PER_THREAD + k] = d_keys [block_offset + threadIdx.x * ITEMS_PER_THREAD + k]; sharedMemoryHelperIndices[threadIdx.x * ITEMS_PER_THREAD + k] = threadIdx.x * ITEMS_PER_THREAD + k ; } __syncthreads(); // --- Collectively sort the keys BlockRadixSortT(temp_storage).SortBlockedToStriped(*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayKeys + (threadIdx.x * ITEMS_PER_THREAD))), *static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryHelperIndices + (threadIdx.x * ITEMS_PER_THREAD)))); __syncthreads(); // --- Write data to shared memory for (int k = 0; k < ITEMS_PER_THREAD; k++) { d_values_resultA[block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesA[sharedMemoryHelperIndices[threadIdx.x * ITEMS_PER_THREAD + k]]; d_values_resultB[block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesB[sharedMemoryHelperIndices[threadIdx.x * ITEMS_PER_THREAD + k]]; d_keys_result [block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArrayKeys [threadIdx.x * ITEMS_PER_THREAD + k]; } } /********/ /* MAIN */ /********/ int main() { const int numElemsPerArray = 8; const int numArrays = 4; const int N = numArrays * numElemsPerArray; const int numElemsPerThread = 4; const int RANGE = N * numElemsPerThread; // --- Allocating and initializing the data on the host float *h_valuesA = (float *)malloc(N * sizeof(float)); float *h_valuesB = (float *)malloc(N * sizeof(float)); int *h_keys = (int *) malloc(N * sizeof(int)); for (int i = 0 ; i < N; i++) { h_valuesA[i] = rand() % RANGE; h_valuesB[i] = rand() % RANGE; h_keys[i] = rand() % RANGE; } printf("Original\n\n"); for (int k = 0; k < numArrays; k++) for (int i = 0; i < numElemsPerArray; i++) printf("Array nr. %i; Element nr. %i; Key %i; Value A %f; Value B %f\n", k, i, h_keys[k * numElemsPerArray + i], h_valuesA[k * numElemsPerArray + i], h_valuesB[k * numElemsPerArray + i]); // --- Allocating the results on the host float *h_values_resultA = (float *)malloc(N * sizeof(float)); float *h_values_resultB = (float *)malloc(N * sizeof(float)); float *h_values_result2 = (float *)malloc(N * sizeof(float)); int *h_keys_result1 = (int *) malloc(N * sizeof(int)); int *h_keys_result2 = (int *) malloc(N * sizeof(int)); // --- Allocating space for data and results on device float *d_valuesA; gpuErrchk(cudaMalloc((void **)&d_valuesA, N * sizeof(float))); float *d_valuesB; gpuErrchk(cudaMalloc((void **)&d_valuesB, N * sizeof(float))); int *d_keys; gpuErrchk(cudaMalloc((void **)&d_keys, N * sizeof(int))); float *d_values_resultA; gpuErrchk(cudaMalloc((void **)&d_values_resultA, N * sizeof(float))); float *d_values_resultB; gpuErrchk(cudaMalloc((void **)&d_values_resultB, N * sizeof(float))); float *d_values_result2; gpuErrchk(cudaMalloc((void **)&d_values_result2, N * sizeof(float))); int *d_keys_result1; gpuErrchk(cudaMalloc((void **)&d_keys_result1, N * sizeof(int))); int *d_keys_result2; gpuErrchk(cudaMalloc((void **)&d_keys_result2, N * sizeof(int))); // --- BlockSortKernel with shared gpuErrchk(cudaMemcpy(d_valuesA, h_valuesA, N * sizeof(float), cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpy(d_valuesB, h_valuesB, N * sizeof(float), cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpy(d_keys, h_keys, N * sizeof(int), cudaMemcpyHostToDevice)); shared_BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_valuesA, d_valuesB, d_keys, d_values_resultA, d_values_resultB, d_keys_result1); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); gpuErrchk(cudaMemcpy(h_values_resultA, d_values_resultA, N * sizeof(float), cudaMemcpyDeviceToHost)); gpuErrchk(cudaMemcpy(h_values_resultB, d_values_resultB, N * sizeof(float), cudaMemcpyDeviceToHost)); gpuErrchk(cudaMemcpy(h_keys_result1, d_keys_result1, N * sizeof(int), cudaMemcpyDeviceToHost)); printf("\n\nBlockSortKernel using shared memory\n\n"); for (int k = 0; k < numArrays; k++) for (int i = 0; i < numElemsPerArray; i++) printf("Array nr. %i; Element nr. %i; Key %i; Value %f; Value %f\n", k, i, h_keys_result1[k * numElemsPerArray + i], h_values_resultA[k * numElemsPerArray + i], h_values_resultB[k * numElemsPerArray + i]); return 0; } 
0
source

Following my second answer, I want to provide an additional extension to the case where CUB is used to sort items stored in a linear array of shared memory that is populated with a 2D stream grid. Accordingly, cub::BlockRadixSort used with a 2D thread grid instead of a 1D thread grid, as in the previous answer. Here is a complete example:

 #include <cub/cub.cuh> #include <stdio.h> #include <stdlib.h> #include "Utilities.cuh" using namespace cub; /*******************************/ /* CUB BLOCKSORT KERNEL SHARED */ /*******************************/ template <int BLOCKSIZE_X, int BLOCKSIZE_Y, int ITEMS_PER_THREAD> __global__ void shared_BlockSortKernel(float *d_valuesA, float *d_valuesB, int *d_keys, float *d_values_resultA, float *d_values_resultB, int *d_keys_result) { // --- Shared memory allocation __shared__ float sharedMemoryArrayValuesA [BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD]; __shared__ float sharedMemoryArrayValuesB [BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD]; __shared__ int sharedMemoryArrayKeys [BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD]; __shared__ int sharedMemoryHelperIndices[BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD]; // --- Specialize BlockStore and BlockRadixSort collective types typedef cub::BlockRadixSort <int , BLOCKSIZE_X, ITEMS_PER_THREAD, int, 4, false, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, BLOCKSIZE_Y> BlockRadixSortT; // --- Allocate type-safe, repurposable shared memory for collectives __shared__ typename BlockRadixSortT::TempStorage temp_storage; int block_offset = blockIdx.x * (BLOCKSIZE_X * BLOCKSIZE_Y * ITEMS_PER_THREAD); // --- Load data to shared memory for (int k = 0; k < ITEMS_PER_THREAD; k++) { sharedMemoryArrayValuesA [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = d_valuesA[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k]; sharedMemoryArrayValuesB [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = d_valuesB[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k]; sharedMemoryArrayKeys [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = d_keys [block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k]; sharedMemoryHelperIndices[(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k ; } __syncthreads(); // --- Collectively sort the keys BlockRadixSortT(temp_storage).SortBlockedToStriped(*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArrayKeys + ((threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD))), *static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryHelperIndices + ((threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD)))); __syncthreads(); // --- Write data to shared memory for (int k = 0; k < ITEMS_PER_THREAD; k++) { d_values_resultA[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesA[sharedMemoryHelperIndices[(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k]]; d_values_resultB[block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = sharedMemoryArrayValuesB[sharedMemoryHelperIndices[(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k]]; d_keys_result [block_offset + (threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k] = sharedMemoryArrayKeys [(threadIdx.y * BLOCKSIZE_X + threadIdx.x) * ITEMS_PER_THREAD + k]; } } /********/ /* MAIN */ /********/ int main() { const int blockSize_x = 2; const int blockSize_y = 4; const int numElemsPerArray = blockSize_x * blockSize_y; const int numArrays = 4; const int N = numArrays * numElemsPerArray; const int numElemsPerThread = numElemsPerArray / (blockSize_x * blockSize_y); const int RANGE = N * numElemsPerThread; // --- Allocating and initializing the data on the host float *h_valuesA = (float *)malloc(N * sizeof(float)); float *h_valuesB = (float *)malloc(N * sizeof(float)); int *h_keys = (int *) malloc(N * sizeof(int)); for (int i = 0 ; i < N; i++) { h_valuesA[i] = rand() % RANGE; h_valuesB[i] = rand() % RANGE; h_keys[i] = rand() % RANGE; } printf("Original\n\n"); for (int k = 0; k < numArrays; k++) for (int i = 0; i < numElemsPerArray; i++) printf("Array nr. %i; Element nr. %i; Key %i; Value A %f; Value B %f\n", k, i, h_keys[k * numElemsPerArray + i], h_valuesA[k * numElemsPerArray + i], h_valuesB[k * numElemsPerArray + i]); // --- Allocating the results on the host float *h_values_resultA = (float *)malloc(N * sizeof(float)); float *h_values_resultB = (float *)malloc(N * sizeof(float)); float *h_values_result2 = (float *)malloc(N * sizeof(float)); int *h_keys_result1 = (int *) malloc(N * sizeof(int)); int *h_keys_result2 = (int *) malloc(N * sizeof(int)); // --- Allocating space for data and results on device float *d_valuesA; gpuErrchk(cudaMalloc((void **)&d_valuesA, N * sizeof(float))); float *d_valuesB; gpuErrchk(cudaMalloc((void **)&d_valuesB, N * sizeof(float))); int *d_keys; gpuErrchk(cudaMalloc((void **)&d_keys, N * sizeof(int))); float *d_values_resultA; gpuErrchk(cudaMalloc((void **)&d_values_resultA, N * sizeof(float))); float *d_values_resultB; gpuErrchk(cudaMalloc((void **)&d_values_resultB, N * sizeof(float))); float *d_values_result2; gpuErrchk(cudaMalloc((void **)&d_values_result2, N * sizeof(float))); int *d_keys_result1; gpuErrchk(cudaMalloc((void **)&d_keys_result1, N * sizeof(int))); int *d_keys_result2; gpuErrchk(cudaMalloc((void **)&d_keys_result2, N * sizeof(int))); // --- BlockSortKernel with shared gpuErrchk(cudaMemcpy(d_valuesA, h_valuesA, N * sizeof(float), cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpy(d_valuesB, h_valuesB, N * sizeof(float), cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpy(d_keys, h_keys, N * sizeof(int), cudaMemcpyHostToDevice)); shared_BlockSortKernel<blockSize_x, blockSize_y, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_valuesA, d_valuesB, d_keys, d_values_resultA, d_values_resultB, d_keys_result1); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); gpuErrchk(cudaMemcpy(h_values_resultA, d_values_resultA, N * sizeof(float), cudaMemcpyDeviceToHost)); gpuErrchk(cudaMemcpy(h_values_resultB, d_values_resultB, N * sizeof(float), cudaMemcpyDeviceToHost)); gpuErrchk(cudaMemcpy(h_keys_result1, d_keys_result1, N * sizeof(int), cudaMemcpyDeviceToHost)); printf("\n\nBlockSortKernel using shared memory\n\n"); for (int k = 0; k < numArrays; k++) for (int i = 0; i < numElemsPerArray; i++) printf("Array nr. %i; Element nr. %i; Key %i; Value %f; Value %f\n", k, i, h_keys_result1[k * numElemsPerArray + i], h_values_resultA[k * numElemsPerArray + i], h_values_resultB[k * numElemsPerArray + i]); return 0; } 
0
source

Source: https://habr.com/ru/post/949233/


All Articles