Sorting many small arrays in CUDA

I am implementing a median filter in CUDA. For a specific pixel, I extract its neighbors corresponding to the window around the pixel, say, an N x N ( 3 x 3 ) window and now have an array of N x N elements. I do not intend to use more than 10 x 10 windows for my application.

This array is now locally present in the kernel and is already loaded into the device’s memory. From the previous SO posts I read, the most common sorting algorithms are implemented by Thrust. But Thrust can only be called from the host. Topic - Lose inside user-written files

Is there a quick and efficient way to sort a small array of N x N elements inside the kernel?

+3
source share
3 answers

If the number of elements is fixed and small, you can use sorting networks ( http://pages.ripco.net/~jgamble/nw.html ). It provides a fixed number of compare / swap operations for a fixed number of elements (e.g. 19 comparisons / replace iterations for 8 elements).

+4
source

Your problem is sorting many small arrays in CUDA.

Following Robert's suggestion in his comment, CUB offers a possible solution to solve this problem. Below I will tell you an example that was built around Robert’s code in cub BlockRadixSort: how to deal with a large tile size or sort multiple fragments? .

The idea is to sort small arrays for different stream blocks, and then use cub :: BlockRadixSort to sort each array. There are two versions: one load and one load of small arrays into shared memory.

Let me finally note that your claim that CUDA Thrust cannot be called from kernels is no longer true. The Thrust message inside the user-written files you linked to has been updated with other answers.

 #include <cub/cub.cuh> #include <stdio.h> #include <stdlib.h> #include "Utilities.cuh" using namespace cub; /**********************************/ /* CUB BLOCKSORT KERNEL NO SHARED */ /**********************************/ template <int BLOCK_THREADS, int ITEMS_PER_THREAD> __global__ void BlockSortKernel(int *d_in, int *d_out) { // --- Specialize BlockLoad, BlockStore, and BlockRadixSort collective types typedef cub::BlockLoad <int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE> BlockLoadT; typedef cub::BlockStore <int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_TRANSPOSE> BlockStoreT; typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT; // --- Allocate type-safe, repurposable shared memory for collectives __shared__ union { typename BlockLoadT ::TempStorage load; typename BlockStoreT ::TempStorage store; typename BlockRadixSortT::TempStorage sort; } temp_storage; // --- Obtain this block segment of consecutive keys (blocked across threads) int thread_keys[ITEMS_PER_THREAD]; int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD); BlockLoadT(temp_storage.load).Load(d_in + block_offset, thread_keys); __syncthreads(); // --- Collectively sort the keys BlockRadixSortT(temp_storage.sort).Sort(thread_keys); __syncthreads(); // --- Store the sorted segment BlockStoreT(temp_storage.store).Store(d_out + block_offset, thread_keys); } /*******************************/ /* CUB BLOCKSORT KERNEL SHARED */ /*******************************/ template <int BLOCK_THREADS, int ITEMS_PER_THREAD> __global__ void shared_BlockSortKernel(int *d_in, int *d_out) { // --- Shared memory allocation __shared__ int sharedMemoryArray[BLOCK_THREADS * ITEMS_PER_THREAD]; // --- Specialize BlockStore and BlockRadixSort collective types typedef cub::BlockRadixSort <int , BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT; // --- Allocate type-safe, repurposable shared memory for collectives __shared__ typename BlockRadixSortT::TempStorage temp_storage; int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD); // --- Load data to shared memory for (int k = 0; k < ITEMS_PER_THREAD; k++) sharedMemoryArray[threadIdx.x * ITEMS_PER_THREAD + k] = d_in[block_offset + threadIdx.x * ITEMS_PER_THREAD + k]; __syncthreads(); // --- Collectively sort the keys BlockRadixSortT(temp_storage).Sort(*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(sharedMemoryArray + (threadIdx.x * ITEMS_PER_THREAD)))); __syncthreads(); // --- Write data to shared memory for (int k = 0; k < ITEMS_PER_THREAD; k++) d_out[block_offset + threadIdx.x * ITEMS_PER_THREAD + k] = sharedMemoryArray[threadIdx.x * ITEMS_PER_THREAD + k]; } /********/ /* MAIN */ /********/ int main() { const int numElemsPerArray = 8; const int numArrays = 4; const int N = numArrays * numElemsPerArray; const int numElemsPerThread = 4; const int RANGE = N * numElemsPerThread; // --- Allocating and initializing the data on the host int *h_data = (int *)malloc(N * sizeof(int)); for (int i = 0 ; i < N; i++) h_data[i] = rand() % RANGE; // --- Allocating the results on the host int *h_result1 = (int *)malloc(N * sizeof(int)); int *h_result2 = (int *)malloc(N * sizeof(int)); // --- Allocating space for data and results on device int *d_in; gpuErrchk(cudaMalloc((void **)&d_in, N * sizeof(int))); int *d_out1; gpuErrchk(cudaMalloc((void **)&d_out1, N * sizeof(int))); int *d_out2; gpuErrchk(cudaMalloc((void **)&d_out2, N * sizeof(int))); // --- BlockSortKernel no shared gpuErrchk(cudaMemcpy(d_in, h_data, N*sizeof(int), cudaMemcpyHostToDevice)); BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_in, d_out1); gpuErrchk(cudaMemcpy(h_result1, d_out1, N*sizeof(int), cudaMemcpyDeviceToHost)); printf("BlockSortKernel no shared\n\n"); for (int k = 0; k < numArrays; k++) for (int i = 0; i < numElemsPerArray; i++) printf("Array nr. %i; Element nr. %i; Value %i\n", k, i, h_result1[k * numElemsPerArray + i]); // --- BlockSortKernel with shared gpuErrchk(cudaMemcpy(d_in, h_data, N*sizeof(int), cudaMemcpyHostToDevice)); shared_BlockSortKernel<N / numArrays / numElemsPerThread, numElemsPerThread><<<numArrays, numElemsPerArray / numElemsPerThread>>>(d_in, d_out2); gpuErrchk(cudaMemcpy(h_result2, d_out2, N*sizeof(int), cudaMemcpyDeviceToHost)); printf("\n\nBlockSortKernel with shared\n\n"); for (int k = 0; k < numArrays; k++) for (int i = 0; i < numElemsPerArray; i++) printf("Array nr. %i; Element nr. %i; Value %i\n", k, i, h_result2[k * numElemsPerArray + i]); return 0; } 
+2
source

If you are using CUDA 5.X, you can use dynamic parallelism. You can make some child core in the filter core to complete the sort job. How to sort by CUDA, you can use some induction skills.

0
source

Source: https://habr.com/ru/post/949245/


All Articles