I am evaluating CUDA and am currently using the Thrust library to sort numbers.
I would like to create my own comparator for thrust :: sort, but it slows down dramatically! I created my own less implementation by simply copying the code from functional.h . However, it seems that it compiles in some other way and works very slowly.
- default comper: thrust :: less () - 94 ms
- my own comparator: less () - 906 ms
I am using Visual Studio 2010. What should I do to get the same performance as in option 1?
Full code:
#include <stdio.h> #include <cuda.h> #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/generate.h> #include <thrust/sort.h> int myRand() { static int counter = 0; if ( counter++ % 10000 == 0 ) srand(time(NULL)+counter); return (rand()<<16) | rand(); } template<typename T> struct less : public thrust::binary_function<T,T,bool> { __host__ __device__ bool operator()(const T &lhs, const T &rhs) const { return lhs < rhs; } }; int main() { thrust::host_vector<int> h_vec(10 * 1000 * 1000); thrust::generate(h_vec.begin(), h_vec.end(), myRand); thrust::device_vector<int> d_vec = h_vec; int clc = clock(); thrust::sort(d_vec.begin(), d_vec.end(), less<int>()); printf("%dms\n", (clock()-clc) * 1000 / CLOCKS_PER_SEC); return 0; }
source share