I just started playing with Boost.Compute, to find out how much speed it can bring us, I wrote a simple program:
#include <iostream> #include <vector> #include <algorithm> #include <boost/foreach.hpp> #include <boost/compute/core.hpp> #include <boost/compute/platform.hpp> #include <boost/compute/algorithm.hpp> #include <boost/compute/container/vector.hpp> #include <boost/compute/functional/math.hpp> #include <boost/compute/types/builtin.hpp> #include <boost/compute/function.hpp> #include <boost/chrono/include.hpp> namespace compute = boost::compute; int main() { // generate random data on the host std::vector<float> host_vector(16000); std::generate(host_vector.begin(), host_vector.end(), rand); BOOST_FOREACH (auto const& platform, compute::system::platforms()) { std::cout << "====================" << platform.name() << "====================\n"; BOOST_FOREACH (auto const& device, platform.devices()) { std::cout << "device: " << device.name() << std::endl; compute::context context(device); compute::command_queue queue(context, device); compute::vector<float> device_vector(host_vector.size(), context); // copy data from the host to the device compute::copy( host_vector.begin(), host_vector.end(), device_vector.begin(), queue ); auto start = boost::chrono::high_resolution_clock::now(); compute::transform(device_vector.begin(), device_vector.end(), device_vector.begin(), compute::sqrt<float>(), queue); auto ans = compute::accumulate(device_vector.begin(), device_vector.end(), 0, queue); auto duration = boost::chrono::duration_cast<boost::chrono::milliseconds>(boost::chrono::high_resolution_clock::now() - start); std::cout << "ans: " << ans << std::endl; std::cout << "time: " << duration.count() << " ms" << std::endl; std::cout << "-------------------\n"; } } std::cout << "====================plain====================\n"; auto start = boost::chrono::high_resolution_clock::now(); std::transform(host_vector.begin(), host_vector.end(), host_vector.begin(), [](float v){ return std::sqrt(v); }); auto ans = std::accumulate(host_vector.begin(), host_vector.end(), 0); auto duration = boost::chrono::duration_cast<boost::chrono::milliseconds>(boost::chrono::high_resolution_clock::now() - start); std::cout << "ans: " << ans << std::endl; std::cout << "time: " << duration.count() << " ms" << std::endl; return 0; }
And here is a sample output on my computer (win7 64-bit):
====================Intel(R) OpenCL==================== device: Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz ans: 1931421 time: 64 ms ------------------- device: Intel(R) HD Graphics 4600 ans: 1931421 time: 64 ms ------------------- ====================NVIDIA CUDA==================== device: Quadro K600 ans: 1931421 time: 4 ms ------------------- ====================plain==================== ans: 1931421 time: 0 ms
My question is: why is the simple (non-opencl) version faster?
source share