I had the following problem when trying to optimize my application using C ++ Amp: data transfer. For me there is no problem copying data from the CPU to the GPU (how can I do this in the initial state of the application). The worst part is that I need quick access to the results calculated by the C ++ Amp cores, so the pain between the GPU and the CPU is a bottleneck. I read that there is a performance improvement in Windows 8.1, but I am using Windows 7 and I do not plan to change it. I read about intermediate arrays, but I don't know how they can help solve my problem. I need to return a single float value to the host, and it seems like this is the most time consuming operation.
float Subset::reduction_cascade(unsigned element_count, concurrency::array<float, 1>& a)
{
static_assert(_tile_count > 0, "Tile count must be positive!");
assert(source.size() <= UINT_MAX);
assert(element_count != 0);
unsigned stride = _tile_size * _tile_count * 2;
float tail_sum = 0.f;
unsigned tail_length = element_count % stride;
concurrency::array<float, 1> a_partial_result(_tile_count);
concurrency::parallel_for_each(concurrency::extent<1>(_tile_count * _tile_size).tile<_tile_size>(), [=, &a, &a_partial_result] (concurrency::tiled_index<_tile_size> tidx) restrict(amp)
{
tile_static float tile_data[_tile_size];
unsigned local_idx = tidx.local[0];
unsigned input_idx = (tidx.tile[0] * 2 * _tile_size) + local_idx;
tile_data[local_idx] = 0;
do
{
tile_data[local_idx] += a[input_idx] + a[input_idx + _tile_size];
input_idx += stride;
} while (input_idx < element_count);
tidx.barrier.wait();
for (unsigned stride = _tile_size / 2; stride > 0; stride /= 2)
{
if (local_idx < stride)
{
tile_data[local_idx] += tile_data[local_idx + stride];
}
tidx.barrier.wait();
}
if (local_idx == 0)
{
a_partial_result[tidx.tile[0]] = tile_data[0];
}
});
std::vector<float> v_partial_result(_tile_count);
copy(a_partial_result, v_partial_result.begin());
return std::accumulate(v_partial_result.begin(), v_partial_result.end(), tail_sum);
}
, copy(a_partial_result, v_partial_result.begin());. .