As a kind of side project, I am working on a multi-threaded sum algorihm, which will be superior std::accumulatewhen working with a fairly large array. First, I will talk about my thinking process leading to this, but if you want to go straight to the problem, feel free to scroll to this part.
I found many parallel summary algorithms online, most of which take the following approach:
template <typename T, typename IT>
T parallel_sum(IT _begin, IT _end, T _init) {
const auto size = distance(_begin, _end);
static const auto n = thread::hardware_concurrency();
if (size < 10000 || n == 1) return accumulate(_begin, _end, _init);
vector<future<T>> partials;
partials.reserve(n);
auto chunkSize = size / n;
for (unsigned i{ 0 }; i < n; i++) {
partials.push_back(async(launch::async, [](IT _b, IT _e){
return accumulate(_b, _e, T{0});
}, next(_begin, i*chunkSize), (i==n-1)?_end:next(_begin, (i+1)*chunkSize)));
}
for (auto& f : partials) _init += f.get();
return _init;
}
Assuming two threads (as reported thread::hardware_concurrency()), this function will access the elements in memory as follows:

8 . . , . , , , .
( , ) , . , , 16 8- , CPU prefetcher ( , , ). , :
template <typename T, typename IT>
T parallel_sum2(IT _begin, IT _end, T _init) {
const auto size = distance(_begin, _end);
static const auto n = thread::hardware_concurrency();
if (size < 10000 || n == 1) return accumulate(_begin, _end, _init);
vector<future<T>> partials;
partials.reserve(n);
for (unsigned i{ 0 }; i < n; i++) {
partials.push_back(async(launch::async, [](IT _b, IT _e, unsigned _s){
T _ret{ 0 };
for (; _b < _e; advance(_b, _s)) _ret += *_b;
return _ret;
}, next(_begin, i), _end, n));
}
for (auto& f : partials) _init += f.get();
return _init;
}
-, :

, prefetcher , , , , .
, , . - . , , , . " " , (, std::accumulate , ):

, , .
, std::accumulate AVX2, vpaddq. 64- . , , , , , . , . , , . for , , - .
gcc- , Visual Studio 2015, , .
, , ? -, ?
, , .
PS: - x86_64 (, haswell ..).