You can break the vector into pieces for each stream that needs to be filled with std::fill:
{
auto tid = omp_get_thread_num();
auto chunksize = v.size() / omp_get_num_threads();
auto begin = v.begin() + chunksize * tid;
auto end = (tid == omp_get_num_threads() -1) ? v.end() : begin + chunksize);
std::fill(begin, end, 0);
}
, chunksize / (128 = 32 int s). , v.data() . , .
24 Haswell - 9x: 3.6s 1 , 0.4 24 , 4.8B ints = ~ 48 /, . .
, ( ) , . , , , , , , NUMA node.
, std::fill(..., 1); , std::fill(..., 0) , 24 . gcc 6.1.0, icc 17.0.1. , .