Faster calculation of (approximate) variance required

I see with the CPU profiler what compute_variances()is the bottleneck of my project.

  %   cumulative   self              self     total           
 time   seconds   seconds    calls  ms/call  ms/call  name    
 75.63      5.43     5.43       40   135.75   135.75  compute_variances(unsigned int, std::vector<Point, std::allocator<Point> > const&, float*, float*, unsigned int*)
 19.08      6.80     1.37                             readDivisionSpace(Division_Euclidean_space&, char*)
 ...

Here is the function body:

void compute_variances(size_t t, const std::vector<Point>& points, float* avg,
                       float* var, size_t* split_dims) {
  for (size_t d = 0; d < points[0].dim(); d++) {
    avg[d] = 0.0;
    var[d] = 0.0;
  }
  float delta, n;
  for (size_t i = 0; i < points.size(); ++i) {
    n = 1.0 + i;
    for (size_t d = 0; d < points[0].dim(); ++d) {
      delta = (points[i][d]) - avg[d];
      avg[d] += delta / n;
      var[d] += delta * ((points[i][d]) - avg[d]);
    }
  }

  /* Find t dimensions with largest scaled variance. */
  kthLargest(var, points[0].dim(), t, split_dims);
}

where kthLargest(), it seems, is not a problem, as I see that:

0.00 7.18 0.00 40 0.00 0.00 kthLargest(float*, int, int, unsigned int*)

compute_variances()takes a vector of float vectors (i.e. a vector Pointswhere Pointsis the class I implemented) and calculates their variance in each dimension (in relation to the Knut algorithm).

This is how I call the function:

float avg[(*points)[0].dim()];
float var[(*points)[0].dim()];
size_t split_dims[t];

compute_variances(t, *points, avg, var, split_dims);

The question is, can I do better? I would be very happy to pay a trade-off between speed and the approximate calculation of variances. Or maybe I could make the code more convenient for the cache or something else?

Compiled as follows:

g++ main_noTime.cpp -std=c++0x -p -pg -O3 -o eg

, -o3, "o". ypnos, -o3. , , -.

, compute_variances !

[EDIT]

copute_variances() 40 .

10 :

points.size() = 1000   and points[0].dim = 10000
points.size() = 10000  and points[0].dim = 100
points.size() = 10000  and points[0].dim = 10000
points.size() = 100000 and points[0].dim = 100

.

: points[i][d]?

A: point[i] i- std::vector, [] , Point.

const FT& operator [](const int i) const {
  if (i < (int) coords.size() && i >= 0)
     return coords.at(i);
  else {
     std::cout << "Error at Point::[]" << std::endl;
     exit(1);
  }
  return coords[0];  // Clear -Wall warning 
}

coords - std::vector float. , , , ? ( ). , , std::vector.at() ( ref). , .at() , , , .

compute_variances() - ! , , , .

, parallelism.

[EDIT.2]

Point (, - ):

class Point {
 public:

  typedef float FT;

  ...

  /**
   * Get dimension of point.
   */
  size_t dim() const {
    return coords.size();
  }

  /**
   * Operator that returns the coordinate at the given index.
   * @param i - index of the coordinate
   * @return the coordinate at index i
   */
  FT& operator [](const int i) {
    return coords.at(i);
    //it the same if I have the commented code below
    /*if (i < (int) coords.size() && i >= 0)
      return coords.at(i);
    else {
      std::cout << "Error at Point::[]" << std::endl;
      exit(1);
    }
    return coords[0];  // Clear -Wall warning*/
  }

  /**
   * Operator that returns the coordinate at the given index. (constant)
   * @param i - index of the coordinate
   * @return the coordinate at index i
   */
  const FT& operator [](const int i) const {
        return coords.at(i);
    /*if (i < (int) coords.size() && i >= 0)
      return coords.at(i);
    else {
      std::cout << "Error at Point::[]" << std::endl;
      exit(1);
    }
    return coords[0];  // Clear -Wall warning*/
  }

 private:
  std::vector<FT> coords;
};
+4
7

, :

  • Point::operator[] , .
  • coords[i] coords.at(i), , . at . .
  • / Point::operator[] assert. . - , .
  • .
  • .
  • , , . , , avg var. avg var, prefetch , .
  • ++ std::fill std::copy , C memset memcpy.

Point::operator[] ( ). , . , , , (a.k.a. LTO).

, Point::operator[] return coords.at(i) . return coords[i], return coords.at(i).

FT Point::operator[](int i) const {
  assert(i >= 0 && i < (int)coords.size());
  return coords[i];
}

const FT * Point::constData() const {
  return &coords[0];
}

void compute_variances(size_t t, const std::vector<Point>& points, float* avg,
                       float* var, size_t* split_dims)
{
  assert(points.size() > 0);
  const int D = points[0].dim();

  // i = 0, i_n = 1
  assert(D > 0);
#if __cplusplus >= 201103L
  std::copy_n(points[0].constData(), D, avg);
#else
  std::copy(points[0].constData(), points[0].constData() + D, avg);
#endif

  // i = 1, i_n = 0.5
  if (points.size() >= 2) {
    assert(points[1].dim() == D);
    for (int d = D - 1; d >= 0; --d) {
      float const delta = points[1][d] - avg[d];
      avg[d] += delta * 0.5f;
      var[d] = delta * (points[1][d] - avg[d]);
    }
  } else {
    std::fill_n(var, D, 0.0f);
  }

  // i = 2, ...
  for (size_t i = 2; i < points.size(); ) {
    {
      const float i_n = 1.0f / (1.0f + i);
      assert(points[i].dim() == D);
      for (int d = 0; d < D; ++d) {
        float const delta = points[i][d] - avg[d];
        avg[d] += delta * i_n;
        var[d] += delta * (points[i][d] - avg[d]);
      }
    }
    ++ i;
    if (i >= points.size()) break;
    {
      const float i_n = 1.0f / (1.0f + i);
      assert(points[i].dim() == D);      
      for (int d = D - 1; d >= 0; --d) {
        float const delta = points[i][d] - avg[d];
        avg[d] += delta * i_n;
        var[d] += delta * (points[i][d] - avg[d]);
      }
    }
    ++ i;
  }

  /* Find t dimensions with largest scaled variance. */
  kthLargest(var, D, t, split_dims);
}
+1

1. SIMD

(SIMD) . x86 SSE, AVX. , x4 . :

for (size_t d = 0; d < points[0].dim(); ++d) {
    delta = (points[i][d]) - avg[d];
    avg[d] += delta / n;
    var[d] += delta * ((points[i][d]) - avg[d]);
}

, SSE. , . 16- , 32- ( ), . AVX , .

, , .

2. Micro-parallelizm

, , . Intel TBB, OpenMP . , , . , d , i.

, , HT 25-30 .

3.

, , SO, -O3, -O3! , , delta, n , . -funroll-loops , -march. , -march core2 ( AMD) SSE ( , ).

+2

, vector<vector<float> >. float . , Point vector . , , .

, - .

( 1/n ) , . SIMD , .

, , . , , ; , , .

+2
for (size_t d = 0; d < points[0].dim(); d++) {
  avg[d] = 0.0;
  var[d] = 0.0;
}

, memset. IEEE754 0.0 32 - 0x00000000. , . - :

memset((void*)avg, 0, points[0].dim() * sizeof(float));

[0].dim(). . , ( -O3).

( POV), (, ).

avg[d] += delta / n;

: , Dim div N ( N x Dim); N < points.size()

Cuda OpenCL, avg var ( ).

0

- , .


for (size_t d = 0; d < points[0].dim(); d += 4)
{
  // Perform loading all at once.
  register const float p1 = points[i][d + 0];
  register const float p2 = points[i][d + 1];
  register const float p3 = points[i][d + 2];
  register const float p4 = points[i][d + 3];

  register const float delta1 = p1 - avg[d+0];
  register const float delta2 = p2 - avg[d+1];
  register const float delta3 = p3 - avg[d+2];
  register const float delta4 = p4 - avg[d+3];

  // Perform calculations
  avg[d + 0] += delta1 / n;
  var[d + 0] += delta1 * ((p1) - avg[d + 0]);

  avg[d + 1] += delta2 / n;
  var[d + 1] += delta2 * ((p2) - avg[d + 1]);

  avg[d + 2] += delta3 / n;
  var[d + 2] += delta3 * ((p3) - avg[d + 2]);

  avg[d + 3] += delta4 / n;
  var[d + 3] += delta4 * ((p4) - avg[d + 3]);
}

, .

1:
avg var . , , . , , . .

0

.


( ). - , , . .

:
, , , , , 3,14159 . , , , . 3141,59 . , , . 3,141,590 (). .

. . Fixed Point Floating Point .

2
, 1000, 2 1000. 2 . - 2 , .

, 1024 1000. , 65536 131072.


, . , , . 2 ( ) . , , , , . (, ) , .

0

1. . ? , , , ? , , , . , .

2. , , :

double sumsq = 0, sum = 0;
for (i = 0; i < n; i++){
  double xi = x[i];
  sum += xi;
  sumsq += xi * xi;
}
double avg = sum / n;
double avgsq = sumsq / n
double variance = avgsq - avg*avg;

3. . - , .

Point 4. You are using gprof or something like that. The only reasonably reliable number that comes out of it is a function of time over time. He will not tell you very well how time is spent inside the function. I and many others rely on this method , which makes you right in the heart of what takes time.

0
source

Source: https://habr.com/ru/post/1542496/


All Articles