How can I improve / speed up this frequency in C?

Question

How can I improve / speed up this frequency in C?

How can I improve / speed up this frequent feature?

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define M 10 // This is fixed
#define N 8  // This is NOT fixed

// Assumptions: 1. x, a, b and c are all arrays of 10 (M).
//              2. y and z are all matrices of 8 x 10 (N x M).
// Requirement: 1. return the value of ret;
//              2. get all elements of array c
float fnFrequentFunction(const float* x, const float* const* y, const float* const* z,
                         const float* a, const float* b, float *c, int n)
{
    register float tmp;
    register float sum;
    register float ret = 0;
    register const float* yy;
    register const float* zz;
    int i;

    for (i = 0; i < n; i++)  // M == 1, 2, 4, or 8
    {
        sum = 0;
        yy = y[i];
        zz = z[i];

        tmp = x[0] - yy[0]; sum += tmp * tmp * zz[0];
        tmp = x[1] - yy[1]; sum += tmp * tmp * zz[1];
        tmp = x[2] - yy[2]; sum += tmp * tmp * zz[2];
        tmp = x[3] - yy[3]; sum += tmp * tmp * zz[3];
        tmp = x[4] - yy[4]; sum += tmp * tmp * zz[4];
        tmp = x[5] - yy[5]; sum += tmp * tmp * zz[5];
        tmp = x[6] - yy[6]; sum += tmp * tmp * zz[6];
        tmp = x[7] - yy[7]; sum += tmp * tmp * zz[7];
        tmp = x[8] - yy[8]; sum += tmp * tmp * zz[8];
        tmp = x[9] - yy[9]; sum += tmp * tmp * zz[9];

        ret += (c[i] = log(a[i] * b[i]) + sum);
    }

    return ret;
}

// In the main function, all values are just example data.
int main()
{
    float x[M] = {0.001251f, 0.563585f, 0.193304f, 0.808741f, 0.585009f, 0.479873f, 0.350291f, 0.895962f, 0.622840f, 0.746605f};
    float* y[N];
    float* z[N];
    float a[M] = {0.870205f, 0.733879f, 0.711386f, 0.588244f, 0.484176f, 0.852962f, 0.168126f, 0.684286f, 0.072573f, 0.632160f};
    float b[M] = {0.871487f, 0.998108f, 0.798608f, 0.134831f, 0.576281f, 0.410779f, 0.402936f, 0.522935f, 0.623218f, 0.193030f};
    float c[N];

    float t1[M] = {0.864406f, 0.709006f, 0.091433f, 0.995727f, 0.227180f, 0.902585f, 0.659047f, 0.865627f, 0.846767f, 0.514359f};
    float t2[M] = {0.866817f, 0.581347f, 0.175542f, 0.620197f, 0.781823f, 0.778588f, 0.938688f, 0.721610f, 0.940214f, 0.811353f};
    int i, j;

    int n = 10000000;
    long start;

    // Initialize y, z for test example:
    for(i = 0; i < N; ++i)
    {
        y[i] = (float*)malloc(sizeof(float) * M);
        z[i] = (float*)malloc(sizeof(float) * M);

        for(j = 0; j < M; ++j)
        {
            y[i][j] = t1[j] * j;
            z[i][j] = t2[j] * j;
        }
    }


    // Speed test here:
    start = clock();
    while(--n)
        fnFrequentFunction(x, y, z, a, b, c, 8);
    printf("Time used: %ld\n", clock() - start);


    // Output the result here:
    printf("fnFrequentFunction == %f\n", fnFrequentFunction(x, y, z, a, b, c, 8));
    for(j = 0; j < N; ++j)
        printf("  c[%d] == %f\n", j, c[j]);
    printf("\n");


    // Free memory
    for(j = 0; j < N; ++j)
    {
        free(y[j]);
        free(z[j]);
    }

    return 0;
}

Any suggestions are welcome :-)

I am scared that I made a big mistake in my function. The above code is new. I am rechecking it now to make sure that this is what I need.

+3

performance optimization c

Peter Lee Apr 19 '10 at 22:16

source share

9 answers

Andrey · Answer 1 · 2010-04-19T22:20:23+0000

put it outside the loop

sum = 0;

tmp = x[0] - y[0]; sum += tmp * tmp * z[0];
tmp = x[1] - y[1]; sum += tmp * tmp * z[1];
tmp = x[2] - y[2]; sum += tmp * tmp * z[2];
tmp = x[3] - y[3]; sum += tmp * tmp * z[3];
tmp = x[4] - y[4]; sum += tmp * tmp * z[4];
tmp = x[5] - y[5]; sum += tmp * tmp * z[5];
tmp = x[6] - y[6]; sum += tmp * tmp * z[6];
tmp = x[7] - y[7]; sum += tmp * tmp * z[7];
tmp = x[8] - y[8]; sum += tmp * tmp * z[8];
tmp = x[9] - y[9]; sum += tmp * tmp * z[9];

slacker · Answer 2 · 2010-04-19T22:24:10+0000

This feature is great for SIMD processing. Look in your compiler documentation for built-in functions that comply with SSE instructions.
sum. sum sum1 sum2 - , . .
log(). , . - , Intel , , log(). .
float, log() double. logf(). ( ) . , , .
C99, restrict , . , .
. , , M * N .

, , . C99. SIMD- , WAAAAY .

UPDATE: , . .

float fnFrequentFunction(const float *restrict x, const float *restrict y,
                         const float *restrict z, const float *restrict a,
                         const float *restrict b, float *restrict c, int n)
{
    float ret = 0;
    const float *restrict yy = y; //for readability
    const float *restrict zz = z; // -||-

    for (int i = 0; i < n; i++, yy += M, zz += M)  // n == 1, 2, 4, or 8
    {
        float sum = 0;
        float sum2 = 0;

        for(int j = 0; j < 10; j += 2)
        {
            float tmp  = x[j]   - yy[j];   sum  += tmp  * tmp  * zz[j];
            float tmp2 = x[j+1] - yy[j+1]; sum2 += tmp2 * tmp2 * zz[j+1];
        }
        sum += sum2;

        ret += (c[i] = logf(a[i] * b[i]) + sum);
    }
    return ret;
}

Duncan · Answer 3 · 2010-04-19T22:37:01+0000

memoization . /.

Perl memoize , , . C .

, , . , . , .

, .

Michael Dorgan · Answer 4 · 2010-04-19T22:42:46+0000

, tmp , . , 10 for, .

, , . SIMD, FPU, .. , , . vars . .

, , M * . 2 log muls .

M 8 - , , .

, , log(). ? , , , . , -, .

.

vladv · Answer 5 · 2010-04-19T23:07:48+0000

?

, . , . , gcc :

Time used: 8720000

:

Time used: 8710000

, .

, , for, , . . .

, , , , , , .

pm100 · Answer 6 · 2010-04-19T23:21:10+0000

, ints, float, . , ,

Thomas Matthews · Answer 7 · 2010-04-19T23:49:49+0000

Andrey :

float fnFrequentFunction(const float* x, const float* y, const float* z,
                         const float *a, const float *b, float *c, int M)
{
    register float tmp;
    register float sum;
    register float ret = 0;
    int i;
    sum = 0;

    tmp = x[0] - y[0]; sum += tmp * tmp * z[0];
    tmp = x[1] - y[1]; sum += tmp * tmp * z[1];
    tmp = x[2] - y[2]; sum += tmp * tmp * z[2];
    tmp = x[3] - y[3]; sum += tmp * tmp * z[3];
    tmp = x[4] - y[4]; sum += tmp * tmp * z[4];
    tmp = x[5] - y[5]; sum += tmp * tmp * z[5];
    tmp = x[6] - y[6]; sum += tmp * tmp * z[6];
    tmp = x[7] - y[7]; sum += tmp * tmp * z[7];
    tmp = x[8] - y[8]; sum += tmp * tmp * z[8];
    tmp = x[9] - y[9]; sum += tmp * tmp * z[9];

    for (i = 0; i < M; i++)  // M == 1, 2, 4, or 8
    {
        //----------------------------------------
        // Prefetch data into the processor cache
        //----------------------------------------
        float a_value = a[i];
        float b_value = b[i];
        float c_value = 0.0;

        //----------------------------------------
        // Calculate using prefetched data.
        //----------------------------------------
        c_value = log(a_value * b_value) + sum;
        c[i] = c_value;
        ret += c_value;
    }

    return ret;
}

:

float a_value = 0.0;
float b_value = 0.0;
float c_value = 0.0;
--M;
switch (M)
{
    case 7:
        a_value = a[M];
        b_value = b[M];
        c_value = log(a_value * b_value) + sum;
        c[M] = c_value;
    ret += c_value;
    --M;
    case 6:
        a_value = a[M];
        b_value = b[M];
        c_value = log(a_value * b_value) + sum;
        c[M] = c_value;
    ret += c_value;
    --M;
    case 5:
        a_value = a[M];
        b_value = b[M];
        c_value = log(a_value * b_value) + sum;
        c[M] = c_value;
    ret += c_value;
    --M;
    case 4:
        a_value = a[M];
        b_value = b[M];
        c_value = log(a_value * b_value) + sum;
        c[M] = c_value;
    ret += c_value;
    --M;
    case 3:
        a_value = a[M];
        b_value = b[M];
        c_value = log(a_value * b_value) + sum;
        c[M] = c_value;
    ret += c_value;
    --M;
    case 2:
        a_value = a[M];
        b_value = b[M];
        c_value = log(a_value * b_value) + sum;
        c[M] = c_value;
    ret += c_value;
    --M;
    case 1:
        a_value = a[M];
        b_value = b[M];
        c_value = log(a_value * b_value) + sum;
        c[M] = c_value;
    ret += c_value;
    --M;
    case 0:
        a_value = a[M];
        b_value = b[M];
        c_value = log(a_value * b_value) + sum;
        c[M] = c_value;
    ret += c_value;
    break;
}

, "+ sum" "" : ret += (M + 1) * sum; sum .

, - , log, :

float product[8];
for (i = 0; i < M; ++i)
{
  product[i] = a[i] * b[i];
}
for (i = 0; i < M; ++i)
{
  c[i] = log(product);
  ret += c[i];
}
ret += M * sum;

drawnonward · Answer 8 · 2010-04-20T00:26:56+0000

, a b , a b logab, logab [i] = log (a [i] * b [i]), a b - .

sizzzzlerz · Answer 9 · 2010-04-20T01:20:51+0000

, -, . , . , , , C, , . GMM SIMD. , , ( ) Nvidia. , , .

, , , , , .

How can I improve / speed up this frequency in C?

More articles: