Optimizing nested loops in cython: is there a faster way to customize this sample code?

Question

Optimizing nested loops in cython: is there a faster way to customize this sample code?

As part of a large piece of code, I need to call the (simplified) function example(inserted below) several (hundreds of thousands) times, with different arguments. As such, I need this module to work quickly.

The main problem with the module is a few nested loops. However, I'm not sure if the really unnecessary overhead is associated with these loops (as written), or if the code is really that fast, it can get it.

In general, when working with several nested loops in a zyton, are there any loop optimization methods that can be used to reduce overhead and speed up the code? Are any of these methods used in the code sample inserted below?

I am also compiling cython with extra_compile_args=["-ffast-math",'-O3'], although this does not seem to make much difference.

If this code really cannot be faster in cython (in my opinion, it is not), will there be any advantage in writing all or part of this module in C or Fortran?

import numpy as np
import math
cimport numpy as np
cimport cython

DTYPE = np.float
ctypedef np.float_t DTYPE_t

cdef extern from "math.h":
    double log(double x) nogil
    double exp(double x) nogil
    double pow(double x, double y) nogil


def example(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc = 1000.0):
    return example_int(xbg_PSF_compressed,theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data, Sc)

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
@cython.initializedcheck(False)
cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ):

    cdef int k_max = np.max(data) + 1

    cdef double A = np.float(theta[0])
    cdef double n1 = np.float(theta[1])
    cdef double n2 = np.float(theta[2])
    cdef double Sb = np.float(theta[3])

    cdef int npixROI = len(xbg_PSF_compressed)

    cdef double f2 = 0.0
    cdef double df_rho_div_f2 = 0.0


    cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE)
    cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE)
    cdef double[:,::1] x_m_ary_f = np.zeros((k_max + 1, npixROI), dtype=DTYPE)
    cdef double[::1] x_m_sum_f = np.zeros(npixROI, dtype=DTYPE)

    cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
    cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE)

    cdef Py_ssize_t f_index, p, k, n


    #calculations for PS

    cdef int do_half = 0

    cdef double term1 = 0.0
    cdef double term2 = 0.0
    cdef double second_2_a = 0.0
    cdef double second_2_b = 0.0
    cdef double second_2_c = 0.0
    cdef double second_2_d = 0.0
    cdef double second_1_a = 0.0
    cdef double second_1_b = 0.0
    cdef double second_1_c = 0.0
    cdef double second_1_d = 0.0



    for f_index in range(len(f_ary)):
        f2 = f_ary[f_index]
        df_rho_div_f2 = df_rho_div_f_ary[f_index]
        g1_ary_f = np.random.random(k_max+1)
        g2_ary_f = np.random.random(k_max+1)
        term1 = (A * Sb * f2) \
                             * (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \
                                - (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0]))
        second_1_a =  A  * pow(Sb * f2, n1)
        second_1_b = A * pow(Sb * f2, n2)

        for p in range(npixROI):
            x_m_sum_f[p] = term1 * PS_dist_compressed[p]
            x_m_sum[p] += df_rho_div_f2*x_m_sum_f[p]

            second_1_c = second_1_a * PS_dist_compressed[p]
            second_1_d = second_1_b * PS_dist_compressed[p]
            for k in range(data[p]+1):            
                x_m_ary_f[k,p] = second_1_c  * g1_ary_f[k] + second_1_d * g2_ary_f[k] 
                x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f[k,p]

    cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE)

    cdef double[::1] f0_ary = np.zeros(npixROI, dtype=DTYPE) 
    cdef double[::1] f1_ary = np.zeros(npixROI, dtype=DTYPE) 

    cdef double[:,::1] nu_mat = np.zeros((k_max+1, npixROI), dtype=DTYPE)


    cdef double ll = 0.

    for p in range(npixROI):
        f0_ary[p] = -(xbg_PSF_compressed[p] + x_m_sum[p])
        f1_ary[p] = (xbg_PSF_compressed[p] + x_m_ary[1,p])
        nu_mat[0,p] = exp(f0_ary[p])
        nu_mat[1,p] = nu_mat[0,p] * f1_ary[p]

        for k in range(2,data[p]+1):
            for n in range(0, k - 1):
                nu_mat[k,p] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n,p]
            nu_mat[k,p] += f1_ary[p] * nu_mat[k-1,p] / float(k)
        ll+=log( nu_mat[data[p],p])

    if math.isnan(ll) ==True or math.isinf(ll) ==True:
        ll = -10.1**10.

    return ll

For reference, when trying to run this code, the example arguments

f_ary=np.array([ 0.05,  0.15,  0.25 , 0.35 , 0.45  ,0.55 , 0.65 , 0.75,  0.85 , 0.95])
df_rho_div_f_ary = np.array([ 24.27277928,   2.83852471 ,  1.14224844 ,  0.61687863  , 0.39948536,
   0.30138642 ,  0.24715899 ,  0.22077999 ,  0.21594814 ,  0.19035121])
theta=[.002, 3.01,0.01, 10.013]
n_p=1000
data= np.random.randint(1,400,n_p).astype(dtype='int32')
k_max=int(np.max(data))+1
xbg_PSF_compressed= np.ones(n_p)*20 
PS_dist_compressed= np.ones(n_p)

and this example can then be called example(k_max,xbg_PSF_compressed,theta,f_ary,df_rho_div_f_ary, PS_dist_compressed). For synchronization, I find that this example evaluates to ~ 10 loops, best of 3: 147 ms per loop. Since the complete code takes several hours to run, any reduction in this runtime will have a large overall difference in performance.

+4

performance c python fortran cython

Ben S. 10 sept. '15 at 7:21

source share

1 answer

serge-sans-paille · Accepted Answer · 2015-09-10T08:58:40+0000

cython -a , C, .

, , . , 1D- . , :

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
@cython.initializedcheck(False)
cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ):

    cdef int k_max = np.max(data) + 1

    cdef double A = np.float(theta[0])
    cdef double n1 = np.float(theta[1])
    cdef double n2 = np.float(theta[2])
    cdef double Sb = np.float(theta[3])

    cdef int npixROI = len(xbg_PSF_compressed)

    cdef double f2 = 0.0
    cdef double df_rho_div_f2 = 0.0


    cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE)
    cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE)
    cdef double x_m_ary_f
    cdef double x_m_sum_f

    cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
    cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE)

    cdef Py_ssize_t f_index, p, k, n


    #calculations for PS

    cdef int do_half = 0

    cdef double term1 = 0.0
    cdef double term2 = 0.0
    cdef double second_2_a = 0.0
    cdef double second_2_b = 0.0
    cdef double second_2_c = 0.0
    cdef double second_2_d = 0.0
    cdef double second_1_a = 0.0
    cdef double second_1_b = 0.0
    cdef double second_1_c = 0.0
    cdef double second_1_d = 0.0



    for f_index in range(len(f_ary)):
        f2 = f_ary[f_index]
        df_rho_div_f2 = df_rho_div_f_ary[f_index]
        g1_ary_f = np.random.random(k_max+1)
        g2_ary_f = np.random.random(k_max+1)
        term1 = (A * Sb * f2) \
                             * (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \
                                - (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0]))
        second_1_a =  A  * pow(Sb * f2, n1)
        second_1_b = A * pow(Sb * f2, n2)

        for p in range(npixROI):
            x_m_sum_f = term1 * PS_dist_compressed[p]
            x_m_sum[p] += df_rho_div_f2*x_m_sum_f

            second_1_c = second_1_a * PS_dist_compressed[p]
            second_1_d = second_1_b * PS_dist_compressed[p]
            for k in range(data[p]+1):
                x_m_ary_f = second_1_c  * g1_ary_f[k] + second_1_d * g2_ary_f[k] 
                x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f

    cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE)

    cdef double f0_ary
    cdef double f1_ary

    cdef double[:] nu_mat = np.zeros((k_max+1), dtype=DTYPE)


    cdef double ll = 0.

    for p in range(npixROI):
        f0_ary = -(xbg_PSF_compressed[p] + x_m_sum[p])
        f1_ary = (xbg_PSF_compressed[p] + x_m_ary[1,p])
        nu_mat[0] = exp(f0_ary)
        nu_mat[1] = nu_mat[0] * f1_ary

        for k in range(2,data[p]+1):
            for n in range(0, k - 1):
                nu_mat[k] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n]
            nu_mat[k] += f1_ary * nu_mat[k-1] / float(k)
        ll+=log( nu_mat[data[p]])

    if math.isnan(ll) or math.isinf(ll):
        ll = -10.1**10.

    return ll

:

>>> %timeit example(xbg_PSF_compressed, theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data)
10 loops, best of 3: 74.1 ms per loop

:

>>> %timeit example(xbg_PSF_compressed, theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data)
1 loops, best of 3: 146 ms per loop

Optimizing nested loops in cython: is there a faster way to customize this sample code?

More articles: