Cython parallelism and stencils

After intensive use of numba, I return to cython to parallelize some time-consuming functions. In the following, a basic example:

import numpy as np
cimport numpy as np

from cython import boundscheck, wraparound
from cython.parallel import parallel, prange

@boundscheck(False)
@wraparound(False)
def cytest1(double[:,::1] a, double[:,::1] b, int ix1, int ix2, int iz1, int iz2):

    cdef int ix
    cdef int iz

    for ix in range(ix1, ix2):
        for iz in range(iz1, iz2):
            b[ix, iz] = 0.5*(a[ix+1, iz] - a[ix-1, iz])
    return b


@boundscheck(False)
@wraparound(False)
def cytest2(double[:,::1] a, double[:,::1] b, int ix1, int ix2, int iz1, int iz2):

    cdef int ix
    cdef int iz

    with nogil, parallel():
        for ix in prange(ix1, ix2):
            for iz in range(iz1, iz2):
                b[ix, iz] = 0.5*(a[ix+1, iz] - a[ix-1, iz])

    return b

When compiling these two functions (with the openmp flag) and their calls:

nx, nz = 1024, 1024

a = np.random.rand(nx, nz)
b = np.zeros_like(a)

Nit = 1000
ti = time.time()
for i in range(Nit):
    cytest1(a, b, 5, nx-5, 0, nz)
print('cytest1 : {:.3f} s.'.format(time.time() - ti))

ti = time.time()
for i in range(Nit):
    cytest2(a, b, 5, nx-5, 0, nz)
print('cytest2 : {:.3f} s.'.format(time.time() - ti))

I get these runtimes:

cytest1 : 1.757 s.
cytest2 : 1.861 s.

When a parallel function is executed, I can see my 4 processors in action, but the execution time is almost the same as the serial function. I tried moving prangeto the inner loop, but for worse results. I also tried several different options schedule, but without success.

Something is obviously missing me, but what? Is it prangeunable to cut out the loop with code trying to access n + X / nX elements?

EDIT:

My setup:

model name      : Intel(R) Core(TM) i7-6600U CPU @ 2.60GHz
MemTotal        : 8052556 kB
Python          : 3.5.2
cython          : 0.28.2
Numpy           : 1.14.2 
Numba           : 0.37.0

. setup.py:

from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext


ext_modules = [
    Extension("stencil",
              ["stencil.pyx"],
              libraries=["m"],
              extra_compile_args=["-O3", "-ffast-math", "-march=native", "-fopenmp"],
              extra_link_args=['-fopenmp'],
              )
]

setup(
  name="stencil",
  cmdclass={"build_ext": build_ext},
  ext_modules=ext_modules
)
+4
1

, , : , , .

: ? , , , .

, :

 b[ix, iz] = (a[ix+1, iz])

, , .

Intel Xeon E5-2620 @2.1 Ghz %timeit - :

>>> %timeit cytest1(a,b,5, nx-5, 0, nz)
100 loops, best of 3: 1.99 ms per loop

>>> %timeit cytest2(a,b,5, nx-5, 0, nz)
The slowest run took 234.48 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 324 µs per loop

, . 2 , 8 , 16 , "". - 15 , , , .

, , , (), .

, :

....
>>> nx, nz = 10240, 10240 #100 times bigger
....

>>> %timeit cytest1(a,b,5, nx-5, 0, nz)
1 loop, best of 3: 238 ms per loop

>>> %timeit cytest2(a,b,5, nx-5, 0, nz)
10 loops, best of 3: 99.3 ms per loop

2 , : , .

b[ix, iz] = 0.5*(a[ix+1, iz] - a[ix-1, iz])

- , .

sin cos - , (. ):

...
b[ix, iz] = sin(a[ix+1, iz])
...
>>> %timeit cytest1(a,b,5, nx-5, 0, nz)
1 loop, best of 3: 1.6 s per loop

>>> %timeit cytest2(a,b,5, nx-5, 0, nz)
1 loop, best of 3: 217 ms per loop

8, .

, / . :

  • - , , .
  • , .

( Windows -fopenmp linux):

%%cython --compile-args=/openmp --link-args=/openmp 
from cython.parallel import parallel, prange
from cython import boundscheck, wraparound
from libc.math cimport sin

@boundscheck(False)
@wraparound(False)
def cytest1(double[:,::1] a, double[:,::1] b, int ix1, int ix2, int iz1, int iz2):

    cdef int ix
    cdef int iz

    for ix in range(ix1, ix2):
        for iz in range(iz1, iz2):
            b[ix, iz] =sin(a[ix+1, iz])
    return b


@boundscheck(False)
@wraparound(False)
def cytest2(double[:,::1] a, double[:,::1] b, int ix1, int ix2, int iz1, int iz2):

    cdef int ix
    cdef int iz

    with nogil, parallel():
        for ix in prange(ix1, ix2):
            for iz in range(iz1, iz2):
                b[ix, iz] = sin(a[ix+1, iz])

    return b
+3
source

Source: https://habr.com/ru/post/1696258/


All Articles