Nested loop acceleration

I am writing a simulation for a wireless network in python using numpy and cython, where, suppose there are several nodes no_nodesrandomly scattered on the 2d plane, which send some signals and their respective receivers, again randomly scattered on the 2d plane. Each node transmission causes a signal that I call output(each can produce an output signal of different lengths).

What I want to do is summarize these outputs from each node to one large waveform that each receiver will input for demodulation, etc. Now two key points:

  • Transmitters are sent asynchronously, so start_clockand end_clockshould be kept for each transmission node, in order to properly summarize corresponding signals
  • the output of the jtransmitting node will be weakened before it is received by the inode according to the functionattenuate(i,j)

So here is the code:

#create empty 2d array (no_rx_nodes x no_samples for each waveform)
waveforms = np.zeros((no_nodes, max(end_clock))) 

for i in range(no_nodes): #calculate the waveform for each receiver
    for j in range(no_nodes): #sum the waveforms produced by each transmitter
        waveforms[i, start_clock[j]:end_clock[j]] += output[j,:] * attenuate(i,j)
return waveforms

Some comments about this:

  • output[j, :] is the output of the transmitter j
  • waveforms[i,:] is the waveform received by receiver i

, , . ( 10 ^ 6 ), cython, - (, 5-10 , ). , - , , , (, , , , ).

+4
3

3 /, 2-4 . , (numexpr ):

for i in range(no_nodes):
    for j in range(no_nodes):
        # should be chosen so all operands fit in the (next-to-)last level cache
        # first level is normally too small to be usable due to python overhead
        s  = 15000 
        a = attenuation[i,j]
        o = output[j]
        w = waveforms[i]
        for k in range(0, w.size, s): 
            u = min(k + s, w.size)
            w[k:u] += o[k:u] * a
        # or: numexpr.evaluate("w + o * a", out=w)

float32 float64 .

, , .

+5

, , , , , . , . :

def no_buffer(output, attenuate):
    waveforms = np.zeros_like(output)
    for i in xrange(len(output)):
        for j in xrange(len(output)):
            waveforms[i,:] += output[j, :] * attenuate[i, j]

    return waveforms

def with_buffer(output, attenuate):
    waveforms = np.zeros_like(output)
    buffer_arr = np.empty_like(output[0])
    for i in xrange(len(output)):
        for j in xrange(len(output)):
            np.multiply(output[j, :], attenuate[i, j], out=buffer_arr)
            np.add(waveforms[i, :], buffer_arr, out=waveforms[i, :])

    return waveforms

o = np.random.rand(20, 1e6)
a = np.random.rand(20, 20)

In [17]: np.allclose(no_buffer(o, a), with_buffer(o, a))
Out[17]: True

In [18]: %timeit no_buffer(o, a)
1 loops, best of 3: 2.3 s per loop

In [19]: %timeit with_buffer(o, a)
1 loops, best of 3: 1.57 s per loop

, .

, , - , BLAS . , MKL:

In [21]: np.allclose(with_buffer(o, a), np.dot(o.T, a.T).T)
Out[21]: True

In [22]: %timeit np.dot(o.T, a.T).T
10 loops, best of 3: 123 ms per loop
+4

, , . , , , for. , 3 . , :

import numpy as np
import time

def calc(no_nodes):

    output = np.random.rand(no_nodes, 7e5) #some random data, 7e5 samples here
    attenuate= np.random.rand(no_nodes,no_nodes) #some random data
    start_time = time.time()
    output_per_node = np.zeros((no_nodes,no_nodes,7e5))
    output_per_node += output[None, :, :]
    data = attenuate[:,:,None] * output_per_node
    waveforms = np.sum(data, axis=1)
    end_time = time.time()
    print end_time - start_time
    return waveforms

:

def calc1(no_nodes):
    output = np.random.rand(no_nodes, 7e5)
    attenuation = np.random.rand(no_nodes,no_nodes)
    waveforms = np.zeros((no_nodes, 7e5))
    start_time = time.time()
    for i in range(no_nodes):
        for j in range(no_nodes):
            waveforms[i] += output[j] * attenuation[i,j]
    print time.time() - start_time
    return waveforms

? , Numpy , . , , - . cython, ( ) , , . , ? : no_nodes = 10

, , ipython , , ipynb, html :

.

+2

Source: https://habr.com/ru/post/1539920/


All Articles