Python shuffle array that has very few zeros (very sparsey)

I have a very large (length ~ 150 million) numpy array that has very few nonzero values ​​(about 99.9% of the array is 0). I want to shuffle it, but shuffling is slow (it takes about 10 seconds, which is unacceptable because I'm doing a simulation in Monte Carlo). Is there any way to shuffle it in such a way as to take into account the fact that my array consists mainly of 0?

I am going to shuffle only my positive values ​​and then insert them randomly into an array full 0, but I cannot find the numpy function for it.

+4
source share
2 answers

@Divakar, scipy.sparse:

a = scipy.sparse.coo_matrix(a)

def shuffle_sparse_coo(a):
    a.col = np.random.choice(a.shape[1], a.nnz, replace=0)
    return a

shuffle_sparse_coo(a).todense() # Using Divakar 'a' array
Out[408]: matrix([[0, 8, 0, 0, 7, 0, 0, 0, 0, 4, 0, 0, 5, 0, 3, 0, 1, 0, 0, 0]])

< > :

, , @Divakar hackish:

%timeit shuffle_sparse_arr_hackish(a)
The slowest run took 4.32 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 44.7 µs per loop

def shuffle_sparse_arr_nz(a):
    out = np.zeros_like(a)
    mask = np.nonzero(a)
    idx = np.random.choice(a.size, mask[0].size, replace=0)
    out[idx] = a[mask]
    return out

%timeit shuffle_sparse_arr_nz(a)
The slowest run took 4.68 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 41 µs per loop

EDIT2:

@Divakar :

def shuffle_sparse_coo_h(a):
    idx = np.unique((a.shape[1]*np.random.rand(2*a.nnz)).astype(int))[:a.nnz]
    while idx.size<n:
        idx = np.unique((a.shape[1]*np.random.rand(2*a.nnz)).astype(int))[:a.nnz]
    a.col = idx
    return a

#using a 15m element a:

%timeit shuffle_sparse_arr_hackish(a)
10 loops, best of 3: 52.8 ms per loop

a1 = sparse.coo_matrix(a)
%timeit shuffle_sparse_coo(a1)
1 loop, best of 3: 1.01 s per loop

%timeit shuffle_sparse_coo_h(a1)
The slowest run took 4.58 times longer than the fastest. This could mean that an intermediate result is being cached.
100 loops, best of 3: 2.02 ms per loop

EDIT2:

np.random.randint

def shuffle_sparse_coo_h2(a):
    idx = np.unique(np.random.randint(0,a.shape[1],(2*a.nnz,)))[:a.nnz]
    while idx.size < n:
        idx = np.unique(np.random.randint(0,a.shape[1],(2*a.nnz,)))[:a.nnz]
    a.col = idx
    return a  

%timeit shuffle_sparse_coo_h2(a1)
1000 loops, best of 3: 1.86 ms per loop
+2

№1: -

def shuffle_sparse_arr(a):
    out = np.zeros_like(a)
    mask = a!=0
    n = np.count_nonzero(mask)
    idx = np.random.choice(a.size, n, replace=0)
    out[idx] = a[mask]
    return out

№ 2: -

def shuffle_sparse_arr_hackish(a):
    out = np.zeros_like(a)
    mask = a!=0
    n = np.count_nonzero(mask)
    idx = np.unique((a.size*np.random.rand(2*n)).astype(int))[:n]
    while idx.size<n:
        idx = np.unique((a.size*np.random.rand(2*n)).astype(int))[:n]
    np.random.shuffle(idx)
    out[idx] = a[mask]
    return out

-

In [269]: # Setup input array
     ...: a = np.zeros((20),dtype=int)
     ...: sidx = np.random.choice(a.size, 6, replace=0)
     ...: a[sidx] = [5,8,4,1,7,3]
     ...: 

In [270]: a
Out[270]: array([4, 0, 0, 8, 0, 0, 5, 0, 0, 0, 0, 7, 0, 0, 1, 0, 0, 0, 0, 3])

In [271]: shuffle_sparse_arr(a)
Out[271]: array([0, 5, 0, 0, 0, 0, 1, 0, 4, 0, 0, 0, 0, 0, 0, 7, 3, 8, 0, 0])

In [272]: shuffle_sparse_arr_hackish(a)
Out[272]: array([3, 1, 5, 0, 4, 0, 7, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0])

-

In [288]: # Setup input array with 15 million and 99.9% zeros
     ...: a = np.zeros((15000000),dtype=int)
     ...: 
     ...: # Set 100-99.9% as random non-zeros
     ...: n = int(a.size*((100-99.9)/100)) 
     ...: 
     ...: set_idx = np.random.choice(a.size, n , replace=0)
     ...: nums = np.random.choice(a.size, n , replace=0)
     ...: a[set_idx] = nums
     ...: 

In [289]: %timeit shuffle_sparse_arr(a)
1 loops, best of 3: 647 ms per loop

In [290]: %timeit shuffle_sparse_arr_hackish(a)
10 loops, best of 3: 29.1 ms per loop

In [291]: %timeit np.random.shuffle(a)
1 loops, best of 3: 606 ms per loop
+5

Source: https://habr.com/ru/post/1674550/


All Articles