Python filter 2d array using data fragment

import numpy as np

data = np.array([
    [20,  0,  5,  1],
    [20,  0,  5,  1],
    [20,  0,  5,  0],
    [20,  1,  5,  0],
    [20,  1,  5,  0],
    [20,  2,  5,  1],
    [20,  3,  5,  0],
    [20,  3,  5,  0],
    [20,  3,  5,  1],
    [20,  4,  5,  0],
    [20,  4,  5,  0],
    [20,  4,  5,  0]
])

I have the following 2d array. allows you to name the fields a, b, c, din the above order, where the column bis similar to id. I want to delete all cells that do not have outlist 1, the number "1" in the column dfor all cells with the same number in the column b(same identifier), so after filtering I will have the following results:

[[20  0  5  1]
 [20  0  5  1]
 [20  0  5  0]
 [20  2  5  1]
 [20  3  5  0]
 [20  3  5  0]
 [20  3  5  1]]

all rows with b = 1and b = 4were deleted from the data

, , . b. "1" d, b. b = 1 b = 4 ( "id" = 1 "id" = 4), 0 "1" d.

+4
5

: np.unique np.bincount -

unq,tags = np.unique(data[:,1],return_inverse=1)
goodIDs = np.flatnonzero(np.bincount(tags,data[:,3]==1)>=1)
out = data[np.in1d(tags,goodIDs)]

-

In [15]: data
Out[15]: 
array([[20, 10,  5,  1],
       [20, 73,  5,  0],
       [20, 73,  5,  1],
       [20, 31,  5,  0],
       [20, 10,  5,  1],
       [20, 10,  5,  0],
       [20, 42,  5,  1],
       [20, 54,  5,  0],
       [20, 73,  5,  0],
       [20, 54,  5,  0],
       [20, 54,  5,  0],
       [20, 31,  5,  0]])

In [16]: out
Out[16]: 
array([[20, 10,  5,  1],
       [20, 73,  5,  0],
       [20, 73,  5,  1],
       [20, 10,  5,  1],
       [20, 10,  5,  0],
       [20, 42,  5,  1],
       [20, 73,  5,  0]])

: , 0, , :

goodIDs = np.flatnonzero(np.bincount(data[:,1],data[:,3]==1)>=1)
out = data[np.in1d(data[:,1],goodIDs)]

-

In [44]: data
Out[44]: 
array([[20,  0,  5,  1],
       [20,  0,  5,  1],
       [20,  0,  5,  0],
       [20,  1,  5,  0],
       [20,  1,  5,  0],
       [20,  2,  5,  1],
       [20,  3,  5,  0],
       [20,  3,  5,  0],
       [20,  3,  5,  1],
       [20,  4,  5,  0],
       [20,  4,  5,  0],
       [20,  4,  5,  0]])

In [45]: out
Out[45]: 
array([[20,  0,  5,  1],
       [20,  0,  5,  1],
       [20,  0,  5,  0],
       [20,  2,  5,  1],
       [20,  3,  5,  0],
       [20,  3,  5,  0],
       [20,  3,  5,  1]])

, data[:,3] , data[:,3] data[:,3]==1 .


-

In [69]: def logical_or_based(data): #@ Eric soln
    ...:     b_vals = data[:,1]
    ...:     d_vals = data[:,3]
    ...:     is_ok = np.zeros(np.max(b_vals) + 1, dtype=np.bool_)
    ...:     np.logical_or.at(is_ok, b_vals, d_vals)
    ...:     return is_ok[b_vals]
    ...: 
    ...: def in1d_based(data):
    ...:     goodIDs = np.flatnonzero(np.bincount(data[:,1],data[:,3])!=0)
    ...:     out = np.in1d(data[:,1],goodIDs)
    ...:     return out
    ...: 

In [70]: # Setup input
    ...: data = np.random.randint(0,100,(10000,4))
    ...: data[:,1] = np.sort(np.random.randint(0,100,(10000)))
    ...: data[:,3] = np.random.randint(0,2,(10000))
    ...: 

In [71]: %timeit logical_or_based(data) #@ Eric soln
1000 loops, best of 3: 1.44 ms per loop

In [72]: %timeit in1d_based(data)
1000 loops, best of 3: 528 µs per loop
+3

:

import numpy as np

my_list = [[20,0,5,1],
    [20,0,5,1],
    [20,0,5,0],
    [20,1,5,0],
    [20,1,5,0],
    [20,2,5,1],
    [20,3,5,0],
    [20,3,5,0],
    [20,3,5,1],
    [20,4,5,0],
    [20,4,5,0],
    [20,4,5,0]]

all_ids = np.array(my_list)[:,1]
unique_ids = np.unique(all_ids)
indices = [np.where(all_ids==ui)[0][0] for ui in unique_ids ]

final = []
for id in unique_ids:
    try:
        tmp_group = my_list[indices[id]:indices[id+1]]
    except:
        tmp_group = my_list[indices[id]:]
    if 1 in np.array(tmp_group)[:,3]:
        final.extend(tmp_group)

print np.array(final)

:

[[20  0  5  1]
 [20  0  5  1]
 [20  0  5  0]
 [20  2  5  1]
 [20  3  5  0]
 [20  3  5  0]
 [20  3  5  1]]
+1

1 :

[sublist for sublist in list_ if sublist[1] != 1]

1 , 1:

[sublist for sublist in list_ if not (sublist[1] == 1 and sublist[3] != 1) ]
+1

:

  • b >= 0
  • b -
  • b , .. max(b) ~= len(unique(b))

np.ufunc.at:

# unpack for clarity - this costs nothing in numpy
b_vals = data[:,1]
d_vals = data[:,3]

# build an array indexed by b values
is_ok = np.zeros(np.max(b_vals) + 1, dtype=np.bool_)
np.logical_or.at(is_ok, b_vals, d_vals)
# is_ok == array([ True, False,  True,  True, False], dtype=bool)

# take the rows which have a b value that was deemed OK
result = data[is_ok[b_vals]]

np.logical_or.at(is_ok, b_vals, d_vals) - :

for idx, val in zip(b_vals, d_vals):
    is_ok[idx] = np.logical_or(is_ok[idx], val)
+1

, , :

import numpy_indexed as npi
g = npi.group_by(data[:, 1])
ids, valid = g.any(data[:, 3])
result = data[valid[g.inverse]]
+1

Source: https://habr.com/ru/post/1658210/


All Articles