The fastest match matrix

I have two arrays and I want to compute a list / array of matches. That is, a list of all indices i, j, so a [i] == b [j]. This is my code now:

b = np.array([3, 5, 6, 4])
a = np.array([1, 2, 3, 4])

np.array([[i, j] for i in range(a.size) for j in range(b.size) if a[i] == b[j]])

Is there a way to do this faster (maybe with a few options)?

+4
source share
5 answers

Approach No. 1

One approach would use np.in1d-

m_a = np.in1d(a,b)
I = np.flatnonzero(m_a)
J = np.flatnonzero(np.in1d(b, a[m_a]))

Example input, output -

In [367]: a
Out[367]: array([1, 2, 3, 4])

In [368]: b
Out[368]: array([3, 5, 6, 4])

In [370]: I
Out[370]: array([2, 3])

In [371]: J
Out[371]: array([0, 3])

Approach No. 2

Another direct but large amount of memory will be with broadcasting-

I,J = np.nonzero(a[:,None] == b)

Approach No. 3

In the absence of duplicates inside the input arrays we can use np.searchsorted. There are two options: one for sorting aand one for general a.

Option No. 1: For sorting a-

idx = np.searchsorted(a, b)
idx[idx==a.size] = 0
mask = a[idx] == b
I = np.searchsorted(a,b[mask])
J = np.flatnonzero(mask)

№ 2: argsort a -

sidx = a.argsort()
a_sort = a[sidx]
idx = np.searchsorted(a_sort, b)
idx[idx==a.size] = 0
mask = a_sort[idx] == b
I = sidx[np.searchsorted(a_sort,b[mask])]
J = np.flatnonzero(mask)
+2

A numpy numpy.argwhere(), , .

ax = np.tensordot(a, np.ones(len(a)), axes = 0)
bx = np.tensordot(np.ones(len(b)), b, axes = 0)
np.argwhere(ax - bx == 0)

ax - bx - , a b, rsp. "". , , .

+1

[ , ]:

python, : ( ).

import numpy as np
import random
import time

random.seed(12345)
b = [random.randint(0,100000) for i in range(10000)]
a = [random.randint(0,100000) for i in range(10000)]

( ), () , 15.

#List based approach
start_time = time.time()
c2 = [[i,j]  for i in range(len(a)) for j in range(len(b)) if a[i] == b[j]]
print time.time() - start_time # t = 29.2776758671

b = np.array(a)
a = np.array(b)

#NumPy Array based approach    
start_time = time.time()
c1 = np.array([[i, j] for i in range(a.size) for j in range(b.size) if a[i] == b[j]])
print time.time() - start_time # t = 46.374776125

, , numpy, .

#Coincidence Matrix (NumPy) based approach    
start_time = time.time()
c3 = (a[None,:] == b[:,None]).astype(int)
c3s = np.where(c3 == 1)
print time.time() - start_time # t = 0.857568979263

, , , , :

c4 = np.nonzero(a[:,None] == b)
# t = 0.399062156677
+1

. . , , . , . .

If your arrays are nonneg int, and you have enough memory, then the following is a bit faster than before. The benchmarking code is mainly borrowed from @Dominique Fuchs

import numpy as np
import random
import itertools as it
import time

random.seed(12345)
b = np.random.randint(0, 100000, (10000,))
a = np.random.randint(0, 100000, (10000,))
#a = np.cumsum(np.random.randint(0, 20, (10000,)))
#b = np.cumsum(np.random.randint(0, 20, (10000,)))
np.random.shuffle(a)
np.random.shuffle(b)


#Coincidence Matrix (NumPy) based approach (dominique fuchs)
start_time = time.time()
c3 = (a[None,:] == b[:,None]).astype(int)
c3s = np.where(c3 == 1)
t = time.time() - start_time
print('df      {: 12.6f}'.format(1000*t))


# divakar approach 1
start_time = time.time()
m_a = np.in1d(a,b)
I = np.flatnonzero(m_a)
J = np.flatnonzero(np.in1d(b, a[m_a]))
t = time.time() - start_time
print('div 1   {: 12.6f}'.format(1000*t))

# divakar approach 2
start_time = time.time()
I,J = np.nonzero(a[:,None] == b)
t = time.time() - start_time
print('div 2   {: 12.6f}'.format(1000*t))


# bm
import collections

def default(a):
    d=collections.defaultdict(list)
    for k,v in enumerate(a):
        d[v].append(k)
    return d

def coincidences(a,b):
    aa=default(a)
    bb=default(b)
    ab=set(aa.keys()) & set(bb.keys())
    l=[]
    for k in ab:
      for i in aa[k]:
          for j in bb[k]:
              l.append((i,j))
    return l


start_time = time.time()
l = coincidences(a, b)
t = time.time() - start_time
print('bm      {: 12.6f}'.format(1000*t))


# my (pp) approach
start_time = time.time()
ws = np.empty((100000,), dtype=int)
ws[b] = -1
ws[a] = np.arange(len(a))
b_ind = np.flatnonzero(ws[b] > -1)
a_ind = ws[b[b_ind]]
# find duplicates
wsa = ws[a]
# writing to ws[a] above duplicate values in a mean that the same memory
# is written multiple times and only the value written with the last
# occurrence of the duplicate survives.
# All other occurrences can therefore be found by
dup = np.flatnonzero(wsa != np.arange(len(a)))
# Next we have to separate groups of duplicates which at this point are
# all jumbled together.
# This is done by argsorting and grouping keyed by the index of the
# last occurrence.
didx = np.argsort(wsa[dup])
dups = dup[didx]
wsad = wsa[dups]
# Find indices where the key changes, in other words group boundaries.
# Append one bin for junk. (Things marked with index -1 will find their 
# way there.)
split = np.flatnonzero(np.r_[True, wsad[:-1] != wsad[1:], True, True])
split[-1] -= 1
# split duplicate indices into groups.
blocks = np.split(dups, split[1:-2])
ws[a] = -1
ws[a[dups[split[:-2]]]] = np.arange(len(split)-2)
# For each match in b grps will hold the index of the group of
# corresponding duplicates in a; non-matches are marked -1
grps = ws[b[b_ind]]
# Whereever the last occurrence of a duplicate in a was matched to an
# element of b, the corresponding group also matches.
# The index of the element of b must be repeated to keep b_ind and      
# a_ind (where the group will be inserted) in sync
b_ind = np.repeat(b_ind, 1 + np.diff(split)[grps])
# cut up a wherever the last occurrence of a group of duplicates is found
split = np.flatnonzero(grps > -1)
a_chunks = np.split(a_ind, split)
# insert the corresponding groups and rejoin
a_ind = np.concatenate([c for a in it.zip_longest(a_chunks, (blocks[g] for g in grps[split]), fillvalue=np.array([], int)) for c in a])
t = time.time() - start_time
print('pp      {: 12.6f}'.format(1000*t))


o1 = np.lexsort(c3s)
o2 = np.lexsort((b_ind, a_ind))

print(np.all(c3s[1][o1] == a_ind[o2]) and np.all(c3s[0][o1] == b_ind[o2]))

Output Example:

#df        652.901411
#div 1       3.543615
#div 2     332.098961
#bm         14.440775
#pp          1.211882     <----- my solution
#True
+1
source

A (very) fast solution based on sets of dictations and lists, which is linear in time and space, regardless of which data, duplicates or not, large keys or not.

a,b = np.random.randint(0,10**8,size=(2,10**4))

import collections

def default(a):
    d=collections.defaultdict(list)
    for k,v in enumerate(a):
        d[v].append(k)
    return d

def coincidences(a,b):
    aa=default(a)
    bb=default(b)
    ab=set(aa.keys()) & set(bb.keys())
    l=[]
    for k in ab:
      for i in aa[k]:
          for j in bb[k]:
              l.append((i,j))
    return l

Launches:

In [125]: %timeit coincidences(a,b)
10.6 ms ± 402 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Numpy wins only when it can implement a linear solution.

EDIT

Pandas equivalent solution (same timings):

def coincidences_pd(a,b):        
    aa=pd.DataFrame(list(range(len(a))),a)
    bb=pd.DataFrame(list(range(len(b))),b)
    return pd.merge(aa,bb,left_index=True,right_index=True)

In [219]: coincidences_pd(a,b)
Out[219]: 
           0_x   0_y
54025822  1752  8046
84735197  7301  2956
+1
source

Source: https://habr.com/ru/post/1689788/


All Articles