Find the minimum distances between groups of points in 2D (fast and not too much memory)

I have two sets of points in 2D Aand B, and I need to find the minimum distance for each point in Ato point in B. So far I have used SciPy cdist with the code below

import numpy as np
from scipy.spatial.distance import cdist

def ABdist(A, B):
    # Distance to all points in B, for each point in A.
    dist = cdist(A, B, 'euclidean')
    # Indexes to minimum distances.
    min_dist_idx = np.argmin(dist, axis=1)
    # Store only the minimum distances for each point in A, to a point in B.
    min_dists = [dist[i][md_idx] for i, md_idx in enumerate(min_dist_idx)]

    return min_dist_idx, min_dists

N = 10000
A = np.random.uniform(0., 5000., (N, 2))
B = np.random.uniform(0., 5000., (N, 2))

min_dist_idx, min_dists = ABdist(A, B)

which is great for small values N. But now the lengths of the sets have increased from N=10000to N=35000, and I ran into

    dm = np.zeros((mA, mB), dtype=np.double)
MemoryError

I know that I can replace it with a cdistfor loop, which only stores the minimum distance (and index) for each point in Ato each point in B, since that is all I need. I do not need a complete distance matrix AxB. But I used cdistit precisely because it is fast.

cdist , (?) , ?

+4
2

, , k-d tree. , SciPy cKDTree :

from scipy.spatial import cKDTree
min_dists, min_dist_idx = cKDTree(B).query(A, 1)

, , , .

, 1 000 000 , :

N = 1000000
A = np.random.uniform(0., 5000., (N, 2))
B = np.random.uniform(0., 5000., (N, 2))

%timeit cKDTree(B).query(A, 1)
# 3.25 s ± 17.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+4

, . N, pt A. , B pts , .

, einsum matrix-multiplication, this post, pt A , , -

for pt in A:
    d = np.einsum('ij,ij->i',B,B) + pt.dot(pt) - 2*B.dot(pt)

, , A np.einsum('ij,ij->i',B,B), :

min_idx = np.empty(N, dtype=int)
min_dist = np.empty(N)
Bsqsum = np.einsum('ij,ij->i',B,B) 
for i,pt in enumerate(A):
    d = Bsqsum + pt.dot(pt) - 2*B.dot(pt)
    min_idx[i] = d.argmin()
    min_dist[i] = d[min_idx[i]]
min_dist = np.sqrt(min_dist)

-

np.einsum('ij,ij->i',B,B)[:,None] + np.einsum('ij,ij->i',A,A) - 2*B.dot(A.T)

, , A, 3D, :

chunk_size= 100 # Edit this as per memory setup available
                # More means more memory needed
A.shape = (A.shape[0]//chunk_size, chunk_size,-1)

min_idx = np.empty((N//chunk_size, chunk_size), dtype=int)
min_dist = np.empty((N//chunk_size, chunk_size))

Bsqsum = np.einsum('ij,ij->i',B,B)[:,None]
r = np.arange(chunk_size)
for i,chnk in enumerate(A):
    d = Bsqsum + np.einsum('ij,ij->i',chnk,chnk) - 2*B.dot(chnk.T)
    idx = d.argmin(0)
    min_idx[i] = idx
    min_dist[i] = d[idx,r]
min_dist = np.sqrt(min_dist)

min_idx.shape = (N,)
min_dist.shape = (N,)
A.shape = (N,-1)
+1

Source: https://habr.com/ru/post/1690625/


All Articles