Search for a set of indexes that maps rows of one NumPy ndarray to another

I have two structured 2D arrays numpythat are basically equal, which means

A = numpy.array([[a1,b1,c1],
                 [a2,b2,c2],
                 [a3,b3,c3],
                 [a4,b4,c4]]) 

B = numpy.array([[a2,b2,c2],
                 [a4,b4,c4],
                 [a3,b3,c3],
                 [a1,b1,c1]])

Not in the sense

numpy.array_equal(A,B) # False
numpy.array_equiv(A,B) # False
numpy.equal(A,B) # ndarray of True and False

But in the sense that one array (A)is the original, and in another the (B)data is shuffled along one axis (can be along rows or columns).

What is an effective way to sort / shuffle Bto match or get equal Aor, alternatively, sort Ato become equal B? The equality check is really not important if both arrays are shuffled to each other. Aand therefore Bhave unique strings.

I tried a method viewfor sorting arrays like this:

def sort2d(A):
    A_view = np.ascontiguousarray(A).view(np.dtype((np.void,
             A.dtype.itemsize * A.shape[1])))
    A_view.sort()
    return A_view.view(A.dtype).reshape(-1,A.shape[1])   

, -, . , .

+4
2

, , , A → B. :

A = np.random.permutation(12).reshape(4, 3)
idx = np.random.permutation(4)
B = A[idx]

print(repr(A))
# array([[ 7, 11,  6],
#        [ 4, 10,  8],
#        [ 9,  2,  0],
#        [ 1,  3,  5]])

print(repr(B))
# array([[ 1,  3,  5],
#        [ 4, 10,  8],
#        [ 7, 11,  6],
#        [ 9,  2,  0]])

idx, , A[idx] == B. , A B .


* , A, , B A. A B 1D np.void dtype, :

rowtype = np.dtype((np.void, A.dtype.itemsize * A.size / A.shape[0]))
# A and B must be C-contiguous, might need to force a copy here
a = np.ascontiguousarray(A).view(rowtype).ravel()
b = np.ascontiguousarray(B).view(rowtype).ravel()

a_to_as = np.argsort(a)     # indices that sort the rows of A in lexical order

np.searchsorted , B A:

# using the `sorter=` argument rather than `a[a_to_as]` avoids making a copy of `a`
as_to_b = a.searchsorted(b, sorter=a_to_as)

A → B A → A s → B

a_to_b = a_to_as.take(as_to_b)
print(np.all(A[a_to_b] == B))
# True

A B , B → A ,

b_to_a = np.argsort(a_to_b)
print(np.all(B[b_to_a] == A))
# True

:

def find_row_mapping(A, B):
    """
    Given A and B, where B is a copy of A permuted over the first dimension, find
    a set of indices idx such that A[idx] == B.
    This is a unique mapping if and only if there are no repeated rows in A and B.

    Arguments:
        A, B:   n-dimensional arrays with same shape and dtype
    Returns:
        idx:    vector of indices into the rows of A
    """

    if not (A.shape == B.shape):
        raise ValueError('A and B must have the same shape')
    if not (A.dtype == B.dtype):
        raise TypeError('A and B must have the same dtype')

    rowtype = np.dtype((np.void, A.dtype.itemsize * A.size / A.shape[0]))
    a = np.ascontiguousarray(A).view(rowtype).ravel()
    b = np.ascontiguousarray(B).view(rowtype).ravel()
    a_to_as = np.argsort(a)
    as_to_b = a.searchsorted(b, sorter=a_to_as)

    return a_to_as.take(as_to_b)

Benchmark:

In [1]: gen = np.random.RandomState(0)
In [2]: %%timeit A = gen.rand(1000000, 100); B = A.copy(); gen.shuffle(B)
....: find_row_mapping(A, B)
1 loop, best of 3: 2.76 s per loop

* quicksort over rows, O (n log n). , .

+4

, . Jaime Answer, vstack . , , ( ).

unique2d :

def unique2d(arr,consider_sort=False,return_index=False,return_inverse=False): 
    """Get unique values along an axis for 2D arrays.

        input:
            arr:
                2D array
            consider_sort:
                Does permutation of the values within the axis matter? 
                Two rows can contain the same values but with 
                different arrangements. If consider_sort 
                is True then those rows would be considered equal
            return_index:
                Similar to numpy unique
            return_inverse:
                Similar to numpy unique
        returns:
            2D array of unique rows
            If return_index is True also returns indices
            If return_inverse is True also returns the inverse array 
            """

    if consider_sort is True:
        a = np.sort(arr,axis=1)
    else:
        a = arr
    b = np.ascontiguousarray(a).view(np.dtype((np.void, 
            a.dtype.itemsize * a.shape[1])))

    if return_inverse is False:
        _, idx = np.unique(b, return_index=True)
    else:
        _, idx, inv = np.unique(b, return_index=True, return_inverse=True)

    if return_index == False and return_inverse == False:
        return arr[idx]
    elif return_index == True and return_inverse == False:
        return arr[idx], idx
    elif return_index == False and return_inverse == True:
        return arr[idx], inv
    else:
        return arr[idx], idx, inv

:

def row_mapper(a,b,consider_sort=False):
    """Given two 2D numpy arrays returns mappers idx_a and idx_b 
        such that a[idx_a] = b[idx_b] """

    assert a.dtype == b.dtype
    assert a.shape == b.shape

    c = np.concatenate((a,b))
    _, inv = unique2d(c, consider_sort=consider_sort, return_inverse=True)
    mapper_a = inv[:b.shape[0]]
    mapper_b = inv[b.shape[0]:]

    return np.argsort(mapper_a), np.argsort(mapper_b) 

n = 100000
A = np.arange(n).reshape(n//4,4)
B = A[::-1,:]

idx_a, idx_b  = row_mapper(A,B)
print np.all(A[idx_a]==B[idx_b])
# True

Benchmark: @ali_m

%timeit find_row_mapping(A,B) # ali_m solution
%timeit row_mapper(A,B) # current solution

# n = 100
100000 loops, best of 3: 12.2 µs per loop
10000 loops, best of 3: 47.3 µs per loop

# n = 1000
10000 loops, best of 3: 49.1 µs per loop
10000 loops, best of 3: 148 µs per loop

# n = 10000
1000 loops, best of 3: 548 µs per loop
1000 loops, best of 3: 1.6 ms per loop

# n = 100000
100 loops, best of 3: 6.96 ms per loop
100 loops, best of 3: 19.3 ms per loop

# n = 1000000
10 loops, best of 3: 160 ms per loop
1 loops, best of 3: 372 ms per loop

# n = 10000000
1 loops, best of 3: 2.54 s per loop
1 loops, best of 3: 5.92 s per loop

, , , 2-3 , ali_m, , , , . , .

+1

Source: https://habr.com/ru/post/1627516/


All Articles