Grouping by the range of differences between elements

I have an array of angles that I want to group into arrays with a maximum difference of 2 degrees between them.

for example: input:

angles = np.array([[1],[2],[3],[4],[4],[5],[10]])

Output

('group', 1)
[[1]
 [2]
 [3]]
('group', 2)
[[4]
 [4]
 [5]]
('group', 3)
[[10]]

numpy.diff gets the difference in the next element from the current, I need the difference of the next elements from the first of the group

itertools.groupby groups items not within a specific range

numpy.digitize groups elements according to a predefined range, not the range indicated by the elements of the array. (Maybe I can use this by getting the unique values ​​of the angles, grouping them by their difference and using this as a predefined range?)

.

, , : ( expand_dims vstack, 1d ( ), , )

angles = np.array([[1],[2],[3],[4],[4],[5],[10]])

groupedangles = []
idx1 = 0
diffAngleMax = 2

while(idx1 < len(angles)):
    angleA = angles[idx1]
    group = np.expand_dims(angleA, axis=0)
    for idx2 in xrange(idx1+1,len(angles)):
        angleB = angles[idx2]
        diffAngle = angleB - angleA
        if abs(diffAngle) <= diffAngleMax:
            group = np.vstack((group,angleB))
        else:
            idx1 = idx2
            groupedangles.append(group)
            break
    if idx2 == len(angles) - 1:
        if idx1 == idx2:
            angleA = angles[idx1]
            group = np.expand_dims(angleA, axis=0)
        groupedangles.append(group)
        break

for idx, x in enumerate(groupedangles):
    print('group', idx+1)
    print(x)

?

+4
2

. Cython

In [1]: import cython

In [2]: %load_ext Cython

In [3]: %%cython
   ...: import numpy as np
   ...: cimport numpy as np
   ...: def cluster(np.ndarray array, np.float64_t maxdiff):
   ...:     cdef np.ndarray[np.float64_t, ndim=1] flat = np.sort(array.flatten())
   ...:     cdef list breakpoints = []
   ...:     cdef np.float64_t seed = flat[0]
   ...:     cdef np.int64_t int = 0
   ...:     for i in range(0, len(flat)):
   ...:         if (flat[i] - seed) > maxdiff:
   ...:             breakpoints.append(i)
   ...:             seed = flat[i]
   ...:     return np.split(array, breakpoints)
   ...: 

In [4]: angles = np.random.choice(np.arange(5000), 500).astype(np.float64)[:, None]

In [5]: %timeit cluster(angles, 2)
422 µs ± 12.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [6]: angles = np.random.choice(np.arange(500), 1500).astype(np.float64)[:, None]

In [7]: %timeit cluster(angles, 2)
263 µs ± 14.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

. , O (N * log (N)).

Pre-

.

def cluster(array, maxdiff):
    tmp = array.copy()
    groups = []
    while len(tmp):
        # select seed
        seed = tmp.min()
        mask = (tmp - seed) <= maxdiff
        groups.append(tmp[mask, None])
        tmp = tmp[~mask]
    return groups

:

In [27]: cluster(angles, 2)
Out[27]: 
[array([[1],
        [2],
        [3]]), array([[4],
        [4],
        [5]]), array([[10]])]

500, 1000 1500 :

In [4]: angles = np.random.choice(np.arange(500), 500)[:, None]

In [5]: %timeit cluster(angles, 2)
1.25 ms ± 60.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [6]: angles = np.random.choice(np.arange(500), 1000)[:, None]

In [7]: %timeit cluster(angles, 2)
1.46 ms ± 37 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [8]: angles = np.random.choice(np.arange(500), 1500)[:, None]

In [9]: %timeit cluster(angles, 2)
1.99 ms ± 72.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

O (N ^ 2) O (N) , , : . .

In [4]: angles = np.random.choice(np.arange(500), 500)[:, None]

In [5]: %timeit cluster(angles, 2)
1.06 ms ± 27.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [6]: angles = np.random.choice(np.arange(1000), 500)[:, None]

In [7]: %timeit cluster(angles, 2)
1.79 ms ± 117 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [8]: angles = np.random.choice(np.arange(1500), 500)[:, None]

In [9]: %timeit cluster(angles, 2)
2.16 ms ± 90.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [10]: angles = np.random.choice(np.arange(5000), 500)[:, None]

In [11]: %timeit cluster(angles, 2)
3.21 ms ± 139 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
+4

. bincount argpartition, , N = 500 .

import numpy as np

def flexibin(a):
    idx0 = np.argsort(a)
    as_ = a[idx0]
    A = np.r_[as_, as_+2]
    idx = np.argsort(A)
    uinv = np.flatnonzero(idx >= len(a))
    linv = np.empty_like(idx)
    linv[np.flatnonzero(idx < len(a))] = np.arange(len(a))
    bins = [0]
    curr = 0
    while True:
        for j in range(uinv[idx[curr]], len(idx)):
            if idx[j] < len(a) and A[idx[j]] > A[idx[curr]] + 2:
                bins.append(j)
                curr = j
                break
        else:
            return np.split(idx0, linv[bins[1:]])

a = 180 * np.random.random((500,))
bins = flexibin(a)

mn, mx = zip(*((np.min(a[b]), np.max(a[b])) for b in bins))
assert np.all(np.diff(mn) > 2)
assert np.all(np.subtract(mx, mn) <= 2)
print('all ok')
+1

Source: https://habr.com/ru/post/1692246/


All Articles