Replace 1 with 0 in sequence

I have a huge list of 1 and 0, like this:

x = [1,0,0,0,1,1,1,1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1].  

Full list here .

I want to create a new list y with the condition that 1 should be preserved only if they occur in the sequence> = 10, otherwise these 1 should be replaced with zeros. ex, based on the xabove ^ y, should become:

y = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1].  

So far I have the following:

  • Find out where the changes are, and
  • Find out what sequences occur at what frequency:

import numpy as np
import itertools
nx = np.array(x)
print np.argwhere(np.diff(nx)).squeeze()

answer = []
for key, iter in itertools.groupby(nx):
    answer.append((key, len(list(iter))))
print answer

which gives me:

[0 3 8 14]  # A
[(1, 1), (0, 3), (1, 5), (0, 6), (1, 10)] # B

#A, which means that changes occur after positions 0, 3, etc.

#B means there is one 1, followed by three 0, followed by five 1, then 6 zeros, followed by 10 1.

y, 1 0 ?

PS: ## .

+4
6

, . - :

>>> from itertools import groupby
>>> x = [1,0,0,0,1,1,1,1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1]
>>> result = []
>>> for k, g in groupby(x):
...     if k:
...         g = list(g)
...         if len(g) < 10:
...             g = len(g)*[0]
...     result.extend(g)
...
>>> result
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

, , pandas, , :

In [11]: from itertools import groupby

In [12]: %%timeit
    ...: result = []
    ...: for k, g in groupby(x):
    ...:     if k:
    ...:         g = list(g)
    ...:         if len(g) < 10:
    ...:             g = len(g)*[0]
    ...:     result.extend(g)
    ...:
181 µs ± 1.72 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

In [13]: %%timeit s = pd.Series(x)
    ...: s[s.groupby(s.ne(1).cumsum()).transform('count').lt(10)] = 0
    ...:
4.03 ms ± 176 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

, pandas, pd.Series , :

In [14]: %%timeit
    ...: s = pd.Series(x)
    ...: s[s.groupby(s.ne(1).cumsum()).transform('count').lt(10)] = 0
    ...: s = s.tolist()
    ...:
4.92 ms ± 119 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
+6

numpy. , :

import numpy as np
import pandas as pd
from itertools import groupby
import re
from timeit import timeit

def f_pp(data):
    switches = np.empty((data.size + 1,), bool)
    switches[0] = data[0]
    switches[-1] = data[-1]
    switches[1:-1] = data[:-1]^data[1:]
    switches = np.where(switches)[0].reshape(-1, 2)
    switches = switches[switches[:, 1]-switches[:, 0] >= 10].ravel()
    reps = np.empty((switches.size + 1,), int)
    reps[1:-1] = np.diff(switches)
    reps[0] = switches[0]
    reps[-1] = data.size - switches[-1]
    return np.repeat(np.arange(reps.size) & 1, reps)

def f_ja(data):
    result = []
    for k, g in groupby(data):
        if k:
            g = list(g)
            if len(g) < 10:
                g = len(g)*[0]
        result.extend(g)
    return result

def f_mu(s):
    s = s.copy()
    s[s.groupby(s.ne(1).cumsum()).transform('count').lt(10)] = 0
    return s

def vrange(starts, stops):
     stops = np.asarray(stops)
     l = stops - starts # Lengths of each range.
     return np.repeat(stops - l.cumsum(), l) + np.arange(l.sum())

def f_ka(data):
    x = data.copy()
    d = np.where(np.diff(x) != 0)[0]
    d2 = np.diff(np.concatenate(([0], d, [x.size])))
    ind = np.where(d2 >= 10)[0] - 1
    x[vrange(d[ind] + 1, d[ind + 1] + 2)] = 0
    return x

def f_ol(data):
    return list(re.sub(b'(?<!\x01)\x01{,9}(?!\x01)', lambda m: len(m.group()) * b'\x00', bytes(data)))

n = 10_000
data = np.repeat((np.arange(n) + np.random.randint(2))&1, np.random.randint(1, 20, (n,)))
datal = data.tolist()
datap = pd.Series(data)

kwds = dict(globals=globals(), number=100)

print(np.where(f_ja(datal) != f_pp(data))[0])
print(np.where(f_ol(datal) != f_pp(data))[0])
#print(np.where(f_ka(data) != f_pp(data))[0])
print(np.where(f_mu(datap).values != f_pp(data))[0])

print('itertools.groupby: {:6.3f} ms'.format(10 * timeit('f_ja(datal)', **kwds)))
print('re:                {:6.3f} ms'.format(10 * timeit('f_ol(datal)', **kwds)))
#print('numpy Kasramvd:    {:6.3f} ms'.format(10 * timeit('f_ka(data)', **kwds)))
print('pandas:            {:6.3f} ms'.format(10 * timeit('f_mu(datap)', **kwds)))
print('numpy pp:          {:6.3f} ms'.format(10 * timeit('f_pp(data)', **kwds)))

:

[]                                        # Delta ja, pp
[]                                        # Delta ol, pp
[  749   750   751 ... 98786 98787 98788] # Delta mu, pp
itertools.groupby:  5.415 ms
re:                28.197 ms
pandas:            14.972 ms
numpy pp:           0.788 ms

. @Olivier @juanpa.arrivillaga, , @MaxU . @Kazramvd, . ( , - pandas @Kazramvd).

, , (, , ..) .

+4

B .

b = [(1, 1), (0, 3), (1, 5), (0, 6), (1, 10)] # B

y = sum(([num and int(rep >= 10)] * rep for num, rep in b), [])

re

, -, re , bytes.

import re

x = [1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

y = list(re.sub(b'(?<!\x01)\x01{,9}(?!\x01)', lambda m: len(m.group()) * b'\x00', bytes(x)))

:

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+2

Numpy, :

ind = np.where(np.diff(np.concatenate(([0], np.where(np.diff(x) != 0)[0], [x.size]))) >= 10)[0] - 1
x[vrange(d[ind] + 1, d[ind + 1] + 2)] = 0

Python, itertools.chain, itertools.repeat itertools.groupby :

chain.from_iterable(repeat(0, len(i)) if len(i) >= 10 else i for i in [list(g) for _, g in groupby(x)])

Demonstrations:

# Python

In [28]: list(chain.from_iterable(repeat(0, len(i)) if len(i) >= 10 else i for i in [list(g) for _, g in groupby(x)]))
Out[28]: [1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

# Numpy

In [161]: x = np.array([1,0,0,0,1,1,1,1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1, 0, 0, 1, 1, 1, 1, 1, 1 ,1, 1, 1, 1, 0, 0])

In [162]: d = np.where(np.diff(x) != 0)[0]

In [163]: d2 = np.diff(np.concatenate(([0], d, [x.size])))

In [164]: ind = np.where(d2 >= 10)[0] - 1

In [165]: def vrange(starts, stops):
     ...:     stops = np.asarray(stops)
     ...:     l = stops - starts # Lengths of each range.
     ...:     return np.repeat(stops - l.cumsum(), l) + np.arange(l.sum())
     ...: 

In [166]: x[vrange(d[ind] + 1, d[ind + 1] + 2)] = 0

In [167]: x
Out[167]: 
array([1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

For VrangeI used this answer https://codereview.stackexchange.com/questions/83018/vectorized-numpy-version-of-arange-with-multiple-start-stop , but I think there may be more optimized approaches for this.

+2
source

Try the following:

y = []
for pair in b: ## b is the list which you called #B
    add = 0
    if pair[0] == 1 and pair[1] > 9:
        add = 1
    y.extend([add] * pair[1])
+1
source

using pandas:

import pandas as pd

In [130]: s = pd.Series(x)

In [131]: s
Out[131]:
0     1
1     0
2     0
3     0
4     1
     ..
20    1
21    1
22    1
23    1
24    1
Length: 25, dtype: int64

In [132]: s[s.groupby(s.ne(1).cumsum()).transform('count').lt(10)] = 0

In [133]: s.tolist()
Out[133]: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [134]: s
Out[134]:
0     0
1     0
2     0
3     0
4     0
     ..
20    1
21    1
22    1
23    1
24    1
Length: 25, dtype: int64

for your "huge" list, approx. 7 ms on my old laptop:

In [141]: len(x)
Out[141]: 5124

In [142]: %%timeit
     ...: s = pd.Series(x)
     ...: s[s.groupby(s.ne(1).cumsum()).transform('count').lt(10)] = 0
     ...: res = s.tolist()
     ...:
6.56 ms ± 16.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
+1
source

Source: https://habr.com/ru/post/1694551/


All Articles