I think you have 2 ways - a simpler and slower solution or a more complex one.
pat = np.asarray([1, 2, 2, 0]) N = len(pat) df['rm0'] = (df['row_pat'].rolling(window=N , min_periods=N) .apply(lambda x: (x==pat).all()) .mask(lambda x: x == 0) .bfill(limit=N-1) .fillna(0) .astype(bool) )
If performance is important, use strides , the solution from has changed:
- take a rollback approach .
- compare with pattaern and return
True to match all - get indexes of the first
np.mgrid events and indexing - create all list indexes.
- compare
numpy.in1d and create a new column
def rolling_window(a, window): shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) strides = a.strides + (a.strides[-1],) c = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) return c arr = df['row_pat'].values b = np.all(rolling_window(arr, N) == pat, axis=1) c = np.mgrid[0:len(b)][b] d = [i for x in c for i in range(x, x+N)] df['rm2'] = np.in1d(np.arange(len(arr)), d)
Another solution, thanks @divakar :
arr = df['row_pat'].values b = np.all(rolling_window(arr, N) == pat, axis=1) m = (rolling_window(arr, len(pat)) == pat).all(1) m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)] df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N
Delay
np.random.seed(456) import pandas as pd from numpy.random import choice, randn from scipy.ndimage.morphology import binary_dilation import string
def rolling_window(a, window): shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) strides = a.strides + (a.strides[-1],) c = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) return c arr = df['row_pat'].values b = np.all(rolling_window(arr, N) == pat, axis=1) m = (rolling_window(arr, len(pat)) == pat).all(1) m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)] df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2)) arr = df['row_pat'].values b = np.all(rolling_window(arr, N) == pat, axis=1) c = np.mgrid[0:len(b)][b] d = [i for x in c for i in range(x, x+N)] df['rm2'] = np.in1d(np.arange(len(arr)), d)
print (df.iloc[460:480]) date_time group_var row_pat values rm0 rm1 rm2 12045 2019-06-25 21:00:00 A 3 -0.081152 False False False 12094 2019-06-27 22:00:00 A 1 -0.818167 False False False 12125 2019-06-29 05:00:00 A 0 -0.051088 False False False 12143 2019-06-29 23:00:00 A 0 -0.937589 False False False 12145 2019-06-30 01:00:00 A 3 0.298460 False False False 12158 2019-06-30 14:00:00 A 1 0.647161 False False False 12164 2019-06-30 20:00:00 A 3 -0.735538 False False False 12210 2019-07-02 18:00:00 A 1 -0.881740 False False False 12341 2019-07-08 05:00:00 A 3 0.525652 False False False 12343 2019-07-08 07:00:00 A 1 0.311598 False False False 12358 2019-07-08 22:00:00 A 1 -0.710150 True True True 12360 2019-07-09 00:00:00 A 2 -0.752216 True True True 12400 2019-07-10 16:00:00 A 2 -0.205122 True True True 12404 2019-07-10 20:00:00 A 0 1.342591 True True True 12413 2019-07-11 05:00:00 A 1 1.707748 False False False 12506 2019-07-15 02:00:00 A 2 0.319227 False False False 12527 2019-07-15 23:00:00 A 3 2.130917 False False False 12600 2019-07-19 00:00:00 A 1 -1.314070 False False False 12604 2019-07-19 04:00:00 A 0 0.869059 False False False 12613 2019-07-19 13:00:00 A 2 1.342101 False False False
In [225]: %%timeit ...: df['rm0'] = (df['row_pat'].rolling(window=N , min_periods=N) ...: .apply(lambda x: (x==pat).all()) ...: .mask(lambda x: x == 0) ...: .bfill(limit=N-1) ...: .fillna(0) ...: .astype(bool) ...: ) ...: 1 loop, best of 3: 356 ms per loop In [226]: %%timeit ...: arr = df['row_pat'].values ...: b = np.all(rolling_window(arr, N) == pat, axis=1) ...: c = np.mgrid[0:len(b)][b] ...: d = [i for x in c for i in range(x, x+N)] ...: df['rm2'] = np.in1d(np.arange(len(arr)), d) ...: 100 loops, best of 3: 7.63 ms per loop In [227]: %%timeit ...: arr = df['row_pat'].values ...: b = np.all(rolling_window(arr, N) == pat, axis=1) ...: ...: m = (rolling_window(arr, len(pat)) == pat).all(1) ...: m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)] ...: df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2)) ...: 100 loops, best of 3: 7.25 ms per loop