Pandas - Search and index strings matching a string sequence pattern

I would like to find a template in a data frame in a categorical variable that goes down the lines. I can see how to use the Series.shift () method to search up / down and use logical logic to search for a template, however I want to do this with a grouping variable, and also mark all the lines that are part of the template, not just the start line.

code:

import pandas as pd from numpy.random import choice, randn import string # df constructor n_rows = 1000 df = pd.DataFrame({'date_time': pd.date_range('2/9/2018', periods=n_rows, freq='H'), 'group_var': choice(list(string.ascii_uppercase), n_rows), 'row_pat': choice([0, 1, 2, 3], n_rows), 'values': randn(n_rows)}) # sorting df.sort_values(by=['group_var', 'date_time'], inplace=True) df.head(10) 

What returns this: enter image description here

I can find the beginning of the template (without grouping):

 # the row ordinal pattern to detect p0, p1, p2, p3 = 1, 2, 2, 0 # flag the row at the start of the pattern df['pat_flag'] = \ df['row_pat'].eq(p0) & \ df['row_pat'].shift(-1).eq(p1) & \ df['row_pat'].shift(-2).eq(p2) & \ df['row_pat'].shift(-3).eq(p3) df.head(10) 

enter image description here

What I cannot understand is how to do this only with "group_var", and instead of returning True to start the template, return true for all lines that are part of the template.

Appreciate all the tips on how to solve this!

Thanks...

+5
source share
5 answers

I think you have 2 ways - a simpler and slower solution or a more complex one.


 pat = np.asarray([1, 2, 2, 0]) N = len(pat) df['rm0'] = (df['row_pat'].rolling(window=N , min_periods=N) .apply(lambda x: (x==pat).all()) .mask(lambda x: x == 0) .bfill(limit=N-1) .fillna(0) .astype(bool) ) 

If performance is important, use strides , the solution from has changed:

  • take a rollback approach .
  • compare with pattaern and return True to match all
  • get indexes of the first np.mgrid events and indexing
  • create all list indexes.
  • compare numpy.in1d and create a new column

 def rolling_window(a, window): shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) strides = a.strides + (a.strides[-1],) c = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) return c arr = df['row_pat'].values b = np.all(rolling_window(arr, N) == pat, axis=1) c = np.mgrid[0:len(b)][b] d = [i for x in c for i in range(x, x+N)] df['rm2'] = np.in1d(np.arange(len(arr)), d) 

Another solution, thanks @divakar :

 arr = df['row_pat'].values b = np.all(rolling_window(arr, N) == pat, axis=1) m = (rolling_window(arr, len(pat)) == pat).all(1) m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)] df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2)) 

Delay

 np.random.seed(456) import pandas as pd from numpy.random import choice, randn from scipy.ndimage.morphology import binary_dilation import string # df constructor n_rows = 100000 df = pd.DataFrame({'date_time': pd.date_range('2/9/2018', periods=n_rows, freq='H'), 'group_var': choice(list(string.ascii_uppercase), n_rows), 'row_pat': choice([0, 1, 2, 3], n_rows), 'values': randn(n_rows)}) # sorting df.sort_values(by=['group_var', 'date_time'], inplace=True) 

 def rolling_window(a, window): shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) strides = a.strides + (a.strides[-1],) c = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) return c arr = df['row_pat'].values b = np.all(rolling_window(arr, N) == pat, axis=1) m = (rolling_window(arr, len(pat)) == pat).all(1) m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)] df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2)) arr = df['row_pat'].values b = np.all(rolling_window(arr, N) == pat, axis=1) c = np.mgrid[0:len(b)][b] d = [i for x in c for i in range(x, x+N)] df['rm2'] = np.in1d(np.arange(len(arr)), d) 

 print (df.iloc[460:480]) date_time group_var row_pat values rm0 rm1 rm2 12045 2019-06-25 21:00:00 A 3 -0.081152 False False False 12094 2019-06-27 22:00:00 A 1 -0.818167 False False False 12125 2019-06-29 05:00:00 A 0 -0.051088 False False False 12143 2019-06-29 23:00:00 A 0 -0.937589 False False False 12145 2019-06-30 01:00:00 A 3 0.298460 False False False 12158 2019-06-30 14:00:00 A 1 0.647161 False False False 12164 2019-06-30 20:00:00 A 3 -0.735538 False False False 12210 2019-07-02 18:00:00 A 1 -0.881740 False False False 12341 2019-07-08 05:00:00 A 3 0.525652 False False False 12343 2019-07-08 07:00:00 A 1 0.311598 False False False 12358 2019-07-08 22:00:00 A 1 -0.710150 True True True 12360 2019-07-09 00:00:00 A 2 -0.752216 True True True 12400 2019-07-10 16:00:00 A 2 -0.205122 True True True 12404 2019-07-10 20:00:00 A 0 1.342591 True True True 12413 2019-07-11 05:00:00 A 1 1.707748 False False False 12506 2019-07-15 02:00:00 A 2 0.319227 False False False 12527 2019-07-15 23:00:00 A 3 2.130917 False False False 12600 2019-07-19 00:00:00 A 1 -1.314070 False False False 12604 2019-07-19 04:00:00 A 0 0.869059 False False False 12613 2019-07-19 13:00:00 A 2 1.342101 False False False 

 In [225]: %%timeit ...: df['rm0'] = (df['row_pat'].rolling(window=N , min_periods=N) ...: .apply(lambda x: (x==pat).all()) ...: .mask(lambda x: x == 0) ...: .bfill(limit=N-1) ...: .fillna(0) ...: .astype(bool) ...: ) ...: 1 loop, best of 3: 356 ms per loop In [226]: %%timeit ...: arr = df['row_pat'].values ...: b = np.all(rolling_window(arr, N) == pat, axis=1) ...: c = np.mgrid[0:len(b)][b] ...: d = [i for x in c for i in range(x, x+N)] ...: df['rm2'] = np.in1d(np.arange(len(arr)), d) ...: 100 loops, best of 3: 7.63 ms per loop In [227]: %%timeit ...: arr = df['row_pat'].values ...: b = np.all(rolling_window(arr, N) == pat, axis=1) ...: ...: m = (rolling_window(arr, len(pat)) == pat).all(1) ...: m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)] ...: df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2)) ...: 100 loops, best of 3: 7.25 ms per loop 
+4
source

You can use the pd.rolling () methods, and then just compare the arrays that it returns with the array that contains the template you are trying to map.

 pattern = np.asarray([1.0, 2.0, 2.0, 0.0]) n_obs = len(pattern) df['rolling_match'] = (df['row_pat'] .rolling(window=n_obs , min_periods=n_obs) .apply(lambda x: (x==pattern).all()) .astype(bool) # All as bools .shift(-1 * (n_obs - 1)) # Shift back .fillna(False) # convert NaNs to False ) 

It is important to specify minimum periods here to make sure that you find only exact matches (and therefore the equality check will not fail if the pieces are offset). The apply function performs a pairwise check between two arrays, and then we use .all () to ensure consistency. We convert to bool, and then call the function shift to move it to the "forward" indicator, and not just after the fact.

Help on the rolling features available here - https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.rolling.html

+3
source

It works.
It works as follows:
a) Each group requires a window of size 4 and scans the column until it finds the combination (1,2,2,0) in the exact sequence. Once it finds a sequence, it fills in the corresponding index values โ€‹โ€‹of the new pat_flag column with 1.
b) If he does not find a combination, he fills in column 0.

 pattern = [1,2,2,0] def get_pattern(df): df = df.reset_index(drop=True) df['pat_flag'] = 0 get_indexes = [] temp = [] for index, row in df.iterrows(): mindex = index +1 # get the next 4 values for j in range(mindex, mindex+4): if j == df.shape[0]: break else: get_indexes.append(j) temp.append(df.loc[j,'row_pat']) # check if sequence is matched if temp == pattern: df.loc[get_indexes,'pat_flag'] = 1 else: # reset if the pattern is not found in given window temp = [] get_indexes = [] return df # apply function to the groups df = df.groupby('group_var').apply(get_pattern) ## snippet of output date_time group_var row_pat values pat_flag 41 2018-03-13 21:00:00 C 3 0.731114 0 42 2018-03-14 05:00:00 C 0 1.350164 0 43 2018-03-14 11:00:00 C 1 -0.429754 1 44 2018-03-14 12:00:00 C 2 1.238879 1 45 2018-03-15 17:00:00 C 2 -0.739192 1 46 2018-03-18 06:00:00 C 0 0.806509 1 47 2018-03-20 06:00:00 C 1 0.065105 0 48 2018-03-20 08:00:00 C 1 0.004336 0 
+1
source

Deploying Emmet02's answer: using the rolling function for all groups and setting match-column to 1 for all matching template indexes:

 pattern = np.asarray([1,2,2,0]) # Create a match column in the main dataframe df.assign(match=False, inplace=True) for group_var, group in df.groupby("group_var"): # Per group do rolling window matching, the last # values of matching patterns in array 'match' # will be True match = ( group['row_pat'] .rolling(window=len(pattern), min_periods=len(pattern)) .apply(lambda x: (x==pattern).all()) ) # Get indices of matches in current group idx = np.arange(len(group))[match == True] # Include all indices of matching pattern, # counting back from last index in pattern idx = idx.repeat(len(pattern)) - np.tile(np.arange(len(pattern)), len(idx)) # Update matches match.values[idx] = True df.loc[group.index, 'match'] = match df[df.match==True] 

edit: Without for loop

 # Do rolling matching in group clause match = ( df.groupby("group_var") .rolling(len(pattern)) .row_pat.apply(lambda x: (x==pattern).all()) ) # Convert NaNs match = (~match.isnull() & match) # Get indices of matches in current group idx = np.arange(len(df))[match] # Include all indices of matching pattern idx = idx.repeat(len(pattern)) - np.tile(np.arange(len(pattern)), len(idx)) # Mark all indices that are selected by "idx" in match-column df = df.assign(match=df.index.isin(df.index[idx])) 
+1
source

You can do this by specifying a custom aggregation function, and then using it in the group_by statement, and finally merge it back into the original frame. Something like that:

Aggregate Function:

 def pattern_detect(column): # define any other pattern to detect here p0, p1, p2, p3 = 1, 2, 2, 0 column.eq(p0) & \ column.shift(-1).eq(p1) & \ column.shift(-2).eq(p2) & \ column.shift(-3).eq(p3) return column.any() 

Use the following group:

 grp = df.group_by('group_var').agg([patter_detect])['row_pat'] 

Now merge it back to the original data framework:

 df = df.merge(grp, left_on='group_var',right_index=True, how='left') 
+1
source

Source: https://habr.com/ru/post/1275269/


All Articles