How to count the number of time slots that correspond to a logical state in a pandas frame?

I have pandas dfwith time series in column1and boolean condition in column2. This describes continuous time intervals that satisfy a particular condition. Please note that the time intervals are of unequal length.

Timestamp   Boolean_condition
1           1
2           1
3           0
4           1
5           1
6           1
7           0
8           0
9           1
10          0

How to calculate the total number of time intervals in the entire series that satisfy this condition?

The desired result should look like this:

Timestamp   Boolean_condition   Event_number
1           1                   1
2           1                   1
3           0                   NaN
4           1                   2
5           1                   2
6           1                   2
7           0                   NaN
8           0                   NaN
9           1                   3
10          0                   NaN
+4
source share
4 answers

You can create Seriesusing cumsumtwo masks, and then create NaNby function Series.mask:

mask0 = df.Boolean_condition.eq(0)
mask2 = df.Boolean_condition.ne(df.Boolean_condition.shift(1))
print ((mask2 & mask0).cumsum().add(1))
0    1
1    1
2    2
3    2
4    2
5    2
6    3
7    3
8    3
9    4
Name: Boolean_condition, dtype: int32

df['Event_number'] = (mask2 & mask0).cumsum().add(1).mask(mask0)
print (df)
   Timestamp  Boolean_condition  Event_number
0          1                  1           1.0
1          2                  1           1.0
2          3                  0           NaN
3          4                  1           2.0
4          5                  1           2.0
5          6                  1           2.0
6          7                  0           NaN
7          8                  0           NaN
8          9                  1           3.0
9         10                  0           NaN

Delay

#[100000 rows x 2 columns
df = pd.concat([df]*10000).reset_index(drop=True)
df1 = df.copy()
df2 = df.copy()

def nick(df):
    isone = df.Boolean_condition[df.Boolean_condition.eq(1)]
    idx = isone.index
    grp = (isone != idx.to_series().diff().eq(1)).cumsum()
    df.loc[idx, 'Event_number'] = pd.Categorical(grp).codes + 1
    return df

def jez(df):
    mask0 = df.Boolean_condition.eq(0)
    mask2 = df.Boolean_condition.ne(df.Boolean_condition.shift(1))
    df['Event_number'] = (mask2 & mask0).cumsum().add(1).mask(mask0)
    return (df)

def jez1(df):
    mask0 = ~df.Boolean_condition
    mask2 = df.Boolean_condition.ne(df.Boolean_condition.shift(1))
    df['Event_number'] = (mask2 & mask0).cumsum().add(1).mask(mask0)
    return (df)

In [68]: %timeit (jez1(df))
100 loops, best of 3: 6.45 ms per loop

In [69]: %timeit (nick(df1))
100 loops, best of 3: 12 ms per loop

In [70]: %timeit (jez(df2))
100 loops, best of 3: 5.34 ms per loop
+3

:

1) True (, 1), isone

2) , , . , 1. .

3) isone , , ( ). .

4) loc isone, , grp , Event_number.


isone = df.Bolean_condition[df.Bolean_condition.eq(1)]
idx = isone.index
grp = (isone != idx.to_series().diff().eq(1)).cumsum()
df.loc[idx, 'Event_number'] = pd.Categorical(grp).codes + 1

enter image description here


:

numpy:

1) .

2) (1's) .

3) NaN , , , .

4) , Nan's , .

5) , 1, , . , 1's.

6) .


def nick(df):
    b = df.Bolean_condition.values
    slc = np.flatnonzero(b)
    slc_pl_1 = np.append(np.nan, slc)
    nan_arr = np.full(b.size, fill_value=np.nan)
    nan_arr[slc] = np.cumsum(slc_pl_1[1:] - slc_pl_1[:-1] != 1)
    df['Event_number'] = nan_arr
    return df

:

a DF 10 000 :

np.random.seed(42)
df1 = pd.DataFrame(dict(
        Timestamp=np.arange(10000),
        Bolean_condition=np.random.choice(np.array([0,1]), 10000, p=[0.4, 0.6]))
                  )

df1.shape
# (10000, 2)

def jez(df):
    mask0 = df.Bolean_condition.eq(0)
    mask2 = df.Bolean_condition.ne(df.Bolean_condition.shift(1))
    df['Event_number'] = (mask2 & mask0).cumsum().mask(mask0)
    return (df)

nick(df1).equals(jez(df1))
# True

%%timeit
nick(df1)
1000 loops, best of 3: 362 µs per loop

%%timeit
jez(df1)
100 loops, best of 3: 1.56 ms per loop

a DF, 1 :

np.random.seed(42)
df1 = pd.DataFrame(dict(
        Timestamp=np.arange(1000000),
        Bolean_condition=np.random.choice(np.array([0,1]), 1000000, p=[0.4, 0.6]))
                  )

df1.shape
# (1000000, 2)

nick(df1).equals(jez(df1))
# True

%%timeit
nick(df1)
10 loops, best of 3: 34.9 ms per loop

%%timeit
jez(df1)
10 loops, best of 3: 50.1 ms per loop
+3

. Matlab:

Boolean_condition = [1 1 0 1 1 1 0 0 1 0];
Event_number = [NA NA NA NA NA NA NA NA NA NA];
loop_event_number = 1;

for timestamp=1:10
if Boolean_condition(timestamp)==1
Event_number(timestamp) = loop_event_number;
last_event_number = loop_event_number;
else
loop_event_number = last_event_number +1;
end
end

% Event_number = 1 1 NA 2 2 2 NA NA 3 NA
+1

, df.

df = pd.concat([df,pd.Series([0]*len(df), name = '2')], axis = 1)
if df.iloc[0,1] == 1:
        counter = 1
        df.iloc[0, 2] = counter
    else:
        counter = 0
        df.iloc[0,2] = 0
    previous = df.iloc[0,1]
    for y,x in df.iloc[1:,].iterrows():
        print(y)
        if x[1] == 1 and previous == 1:
            previous = x[1]
            df.iloc[y, 2] = counter
        if x[1] == 0:
            previous = x[1]
            df.iloc[y,2] = 0
        if x[1] == 1 and previous == 0:
            counter += 1
            previous = x[1]
            df.iloc[y,2] = counter
+1

Source: https://habr.com/ru/post/1667239/


All Articles