Pandas Interpreting a Dataframe in Indexed Partitions

My sample code is as follows:

import pandas as pd


dictx = {'col1':[1,'nan','nan','nan',5,'nan',7,'nan',9,'nan','nan','nan',13],\
    'col2':[20,'nan','nan','nan',22,'nan',25,'nan',30,'nan','nan','nan',25],\
    'col3':[15,'nan','nan','nan',10,'nan',14,'nan',13,'nan','nan','nan',9]}
df = pd.DataFrame(dictx).astype(float)

I am trying to interpolate various segments that contain the value "nan".
For context, I'm trying to track bus speeds using GPS data provided by the city (Sรฃo Paulo, Brazil), but there is little data and with parts that do not provide information, for example, but there are segments that I know they stop, for example, dawn, but the information also comes in as "nan."

What I need:
I experimented with the parameters dataframe.interpolate()(limit and limit_diretcion), but it turned out to be short. If I install df.interpolate(limit=2), I will not only interpolate the data I need, but also the data in which they should not. So I need to interpolate between sections defined by the limit

Required Conclusion:

Out[7]: 
    col1   col2   col3
0    1.0  20.00  15.00
1    nan  nan    nan
2    nan  nan    nan
3    nan  nan    nan
4    5.0  22.00  10.00
5    6.0  23.50  12.00
6    7.0  25.00  14.00
7    8.0  27.50  13.50
8    9.0  30.00  13.00
9    nan  nan    nan
10   nan  nan    nan
11   nan  nan    nan
12   13.0 25.00  9.00

The logic I'm trying to apply is basically trying to find nan and calculate the difference between their indices and thereby create a new dataframe_temp for interpolation and only add it to another, creating a new dataframe_final. But this has become difficult to achieve because 'nan'=='nan'returnFalse

+4
source share
3 answers

, . , Pandas 0,23 .

https://pandas-docs.imtqy.com/pandas-docs-travis/whatsnew.html#dataframe-interpolate-has-gained-the-limit-area-kwarg

df_fw = df.interpolate(limit=1)
df_bk = df.interpolate(limit=1, limit_direction='backward')

df_fw.where(df_bk.notna())

    col1  col2  col3
0    1.0  20.0  15.0
1    NaN   NaN   NaN
2    NaN   NaN   NaN
3    NaN   NaN   NaN
4    5.0  22.0  10.0
5    6.0  23.5  12.0
6    7.0  25.0  14.0
7    8.0  27.5  13.5
8    9.0  30.0  13.0
9    NaN   NaN   NaN
10   NaN   NaN   NaN
11   NaN   NaN   NaN
12  13.0  25.0   9.0


.
.

def interp(df, limit):
    d = df.notna().rolling(limit + 1).agg(any).fillna(1)
    d = pd.concat({
        i: d.shift(-i).fillna(1)
        for i in range(limit + 1)
    }).prod(level=1)

    return df.interpolate(limit=limit).where(d.astype(bool))

df.pipe(interp, 1)

    col1  col2  col3
0    1.0  20.0  15.0
1    NaN   NaN   NaN
2    NaN   NaN   NaN
3    NaN   NaN   NaN
4    5.0  22.0  10.0
5    6.0  23.5  12.0
6    7.0  25.0  14.0
7    8.0  27.5  13.5
8    9.0  30.0  13.0
9    NaN   NaN   NaN
10   NaN   NaN   NaN
11   NaN   NaN   NaN
12  13.0  25.0   9.0

NaN . df

dictx = {'col1':[1,'nan','nan','nan',5,'nan','nan',7,'nan',9,'nan','nan','nan',13],\
    'col2':[20,'nan','nan','nan',22,'nan',25,'nan','nan',30,'nan','nan','nan',25],\
    'col3':[15,'nan','nan','nan',10,'nan',14,'nan',13,'nan','nan','nan',9,'nan']}
df = pd.DataFrame(dictx).astype(float)
df

    col1  col2  col3
0    1.0  20.0  15.0
1    NaN   NaN   NaN
2    NaN   NaN   NaN
3    NaN   NaN   NaN
4    5.0  22.0  10.0
5    NaN   NaN   NaN
6    NaN  25.0  14.0
7    7.0   NaN   NaN
8    NaN   NaN  13.0
9    9.0  30.0   NaN
10   NaN   NaN   NaN
11   NaN   NaN   NaN
12   NaN   NaN   9.0
13  13.0  25.0   NaN

limit=1

df.pipe(interp, 1)

    col1  col2  col3
0    1.0  20.0  15.0
1    NaN   NaN   NaN
2    NaN   NaN   NaN
3    NaN   NaN   NaN
4    5.0  22.0  10.0
5    NaN  23.5  12.0
6    NaN  25.0  14.0
7    7.0   NaN  13.5
8    8.0   NaN  13.0
9    9.0  30.0   NaN
10   NaN   NaN   NaN
11   NaN   NaN   NaN
12   NaN   NaN   9.0
13  13.0  25.0   9.0

limit=2

df.pipe(interp, 2).round(2)

     col1   col2  col3
0    1.00  20.00  15.0
1     NaN    NaN   NaN
2     NaN    NaN   NaN
3     NaN    NaN   NaN
4    5.00  22.00  10.0
5    5.67  23.50  12.0
6    6.33  25.00  14.0
7    7.00  26.67  13.5
8    8.00  28.33  13.0
9    9.00  30.00   NaN
10    NaN    NaN   NaN
11    NaN    NaN   NaN
12    NaN    NaN   9.0
13  13.00  25.00   9.0
+1

, NaNs, ( limit):

import numpy as np
import pandas as pd

dictx = {'col1':[1,'nan','nan','nan',5,'nan',7,'nan',9,'nan','nan','nan',13],\
    'col2':[20,'nan','nan','nan',22,'nan',25,'nan',30,'nan','nan','nan',25],\
    'col3':[15,'nan','nan','nan',10,'nan',14,'nan',13,'nan','nan','nan',9]}
df = pd.DataFrame(dictx).astype(float)

limit = 2
notnull = pd.notnull(df).all(axis=1)
# assign group numbers to the rows of df. Each group starts with a non-null row,
# followed by null rows
group = notnull.cumsum()
# find the index of groups having length > limit
ignore = (df.groupby(group).filter(lambda grp: len(grp)>limit)).index
# only ignore rows which are null
ignore = df.loc[~notnull].index.intersection(ignore)
keep = df.index.difference(ignore)
# interpolate only the kept rows
df.loc[keep] = df.loc[keep].interpolate()

print(df)

    col1  col2  col3
0    1.0  20.0  15.0
1    NaN   NaN   NaN
2    NaN   NaN   NaN
3    NaN   NaN   NaN
4    5.0  22.0  10.0
5    6.0  23.5  12.0
6    7.0  25.0  14.0
7    8.0  27.5  13.5
8    9.0  30.0  13.0
9    NaN   NaN   NaN
10   NaN   NaN   NaN
11   NaN   NaN   NaN
12  13.0  25.0   9.0

limit, , , .

+1

.

for i in list(df):

     for x in range(len(df[i])):

         if not df[i][x]  > -100:

                   df[i][x] = 0

df

col1    col2    col3
0   1.0     20.0    15.0
1   0.0     0.0     0.0
2   0.0     0.0     0.0
3   0.0     0.0     0.0
4   5.0     22.0    10.0
5   0.0     0.0     0.0
6   7.0     25.0    14.0
7   0.0     0.0     0.0
8   9.0     30.0    13.0
9   0.0     0.0     0.0
10  0.0     0.0     0.0
11  0.0     0.0     0.0
12  13.0    25.0    9.0

df["col1"][1] == df["col2"][1]
True
0

Source: https://habr.com/ru/post/1694007/


All Articles