You can use value_counts
with boolean indexing
and isin
:
df = pd.DataFrame({
'LeafID':[1,1,2,1,3,3,1,6,3,5,1],
'pidx':[10,10,300,10,30,40,20,10,30,45,20],
'pidy':[20,20,400,20,15,20,12,43,54,112,23],
'count':[10,20,30,40,80,10,20,50,30,10,70],
'score':[10,10,10,22,22,3,4,5,9,0,1]
})
print (df)
LeafID count pidx pidy score
0 1 10 10 20 10
1 1 20 10 20 10
2 2 30 300 400 10
3 1 40 10 20 22
4 3 80 30 15 22
5 3 10 40 20 3
6 1 20 20 12 4
7 6 50 10 43 5
8 3 30 30 54 9
9 5 10 45 112 0
10 1 70 20 23 1
s = df.pidx.value_counts()
idx = s[s>2].index
print (df[df.pidx.isin(idx)])
LeafID count pidx pidy score
0 1 10 10 20 10
1 1 20 10 20 10
3 1 40 10 20 22
7 6 50 10 43 5
Delay
np.random.seed(123)
N = 1000000
L1 = list('abcdefghijklmnopqrstu')
L2 = list('efghijklmnopqrstuvwxyz')
df = pd.DataFrame({'LeafId':np.random.randint(1000, size=N),
'pidx': np.random.randint(10000, size=N),
'pidy': np.random.choice(L2, N),
'count':np.random.randint(1000, size=N)})
print (df)
print (df.groupby('pidx').filter(lambda x: len(x) > 120))
def jez(df):
s = df.pidx.value_counts()
return df[df.pidx.isin(s[s>120].index)]
print (jez(df))
In [55]: %timeit (df.groupby('pidx').filter(lambda x: len(x) > 120))
1 loop, best of 3: 1.17 s per loop
In [56]: %timeit (jez(df))
10 loops, best of 3: 141 ms per loop
In [62]: %timeit (df[df.groupby('pidx').pidx.transform('size') > 120])
10 loops, best of 3: 102 ms per loop
In [63]: %timeit (df[df.groupby('pidx').pidx.transform(len) > 120])
1 loop, best of 3: 685 ms per loop
In [64]: %timeit (df[df.groupby('pidx').pidx.transform('count') > 120])
10 loops, best of 3: 104 ms per loop
For final_score
you can use:
df['final_score'] = df['count'].mul(.4).add(df.score.mul(.6))