Here's the NumPy approach with np.random.choice
-
a = df.value.values
Run Example -
In [237]: df = pd.DataFrame(np.random.randint(0,2,(100,2)),columns=['id','value']) In [238]: (df.value==1).sum()
Alternatively, a more pandas approach is
idx = np.flatnonzero(df['value']) df.ix[np.random.choice(idx,size=int(0.1*len(idx)),replace=0),'value'] = 0
Runtime test
All approaches published so far -
def f1(df): #@piRSquared soln1 df.loc[df.query('value == 1').sample(frac=.1).index,'value'] = 0 def f2(df): #@piRSquared soln2 v = df.value.values == 1 df.loc[v, 'value'] = np.random.choice((0, 1), v.sum(), p=(.1, .9)) def f3(df): #@Roman Pekar soln idx = df.index[df.value==1] df.loc[np.random.choice(idx, size=idx.size/10, replace=False)].value = 0 def f4(df): #@Mine soln1 a = df.value.values idx = np.flatnonzero(a) a[np.random.choice(idx,size=int(0.1*len(idx)),replace=0)] = 0 def f5(df): #@Mine soln2 idx = np.flatnonzero(df['value']) df.ix[np.random.choice(idx,size=int(0.1*len(idx)),replace=0),'value'] = 0
Dates -
In [2]:
source share