, , ... ,
() ind,
import pandas as pd
import numpy as np
size = 10
df = pd.DataFrame(columns=['a'])
df['a'] = np.arange(size)
ind = np.array([[5, 8, 9, 5, 0],
[0, 1, 7, 6, 9],
[2, 4, 5, 2, 4],
[2, 4, 7, 7, 9],
[1, 7, 0, 6, 9],
[9, 7, 6, 9, 1],
[0, 1, 8, 8, 3],
[9, 8, 7, 3, 6],
[5, 1, 9, 3, 4],
[8, 1, 4, 0, 3]])
def group(row):
return df.loc[ind[df.index.get_loc(row.name)], 'a'].mean()
df['avg'] = df.apply(group, axis=1)
df['comparison'] = df.a.values[ind].mean(axis=1)
In [86]: (df['comparison'] == df['avg']).all()
Out[86]: True
0.52635884284973140.014391899108886719bincount 0.03328204154968262

timeit ( ), ,
import timeit
sizes = [10, 100, 1000, 10000]
res_mine = map(mine, sizes)
res_bincount = map(bincount, sizes)
res_original = map(original, sizes[:-1])
def bincount(size):
return min(timeit.repeat(
"""lengths = np.array([len(x) for x in ind])
positions = np.arange(len(ind))
values = df.a.values
avg = np.bincount(positions.repeat(lengths), values[np.concatenate(ind)]) / lengths
df.assign(avg=avg)""",
"""import pandas as pd
import numpy as np
size = {size}
df = pd.DataFrame(columns=['a'])
df['a'] = np.arange(size)
np.random.seed(1)
ind = np.array([np.random.randint(0, size, size=5) for _ in range(size)])
def group(row):
return df.loc[ind[df.index.get_loc(row.name)], 'a'].mean()""".format(size=size),
number=100, repeat=10))
def original(size):
return min(timeit.repeat(
"""df['avg'] = df.apply(group, axis=1)""",
"""import pandas as pd
import numpy as np
size = {size}
df = pd.DataFrame(columns=['a'])
df['a'] = np.arange(size)
np.random.seed(1)
ind = np.array([np.random.randint(0, size, size=5) for _ in range(size)])
def group(row):
return df.loc[ind[df.index.get_loc(row.name)], 'a'].mean()""".format(size=size),
repeat=10, number=1))
def mine(size):
return min(timeit.repeat("""df['comparison'] = df.a.values[ind].mean(axis=1)""",
"""import pandas as pd
import numpy as np
size = {size}
df = pd.DataFrame(columns=['a'])
df['a'] = np.arange(size)
np.random.seed(1)
ind = np.array([np.random.randint(0, size, size=5) for _ in range(size)])
def group(row):
return df.loc[ind[df.index.get_loc(row.name)], 'a'].mean()""".format(size=size),
repeat=100, number=10))
import matplotlib.pyplot as plt
fig = plt.figure()
ax = plt.axes()
ax.plot(sizes, res_mine, label='mine')
ax.plot(sizes, res_bincount, label='bincount')
ax.plot(sizes[:-1], res_original, label='original')
plt.yscale('log')
plt.xscale('log')
plt.legend()
plt.xlabel('size of dataframe')
plt.ylabel('run time (s)')
plt.show()
, ,