Calculate the average number of DataFrame row groups given by two-dimensional index lists with unequal lengths

I have a DataFrame with n rows. I also have a 2 dimensional array of indices. This array also has n rows, however each row can be variable in length. I need to group rows of a DataFrame by indexes and calculate the average value of a column.

For instance:

If I have a DataFrame df and an ind array, I need to get

[df.loc[ind[n], col_name].mean() for n in ind].

I implemented this with the applypandas function :

size = 100000
df = pd.DataFrame(columns=['a'])
df['a'] = np.arange(size)
np.random.seed(1)
ind = np.array([np.random.randint(0, size, size=5) for _ in range(size)])
def group(row):
    return df.loc[ind[df.index.get_loc(row.name)], 'a'].mean()
df['avg'] = df.apply(group, axis=1)

but it scales slowly and poorly. In this case, it’s much faster to do

df.a.values[ind].mean(axis=1)

However, as I understand it, this only works because all ind elements have the same length, and this following code does not work:

new_ind = ind.tolist()
new_ind[0].pop()
df.a.values[new_ind].mean(axis=1)

pandas groupby, . ?

+4
2


np.random.seed(1)

size = 10
df = pd.DataFrame(dict(a=np.arange(size)))

# array of variable length sub-arrays
ind = np.array([
    np.random.randint(
        0, size, size=np.random.randint(1, 11)
    ) for _ in range(size)
])


np.bincount weights.
.

# get an array of the lengths of sub-arrays
lengths = np.array([len(x) for x in ind])
# simple np.arange for initial positions
positions = np.arange(len(ind))
# get at the underlying values of column `'a'`
values = df.a.values

# for each position repeated the number of times equal to
# the length of the sub-array at that position,
# add to the bin, identified by the position, the amount
# from values at the indices from the sub-array
# divide sums by lengths to get averages
avg = np.bincount(
    positions.repeat(lengths),
    values[np.concatenate(ind)]
) / lengths

df.assign(avg=avg)

   a       avg
0  0  3.833333
1  1  4.250000
2  2  6.200000
3  3  6.000000
4  4  5.200000
5  5  5.400000
6  6  2.000000
7  7  3.750000
8  8  6.500000
9  9  6.200000

, , . , .

Method pir      mcf Best
Size                    
10       1  12.3746  pir
30       1  44.0495  pir
100      1  124.054  pir
300      1    270.6  pir
1000     1  576.505  pir
3000     1  819.034  pir
10000    1  990.847  pir

enter image description here

def mcf(d, i):
    g = lambda r: d.loc[i[d.index.get_loc(r.name)], 'a'].mean()
    return d.assign(avg=d.apply(g, 1))

def pir(d, i):
    lengths = np.array([len(x) for x in i])
    positions = np.arange(len(i))
    values = d.a.values

    avg = np.bincount(
        positions.repeat(lengths),
        values[np.concatenate(i)]
    ) / lengths

    return d.assign(avg=avg)

results = pd.DataFrame(
    index=pd.Index([10, 30, 100, 300, 1000, 3000, 10000], name='Size'),
    columns=pd.Index(['pir', 'mcf'], name='Method')
)

for i in results.index:

    df = pd.DataFrame(dict(a=np.arange(i)))
    ind = np.array([
        np.random.randint(
            0, i, size=np.random.randint(1, 11)
        ) for _ in range(i)
    ])

    for j in results.columns:

        stmt = '{}(df, ind)'.format(j)
        setp = 'from __main__ import df, ind, {}'.format(j)
        results.set_value(i, j, timeit(stmt, setp, number=10))

results.div(results.min(1), 0).round(2).pipe(lambda d: d.assign(Best=d.idxmin(1)))

fig, (a1, a2) = plt.subplots(2, 1, figsize=(6, 6))
results.plot(loglog=True, lw=3, ax=a1)
results.div(results.min(1), 0).round(2).plot.bar(logy=True, ax=a2)
+2

, , ... ,

() ind,

import pandas as pd
import numpy as np
size = 10
df = pd.DataFrame(columns=['a'])
df['a'] = np.arange(size)
ind = np.array([[5, 8, 9, 5, 0],
       [0, 1, 7, 6, 9],
       [2, 4, 5, 2, 4],
       [2, 4, 7, 7, 9],
       [1, 7, 0, 6, 9],
       [9, 7, 6, 9, 1],
       [0, 1, 8, 8, 3],
       [9, 8, 7, 3, 6],
       [5, 1, 9, 3, 4],
       [8, 1, 4, 0, 3]])
def group(row):
    return df.loc[ind[df.index.get_loc(row.name)], 'a'].mean()
df['avg'] = df.apply(group, axis=1)

df['comparison'] = df.a.values[ind].mean(axis=1)

In [86]: (df['comparison'] == df['avg']).all()
Out[86]: True

  • 0.5263588428497314
  • 0.014391899108886719
  • bincount 0.03328204154968262

enter image description here

timeit ( ), ,

import timeit
sizes = [10, 100, 1000, 10000]
res_mine = map(mine, sizes)
res_bincount = map(bincount, sizes)
res_original = map(original, sizes[:-1])

def bincount(size):
    return min(timeit.repeat(
        """lengths = np.array([len(x) for x in ind])
positions = np.arange(len(ind))
values = df.a.values
avg = np.bincount(positions.repeat(lengths), values[np.concatenate(ind)]) / lengths
df.assign(avg=avg)""",
        """import pandas as pd
import numpy as np
size = {size}
df = pd.DataFrame(columns=['a'])
df['a'] = np.arange(size)
np.random.seed(1)
ind = np.array([np.random.randint(0, size, size=5) for _ in range(size)])
def group(row):
    return df.loc[ind[df.index.get_loc(row.name)], 'a'].mean()""".format(size=size),
    number=100, repeat=10))

def original(size):
    return min(timeit.repeat(
        """df['avg'] = df.apply(group, axis=1)""",
        """import pandas as pd
import numpy as np    
size = {size}             
df = pd.DataFrame(columns=['a'])
df['a'] = np.arange(size)       
np.random.seed(1)               
ind = np.array([np.random.randint(0, size, size=5) for _ in range(size)])
def group(row):                                                          
    return df.loc[ind[df.index.get_loc(row.name)], 'a'].mean()""".format(size=size),
    repeat=10, number=1))

def mine(size):
    return min(timeit.repeat("""df['comparison'] = df.a.values[ind].mean(axis=1)""",
        """import pandas as pd
import numpy as np    
size = {size}             
df = pd.DataFrame(columns=['a'])
df['a'] = np.arange(size)       
np.random.seed(1)               
ind = np.array([np.random.randint(0, size, size=5) for _ in range(size)])
def group(row):                                                          
    return df.loc[ind[df.index.get_loc(row.name)], 'a'].mean()""".format(size=size),
        repeat=100, number=10))

import matplotlib.pyplot as plt
fig = plt.figure()
ax = plt.axes()
ax.plot(sizes, res_mine, label='mine')
ax.plot(sizes, res_bincount, label='bincount')
ax.plot(sizes[:-1], res_original, label='original')
plt.yscale('log')
plt.xscale('log')
plt.legend()
plt.xlabel('size of dataframe')
plt.ylabel('run time (s)')
plt.show()

, ,

+1

Source: https://habr.com/ru/post/1683479/


All Articles