agg, document_id :
print apyData
afx year document_id company
0 0 1999 3 Orange
1 1 1999 5 Orange
2 2 1999 3 Orange
3 3 2001 41 Banana
4 4 2001 21 Strawberry
5 5 2001 18 Strawberry
6 6 2002 44 Orange
f = {'nbDocument' : lambda x: len(x.unique()), 'document_id' : lambda x: tuple(x)}
count2 = apyData.groupby(['year','company']).document_id.agg(f).reset_index()
print count2
year company nbDocument document_id
0 1999 Orange 2 (3, 5, 3)
1 2001 Banana 1 (41,)
2 2001 Strawberry 2 (21, 18)
3 2002 Orange 1 (44,)
count2['document_id'] = count2['document_id'].apply(lambda x: list(x))
count2 = count2[['year','document_id','company','nbDocument']]
print count2
year document_id company nbDocument
0 1999 [3, 5, 3] Orange 2
1 2001 [41] Banana 1
2 2001 [21, 18] Strawberry 2
3 2002 [44] Orange 1
EDIT:
'document_id' : lambda x: list(x) agg, :
ValueError:
tuple, list.
EDIT1:
:
def je(apyData):
f = {'nbDocument' : lambda x: len(x.unique()), 'document_id' : lambda x: tuple(x)}
count2 = apyData.groupby(['year','company']).document_id.agg(f).reset_index()
count2['document_id'] = count2['document_id'].apply(lambda x: list(x))
return count2
def mm(df):
out = pd.DataFrame()
grouped = df.groupby(['year', 'company'])
out['nbDocument'] = grouped.apply(lambda x: list(x['document_id'].drop_duplicates()))
out['document_id'] = out['nbDocument'].apply(lambda x: len(x))
return (out.reset_index().sort_values(['year', 'company']))
def st(df):
result = pd.DataFrame()
result['document_id'] = df.groupby(['company', 'year']).apply(lambda x: [d for d in x['document_id'].drop_duplicates()])
result['nbDocument'] = result.document_id.apply(lambda x: len(x))
return result.reset_index().sort_values(['company', 'year'])
print mm(apyData)
print st(apyData)
print je(apyData)
:
In [48]: %timeit je(apyData)
100 loops, best of 3: 3.08 ms per loop
In [49]: %timeit mm(apyData)
100 loops, best of 3: 5.73 ms per loop
In [50]: %timeit st(apyData)
100 loops, best of 3: 5.8 ms per loop