defaultdict
from collections import defaultdict
d = defaultdict(set)
for t in df.dropna(subset=['list_value']).itertuples():
d[t.study_id] |= set(t.list_value)
df.assign(list_value=df.study_id.map(pd.Series(d).apply(sorted)))
study_id list_value
0 1 [a, b, c]
1 1 [a, b, c]
2 1 [a, b, c]
3 2 [a, b, d, e, z]
4 2 [a, b, d, e, z]
5 2 [a, b, d, e, z]
np.unique and other other tricks
, ndarray
df.assign(
list_value=df.study_id.map(
df.set_index('study_id').list_value.dropna().sum(level=0).apply(np.unique)
)
)
study_id list_value
0 1 [a, b, c]
1 1 [a, b, c]
2 1 [a, b, c]
3 2 [a, b, d, e, z]
4 2 [a, b, d, e, z]
5 2 [a, b, d, e, z]
sorted,
df.assign(
list_value=df.study_id.map(
df.set_index('study_id').list_value.dropna()
.sum(level=0).apply(np.unique).apply(sorted)
)
)
!
df.assign(
list_value=df.study_id.map(
df.list_value.str.join('|').groupby(df.study_id).apply(
lambda x: sorted(set('|'.join(x.dropna()).split('|')))
)
)
)
study_id list_value
0 1 [a, b, c]
1 1 [a, b, c]
2 1 [a, b, c]
3 2 [a, b, d, e, z]
4 2 [a, b, d, e, z]
5 2 [a, b, d, e, z]
df = pd.DataFrame(dict(
study_id=[1, 1, 1, 2, 2, 2],
list_value=[['a', 'b'], ['a'], ['c'], ['d', 'e', 'a'], np.nan, ['z', 'a', 'b']]
), columns=['study_id', 'list_value'])