A simple list comprehension is the fastest when working with strings like this:
In [112]: df['word_overlap'] = [set(x[0].split()) & set(x[1].split()) for x in df.values] In [113]: df['overlap_count'] = df['word_overlap'].str.len() In [114]: df Out[114]: friend_description friend_definition word_overlap overlap_count 0 James is dumb dumb dude {dumb} 1 1 Jacob is smart smart guy {smart} 1 2 Jane is pretty she looks pretty {pretty} 1 3 Susan is rich she is rich {rich, is} 2
single apply(..., axis=1) :
In [85]: df['word_overlap'] = df.apply(lambda r: set(r['friend_description'].split()) & ...: set(r['friend_definition'].split()), ...: axis=1) ...: In [86]: df['overlap_count'] = df['word_overlap'].str.len() In [87]: df Out[87]: friend_description friend_definition word_overlap overlap_count 0 James is dumb dumb dude {dumb} 1 1 Jacob is smart smart guy {smart} 1 2 Jane is pretty she looks pretty {pretty} 1 3 Susan is rich she is rich {rich, is} 2
apply().apply(..., axis=1) :
In [23]: df['word_overlap'] = (df.apply(lambda x: x.str.split(expand=False)) ...: .apply(lambda r: set(r['friend_description']) & set(r['friend_definition']), ...: axis=1)) ...: In [24]: df['overlap_count'] = df['word_overlap'].str.len() In [25]: df Out[25]: friend_description friend_definition word_overlap overlap_count 0 James is dumb dumb dude {dumb} 1 1 Jacob is smart smart guy {smart} 1 2 Jane is pretty she looks pretty {pretty} 1 3 Susan is rich she is rich {is, rich} 2
Timing against 40,000 lines of DF:
In [104]: df = pd.concat([df] * 10**4, ignore_index=True) In [105]: df.shape Out[105]: (40000, 2) In [106]: %timeit [set(x[0].split()) & set(x[1].split()) for x in df.values] 223 ms Β± 19.4 ms per loop (mean Β± std. dev. of 7 runs, 1 loop each) In [107]: %timeit df.apply(lambda r: set(r['friend_description'].split()) & set(r['friend_definition'].split()), axis=1) 3.65 s Β± 46.6 ms per loop (mean Β± std. dev. of 7 runs, 1 loop each) In [108]: %timeit df.apply(lambda x: x.str.split(expand=False)).apply(lambda r: set(r['friend_description']) & set(r['friend_definition']), ...: axis=1) 4.63 s Β± 84.7 ms per loop (mean Β± std. dev. of 7 runs, 1 loop each)
source share