pandas only with str.get_dummies
dummies = df.AIRPORT.str.get_dummies()
df.join(
dummies * pd.Series('Y', dummies.columns)
).replace('', np.nan)
AIRPORT BHX EWR JAX
0 EWR|JAX nan Y Y
1 EWR|BHX Y Y nan
2 EWR|BHX Y Y nan
3 EWR|BHX Y Y nan
4 EWR|BHX Y Y nan
pandasand numpywithnp.where
dummies = df.AIRPORT.str.get_dummies()
d1 = pd.DataFrame(
np.where(dummies.values == 1, 'Y', np.nan),
dummies.index, dummies.columns
)
d2 = df.join(d1)
print(d2)
AIRPORT BHX EWR JAX
0 EWR|JAX nan Y Y
1 EWR|BHX Y Y nan
2 EWR|BHX Y Y nan
3 EWR|BHX Y Y nan
4 EWR|BHX Y Y nan
Timing
small data
%%timeit
df.join(
df.AIRPORT.str.get_dummies() * pd.Series('Y', dummies.columns)
).replace('', np.nan)
100 loops, best of 3: 2.31 ms per loop
%timeit df.assign(**df.AIRPORT.str.get_dummies().replace({1:'Y',0:np.nan}))
100 loops, best of 3: 2.78 ms per loop
%%timeit
dummies = df.AIRPORT.str.get_dummies()
d1 = pd.DataFrame(
np.where(dummies.values == 1, 'Y', np.nan),
dummies.index, dummies.columns
)
df.join(d1)
1000 loops, best of 3: 1.65 ms per loop
big data
from string import ascii_uppercase
np.random.seed([3,1415])
source = pd.DataFrame(
np.random.choice(list(ascii_uppercase), [100, 3])
).sum(1).unique()
df = pd.DataFrame(
np.random.choice(source, [10000, 2]), columns=['A', 'B']
).query('A != B').apply('|'.join, 1).to_frame('AIRPORT')
%%timeit
dummies = df.AIRPORT.str.get_dummies()
df.join(
dummies * pd.Series('Y', dummies.columns)
).replace('', np.nan)
1 loop, best of 3: 594 ms per loop
%timeit df.assign(**df.AIRPORT.str.get_dummies().replace({1:'Y',0:np.nan}))
1 loop, best of 3: 629 ms per loop
%%timeit
dummies = df.AIRPORT.str.get_dummies()
d1 = pd.DataFrame(
np.where(dummies.values == 1, 'Y', np.nan),
dummies.index, dummies.columns
)
df.join(d1)
1 loop, best of 3: 592 ms per loop