1
groupby transform first.
transform
.
df.assign(
Account_Number=
df.groupby('Account_Number')
.Dummy_Account
.transform('first')
)
Account_Number Dummy_Account
0 ACC0000000000001 ACC0000000000001
1 ACC0000000000002 ACC0000000000002
2 ACC0000000000001 ACC0000000169532
3 ACC0000000123005 ACC0000000123005
4 ACC0000000000004 ACC0000000000004
5 ACC0000000000005 ACC0000000000005
6 ACC0000000123005 ACC0000000000040
2
Numpy np.unique, .
(idx) , 'Account_Number'. 'Dummy_Account'. (inv), , , , .
u, idx, inv = np.unique(
df.Account_Number.values,
return_index=True,
return_inverse=True
)
df.assign(
Account_Number=
df.Dummy_Account.values[idx][inv]
)
Account_Number Dummy_Account
0 ACC0000000000001 ACC0000000000001
1 ACC0000000000002 ACC0000000000002
2 ACC0000000000001 ACC0000000169532
3 ACC0000000123005 ACC0000000123005
4 ACC0000000000004 ACC0000000000004
5 ACC0000000000005 ACC0000000000005
6 ACC0000000123005 ACC0000000000040
3
pd.factorize pd.Series.duplicated.
, 2, duplicated , . , pd.factorize. f , inv 2.
d = ~df.Account_Number.duplicated().values
f, u = pd.factorize(df.Account_Number.values)
df.assign(
Account_Number=
df.Dummy_Account.values[d][f]
)
Account_Number Dummy_Account
0 ACC0000000000001 ACC0000000000001
1 ACC0000000000002 ACC0000000000002
2 ACC0000000000001 ACC0000000169532
3 ACC0000000123005 ACC0000000123005
4 ACC0000000000004 ACC0000000000004
5 ACC0000000000005 ACC0000000000005
6 ACC0000000123005 ACC0000000000040
Time tests
results
res.plot(loglog=True)

res.div(res.min(1), 0)
jez cold pir1 pir2 pir3
10 4.524811 3.819322 2.870916 1.000000 1.238144
30 4.833144 4.093932 3.310285 1.000000 1.382189
100 4.863337 4.048008 3.146154 1.000000 1.320060
300 5.144460 3.894850 3.157636 1.000000 1.357779
1000 4.870499 3.742524 2.348021 1.000000 1.069559
3000 5.375105 3.432398 1.852771 1.126024 1.000000
10000 7.100372 4.335100 1.890134 1.551161 1.000000
30000 7.227139 3.993985 1.530002 1.594531 1.000000
100000 8.052324 3.811728 1.380440 1.708170 1.000000
300000 8.690613 4.204664 1.539624 1.942090 1.000000
1000000 7.787494 3.668117 1.498758 2.129085 1.000000
Customization
def jez(d):
v = d.sort_values('Account_Number')
v['Account_Number'] = v['Dummy_Account'].mask(v.duplicated('Account_Number')).ffill()
return v.sort_index()
def cold(d):
m = d.drop_duplicates('Account_Number', keep='first')\
.set_index('Account_Number')\
.Dummy_Account
return d.assign(Account_Number=d.Account_Number.map(m))
def pir1(d):
return d.assign(
Account_Number=
d.groupby('Account_Number')
.Dummy_Account
.transform('first')
)
def pir2(d):
u, idx, inv = np.unique(
d.Account_Number.values,
return_index=True,
return_inverse=True
)
return d.assign(
Account_Number=
d.Dummy_Account.values[idx][inv]
)
def pir3(d):
p = ~d.Account_Number.duplicated().values
f, u = pd.factorize(d.Account_Number.values)
return d.assign(
Account_Number=
d.Dummy_Account.values[p][f]
)
res = pd.DataFrame(
index=[10, 30, 100, 300, 1000, 3000, 10000,
30000, 100000, 300000, 1000000],
columns='jez cold pir1 pir2 pir3'.split(),
dtype=float
)
np.random.seed([3, 1415])
for i in res.index:
d = pd.DataFrame(dict(
Account_Number=np.random.randint(i // 2, size=i),
Dummy_Account=range(i)
))
d = pd.concat([df] * i, ignore_index=True)
for j in res.columns:
stmt = f'{j}(d)'
setp = f'from __main__ import {j}, d'
res.at[i, j] = timeit(stmt, setp, number=100)