2M :
import pandas as pd
import time
top_brands = ['Coca Cola', 'Apple', 'Victoria\ Secret', 'Samsung']
items = pd.DataFrame(
[['Apple 6S', 'Apple'],
['New Victoria\ Secret', 'missing'],
['Used Samsung TV', 'missing'],
['Used bike', 'missing']],
columns=['item_title', 'brand_name'])
items = pd.concat([items]*500000, ignore_index=True)
:
''' Code Block 1 '''
items1 = items.copy()
t = time.time()
def get_brand_name_v1(row):
if row['brand_name'] != 'missing':
return row['brand_name']
item_title = row['item_title']
for brand in top_brands:
brand_start = brand + ' '
brand_in_between = ' ' + brand + ' '
brand_end = ' ' + brand
if ((brand_in_between in item_title) or \
item_title.endswith(brand_end) or \
item_title.startswith(brand_start)):
return brand
return 'missing'
items1['brand_name'] = items1.apply(lambda x: get_brand_name_v1(x), axis=1)
print('Code Block 1 time: {:f}'.format(time.time()-t))
:
NAN , 'missing'. , , "" (, brand_name ['brand_name' ] )
''' Code Block 2 '''
items2 = items.copy()
t = time.time()
items2.loc[:,'brand_name'].replace(['missing'], [None], inplace=True)
def get_brand_name_v2(row):
brand_name = row['brand_name']
if brand_name is not None: return brand_name
item_title = row['item_title']
for brand in top_brands:
if brand in item_title: return brand
items2['brand_name'] = items2.apply(lambda x: get_brand_name_v2(x), axis=1)
items2.loc[:,'brand_name'].fillna('missing', inplace=True)
print('Code Block 2 time: {:f}'.format(time.time()-t))
Idlehands :
brand_name missing. , .
''' Code Block 3 '''
items3 = items.copy()
items3.loc[:,'brand_name'].replace(['missing'], [None], inplace=True)
t = time.time()
brands = (items3['item_title'].str.extract(
'(?P<brand_name>{})'.format("|".join(top_brands)), expand=True))
brands.loc[:,'brand_name'].fillna('missing', inplace=True)
items3.loc[:,'brand_name'].fillna(brands.loc[:,'brand_name'], inplace=True)
print('Code Block 3 time: {:f}'.format(time.time()-t))
you can do it even faster if you can afford to use NANinstead of 'missing'in your dataset and delete all operations that replace NANwith 'missing'.