Smart way to check if a string contains an item in a list - python

The list top_brandscontains a list of brands, for example

top_brands = ['Coca Cola', 'Apple', 'Victoria\ Secret', ....]

itemsis that pandas.DataFrame, and the structure is shown below. My task is to fill brand_namein item_titleif brand_namemissing

row     item_title                 brand_name

1    |  Apple 6S                  |  Apple
2    |  New Victoria\ Secret    |  missing  <-- need to fill with Victoria\ Secret
3    |  Used Samsung TV           |  missing  <--need fill with Samsung
4    |  Used bike                 |  missing  <--No need to do anything because there is no brand_name in the title 
    ....

My code is as follows. The problem is that it is too slow for a data frame containing 2 million records. Anyway, can I use pandas or numpy to complete the task?

def get_brand_name(row):
    if row['brand_name'] != 'missing':
        return row['brand_name']

    item_title = row['item_title']

    for brand in top_brands:
        brand_start = brand + ' '
        brand_in_between = ' ' + brand + ' '
        brand_end = ' ' + brand
        if ((brand_in_between in item_title) or item_title.endswith(brand_end) or item_title.startswith(brand_start)): 
            print(brand)
            return brand

    return 'missing'    ### end of get_brand_name


items['brand_name'] = items.apply(lambda x: get_brand_name(x), axis=1)
+4
source share
4 answers

Try the following:

pd.concat([df['item_title'], df['item_title'].str.extract('(?P<brand_name>{})'.format("|".join(top_brands)), expand=True).fillna('missing')], axis=1)

Conclusion:

              item_title         brand_name
0               Apple 6S              Apple
1  New Victoria Secret  Victoria Secret
2        Used Samsung TV            Samsung
3              Used Bike            missing

I ran against a random sample of 2 million items on my machine:

def read_file():
    df = pd.read_csv('file1.txt')
    new_df = pd.concat([df['item_title'], df['item_title'].str.extract('(?P<brand_name>{})'.format("|".join(top_brands)), expand=True).fillna('missing')], axis=1)
    return new_df

start = time.time()
print(read_file())
end = time.time() - start
print(f'Took {end}s to process')

Conclusion:

                                   item_title         brand_name
0                                    LG watch                 LG
1                                  Sony watch               Sony
2                                 Used Burger            missing
3                                    New Bike            missing
4                               New underwear            missing
5                                    New Sony               Sony
6                        Used Apple underwear              Apple
7                       Refurbished Panasonic          Panasonic
8                   Used Victoria Secret TV  Victoria Secret
9                                Disney phone             Disney
10                                Used laptop            missing
...                                       ...                ...
1999990             Refurbished Disney tablet             Disney
1999991                    Refurbished laptop            missing
1999992                       Nintendo Coffee           Nintendo
1999993                      Nintendo desktop           Nintendo
1999994         Refurbished Victoria Secret  Victoria Secret
1999995                           Used Burger            missing
1999996                    Nintendo underwear           Nintendo
1999997                     Refurbished Apple              Apple
1999998                      Refurbished Sony               Sony
1999999                      New Google phone             Google

[2000000 rows x 2 columns]
Took 3.2660000324249268s to process

My car specifications:

Windows 7 Pro 64bit Intel i7-4770 @ 3.40GHZ RAM 12.0 GB

3,266 ... ?

+1

, , NER ( ).

item_title n

['New','New Victoria\'s', 'New Victoria\ Secret', 'Victoria\'s', 'Victoria\ Secret', 'Secret']

.

, , item_title . levenshtein n , .

0

, - :

top_brands = [r'Coca Cola', r'Apple', r'Victoria\ Secret', r'Samsung']

df = pd.DataFrame({
         'item_title': ['Apple 6S', 'New Victoria\ Secret', 'Used Samsung TV', 'Used bike'],
         'brand_name': ['Apple', 'missing', 'missing', 'missing']
         }, columns=['item_title' ,'brand_name'])

#               item_title brand_name
# 0               Apple 6S      Apple
# 1  New Victoria Secret    missing
# 2        Used Samsung TV    missing
# 3              Used bike    missing

# concatenate brand names into regex string
# with each brand as a capture group
top_brands = '|'.join(['(' + x + ')'  for x in top_brands])

# "(Coca Cola)|(Apple)|(Victoria\\ Secret)|(Samsung)"

df.loc[:, 'brand_name'] = df['item_title'].str.extract(\ 
                          top_brands).fillna('').sum(axis=1).replace('', 'missing')

#               item_title         brand_name
# 0               Apple 6S              Apple
# 1  New Victoria Secret  Victoria Secret
# 2        Used Samsung TV            Samsung
# 3              Used bike            missing
0

2M :

import pandas as pd
import time
top_brands = ['Coca Cola', 'Apple', 'Victoria\ Secret', 'Samsung']
items = pd.DataFrame(
        [['Apple 6S', 'Apple'],
         ['New Victoria\ Secret', 'missing'],
         ['Used Samsung TV', 'missing'],
         ['Used bike', 'missing']],
         columns=['item_title', 'brand_name'])
items = pd.concat([items]*500000, ignore_index=True)

:

''' Code Block 1 '''
items1 = items.copy()
t = time.time()
def get_brand_name_v1(row):
    if row['brand_name'] != 'missing':
        return row['brand_name']
    item_title = row['item_title']
    for brand in top_brands:
        brand_start = brand + ' '
        brand_in_between = ' ' + brand + ' '
        brand_end = ' ' + brand
        if ((brand_in_between in item_title) or \
            item_title.endswith(brand_end) or  \
            item_title.startswith(brand_start)): 
            return brand
    return 'missing'
items1['brand_name'] = items1.apply(lambda x: get_brand_name_v1(x), axis=1)
print('Code Block 1 time: {:f}'.format(time.time()-t))

# Code Block 1 time: 53.718933

: NAN , 'missing'. , , "" (, brand_name ['brand_name' ] )

''' Code Block 2 '''
items2 = items.copy()
t = time.time()
items2.loc[:,'brand_name'].replace(['missing'], [None], inplace=True)
def get_brand_name_v2(row):
    brand_name = row['brand_name']
    if brand_name is not None: return brand_name
    item_title = row['item_title']
    for brand in top_brands:
        if brand in item_title: return brand
items2['brand_name'] = items2.apply(lambda x: get_brand_name_v2(x), axis=1)
items2.loc[:,'brand_name'].fillna('missing', inplace=True)
print('Code Block 2 time: {:f}'.format(time.time()-t))

# Code Block 2 time: 47.940444

Idlehands : brand_name missing. , .

''' Code Block 3 '''
items3 = items.copy()
items3.loc[:,'brand_name'].replace(['missing'], [None], inplace=True)
t = time.time()
brands = (items3['item_title'].str.extract(
        '(?P<brand_name>{})'.format("|".join(top_brands)), expand=True))
brands.loc[:,'brand_name'].fillna('missing', inplace=True)
items3.loc[:,'brand_name'].fillna(brands.loc[:,'brand_name'], inplace=True)
print('Code Block 3 time: {:f}'.format(time.time()-t))

# Code Block 3 time: 3.388266

you can do it even faster if you can afford to use NANinstead of 'missing'in your dataset and delete all operations that replace NANwith 'missing'.

0
source

Source: https://habr.com/ru/post/1692994/


All Articles