How to combine two data frames well with Python (using pandas or other tools)?

I have one pandas framework consisting of the names of the cities of the world, as well as the countries to which the cities belong,

city.head(3)

    city    country
0   Qal eh-ye Now   Afghanistan
1   Chaghcharan Afghanistan
2   Lashkar Gah Afghanistan

and another data frame consisting of the addresses of world universities, which is shown below:

df.head(3)
    university
0   Inst Huizhou, Huihzhou 516001, Guangdong, Peop...
1   Guangxi Acad Sci, Nanning 530004, Guangxi, Peo...
2   Shenzhen VisuCA Key Lab SIAT, Shenzhen, People...

The location of city names is unevenly distributed across the lines. I would like to compare the names of cities with the addresses of world universities. That is, I would like to know in which city each university is located. We hope that the name of the city will correspond to the same row as the address of each university.

I tried the following and it does not work because the locations of the cities are irregular in rows.

df['university'].str.split(',').str[0]
+4
6

apply

city_list = city.tolist()

def match_city(row):
    for city in city_list:
        if city in row['university']: return city
    return 'None'

df['city'] = df.apply(match_city, axis=1)

, . , match_city.

+2

, . , .

numpy NaN, , . , , NaN.

import re
import numpy as np

data = ["Inst Huizhou, Huihzhou 516001, Guangdong, People Republic of China",
        "Guangxi Acad Sci, Nanning 530004, Guangxi, People Republic of China",
        "Shenzhen VisuCA Key Lab SIAT, Shenzhen, People Republic of China",
        "New York University, New York, New York 10012, United States of America",
        ""]
df = pd.DataFrame(data, columns = ['university'])

def extract_city(row):
    match = re.match('^[^,]*,([^,]*),', row)
    if match:
        city = re.sub('\d+', '', match.group(1)).strip()
    else:
        city = np.nan
    return city


df.university.apply(extract_city)

:

0    Huihzhou
1     Nanning
2    Shenzhen
3    New York
4         NaN
Name: university, dtype: object
+2

, , ( , , , ..), .

, levenshtein jaro-winkler, .

:

class DLDistance:
    def __init__(self, s1): 
        self.s1 = s1
        self.d = {}
        self.lenstr1 = len(self.s1)     
        for i in xrange(-1,self.lenstr1+1):
            self.d[(i,-1)] = i+1

    def distance(self, s2):
        lenstr2 = len(s2)
        for j in xrange(-1,lenstr2+1):
            self.d[(-1,j)] = j+1

        for i in xrange(self.lenstr1):
            for j in xrange(lenstr2):
                if self.s1[i] == s2[j]:
                    cost = 0
                else:
                    cost = 1
                self.d[(i,j)] = min(
                               self.d[(i-1,j)] + 1, # deletion
                               self.d[(i,j-1)] + 1, # insertion
                               self.d[(i-1,j-1)] + cost, # substitution
                              )
                if i and j and self.s1[i]==s2[j-1] and self.s1[i-1] == s2[j]:
                    self.d[(i,j)] = min (self.d[(i,j)], self.d[i-2,j-2] + cost) # transposition

        return self.d[self.lenstr1-1,lenstr2-1]

if __name__ == '__main__':
   base = u'abs'
   cmpstrs = [u'abs', u'sdfbasz', u'asdf', u'hfghfg']
   dl = DLDistance(base)

   for s in cmpstrs:
      print "damerau_levenshtein"
      print dl.distance(s)

, , N * M , N , M . ( , , , , )

levenshtein distance: https://en.wikipedia.org/wiki/Levenshtein_distance

jaro-winkler: https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance

+2

, , , , . , , , , .

, :

def address_to_dict(address):
    return {word: address for word in address.split(",")}

, , ,... , , . . : N- Python

, :

word_to_address_mapping = pd.DataFrame(df.university.apply(address_to_dict ).tolist()).stack()

word_to_address_mapping = pd.DataFrame(word_to_address_mapping, 
                                       columns=["address"])
word_to_address_mapping.index = word_to_address_mapping.index.droplevel(level=0)
word_to_address_mapping

- :

enter image description here

, , , : word_to_address_mapping, , .

# the outer join here should ensure that several university in the 
# same city do not overwrite each other
pd.merge(left=word_to_address_mapping, right=city,
         left_index=True, right_on="city", 
         how="outer)
+1

. . , .

In [22]: def get_city(univ_name_split):
   ....:     # find country from university address
   ....:     for name in univ_name_split:
   ....:         if name in city['country'].values:
   ....:             country = name
   ....:     else:
   ....:         country = None

   ....:     if country:
   ....:         cities = city[city.country == country].city.values
   ....:     else:
   ....:         cities = city['city'].values

   ....:     # find city from university address
   ....:     for name in univ_name_split:
   ....:         if name in cities:
   ....:             return name
   ....:     else:
   ....:         return None
   ....:     


In [1]: import pandas as pd

In [2]: city = pd.read_csv('city.csv')

In [3]: df = pd.read_csv('university.csv')

In [4]: # splitting university name and address

In [5]: df_split = df['university'].str.split(',')

In [6]: df_split = df_split.apply(lambda x:[i.strip() for i in x])

In [10]: df
Out[10]: 
                                          university
0  Kongu Engineering College, Perundurai, Erode, ...
1           Anna University - Guindy, Chennai, India
2  Birla Institute of Technology and Science, Pil...

In [11]: df_split
Out[11]: 
0    [Kongu Engineering College, Perundurai, Erode,...
1           [Anna University - Guindy, Chennai, India]
2    [Birla Institute of Technology and Science, Pi...
Name: university, dtype: object

In [12]: city
Out[12]: 
         city country
0   Bangalore   India
1     Chennai   India
2  Coimbatore   India
3       Delhi   India
4       Erode   India


#This function is shorter version of above function
In [14]: def get_city(univ_name_split):
   ....:     for name in univ_name_split:
   ....:         if name in city['city'].values:
   ....:             return name
   ....:     else:
   ....:         return None
   ....:     

In [15]: df['city'] = df_split.apply(get_city)

In [16]: df
Out[16]: 
                                          university     city
0  Kongu Engineering College, Perundurai, Erode, ...    Erode
1           Anna University - Guindy, Chennai, India  Chennai
2  Birla Institute of Technology and Science, Pil...     None
0

I created a small library for my projects, especially for fuzzy associations. This may not be the fastest solution, but it may help, feel free to use. Link to my GitHub repository

0
source

Source: https://habr.com/ru/post/1653814/


All Articles