Matching states and cities with a few possible words

I have a Python list, for example, the following elements:

['Alabama[edit]',
 'Auburn (Auburn University)[1]',
 'Florence (University of North Alabama)',
 'Jacksonville (Jacksonville State University)[2]',
 'Livingston (University of West Alabama)[2]',
 'Montevallo (University of Montevallo)[2]',
 'Troy (Troy University)[2]',
 'Tuscaloosa (University of Alabama, Stillman College, Shelton State)[3][4]',
 'Tuskegee (Tuskegee University)[5]',
 'Alaska[edit]',
 'Fairbanks (University of Alaska Fairbanks)[2]',
 'Arizona[edit]',
 'Flagstaff (Northern Arizona University)[6]',
 'Tempe (Arizona State University)',
 'Tucson (University of Arizona)',
 'Arkansas[edit]',
 'Arkadelphia (Henderson State University, Ouachita Baptist University)[2]',
 'Conway (Central Baptist College, Hendrix College, University of Central Arkansas)[2]',
 'Fayetteville (University of Arkansas)[7]']

The list is not complete, but enough to give you an idea of ​​what is in it.

The data is structured as follows:

There is a name for the state of the USA and, following the name of the state, there are some city names in that state. The name of the state, as you can see, ends in "[edit]", and the name of the cities either ends in a bracket with a number (for example, " 1 " or "[2]") or with the name of the university in parentheses (for example, "University Northern Alabama ").

(Find the full help file for this problem here )

Python , . , , :

{'Alabama': ['Auburn', 'Florence', 'Jacksonville'...], 'Arizona': ['Flagstaff', 'Temple', 'Tucson', ....], ......}

, :

import numpy as np
import pandas as pd

    def get_list_of_university_towns():
        '''
        Returns a DataFrame of towns and the states they are in from the 
        university_towns.txt list. The format of the DataFrame should be:
        DataFrame( [ ["Michigan", "Ann Arbor"], ["Michigan", "Yipsilanti"] ], 
        columns=["State", "RegionName"]  )

        The following cleaning needs to be done:

        1. For "State", removing characters from "[" to the end.
        2. For "RegionName", when applicable, removing every character from " (" to the end.
        3. Depending on how you read the data, you may need to remove newline character '\n'. 

        '''

        fhandle = open("university_towns.txt")
        ftext = fhandle.read().split("\n")

        reftext = list()
        for item in ftext:
            reftext.append(item.split(" ")[0])

        #pos = reftext[0].find("[")
        #reftext[0] = reftext[0][:pos]

        towns = list()
        dic = dict()

        for item in reftext:
            if item == "Alabama[edit]":
                state = "Alabama"

            elif item.endswith("[edit]"):
                dic[state] = towns
                towns = list()
                pos = item.find("[")
                item = item[:pos]
                state = item

            else:
                towns.append(item)

        return ftext

    get_list_of_university_towns()

, , :

{'Alabama': ['Auburn',
  'Florence',
  'Jacksonville',
  'Livingston',
  'Montevallo',
  'Troy',
  'Tuscaloosa',
  'Tuskegee'],
 'Alaska': ['Fairbanks'],
 'Arizona': ['Flagstaff', 'Tempe', 'Tucson'],
 'Arkansas': ['Arkadelphia',
  'Conway',
  'Fayetteville',
  'Jonesboro',
  'Magnolia',
  'Monticello',
  'Russellville',
  'Searcy'],
 'California': ['Angwin',
  'Arcata',
  'Berkeley',
  'Chico',
  'Claremont',
  'Cotati',
  'Davis',
  'Irvine',
  'Isla',
  'University',
  'Merced',
  'Orange',
  'Palo',
  'Pomona',
  'Redlands',
  'Riverside',
  'Sacramento',
  'University',
  'San',
  'San',
  'Santa',
  'Santa',
  'Turlock',
  'Westwood,',
  'Whittier'],
 'Colorado': ['Alamosa',
  'Boulder',
  'Durango',
  'Fort',
  'Golden',
  'Grand',
  'Greeley',
  'Gunnison',
  'Pueblo,'],
 'Connecticut': ['Fairfield',
  'Middletown',
  'New',
  'New',
  'New',
  'Storrs',
  'Willimantic'],
 'Delaware': ['Dover', 'Newark'],
 'Florida': ['Ave',
  'Boca',
  'Coral',
  'DeLand',
  'Estero',
  'Gainesville',
  'Orlando',
  'Sarasota',
  'St.',
  'St.',
  'Tallahassee',
  'Tampa'],
 'Georgia': ['Albany',
  'Athens',
  'Atlanta',
  'Carrollton',
  'Demorest',
  'Fort',
  'Kennesaw',
  'Milledgeville',
  'Mount',
  'Oxford',
  'Rome',
  'Savannah',
  'Statesboro',
  'Valdosta',
  'Waleska',
  'Young'],
 'Hawaii': ['Manoa'],

: (, " " ) . .

, , , , . , Regex?

+4
4

[c/sh] ould

fhandle = open("university_towns.txt")
ftext = fhandle.read().split("\n") 

# to

with open("university_towns.txt","r") as f:
    d = f.readlines()

# file is autoclosed here, lines are autosplit by readlines()

:

def save(state,city,dic):
    '''convenience fnkt to add or create set entry with list of city'''
    if state in dic:
        dic[state].append(city)
    else:
        dic[state] = [] # fix for glitch

dic = {}
state = "" 

with open("university_towns.txt","r") as f:
    d = f.readlines()  

for n in d:                                         # iterate all lines
    if "[edit]" in n:                                   # handles states
        act_state = n.replace("[edit]","").strip()      # clean up state
        # needed in case 2 states w/o cities follow right after each other
        save(act_state,"", dic)                         # create state in dic, no cities
        state = n.replace("[edit]","").strip()      # clean up state
    else:
        # splits at ( takes first and splits at [ takes first removes blanks
        #   => get city name before ( or [
        city = n.split("(")[0].split("[")[0].strip()  
        save(state,city,dic)                            # adds city to state in dic

print (dic)

():

{
 'Alabama' : ['Auburn', 'Florence', 'Jacksonville', 'Livingston',
              'Montevallo', 'Troy', 'Tuscaloosa', 'Tuskegee'], 
 'Alaska'  : ['Fairbanks'], 
 'Arizona' : ['Flagstaff', 'Tempe', 'Tucson'], 
 'Arkansas': ['Arkadelphia', 'Conway', 'Fayetteville']
}
+2

, :

states_rx = re.compile(r'''
^
(?P<state>.+?)\[edit\]
(?P<cities>[\s\S]+?)
(?=^.*\[edit\]$|\Z)
''', re.MULTILINE | re.VERBOSE)

cities_rx = re.compile(r'''^[^()\n]+''', re.MULTILINE)

transformed = '\n'.join(lst_)

result = {state.group('state'): [city.group(0).rstrip() 
        for city in cities_rx.finditer(state.group('cities'))] 
        for state in states_rx.finditer(transformed)}
print(result)

{'Alabama': ['Auburn', 'Florence', 'Jacksonville', 'Livingston', 'Montevallo', 'Troy', 'Tuscaloosa', 'Tuskegee'], 'Alaska': ['Fairbanks'], 'Arizona': ['Flagstaff', 'Tempe', 'Tucson'], 'Arkansas': ['Arkadelphia', 'Conway', 'Fayetteville']}


:

, :

  • \n
  • dict


transformed = '\n'.join(your_list)

^                      # match start of the line
(?P<state>.+?)\[edit\] # capture anything in that line up to [edit]
(?P<cities>[\s\S]+?)   # afterwards match anything up to
(?=^.*\[edit\]$|\Z)    # ... either another state or the very end of the string

regex101.com.

^[^()\n]+              # match start of the line, anything not a newline character or ( or )

regex101.com.

result = {state.group('state'): [city.group(0).rstrip() for city in cities_rx.finditer(state.group('cities'))] for state in states_rx.finditer(transformed)}

:

for state in states_rx.finditer(transformed):
    # state is in state.group('state')
    for city in cities_rx.finditer(state.group('cities')):
        # city is in city.group(0), possibly with whitespaces
        # hence the rstrip


, :
import timeit
print(timeit.timeit(findstatesandcities, number=10**5))
# 12.234304904000965

, , 100.000 12 , .

+4

:

:

, , - , "pos_flag", :

import re
pattern='\w+(?=\[edit\])'

track=[]
with open('mon.txt','r') as f:
    for line in f:
        match=re.search(pattern,line)
        if match:
            track.append('pos_flag')
            track.append(line.strip().split('[')[0])
        else:

            track.append(line.strip().split('(')[0])

- :

['pos_flag', 'Alabama', 'Auburn ', 'Florence ', 'Jacksonville ', 'Livingston ', 'Montevallo ', 'Troy ', 'Tuscaloosa ', 'Tuskegee ', 'pos_flag', 'Alaska', 'Fairbanks ', 'pos_flag', 'Arizona', 'Flagstaff ', 'Tempe ', 'Tucson ', 'pos_flag', 'Arkansas', 'Arkadelphia ', 'Conway ', 'Fayetteville ', 'Jonesboro ', 'Magnolia ', 'Monticello ', 'Russellville ', 'Searcy ', 'pos_flag', 

, "pos_flag" :

:

pos_flag :

index_no=[]
for index,value in enumerate(track):
    if value=='pos_flag':
        index_no.append(index)

:

[0, 10, 13, 18, 28, 55, 66, 75, 79, 93, 111, 114, 119, 131, 146, 161, 169, 182, 192, 203, 215, 236, 258, 274, 281, 292, 297, 306, 310, 319, 331, 338, 371, 391, 395, 419, 432, 444, 489, 493, 506, 512, 527, 551, 559, 567, 581, 588, 599, 614]

no, :

:

sort the list using the no index and set the first word as the dict key and the rest as dict values:

city_dict={}
for i in range(0,len(index_no),1):
    try:
        value_1=track[index_no[i:i + 2][0]:index_no[i:i + 2][1]]
        city_dict[value_1[1]]=value_1[2:]
    except IndexError:
        city_dict[track[index_no[i:i + 2][0]:][1]]=track[index_no[i:i + 2][0]:][1:]

print(city_dict)

output:

since the dict is not ordered in python 3.5, so the output order is different from the input file:

{'Kentucky': ['Bowling Green ', 'Columbia ', 'Georgetown ', 'Highland Heights ', 'Lexington ', 'Louisville ', 'Morehead ', 'Murray ', 'Richmond ', 'Williamsburg ', 'Wilmore '], 'Mississippi': ['Cleveland ', 'Hattiesburg ', 'Itta Bena ', 'Oxford ', 'Starkville '], 'Wisconsin': ['Appleton ', 'Eau Claire ', 'Green Bay ', 'La Crosse ', 'Madison ', 'Menomonie ', 'Milwaukee ', 

full_code:

import re
pattern='\w+(?=\[edit\])'

track=[]
with open('mon.txt','r') as f:
    for line in f:
        match=re.search(pattern,line)
        if match:
            track.append('pos_flag')
            track.append(line.strip().split('[')[0])
        else:

            track.append(line.strip().split('(')[0])


index_no=[]
for index,value in enumerate(track):
    if value=='pos_flag':
        index_no.append(index)


city_dict={}
for i in range(0,len(index_no),1):
    try:
        value_1=track[index_no[i:i + 2][0]:index_no[i:i + 2][1]]
        city_dict[value_1[1]]=value_1[2:]
    except IndexError:
        city_dict[track[index_no[i:i + 2][0]:][1]]=track[index_no[i:i + 2][0]:][1:]

print(city_dict)

The second solution:

If you want to use regex, try this small solution:

import re
pattern='((\w+\[edit\])(?:(?!^\w+\[edit\]).)*)'
with open('file.txt','r') as f:
    prt=re.finditer(pattern,f.read(),re.DOTALL | re.MULTILINE)

    for line in prt:
        dict_p={}
        match = []
        match.append(line.group(1))
        dict_p[match[0].split('\n')[0].strip().split('[')[0]]= [i.split('(')[0].strip() for i in match[0].split('\n')[1:][:-1]]

        print(dict_p)

he will give:

{'Alabama': ['Auburn', 'Florence', 'Jacksonville', 'Livingston', 'Montevallo', 'Troy', 'Tuscaloosa', 'Tuskegee']}
{'Alaska': ['Fairbanks']}
{'Arizona': ['Flagstaff', 'Tempe', 'Tucson']}
{'Arkansas': ['Arkadelphia', 'Conway', 'Fayetteville', 'Jonesboro', 'Magnolia', 'Monticello', 'Russellville', 'Searcy']}
{'California': ['Angwin', 'Arcata', 'Berkeley', 'Chico', 'Claremont', 'Cotati', 'Davis', 'Irvine', 'Isla Vista', 'University Park, Los Angeles', 'Merced', 'Orange', 'Palo Alto', 'Pomona', 'Redlands', 'Riverside', 'Sacramento', 'University District, San Bernardino', 'San Diego', 'San Luis Obispo', 'Santa Barbara', 'Santa Cruz', 'Turlock', 'Westwood, Los Angeles', 'Whittier']}
{'Colorado': ['Alamosa', 'Boulder', 'Durango', 'Fort Collins', 'Golden', 'Grand Junction', 'Greeley', 'Gunnison', 'Pueblo, Colorado']}

demo:

+3
source

I tried to eliminate the need for more than one regular expression .

import re

def mkdict(data):
  state, dict = None, {}
  rx = re.compile(r'^(?:(.+\[edit\])|([^\(\n:]+))', re.M)
  for m in rx.finditer(data):
    if m.groups()[0]:
      state = m.groups()[0].rstrip('[edit]')
      dict[state] = []
    else:
      dict[state].append(m.groups()[1].rstrip())
  return dict

if __name__ == '__main__':
  import sys, timeit, functools
  data = sys.stdin.read()
  print(timeit.Timer(functools.partial(mkdict, data)).timeit(10**3))
  print(mkdict(data))

Try it online .

+2
source

Source: https://habr.com/ru/post/1691579/


All Articles