Python regular expression to write comma separated list

I have a list of weather forecasts that start with a similar prefix that I would like to remove. I would also like to write down the names of cities:

Some examples:

If you have a vacation or wedding plans in Phoenix, Tucson, Flagstaff, Salt Lake City, Park City, Denver, Estes Park, Colorado Springs, Pueblo, or Albuquerque, the week will be ...

If you have a vacation or wedding plans in Miami, Jacksonville, Macon, Charlotte, or Charleston, expect a couple of systems ...

If you have a vacation or wedding plans in Pittsburgh, Philadelphia, Atlantic City, Newark, Baltimore, DK, Richmond, Charleston or Dover, expect a week ...

Lines begin with the general prefix "If you have a vacation or wedding plans," and in the last city there is a "or" in front of it. The list of cities has a variable length.

I tried this:

>>> text = 'If you have vacation or wedding plans in NYC, Boston, Manchester, Concord, Providence, or Portland'
>>> re.search(r'^If you have vacation or wedding plans in ((\b\w+\b), ?)+ or (\w+)', text).groups()
('Providence,', 'Providence', 'Portland')
>>>

I think I'm pretty close, but obviously it doesn't work. I have never tried to do something with a variable amount of items captured; Any guidance would be greatly appreciated.

+4
source share
4 answers
import re
s = "If you have vacation or wedding plans for Miami, Jacksonville, Macon, Charlotte, or Charleston, expect a couple systems"
p = re.compile(r"If you have vacation or wedding plans (in|for) ((\w+, )+)or (\w+)")
m = p.match(s)
print m.group(2) # output: Miami, Jacksonville, Macon, Charlotte,
cities = m.group(2).split(", ") # cities = ['Miami', 'Jacksonville', 'Macon', 'Charlotte', '']
cities[-1] = m.group(4) # add the city after or
print cities # cities = ['Miami', 'Jacksonville', 'Macon', 'Charlotte', 'Charleston']

the city can be mapped template (\w+, )and or (\w+) and divide the city pattern,

btw, since the template is used for many data, it is preferable to work with a compiled object

PS: , for in, ,

0

: csv ( , data.csv, , ). :

  • ,
  • "..."
  • "" ( )

:

import csv


def cleanup(row):
    new_row = row[:-1]
    new_row[0] = new_row[0].replace('If you have vacation or wedding plans in ', '')
    new_row[0] = new_row[0].replace('If you have vacation or wedding plans for ', '')
    new_row[-1] = new_row[-1].replace('or ', '')
    return new_row

if __name__ == '__main__':
    with open('data.csv') as f:
        reader = csv.reader(f, skipinitialspace=True)
        for row in reader:
            row = cleanup(row)
            print row

:

['Phoenix', 'Tucson', 'Flagstaff', 'Salt Lake City', 'Park City', 'Denver', 'Estes Park', 'Colorado Springs', 'Pueblo', 'Albuquerque']
['Miami', 'Jacksonville', 'Macon', 'Charlotte', 'Charleston']
['Pittsburgh', 'Philadelphia', 'Atlantic City', 'Newark', 'Baltimore', 'D.C.', 'Richmond', 'Charleston', 'Dover']
+1

(, ).

nltk, Named Entity Recognition. nltk.chunk.ne_chunk_sents(), :

import nltk


def extract_entity_names(t):
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))

    return entity_names


sample = "If you have vacation or wedding plans in Phoenix, Tucson, Flagstaff, Salt Lake City, Park City, Denver, Estes Park, Colorado Springs, Pueblo, or Albuquerque, the week will..."

sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

entity_names = []
for tree in chunked_sentences:
    entity_names.extend(extract_entity_names(tree))

print entity_names

:

['Phoenix', 'Tucson', 'Flagstaff', 'Salt Lake City', 'Park City', 'Denver', 'Estes Park', 'Colorado Springs', 'Pueblo', 'Albuquerque']
+1

>>> text = 'If you have vacation or wedding plans for Phoenix, Tucson, Flagstaff, Salt Lake City, Park City, Denver, Estes Park, Colorado Springs, Pueblo, or Albuquerque, the week will'
>>> match = re.search(r'^If you have vacation or wedding plans (in?|for?) ([\w+ ,]+)',text).groups()[1].split(", ")

>>> match
['Phoenix', 'Tucson', 'Flagstaff', 'Salt Lake City', 'Park City', 'Denver', 'Estes Park', 'Colorado Springs', 'Pueblo', 'or Albuquerque', 'the week will']
0

Source: https://habr.com/ru/post/1617156/


All Articles