Complex string matching

Question

Complex string matching

I want to find the first index of substrings in a larger string. I just want it to match the whole words, and I would like it to be case insensitive, except that I want it to treat CamelCase as separate words.

The code below does the trick, but it is slow. I would like to speed it up. Any suggestions? I tried some regular expressions but couldn't find one that handled all edge cases.

def word_start_index(text, seek_word): start_index = 0 curr_word = "" def case_change(): return curr_word and ch.isupper() and curr_word[-1].islower() def is_match(): return curr_word.lower() == seek_word.lower() for i, ch in enumerate(text): if case_change() or not ch.isalnum(): if is_match(): return start_index curr_word = "" start_index = None if ch.isalnum(): if start_index is None: start_index = i curr_word += ch if is_match(): return start_index if __name__ == "__main__": # 01234567890123456789012345 test_text = "a_foobar_FooBar baz golf_CART" test_words = ["a", "foo", "bar", "baz", "golf", "cart", "fred"] for word in test_words: match_start = word_start_index(test_text, word) print match_start, word

Output:

 0 a 9 foo 12 bar 16 baz 20 golf 25 cart None fred

+4

python string regex find

Jesse aldridge Jan 24 '10 at 13:25

source share

3 answers

word_emitter (below) takes a text string and gives lowercase "words" as they are found one at a time (together with their positions).

It replaces all underscores with spaces. Then it breaks the text into a list. For instance,

 "a_foobar_FooBar baz golf_CART Foo"

becomes

 ['a', 'foobar', 'FooBar', 'baz', 'golf', 'CART', 'Foo']

Of course, you also want camelCase words to be treated as separate words. Therefore, for each part in the above list, we use the regex pattern '(.*[az])(?=[AZ])' separate the words camelCase. This regex uses the re operator to view in standby mode (?=...) . Perhaps this is the hardest part of all this.

word_emitter then displays the words one at a time along with their respective positions.

Once you have a feature that breaks text into “words,” the rest is easy.

I also switch the order of your loops, so you only skip test_text once. This will speed up the process if test_text is very long compared to test_words.

 import re import string import itertools nonspace=re.compile('(\S+)') table = string.maketrans( '_.,!?;:"(){}@#$%^&*-+='+"'", ' ', ) def piece_emitter(text): # This generator splits text into 2-tuples of (positions,pieces). # Given "a_foobar_FooBar" it returns # ((0,'a'), # (2,'foobar'), # (9,'FooBar'), # ) pos=0 it=itertools.groupby(text,lambda w: w.isspace()) for k,g in it: w=''.join(g) w=w.translate(table) it2=itertools.groupby(w,lambda w: w.isspace()) for isspace,g2 in it2: word=''.join(g2) if not isspace: yield pos,word pos+=len(word) def camel_splitter(word): # Given a word like 'FooBar', this generator yields # 'Foo', then 'Bar'. it=itertools.groupby(word,lambda w: w.isupper()) for k,g in it: w=''.join(g) if len(w)==1: try: k1,g1=next(it) w+=''.join(g1) except StopIteration: pass yield w def word_emitter(piece): # Given 'getFooBar', this generator yields in turn the elements of the sequence # ((0,'get'), # (0,'getFoo'), # (0,'getFooBar'), # (3,'Foo'), # (3,'FooBar'), # (6,'Bar'), # ) # In each 2-tuple, the number is the starting position of the string, # followed by the fragment of camelCase word generated by camel_splitter. words=list(camel_splitter(piece)) num_words=len(words) for i in range(0,num_words+1): prefix=''.join(words[:i]) for step in range(1,num_words-i+1): word=''.join(words[i:i+step]) yield len(prefix),word def camel_search(text,words): words=dict.fromkeys(words,False) for pos,piece in piece_emitter(text): if not all(words[test_word] for test_word in words): for subpos,word in word_emitter(piece): for test_word in words: if not words[test_word] and word.lower() == test_word.lower(): yield pos+subpos,word words[test_word]=True break else: break for word in words: if not words[word]: yield None,word if __name__ == "__main__": # 01234567890123456789012345 test_text = "a_foobar_FooBar baz golf_CART" test_words = ["a", "foo", "bar", "baz", "golf", "cart", "fred"] for pos,word in camel_search(test_text,test_words): print pos,word.lower()

Here are the tests I used to test the program:

 import unittest import sys import camel import itertools class Test(unittest.TestCase): def check(self,result,answer): for r,a in itertools.izip_longest(result,answer): if r!=a: print('%s != %s'%(r,a)) self.assertTrue(r==a) def test_piece_emitter(self): tests=(("a_foobar_FooBar baz? golf_CART Foo 'food' getFooBaz", ((0,'a'), (2,'foobar'), (9,'FooBar'), (16,'baz'), (21,'golf'), (26,'CART'), (31,'Foo'), (36,'food'), (42,'getFooBaz'), ) ), ) for text,answer in tests: result=list(camel.piece_emitter(text)) print(result) self.check(result,answer) def test_camel_splitter(self): tests=(('getFooBar',('get','Foo','Bar')), ('getFOObar',('get','FOO','bar')), ('Foo',('Foo',)), ('getFoo',('get','Foo')), ('foobar',('foobar',)), ('fooBar',('foo','Bar')), ('FooBar',('Foo','Bar')), ('a',('a',)), ('fooB',('foo','B')), ('FooB',('Foo','B')), ('FOOb',('FOO','b')), ) for word,answer in tests: result=camel.camel_splitter(word) self.check(result,answer) def test_word_emitter(self): tests=(("a", ((0,'a'),) ), ('getFooBar', ((0,'get'), (0,'getFoo'), (0,'getFooBar'), (3,'Foo'), (3,'FooBar'), (6,'Bar'), ) ) ) for text,answer in tests: result=list(camel.word_emitter(text)) print(result) self.check(result,answer) def test_camel_search(self): tests=(("a_foobar_FooBar baz? golf_CART Foo 'food' getFooBaz", ("a", "foo", "bar", "baz", "golf", "cart", "fred", "food", 'FooBaz'), ((0,'a'), (9,'Foo'), (12,'Bar'), (16,'baz'), (21,'golf'), (26,'CART'), (36,'food'), (45,'FooBaz'), (None,'fred') ) ), ("\"Foo\"",('Foo',),((1,'Foo'),)), ("getFooBar",('FooBar',),((3,'FooBar'),)), ) for text,search_words,answer in tests: result=list(camel.camel_search(text,search_words)) print(result) self.check(result,answer) if __name__ == '__main__': unittest.main(argv = unittest.sys.argv + ['--verbose'])

+3

unutbu Jan 24 '10 at 14:21

source share

With an index to speed up your search :-)

 from collections import defaultdict class IndexedText(object): """ a indexed text """ def __init__(self, text): self.text = text self._index() def word_start_index(self, word): l = len(word) w = word.lower() return self.index[word] def _index(self): self.index = defaultdict( list ) def index( word, pos): self.index[word.lower()].append( pos ) start = 0 it = enumerate(self.text) lpos, lchar = it.next() WS = (' ','_') for pos, char in it: if lchar in WS and char not in WS: index( self.text[start:lpos], start ) start = pos elif lchar.islower() and char.isupper(): # camelcase index( self.text[start:pos], start ) start = pos lpos, lchar = pos, char # last word is missing index( self.text[start:], start ) if __name__ == "__main__": # 01234567890123456789012345 test_text = "a_foobar_FooBar baz golf_CART" test_words = ["a", "foo", "bar", "baz", "golf", "cart", "fred"] index = IndexedText( test_text ) for word in test_words: match_start = index.word_start_index( word ) print match_start, word

+1

Jochen ritzel Jan 24 '10 at 14:52

source share

Mark byers · Accepted Answer · 2010-01-24T13:42:07+0000

If I did this with regular expressions, I would do it like this:

 def word_start_index2(text, seek_word): camel_case = seek_word[0].upper() + seek_word[1:].lower() seek_word_i = ''.join('[' + c.lower() + c.upper() + ']' for c in seek_word) regex1 = r'(?:(?<=[^a-zA-Z])|^)' + seek_word_i + r'(?=$|[^a-zA-Z])' regex2 = r'(?:(?<=[az]|[^AZ])|^)' + camel_case + r'(?=$|[AZ]|[^az])' regex = '%s|%s' % (regex1, regex2) import re m = re.search(regex, text) if not m: return None else: return m.start()

I have not tested this version against your version, but you can try it to see if it is better or worse, or let us know.

My answer may give you different results in some cases, but in your comments you said that you do not care about these cases.

Also, I tried using the notation (?i) to mark part of the regular expression as case insensitive, but for some reason this does not work correctly. I can’t explain why.

Final self-nitpick: the function should check its arguments, but this code is omitted for clarity. You should add checks, at least for the following:

the text should be a string
seek_word must be a string matching '[a-zA-Z] +'

Complex string matching

More articles: