word_emitter (below) takes a text string and gives lowercase "words" as they are found one at a time (together with their positions).
It replaces all underscores with spaces. Then it breaks the text into a list. For instance,
"a_foobar_FooBar baz golf_CART Foo"
becomes
['a', 'foobar', 'FooBar', 'baz', 'golf', 'CART', 'Foo']
Of course, you also want camelCase words to be treated as separate words. Therefore, for each part in the above list, we use the regex pattern '(.*[az])(?=[AZ])' separate the words camelCase. This regex uses the re operator to view in standby mode (?=...) . Perhaps this is the hardest part of all this.
word_emitter then displays the words one at a time along with their respective positions.
Once you have a feature that breaks text into โwords,โ the rest is easy.
I also switch the order of your loops, so you only skip test_text once. This will speed up the process if test_text is very long compared to test_words.
import re import string import itertools nonspace=re.compile('(\S+)') table = string.maketrans( '_.,!?;:"(){}@#$%^&*-+='+"'", ' ', ) def piece_emitter(text): # This generator splits text into 2-tuples of (positions,pieces). # Given "a_foobar_FooBar" it returns # ((0,'a'), # (2,'foobar'), # (9,'FooBar'), # ) pos=0 it=itertools.groupby(text,lambda w: w.isspace()) for k,g in it: w=''.join(g) w=w.translate(table) it2=itertools.groupby(w,lambda w: w.isspace()) for isspace,g2 in it2: word=''.join(g2) if not isspace: yield pos,word pos+=len(word) def camel_splitter(word): # Given a word like 'FooBar', this generator yields # 'Foo', then 'Bar'. it=itertools.groupby(word,lambda w: w.isupper()) for k,g in it: w=''.join(g) if len(w)==1: try: k1,g1=next(it) w+=''.join(g1) except StopIteration: pass yield w def word_emitter(piece): # Given 'getFooBar', this generator yields in turn the elements of the sequence # ((0,'get'), # (0,'getFoo'), # (0,'getFooBar'), # (3,'Foo'), # (3,'FooBar'), # (6,'Bar'), # ) # In each 2-tuple, the number is the starting position of the string, # followed by the fragment of camelCase word generated by camel_splitter. words=list(camel_splitter(piece)) num_words=len(words) for i in range(0,num_words+1): prefix=''.join(words[:i]) for step in range(1,num_words-i+1): word=''.join(words[i:i+step]) yield len(prefix),word def camel_search(text,words): words=dict.fromkeys(words,False) for pos,piece in piece_emitter(text): if not all(words[test_word] for test_word in words): for subpos,word in word_emitter(piece): for test_word in words: if not words[test_word] and word.lower() == test_word.lower(): yield pos+subpos,word words[test_word]=True break else: break for word in words: if not words[word]: yield None,word if __name__ == "__main__": # 01234567890123456789012345 test_text = "a_foobar_FooBar baz golf_CART" test_words = ["a", "foo", "bar", "baz", "golf", "cart", "fred"] for pos,word in camel_search(test_text,test_words): print pos,word.lower()
Here are the tests I used to test the program:
import unittest import sys import camel import itertools class Test(unittest.TestCase): def check(self,result,answer): for r,a in itertools.izip_longest(result,answer): if r!=a: print('%s != %s'%(r,a)) self.assertTrue(r==a) def test_piece_emitter(self): tests=(("a_foobar_FooBar baz? golf_CART Foo 'food' getFooBaz", ((0,'a'), (2,'foobar'), (9,'FooBar'), (16,'baz'), (21,'golf'), (26,'CART'), (31,'Foo'), (36,'food'), (42,'getFooBaz'), ) ), ) for text,answer in tests: result=list(camel.piece_emitter(text)) print(result) self.check(result,answer) def test_camel_splitter(self): tests=(('getFooBar',('get','Foo','Bar')), ('getFOObar',('get','FOO','bar')), ('Foo',('Foo',)), ('getFoo',('get','Foo')), ('foobar',('foobar',)), ('fooBar',('foo','Bar')), ('FooBar',('Foo','Bar')), ('a',('a',)), ('fooB',('foo','B')), ('FooB',('Foo','B')), ('FOOb',('FOO','b')), ) for word,answer in tests: result=camel.camel_splitter(word) self.check(result,answer) def test_word_emitter(self): tests=(("a", ((0,'a'),) ), ('getFooBar', ((0,'get'), (0,'getFoo'), (0,'getFooBar'), (3,'Foo'), (3,'FooBar'), (6,'Bar'), ) ) ) for text,answer in tests: result=list(camel.word_emitter(text)) print(result) self.check(result,answer) def test_camel_search(self): tests=(("a_foobar_FooBar baz? golf_CART Foo 'food' getFooBaz", ("a", "foo", "bar", "baz", "golf", "cart", "fred", "food", 'FooBaz'), ((0,'a'), (9,'Foo'), (12,'Bar'), (16,'baz'), (21,'golf'), (26,'CART'), (36,'food'), (45,'FooBaz'), (None,'fred') ) ), ("\"Foo\"",('Foo',),((1,'Foo'),)), ("getFooBar",('FooBar',),((3,'FooBar'),)), ) for text,search_words,answer in tests: result=list(camel.camel_search(text,search_words)) print(result) self.check(result,answer) if __name__ == '__main__': unittest.main(argv = unittest.sys.argv + ['--verbose'])