String mask and regex offset

I have a line on which I am trying to create a regular expression mask that will show the Nnumber of words based on the offset. Let's say I have the following line:

"The quick, brown fox jumps over the lazy dog."

I want to show 3 words at a time:

displacement 0: "The quick, brown"
displacement 1: "quick, brown fox"
displacement 2: "brown fox jumps"
displacement 3: "fox jumps over"
displacement 4: "jumps over the"
displacement 5: "over the lazy"
displacement 6:"the lazy dog."

I use Python, and I used the following simple regular expression to detect three words:

>>> import re
>>> s = "The quick, brown fox jumps over the lazy dog."
>>> re.search(r'(\w+\W*){3}', s).group()
'The quick, brown '

But I can’t figure out how to have a mask to show the next 3 words, not the initial ones. I need to keep punctuation marks.

+3
4

, - regex, offset .

- :

import re
s = "The quick, brown fox jumps over the lazy dog."

print re.search(r'(?:\w+\W*){0}((?:\w+\W*){3})', s).group(1)
# The quick, brown 
print re.search(r'(?:\w+\W*){1}((?:\w+\W*){3})', s).group(1)
# quick, brown fox      
print re.search(r'(?:\w+\W*){2}((?:\w+\W*){3})', s).group(1)
# brown fox jumps 

:

 _"word"_      _"word"_
/        \    /        \
(?:\w+\W*){2}((?:\w+\W*){3})
             \_____________/
                group 1

, : 2, 1, 3.

(?:...) , .


"word"

, \w+\W* "", :

import re
s = "nothing"
print re.search(r'(\w+\W*){3}', s).group()
# nothing

, , \W* .

, :

\w+(?:\W+|$)

a \w+, \w+, $.


Kobi , , . findall (. ideone.com):

import re
s = "The quick, brown fox jumps over the lazy dog."

triplets = re.findall(r"\b(?=((?:\w+(?:\W+|$)){3}))", s)

print triplets
# ['The quick, brown ', 'quick, brown fox ', 'brown fox jumps ',
#  'fox jumps over ', 'jumps over the ', 'over the lazy ', 'the lazy dog.']

print triplets[3]
# fox jumps over 

, , \b, lookahead 3 "" 1.

    ______lookahead______
   /      ___"word"__    \
  /      /           \    \
\b(?=((?:\w+(?:\W+|$)){3}))
     \___________________/
           group 1

+5

, :

words = re.split(r"\s+", s)
for i in range(len(words) - 2):
    print ' '.join(words[i:i+3])

, , , , , .

+2

>>> s = "The quick, brown fox jumps over the lazy dog."
>>> for offset in range(7):
...     print 'offset {0}: "{1}"'.format(offset, ' '.join(s.split()[offset:][:3]))
... 
offset 0: "The quick, brown"
offset 1: "quick, brown fox"
offset 2: "brown fox jumps"
offset 3: "fox jumps over"
offset 4: "jumps over the"
offset 5: "over the lazy"
offset 6: "the lazy dog."
+1

:

  • .
  • .

1 , , - str.split. 2 , , pairwise itertools:

http://docs.python.org/library/itertools.html#recipes

, n- :

import itertools

def nwise(iterable, n):
    """nwise(iter([1,2,3,4,5]), 3) -> (1,2,3), (2,3,4), (4,5,6)"""
    iterables = itertools.tee(iterable, n)
    slices = (itertools.islice(it, idx, None) for (idx, it) in enumerate(iterables))
    return itertools.izip(*slices)

And we get a simple and modular code:

>>> s = "The quick, brown fox jumps over the lazy dog."
>>> list(nwise(s.split(), 3))
[('The', 'quick,', 'brown'), ('quick,', 'brown', 'fox'), ('brown', 'fox', 'jumps'), ('fox', 'jumps', 'over'), ('jumps', 'over', 'the'), ('over', 'the', 'lazy'), ('the', 'lazy', 'dog.')]

Or as you requested:

>>> # also: map(" ".join, nwise(s.split(), 3))
>>> [" ".join(words) for words in nwise(s.split(), 3)]
['The quick, brown', 'quick, brown fox', 'brown fox jumps', 'fox jumps over', 'jumps over the', 'over the lazy', 'the lazy dog.']
+1
source

Source: https://habr.com/ru/post/1755180/


All Articles