(, , , ), ( ) , N , Counter , :
import re
import mmap
from itertools import islice, izip, tee
from collections import Counter
from pprint import pprint
def word_grouper(filename, size):
counts = Counter()
with open(filename) as fin:
mm = mmap.mmap(fin.fileno(), 0, access=mmap.ACCESS_READ)
words = (m.group() for m in re.finditer('[a-z]+', mm))
sliding = [islice(w, n, None) for n, w in enumerate(tee(words, size+1))]
for slide in izip(*sliding):
counts.update(slide[:n] for n in range(1, len(slide)))
return counts
counts = word_grouper('input filename', 4)
pprint(counts.most_common())
( ):
[(('file',), 2),
(('input', 'file'), 2),
(('input',), 2),
(('a', 'sample', 'input'), 1),
(('file', 'will', 'always', 'be'), 1),
(('sample', 'input', 'file', 'an'), 1),
(('this', 'is', 'a', 'sample'), 1),
(('this', 'is'), 1),
(('will',), 1),
(('lower', 'case', 'with'), 1),
(('an', 'input', 'file', 'will'), 1),
(('sample', 'input'), 1),
(('is', 'a'), 1),
(('all', 'lower', 'case', 'with'), 1),
(('input', 'file', 'will'), 1),
(('an',), 1),
(('always', 'be'), 1),
(('lower', 'case', 'with', 'no'), 1),
(('an', 'input'), 1),
(('be', 'all', 'lower'), 1),
(('this',), 1),
(('be', 'all', 'lower', 'case'), 1),
(('this', 'is', 'a'), 1),
(('sample',), 1),
(('sample', 'input', 'file'), 1),
(('will', 'always', 'be', 'all'), 1),
(('a',), 1),
(('a', 'sample'), 1),
(('is', 'a', 'sample'), 1),
(('will', 'always'), 1),
(('lower',), 1),
(('lower', 'case'), 1),
(('file', 'an'), 1),
(('file', 'an', 'input'), 1),
(('file', 'will'), 1),
(('is',), 1),
(('all', 'lower'), 1),
(('input', 'file', 'an', 'input'), 1),
(('always', 'be', 'all', 'lower'), 1),
(('an', 'input', 'file'), 1),
(('input', 'file', 'an'), 1),
(('be', 'all'), 1),
(('input', 'file', 'will', 'always'), 1),
(('be',), 1),
(('all',), 1),
(('always', 'be', 'all'), 1),
(('is', 'a', 'sample', 'input'), 1),
(('always',), 1),
(('all', 'lower', 'case'), 1),
(('file', 'an', 'input', 'file'), 1),
(('file', 'will', 'always'), 1),
(('a', 'sample', 'input', 'file'), 1),
(('will', 'always', 'be'), 1)]