Be Pythonic: Collecting Custom Strings - Indexer

Firstly, the code below works as it is. I'm a lot more Ruby programmer, so I still feel my way in Python, and I'm sure there should be a much more DRY way to accomplish what I'm doing below.

I create an indexer that creates a dictionary of terms that are repeated in the document along with the counter, and then produces the terms with a count. Now he supports up to four phrases. Is there a better way for me to ignore such logic so that I can do the same, but for phrases of arbitrary length without the need to add more and more conditional expressions?

import sys
file=open(sys.argv[1],"r")
wordcount = {}
last_word = ""
last_last_word = ""
last_last_last_word = ""

for word in file.read().split():
    if word not in wordcount:
        wordcount[word] = 1
    else:
        wordcount[word] += 1

    if last_last_last_word != "":
        if "{} {} {} {}".format(last_last_last_word,last_last_word,last_word,word) not in wordcount:
            wordcount[last_last_last_word + " " + last_last_word + " " + last_word + " " + word ] = 1
        else: 
            wordcount[last_last_last_word + " " + last_last_word + " " + last_word + " " + word ] += 1
    last_last_last_word = last_last_word

    if last_last_word != "":
        if last_last_word + " " + last_word + " " + word not in wordcount:
            wordcount[last_last_word + " " + last_word + " " + word ] = 1
        else: 
            wordcount[last_last_word + " " + last_word + " " + word ] += 1
    last_last_word = last_word

    if last_word != "":
        if last_word + " " + word not in wordcount:
            wordcount[last_word + " " + word] = 1
        else: 
            wordcount[last_word + " " + word] += 1
    last_word = word

for k,v in sorted(wordcount.items(), key=lambda x:x[1], reverse=True):
    print k,v

I include a more extensive sample of input and output. I apologize for the length, but the nature of what this code does will tend to create large outputs.

:

this is a sample input file an input file will always be all lower case with no punctuation

:

file 2
input 2
input file 2
an input file 1
all 1
lower case 1
be 1
is 1
file will always 1
an 1
sample 1
case 1
always be all lower 1
this is a 1
will always be 1
sample input file 1
will always 1
is a sample 1
all lower 1
lower case with no 1
no 1
with 1
with no 1
file will always be 1
with no punctuation 1
lower 1
be all lower case 1
no punctuation 1
an input file will 1
input file an 1
file an 1
input file an input 1
always be 1
file an input file 1
be all 1
is a 1
input file will 1
file will 1
an input 1
input file will always 1
will always be all 1
always be all 1
lower case with 1
a sample 1
a sample input file 1
a sample input 1
is a sample input 1
be all lower 1
a 1
sample input file an 1
sample input 1
case with no punctuation 1
all lower case with 1
this 1
always 1
file an input 1
case with 1
case with no 1
will 1
all lower case 1
punctuation 1
this is 1
this is a sample 1

, , , . , .

+4
6

. , .

string="this is a sample input file an input file will always be all lower case with no punctuation"

def words(count):
    return [" ".join(string.split()[a:b]) for a in range(len(string.split())) for b in range(a+count+1) if len(string.split()[a:b]) == count]

.

, .

lst = words(3)

,

for word in set(lst):
    print word, lst.count(word)

an input file 1
file will always 1
is a sample 1
be all lower 1
file an input 1
with no punctuation 1
input file will 1
lower case with 1
this is a 1
always be all 1
will always be 1
sample input file 1
a sample input 1
all lower case 1
case with no 1
input file an 1

, , , .

, , , , , .count().

, , , , , .

words_list = string.split()
words_dict = {}

for a in range(len(words_list)):
    for b in range(a):
        phrase = " ".join(words_list[b:a])
        if phrase in words_dict:
            words_dict[phrase] += 1
        else:
            words_dict[phrase] = 1

for i in words_dict:
    print i, words_dict[i]

.

0

(, , , ), ( ) , N , Counter , :

import re
import mmap
from itertools import islice, izip, tee
from collections import Counter
from pprint import pprint

def word_grouper(filename, size):
    counts = Counter()
    with open(filename) as fin:
        mm = mmap.mmap(fin.fileno(), 0, access=mmap.ACCESS_READ)
        words = (m.group() for m in re.finditer('[a-z]+', mm))
        sliding = [islice(w, n, None) for n, w in enumerate(tee(words, size+1))]
        for slide in izip(*sliding):
            counts.update(slide[:n] for n in range(1, len(slide)))

    return counts

counts = word_grouper('input filename', 4)
# do appropriate formatting instead of just `pprint`ing
pprint(counts.most_common())

( ):

[(('file',), 2),
 (('input', 'file'), 2),
 (('input',), 2),
 (('a', 'sample', 'input'), 1),
 (('file', 'will', 'always', 'be'), 1),
 (('sample', 'input', 'file', 'an'), 1),
 (('this', 'is', 'a', 'sample'), 1),
 (('this', 'is'), 1),
 (('will',), 1),
 (('lower', 'case', 'with'), 1),
 (('an', 'input', 'file', 'will'), 1),
 (('sample', 'input'), 1),
 (('is', 'a'), 1),
 (('all', 'lower', 'case', 'with'), 1),
 (('input', 'file', 'will'), 1),
 (('an',), 1),
 (('always', 'be'), 1),
 (('lower', 'case', 'with', 'no'), 1),
 (('an', 'input'), 1),
 (('be', 'all', 'lower'), 1),
 (('this',), 1),
 (('be', 'all', 'lower', 'case'), 1),
 (('this', 'is', 'a'), 1),
 (('sample',), 1),
 (('sample', 'input', 'file'), 1),
 (('will', 'always', 'be', 'all'), 1),
 (('a',), 1),
 (('a', 'sample'), 1),
 (('is', 'a', 'sample'), 1),
 (('will', 'always'), 1),
 (('lower',), 1),
 (('lower', 'case'), 1),
 (('file', 'an'), 1),
 (('file', 'an', 'input'), 1),
 (('file', 'will'), 1),
 (('is',), 1),
 (('all', 'lower'), 1),
 (('input', 'file', 'an', 'input'), 1),
 (('always', 'be', 'all', 'lower'), 1),
 (('an', 'input', 'file'), 1),
 (('input', 'file', 'an'), 1),
 (('be', 'all'), 1),
 (('input', 'file', 'will', 'always'), 1),
 (('be',), 1),
 (('all',), 1),
 (('always', 'be', 'all'), 1),
 (('is', 'a', 'sample', 'input'), 1),
 (('always',), 1),
 (('all', 'lower', 'case'), 1),
 (('file', 'an', 'input', 'file'), 1),
 (('file', 'will', 'always'), 1),
 (('a', 'sample', 'input', 'file'), 1),
 (('will', 'always', 'be'), 1)]
+3

, defaultdict - .

, .

import sys
from collections import defaultdict

file=open(sys.argv[1],"r")

wordcount = defaultdict(int)
wordlist = ["" for i in range(int(sys.argv[2]))]

def check(wordcount, wordlist, word):

    wordlist.append(word)
    for i, word in enumerate(wordlist):
        if word != "":
            current = "".join([w + " " for w in wordlist[i:]])
            wordcount[current] += 1

    return wordlist[1:]

for word in file.read().split():
    wordlist = check(wordcount, wordlist, word)

for k,v in sorted(wordcount.items(), key=lambda x:x[1], reverse=True):
    print k,v
0

from collections import Counter
import itertools
import operator as op


def count_phrases(words, phrase_len):
    return reduce(op.add, 
    (Counter(tuple(words[i:i+l]) for i in xrange(len(words)-l+1)) for l in phrase_len))

:

words = "a b c a a".split()
for phrase, count in count_phrases(words, [1, 2]).iteritems():
    print " ".join(phrase), counts

:

b c 1
a 3
c 1
b 1
c a 1
a a 1
a b 1
0

:

def parser(data,size):
    chunked = data.split()
    phrases = []
    for i in xrange(len(chunked)-size):
        phrase=' '.join(chunked[i:size+i])
        phrases.append(phrase)
    return phrases

def parse_file(fname,size):    
    result = []
    with open(fname,'r') as f:    
        for data in f.readlines():
            for i in xrange(1,size):
                result+=parser(data.strip(),i)

    return Counter(result)


result= parse_file('file.txt',4) 
print sorted(result.items(),key=lambda x:x[1],reverse=True)

[('file', 2),
 ('input', 2),
 ('input file', 2),
 ('an input file', 1),
 ('all', 1),
 ('always be all', 1),
 ('is', 1),
 ('an', 1),
 ('sample', 1),
 ('this is a', 1),
 ('will always be', 1),
 ('sample input file', 1),
 ('will always', 1),
 ('is a sample', 1),
 ('all lower', 1),
 ('no', 1),
 ('with no', 1),
 ('lower case', 1),
 ('case', 1),
 ('input file will', 1),
 ('case with no', 1),
 ('input file an', 1),
 ('file an', 1),
 ('be', 1),
 ('always be', 1),
 ('be all lower', 1),
 ('be all', 1),
 ('lower', 1),
 ('is a', 1),
 ('an input', 1),
 ('a sample input', 1),
 ('lower case with', 1),
 ('a sample', 1),
 ('file will', 1),
 ('with', 1),
 ('a', 1),
 ('file will always', 1),
 ('sample input', 1),
 ('this', 1),
 ('always', 1),
 ('file an input', 1),
 ('case with', 1),
 ('will', 1),
 ('all lower case', 1),
 ('this is', 1)]
0

import sys
file=open(sys.argv[1],"r")
wordcount = {}
nb_words = 4
last_words = []

for word in file.read().split():
    last_words = [word] + last_words 
    if len (last_words) > nb_words:
        last_words.pop()
    for i in range(len(last_words)-1,-1,-1):
        if last_words[i] != "":
            key = ' '.join(last_words[:i+1])
            if key not in wordcount:
                wordcount[key] = 1
            else: 
                wordcount[key] += 1

for k,v in sorted(wordcount.items(), key=lambda x:x[1], reverse=True):
    print k,v

. , 4 . : ,

0
source

Source: https://habr.com/ru/post/1622434/


All Articles