Python: list list dictionary

def makecounter():
     return collections.defaultdict(int)

class RankedIndex(object):
  def __init__(self):
    self._inverted_index = collections.defaultdict(list)
    self._documents = []
    self._inverted_index = collections.defaultdict(makecounter)


def index_dir(self, base_path):
    num_files_indexed = 0
    allfiles = os.listdir(base_path)
    self._documents = os.listdir(base_path)
    num_files_indexed = len(allfiles)
    docnumber = 0
    self._inverted_index = collections.defaultdict(list)

    docnumlist = []
    for file in allfiles: 
            self.documents = [base_path+file] #list of all text files
            f = open(base_path+file, 'r')
            lines = f.read()

            tokens = self.tokenize(lines)
            docnumber = docnumber + 1
            for term in tokens:  
                if term not in sorted(self._inverted_index.keys()):
                    self._inverted_index[term] = [docnumber]
                    self._inverted_index[term][docnumber] +=1                                           
                else:
                    if docnumber not in self._inverted_index.get(term):
                        docnumlist = self._inverted_index.get(term)
                        docnumlist = docnumlist.append(docnumber)
            f.close()
    print '\n \n'
    print 'Dictionary contents: \n'
    for term in sorted(self._inverted_index):
        print term, '->', self._inverted_index.get(term)
    return num_files_indexed
    return 0

I get an index error while executing this code: index index is out of range.

The above code generates a dictionary index that stores the “term” as a key and the number of documents in which the term appears as a list. For example: if the term “cat” appears in documents 1.txt, 5.txt and 7.txt, the dictionary will have: cat <- [1,5,7]

Now I have to change it to add the frequency of the term, so if the word cat appears twice in document 1, three times in document 5 and once in document 7: expected result: term <- [[docnumber, term freq], [docnumber, term freq]] <- list of lists in dict! cat <- [[1,2], [5,3], [7,1]]

, . , .

.

+3
3

factory. :

def makecounter():
    return collections.defaultdict(int)

self._inverted_index = collections.defaultdict(makecounter)

for term in tokens:,

        for term in tokens:  
                self._inverted_index[term][docnumber] +=1

self._inverted_index[term] a dict,

{1:2,5:3,7:1}

. self._inverted_index[term] , :

self._inverted_index = dict((t,[d,v[d] for d in sorted(v)])
                            for t in self._inverted_index)

( - !), , , , , ), , , ..).

+6

, , . dict, .

filedicts = {}
for file in allfiles:
  filedicts[file] = {}

  for term in terms:
    filedict.setdefault(term, 0)
    filedict[term] += 1
+1

Perhaps you could just create a simple class for (docname, frequency).

Then your dict may have lists of this new data type. You can also make a list of lists, but a separate data type will be cleaner.

0
source

Source: https://habr.com/ru/post/1767994/


All Articles