Python compares each line in a file with all the others.

I am implementing a statistical program and creating a bottleneck in performance, and was hoping that I could get some help from the community to point me towards optimization.

I create a set for each line in the file and find the intersection of this set by comparing the given data of each line in one file. Then I use the size of this intersection to filter out specific sets of output. The problem is that I have a nested loop cycle (O (n 2 )), and the standard size of the files included in the program is just over 20,000 lines. I timed the algorithm and up to 500 lines, it works after about 20 minutes, but for large files it takes about 8 hours.

I have 16 GB of RAM and a significantly faster 4-core Intel i7 processor. I did not notice significant differences in memory usage by copying list1 and using the second list for comparison, instead of opening the file again (maybe this is because I have an SSD?). I thought the open mechanism reads / writes directly to the hard drive, which is slower, but did not notice the difference when using the two lists. In fact, a program rarely uses more than 1 GB of RAM during operation.

I hope that other people use a specific data type or perhaps better understand multiprocessing in Python and can help me speed up the process. I appreciate any help, and I hope my code is not too poorly written.

import ast, sys, os, shutil
list1 = []
end = 0
filterValue = 3

# creates output file with filterValue appended to name
with open(arg2 + arg1 + "/filteredSets" + str(filterValue) , "w") as outfile:
    with open(arg2 + arg1 + "/file", "r") as infile:
        # create a list of sets of rows in file
        for row in infile:
            list1.append(set(ast.literal_eval(row)))

            infile.seek(0)
            for row in infile:
                # if file only has one row, no comparisons need to be made
                if not(len(list1) == 1):
                # get the first set from the list and...
                    set1 = set(ast.literal_eval(row))
                    # ...find the intersection of every other set in the file
                    for i in range(0, len(list1)):
                        # don't compare the set with itself
                        if not(pos == i):
                            set2 = list1[i]
                            set3 = set1.intersection(set2)
                            # if the two sets have less than 3 items in common
                            if(len(set3) < filterValue):
                                # and you've reached the end of the file
                                if(i == len(list1)):
                                    # append the row in outfile
                                    outfile.write(row)
                                    # increase position in infile
                                    pos += 1
                            else:
                                break
                        else:
                            outfile.write(row)

An input sample is a file with the following format:

[userID1, userID2, userID3]
[userID5, userID3, userID9]
[userID10, userID2, userID3, userID1]
[userID8, userID20, userID11, userID1]

The output file, if it was an input file, would be:

[userID5, userID3, userID9]
[userID8, userID20, userID11, userID1]

... .

Visual example

+4
2

, , - .

def get_data(*args):
    # get the data.

def find_intersections_sets(list1, list2):
    # do the intersections part.

def loop_over_some_result(result):
    # insert assertions so that you don't end up looping in infinity:
    assert result is not None
    ...

def myfunc(*args):
    source1, source2 = args
    L1, L2 = get_data(source1), get_data(source2)
    intersects = find_intersections_sets(L1,L2)
    ...

if __name__ == "__main__":
    myfunc()

, :

if __name__ == "__main__":
    import cProfile
    cProfile.run('myfunc()')

. cProfile . python script?

( , ?) - - , this (python2 ) this (python3):

myfunc :

def get_data(*args):
    # get the data.

def find_intersections_sets(list1, list2):
    # do the intersections part.

def myfunc(*args):
    source1, source2 = args
    L1, L2 = get_data(source1), get_data(source2)

    @timeout(10) # seconds <---- the clever bit!
    intersects = find_intersections_sets(L1,L2)
    ...

... , .

:

import ast 

def get_data(filename):
    with open(filename, 'r') as fi:
        data = fi.readlines()
    return data

def get_ast_set(line):
    return set(ast.literal_eval(line))

def less_than_x_in_common(set1, set2, limit=3):
    if len(set1.intersection(set2)) < limit:
        return True
    else:
        return False

def check_infile(datafile, savefile, filtervalue=3):
    list1 = [get_ast_set(row) for row in get_data(datafile)]
    outlist = []
    for row in list1:
        if any([less_than_x_in_common(set(row), set(i), limit=filtervalue) for i in outlist]):
            outlist.append(row)
    with open(savefile, 'w') as fo:
        fo.writelines(outlist)

if __name__ == "__main__":
    datafile = str(arg2 + arg1 + "/file")
    savefile = str(arg2 + arg1 + "/filteredSets" + str(filterValue))
    check_infile(datafile, savefile)
-1

, , .. .


. , .

Sets = dict()
for rowID, row in enumerate(Rows):
  for userID in row:
     if Sets.get(userID) is None:
       Sets[userID] = set()
     Sets[userID].add(rowID)

, , , .

BadRows = set()
for rowID, row in enumerate(Rows):
  Intersections = dict()
  for userID in row:
    for rowID_cmp in Sets[userID]: 
      if rowID_cmp != rowID:
        Intersections[rowID_cmp] = Intersections.get(rowID_cmp, 0) + 1
  # Now Intersections contains info about how many "times"
  # row numbered rowID_cmp intersectcs current row
  filteredOut = False
  for rowID_cmp in Intersections:
    if Intersections[rowID_cmp] >= filterValue:
      BadRows.add(rowID_cmp)
      filteredOut = True
  if filteredOut:
    BadRows.add(rowID)

, BadRows, :

for rowID, row in enumerate(Rows):
  if rowID not in BadRows:
    # output row

3- O (nlogn) . , , , .

python, .

0

Source: https://habr.com/ru/post/1628609/


All Articles