How to print only records at intervals from one file, and not overlap other files from another file

I have two files representing records at intervals.

file1.txt

a 5 10
a 13 19
a 27 39
b 4 9
b 15 19
c 20 33
c 39 45

and

file2.txt

something id1 a 4 9 commentx
something id2 a 14 18 commenty
something id3 a 1 4 commentz
something id5 b 3 9 commentbla
something id6 b 16 18 commentbla
something id7 b 25 29 commentblabla
something id8 c 5 59 hihi
something id9 c 40 45 hoho
something id10 c 32 43 haha

What I would like to do is make a file that represents only the entries of file2, for which, if column 3 of file2 is identical to column 1 of file1, the range (columns 4 and 5) is not the same as file1 (column 2 and 3).

The expected output file should be in the file

test.result

something id3 a 1 4 commentz
something id7 b 25 29 commentblabla

I tried using the following python code:

import csv
with open ('file2') as protein, open('file1') as position, open ('test.result',"r+") as fallout:
    writer = csv.writer(fallout, delimiter=' ')
    for rowinprot in csv.reader(protein, delimiter=' '):
        for rowinpos in csv.reader(position, delimiter=' '):
            if rowinprot[2]==rowinpos[0]:
                if rowinprot[4]<rowinpos[1] or rowinprot[3]>rowinpos[2]:
                    writer.writerow(rowinprot)

This did not work ... I had the following result:

something id1 a 4 9 commentx
something id1 a 4 9 commentx
something id1 a 4 9 commentx

which, apparently, is not the one I want.

What did I do wrong? It seems to be in conditional loops. However, I could not understand this ...

+4
source share
4

. . , file1 dict. , 2, dict, . , :

with open("file1.csv", "r") as protein, open("file2.csv", "r") as postion, open("result.csv", "w") as fallout:
    writer = csv.writer(fallout, delimiter=' ')
    protein_dict = {}
    for rowinprt in csv.reader(protein, delimiter=' '):
        key = rowinprt[0]
        sub_value = (int(rowinprt[1]), int(rowinprt[2]))
        protein_dict.setdefault(key, [])
        protein_dict[key].append(sub_value)

    for pos in csv.reader(postion, delimiter=' '):
        id_key = pos[2]
        id_min = int(pos[3])
        id_max = int(pos[4])
        if protein_dict.has_key(id_key) and all([ id_max < _min or _max < id_min for _min, _max in protein_dict[id_key]]):
            writer.writerow(pos)
+1

, :

import csv

class Interval(object):
    """ Representation of a closed interval.
        a & b can be numeric, a datetime.date, or any other comparable type.
    """
    def __init__(self, a, b):
        self.lowerbound, self.upperbound = (a, b) if a < b else (b, a)
    def __contains__(self, val):
        return self.lowerbound <= val <= self.upperbound
    def __repr__(self):
        return '{}({}, {})'.format(self.__class__.__name__,
                                   self.lowerbound, self.upperbound)

filename1 = 'afile1.txt'
filename2 = 'afile2.txt'
filename3 = 'test.result'

intervals = {}  # master dictionary of intervals
with open(filename1, 'rb') as f:
    reader = csv.reader(f, delimiter=' ')
    for row in reader:
        cls, a, b = row[0], int(row[1]), int(row[2])
        intervals.setdefault(cls, []).append(Interval(a, b))

with open(filename2, 'rb') as f1, open(filename3, 'wb') as f2:
    reader = csv.reader(f1, delimiter=' ')
    writer = csv.writer(f2, delimiter=' ')
    for row in reader:
        cls, a, b = row[2], int(row[3]), int(row[4])
        if cls in intervals:
            for interval in intervals[cls]:
                # check for overlap
                if ((a in interval) or (b in interval) or
                    (a < interval.lowerbound and b > interval.upperbound)):
                    break  # skip
            else:
                writer.writerow(row)  # no overlaps
+1

, :

Typically, the code is probably not the most elegant way to do this, but to get the right results with something close to what you wrote, you might need to try something line by line:

with open ('file2.txt') as protein, open('file1.txt') as position, open ('test.result',"r+") as fallout:
    writer = csv.writer(fallout, delimiter=' ')
    for rowinprot in csv.reader(protein, delimiter=' '):
        position.seek(0)
        valid = True
        for rowinpos in csv.reader(position, delimiter=' '):
            if rowinprot[2]==rowinpos[0]:
                 if not (int(rowinprot[4])<int(rowinpos[1]) or int(rowinprot[3])>int(rowinpos[2])):
                     valid = False
        if valid:
            writer.writerow(rowinprot)
0
source

Here is an algorithm that works for you:

def is_overlapping(x, y):
    return len(range(max(x[0], y[0]), min(x[-1], y[-1])+1)) > 0

position_file = r"file1.txt"
positions = [line.strip().split() for line in open(position_file).read().split('\n')]

protein_file = r"file2.txt"
proteins = [(line.strip().split()[2:5], line) for line in open(protein_file).read().split('\n')]

fallout_file = r"result.txt"
with open(fallout_file, 'w') as fallout:
    for index, protein_info in enumerate(proteins):
        try:
            test_position = positions[index]
        except IndexError:
            # If files do not have the same size the loop will end
            break
        protein_position, protein_line = protein_info

        # If identifier is different, write line and check next line
        if protein_position[0] != test_position[0]:
            print(protein_line)
            fallout.write(protein_line)
            continue

        # Here identifiers are identical, then we check if they overlap
        test_range = range(int(test_position[1]), int(test_position[2]))
        protein_range = range(int(protein_position[1]), int(protein_position[2]))
        if not is_overlapping(protein_range, test_range):
            print(protein_line)
            fallout.write(protein_line + '\n')

As for the overlap test, a good snippet was given here: fooobar.com/questions/257431 / ...

0
source

Source: https://habr.com/ru/post/1648748/


All Articles