Memory limit when converting a FASTA file string to a list

I am using python 2.7. I am working with a fasta file containing the DNA sequence of a modern human Y chromosome. This is actually a long string of 20,000,000 characters, such as ATCGACGATCACACG .... I want to convert this very long string to a list of triad strings, for example, this string:

My_sequence_string= "ATGTACGTCATAG"

to this list:

My_sequence_list= ["ATG","TAC","GTC","ATA"]

This is my code:

str_Reading_Frame1=open("Ychromosome.fa", "r").read()
list_Reading_Frame1=[]
def str_to_list(list, str):
    if len(str)>2:
        list.append(str[:3])
        str_to_list(list, str[3:])
str_to_list(list_Reading_Frame1, str_Reading_Frame1)

But I see a memory limit error. I think the problem is calling the function inside it, but I don't know how to refine my code. I don't want to import modules like Biopython, I want to do it myself (with your help :-))

+4
source share
3 answers

,

str_Reading_Frame1=open("Ychromosome.fa", "r").read()

- , memeory. , , . N , O (N ^ 2).

3 , , , , , .

with open('Ychromosome.fa') as f:
    while True:
        triad = f.read(3)
        if len(triad) != 3:
            break
        My_sequence_list.append(triad)


>>> My_sequence_list
['ATG', 'TAC', 'GTC', 'ATA']
+2

, .

def data(x):
    '''x if a file object and data returns an iterable giving blocs of 3 characters'''
    while True:
        d = x.read(3)
        if len(d) != 3:
            raise StopIteration
        yield d

with open("Ychromosome.fa", "r") as str_Reading_Frame1:
    for triad in data(str_Reading_Frame1):
        # use triad one at a time
        ...
+2

, . jamylak . , , , .

Bases=["A", "T", "C", "G"] #4 bases of DNA strands
#Generating 64 different codons
codons=[]
def Possible_Codons(Bases):
    for i in Bases:
        for j in Bases:
            for y in Bases:
                ins= "%s%s%s" % (i, j, y)
                codons.append(ins)
Possible_Codons(Bases)

#Generating 6 different reading frames
Code_file=open("3.fa", "r").read()
open("str_Reading_File1.fa", "w").write(Code_file)
open("str_Reading_File2.fa", "w").write(Code_file[1:])
open("str_Reading_File3.fa", "w").write(Code_file[2:])
open("str_Reading_File4.fa", "w").write(Code_file[::-1])
open("str_Reading_File5.fa", "w").write(Code_file[-2::-1])
open("str_Reading_File6.fa", "w").write(Code_file[-3::-1])
My_sequence_list=[]
numbers=["1", "2", "3", "4", "5", "6"] #It is used for calling files
for i in numbers:
    with open("str_Reading_File"+i+".fa") as f:
        while True:
            triad = f.read(3)
            if len(triad) != 3:
                break
            My_sequence_list.append(triad)
    print "In the reading frame "+i+", codon usage is:"
    for i in codons:
        print "%s = %s times" % (i, My_sequence_list.count(i))
    My_sequence_list=[]
    print "*****************\n"
+1

Source: https://habr.com/ru/post/1568154/


All Articles