, - (Python):
def GC(seq):
s = seq.upper()
return 100.0 * (s.count('G') + s.count('C')) / len(s)
def bin(gc):
if gc < 20: return 1
elif gc > 80: return 8
else:
return int(gc/10)
Then you just need to read the entries from the file, calculate the contents of the GC, find the box you need and write the entry to the appropriate file. The following example implements this with the Python package that we use in the lab:
from pyteomics import fasta
def split_to_bin_files(multifile):
"""Reads a file and writes the entries to appropriate 'bin' files.
`multifile` must be a filename (str)"""
for entry in fasta.read(multifile):
fasta.write((entry,), (multifile+'_bin_'+
str(bin(GC(entry[1])))))
Then you just call it like split_to_bin_files('mybig.fasta').
source
share