Starting question
I am writing a bioinformatics script in python (3.5) that analyzes a large (sorted and indexed) bam file representing a sequence that reads genome alignment, associates genomic information ("annotations") with these readings and counts the types of annotations encountered. I measure the speed with which my scripts process aligned reads (in batches of 1000 views), and I get the following speed changes:

What can explain this pattern?
, , , .
, , ( 2 script - 0,1% htop).
(. )
pysam . AlignmentFile.fetch , AlignedSegment.
gtf ( bgzip tabix). TabixFile.fetch (- pysam), , frozenset (process_annotations, , frozenset), , AlignedSegment.
Counter. ?
, ?
, cProfile , (pysam/ctabix.pyx:579(__cnext__), . ), , , Cython, samtools C. , .
script, pybedtools python bedtools, gtf (https://daler.imtqy.com/pybedtools/index.html).
Speed ββ
. ( ):
$ time python3 -m cProfile -o tests/total_pybedtools.prof ~/src/bioinfo_utils/small_RNA_seq_annotate.py -b results/bowtie2/mapped_C_elegans/WT_1_21-26_on_C_elegans_sorted.bam -g annotations/all_annotations.gtf -a "pybedtools" -l total_pybedtools.log > total_pybedtools.out
real 5m48.474s
user 5m48.204s
sys 0m1.336s
$ time python3 -m cProfile -o tests/total_tabix.prof ~/src/bioinfo_utils/small_RNA_seq_annotate.py -b results/bowtie2/mapped_C_elegans/WT_1_21-26_on_C_elegans_sorted.bam -g annotations/all_annotations.gtf.gz -a "tabix" -l total_tabix.log > total_tabix.out
real 195m40.990s
user 194m54.356s
sys 0m47.696s
( : . , , .)
:

, , . "bioinformatics" "samtools".
, gprof2dot :
$ gprof2dot -f pstats tests/total_pybedtools.prof \
| dot -Tpng -o tests/total_pybedtools_callgraph.png
$ gprof2dot -f pstats tests/total_tabix.prof \
| dot -Tpng -o tests/total_tabix_callgraph.png
tabix:

pybedtools:

:
@contextmanager
def annotation_context(annot_file, getter_type):
"""Yields a function to get annotations for an AlignedSegment."""
if getter_type == "tabix":
gtf_parser = pysam.ctabix.asGTF()
gtf_file = pysam.TabixFile(annot_file, mode="r")
fetch_annotations = gtf_file.fetch
def get_annotations(ali):
"""Generates an annotation getter for *ali*."""
return fetch_annotations(*ALI2POS_INFO(ali), parser=gtf_parser)
elif getter_type == "pybedtools":
gtf_file = open(annot_file, "r")
fetch_annotations = BedTool(gtf_file).as_intervalfile().all_hits
def get_annotations(ali):
"""Generates an annotation list for *ali*."""
return fetch_annotations(Interval(*ALI2POS_INFO(ali)))
else:
raise NotImplementedError("%s not available" % getter_type)
yield get_annotations
gtf_file.close()
def main():
"""Main function of the program."""
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"-b", "--bamfile",
required=True,
help="Sorted and indexed bam file containing the mapped reads."
"A given read is expected to be aligned at only one location.")
parser.add_argument(
"-g", "--gtf",
required=True,
help="A sorted, bgzip-compressed gtf file."
"A corresponding .tbi tabix index should exist.")
parser.add_argument(
"-a", "--annotation_getter",
choices=["tabix", "pybedtools"],
default="tabix",
help="Method to use to access annotations from the gtf file.")
parser.add_argument(
"-l", "--logfile",
help="File in which to write logs.")
args = parser.parse_args()
if not args.logfile:
logfilename = "%s.log" % args.annotation_getter
else:
logfilename = args.logfile
logging.basicConfig(
filename=logfilename,
level=logging.DEBUG)
INFO = logging.info
DEBUG = logging.debug
WARNING = logging.warning
process_annotations = make_annotation_processor(args.annotation_getter)
with annotation_context(args.gtf, args.annotation_getter) as get_annotations:
def generate_annotations(bamfile):
"""Generates annotations for the alignments in *bamfile*."""
last_t = perf_counter()
for i, ali in enumerate(bamfile.fetch(), start=1):
if not i % 1000:
now = perf_counter()
INFO("%d alignments processed (%.0f alignments / s)" % (
i,
1000.0 / (now - last_t)))
last_t = perf_counter()
yield process_annotations(get_annotations(ali), ali)
with pysam.AlignmentFile(args.bamfile, "rb") as bamfile:
annot_stats = Counter(generate_annotations(bamfile))
print(*reversed(annot_stats.most_common()), sep="\n")
return 0
( (make_annotation_processor , ), script.)