I hope I post on the right place ? My script is running fine on little genomes but it take hours and days when it comes to work with mammal genomes. I tried many different things but Im out of idea. Can you tell me what cause this script to be so slow ? This script parse fasta file, it works with ID beginning with a > or not. It also calculates somes genomics metrics like N50 and L50 and finally it return a dict with seq IDs and their length.
Thank you very much!
from collections import OrderedDict
import argparse
parser = argparse.ArgumentParser(description = "N50 parser")
#parser = argparse.ArgumentParser(prog="N50Parser.py", usage="N50")
parser.add_argument("-i", "--input", action="store", dest="input", required=True,
help="Input file with sequences")
parser.add_argument("-o", "--output", action="store", dest="output",
help="output file")
parser.add_argument("-o2", "--output2", action="store", dest="output2",
help="output file")
args = parser.parse_args()
def read_file(fasta_file):
"Parse fasta file"
descriptiondict = OrderedDict()
dictionnary = OrderedDict()
with open(fasta_file, 'r') as infile:
for line in infile:
record = line.strip()
if record and record[0] == '>':
seqid = record.split(" ")[0][1:]
print(seqid)
dictionnary[seqid] = ""
toto = record.split(" ", 1)
if len(toto) >= 2 :
description = toto[1]
descriptiondict[seqid] = description
else:
descriptiondict[seqid] = ""
continue
dictionnary[seqid] += record
return dictionnary, descriptiondict
seqdict, descriptdict = read_file(args.input)
lengthdict = OrderedDict()
for sequenceid in seqdict:
lengthdict[sequenceid] = len(seqdict[sequenceid])
length = sum(lengthdict.values())
N_number = sum([seqdict[seqid].count("N") for seqid in seqdict])
print(N_number)
print(length)
all_len = sorted(lengthdict.values(), reverse=True)
print(all_len)
if length > 0:
acum = 0
for y in range (len(all_len)):
if acum <= length / 2:
acum = all_len[y] + acum
n = y #L50
else:
break
n = n + 1
print("The L50 is", n)
print("The N50 is", all_len[n-1])
with open(args.output, 'w') as outfile:
outfile.write("L50\t{}\n".format(n))
outfile.write("N50\t{}\n".format(all_len[n-1]))
with open(args.output2, "w") as file:
for key, value in lengthdict.items():
file.write(f"{key} : {value}\n")