I have some paired end sequencing data. However, the files are not properly paired. There are unpaired reads which need to be removed to make the read pair files consistent. Though there are solutions like using Trimmomatic. I want suggestions as to how I can improve the performance of my own script. The current version processes at about 10K records per second.
import sys
class FastqRecord(object):
def __init__(self, block, unique_col=1, unique_type=int, sep=' '):
self.block = "".join(block[:])
self.header = block[0][1:].rstrip('\n')
self.uid = unique_type(self.header.split(sep)[unique_col])
def __eq__(self, other):
return self.uid == other.uid
def __gt__(self, other):
return self.uid > other.uid
class Fastq(object):
def __init__(self, filename):
self.file = filename
self.handle = open(self.file)
self.count = 0
self.record = self.next()
def __iter__(self):
return self
def next(self):
return FastqRecord([self.handle.next(), self.handle.next(), self.handle.next(), self.handle.next()])
def update(self):
try:
self.record = self.next()
self.count+=1
except StopIteration:
self.record = False
self.handle.close()
def fn_from_path(path):
return path.split('/')[-1].split('.')[0]
def flush_handle(obj, handle):
handle.write(obj.record.block)
for x in obj:
handle.write(x.block)
return True
@profile
def main():
file1, file2, out_dir = (sys.argv[1], sys.argv[2], sys.argv[3].rstrip('/'))
#file1, file2, out_dir = ('./test/SRR2130002_1.fastq', './test/SRR2130002_2.fastq', '.')
out_handles = {
'p1': open('%s/%s.fastq' % (out_dir, fn_from_path(file1)), 'w'),
'p2': open('%s/%s.fastq' % (out_dir, fn_from_path(file2)), 'w'),
's': open('%s/%s_unpaired.fastq' % (out_dir, fn_from_path(file1).split('_')[0]), 'w')
}
f1, f2 = Fastq(file1), Fastq(file2)
while True:
print "\r%9d %9d %9d" % (f1.count, f2.count, f1.count-f2.count),
sys.stdout.flush()
if f1.record is False or f2.record is False:
if f1.record is False:
flush_handle(f2, out_handles['s'])
else:
flush_handle(f1, out_handles['s'])
break
if f1.record == f2.record:
out_handles['p1'].write(f1.record.block)
out_handles['p2'].write(f2.record.block)
f1.update()
f2.update()
elif f1.record > f2.record:
out_handles['s'].write(f2.record.block)
f2.update()
else:
out_handles['s'].write(f1.record.block)
f1.update()
for i in out_handles:
out_handles[i].close()
print "\n Done!"
if __name__ == "__main__":
main()
This is how my FASTQ files look:
head SRR2130002_1.fasta
@SRR2130002.1 1 length=39
CCCTAACCCTAACCCTAACCCTAACCCAAAGACAGGCAA
+SRR2130002.1 1 length=39
AAAAA7F<<FF<F.<FFFFF77F.))))FA.))))))7A
@SRR2130002.2 2 length=39
TTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTATCTCGTA
+SRR2130002.2 2 length=39
<AAAAAA.7FFFFFFFFFF.<...).A.F7F7)A.FAA<
@SRR2130002.3 3 length=39
CTACCCCTAACCCTAACCCTAACAAAACCCCAAAACAAA
Thanks
UPDATE: I ran a line profiler (kernprof) on the main() function. And got following stats:
Timer unit: 1e-06 s
Total time: 0.808629 s File: paired_extractor.py Function: main at line 46
Line # Hits Time Per Hit % Time Line Contents
==============================================================
46 @profile
47 def main():
48 1 4 4.0 0.0 file1, file2, out_dir = (sys.argv[1], sys.argv[2], sys.argv[3].rstrip('/'))
49 #file1, file2, out_dir = ('./test/SRR2130002_1.fastq', './test/SRR2130002_2.fastq', '.')
50 1 1 1.0 0.0 out_handles = {
51 1 4830 4830.0 0.6 'p1': open('%s/%s.fastq' % (out_dir, fn_from_path(file1)), 'w'),
52 1 4118 4118.0 0.5 'p2': open('%s/%s.fastq' % (out_dir, fn_from_path(file2)), 'w'),
53 1 1283 1283.0 0.2 's': open('%s/%s_unpaired.fastq' % (out_dir, fn_from_path(file1).split('_')[0]), 'w')
54 }
55 1 2945 2945.0 0.4 f1, f2 = Fastq(file1), Fastq(file2)
56 25001 23354 0.9 2.9 while True:
57 25001 105393 4.2 13.0 print "\r%9d %9d %9d" % (f1.count, f2.count, f1.count-f2.count),
58 25001 68264 2.7 8.4 sys.stdout.flush()
59 25001 29858 1.2 3.7 if f1.record is False or f2.record is False:
60 1 1 1.0 0.0 if f1.record is False:
61 1 9578 9578.0 1.2 flush_handle(f2, out_handles['s'])
62 else:
63 flush_handle(f1, out_handles['s'])
64 1 1 1.0 0.0 break
65 25000 47062 1.9 5.8 if f1.record == f2.record:
66 23593 41546 1.8 5.1 out_handles['p1'].write(f1.record.block)
67 23593 34040 1.4 4.2 out_handles['p2'].write(f2.record.block)
68 23593 216319 9.2 26.8 f1.update()
69 23593 196077 8.3 24.2 f2.update()
70 1407 2340 1.7 0.3 elif f1.record > f2.record:
71 out_handles['s'].write(f2.record.block)
72 f2.update()
73 else:
74 1407 2341 1.7 0.3 out_handles['s'].write(f1.record.block)
75 1407 13231 9.4 1.6 f1.update()
76 4 9 2.2 0.0 for i in out_handles:
77 3 6023 2007.7 0.7 out_handles[i].close()
78 1 11 11.0 0.0 print "\n Done!"