I have a complex python pipeline (which code I cant change), calling multiple other scripts and other executables. The point is it takes ages to run over 8000 directories, doing some scientific analyses. So, I wrote a simple wrapper, (might not be most effective, but seems to work) using the multiprocessing module.
from os import path, listdir, mkdir, system
from os.path import join as osjoin, exists, isfile
from GffTools import Gene, Element, Transcript
from GffTools import read as gread, write as gwrite, sort as gsort
from re import match
from multiprocessing import JoinableQueue, Process
from sys import argv, exit
# some absolute paths
inbase = "/.../abfgp_in"
outbase = "/.../abfgp_out"
abfgp_cmd = "python /.../abfgp-2.rev/abfgp.py"
refGff = "/.../B0510_manual_reindexed_noSeq.gff"
# the Queue
Q = JoinableQueue()
i = 0
# define number of processes
try: num_p = int(argv[1])
except ValueError: exit("Wrong CPU argument")
# This is the function calling the abfgp.py script, which in its turn calls alot of third party software
def abfgp(id_, pid):
out = osjoin(outbase, id_)
if not exists(out): mkdir(out)
# logfile
log = osjoin(outbase, "log_process_%s" %(pid))
try:
# call the script
system("%s --dna %s --multifasta %s --target %s -o %s -q >>%s" %(abfgp_cmd, osjoin(inbase, id_, id_ +".dna.fa"), osjoin(inbase, id_, "informants.mfa"), id_, out, log))
except:
print "ABFGP FAILED"
return
# parse the output
def extractGff(id_):
# code not relevant
# function called by multiple processes, using the Queue
def run(Q, pid):
while not Q.empty():
try:
d = Q.get()
print "%s\t=>>\t%s" %(str(i-Q.qsize()), d)
abfgp(d, pid)
Q.task_done()
except KeyboardInterrupt:
exit("Interrupted Child")
# list of directories
genedirs = [d for d in listdir(inbase)]
genes = gread(refGff)
for d in genedirs:
i += 1
indir = osjoin(inbase, d)
outdir = osjoin(outbase, d)
Q.put(d)
# this loop creates the multiple processes
procs = []
for pid in range(num_p):
try:
p = Process(target=run, args=(Q, pid+1))
p.daemon = True
procs.append(p)
p.start()
except KeyboardInterrupt:
print "Aborting start of child processes"
for x in procs:
x.terminate()
exit("Interrupted")
try:
for p in procs:
p.join()
except:
print "Terminating child processes"
for x in procs:
x.terminate()
exit("Interrupted")
print "Parsing output..."
for d in genedirs: extractGff(d)
Now the problem is, abfgp.py uses the os.chdir function, which seems to disrupt the parallel processing. I get a lot of errors, stating that some (input/output) files/directories cannot be found for reading/writing. Even though I call the script through os.system(), from which I though spawning separate processes would prevent this.
How can I work around these chdir interference?
Edit: I might change os.system() to subprocess.Popen(cwd="...") with the right directory. I hope this makes a difference.
Thanks.