I have a pipeline written in python calling some processes in Java. The pipeline runs with two possible modes, on local mode (on a single node) or on SGE cluster.
When I set the option to cluster mode, the error message in the logs are such this
Invalid maximum heap size: -Xmx4g -jar
Error: Could not create the Java Virtual Machine.
Error: A fatal exception has occurred. Program will exit.
In local mode, there is no error and no problems.
My question is what can cause such an error ?
My class to run jobs either locally or on cluster is as follow
class LocalJobManager(JobManager):
def __init__(self):
self.cmd_strs = []
def add_job(self, cmd, cmd_args, **kwargs):
cmd_str = ' '.join([cmd, ] + [str(x) for x in cmd_args])
self.cmd_strs.append(cmd_str)
def run_job(self, cmd, cmd_args, **kwargs):
cmd_str = ' '.join([cmd, ] + [str(x) for x in cmd_args])
self._run_cmd(cmd_str)
def wait(self):
for cmd_str in self.cmd_strs:
self._run_cmd(cmd_str)
def _run_cmd(self, cmd_str):
'''
Throw exception if run command fails
'''
process = subprocess.Popen(cmd_str, stdin=subprocess.PIPE, shell=True)
process.stdin.close()
sts = os.waitpid(process.pid, 0)
if sts[1] != 0:
raise Exception('Failed to run {0}\n'.format(cmd_str))
class ClusterJobManager(JobManager):
def __init__(self, log_dir=None):
import drmaa
self._drmaa = drmaa
self.log_dir = log_dir
if self.log_dir is not None:
make_directory(self.log_dir)
self.session = self._drmaa.Session()
self.session.initialize()
self.job_ids = Queue()
self._lock = threading.Lock()
def add_job(self, cmd, cmd_args, mem=4, max_mem=10, num_cpus=1):
job_id = self._run_job(cmd, cmd_args, mem, max_mem, num_cpus)
self.job_ids.put(job_id)
def run_job(self, cmd, cmd_args, mem=4, max_mem=10, num_cpus=1):
job_id = self._run_job(cmd, cmd_args, mem, max_mem, num_cpus)
self._check_exit_status(job_id)
def wait(self):
self._lock.acquire()
job_ids = []
while not self.job_ids.empty():
job_ids.append(self.job_ids.get())
self.session.synchronize(job_ids, self._drmaa.Session.TIMEOUT_WAIT_FOREVER, False)
self._lock.release()
for job_id in job_ids:
self._check_exit_status(job_id)
def close(self):
self.session.control(self._drmaa.Session.JOB_IDS_SESSION_ALL, self._drmaa.JobControlAction.TERMINATE)
self.session.exit()
def _run_job(self, cmd, cmd_args, mem, max_mem, num_cpus):
job_template = self._init_job_template(cmd, cmd_args, mem, max_mem, num_cpus)
job_id = self.session.runJob(job_template)
self.session.deleteJobTemplate(job_template)
return job_id
def _init_job_template(self, cmd, cmd_args, mem, max_mem, num_cpus):
job_template = self.session.createJobTemplate()
job_template.remoteCommand = cmd
job_template.args = [str(x) for x in cmd_args]
job_template.workingDirectory = os.getcwd()
if self.log_dir is not None:
job_template.errorPath = ':' + self.log_dir
job_template.outputPath = ':' + self.log_dir
job_template.nativeSpecification = '-l mem_free={mem}G,mem_token={mem}G,h_vmem={max_mem}G -V -w n -pe ncpus {num_cpus}'.format(**locals())
return job_template
def _check_exit_status(self, job_id):
return_value = self.session.wait(job_id, self._drmaa.Session.TIMEOUT_WAIT_FOREVER)
if return_value.exitStatus != 0:
raise Exception('Job {0} failed with exit status {1}.'.format(return_value.jobId,
return_value.exitStatus))
Usually the Could not create the Java Virtual Machine (as I am reading though some forum) is caused by syntax error, even though the command called is correct and works locally, besides the class to run jobs on the cluster show above, runs for everything except Java
Thanks