I rarely post questions to a forum, but this one has me stumped. I'm very curious as to what's causing this (a solution would also be nice, but mostly, I'd like to know why I'm having this issue):
I recently wrote a python script for wrapping the invocation of remote commands which are started by a PBS job:
#! /usr/bin/env python
#
# Copyright (c) 2009 Maciej Brodowicz
# Copyright (c) 2011 Bryce Lelbach
#
# Distributed under the Boost Software License, Version 1.0. (See accompanying
# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
from datetime import datetime
from string import letters, digits
from types import StringType
from optparse import OptionParser
from threading import Thread
# subprocess instantiation wrapper. Unfortunately older Python still lurks on
# some machines.
try:
from subprocess import Popen, STDOUT, PIPE
from types import StringType
class process:
_proc = None
_exec = None
def __init__(self, cmd):
self._proc = Popen(cmd, stderr = STDOUT, stdout = PIPE,
shell = (False, True)[type(cmd) == StringType])
def poll(self):
return self._proc.poll()
def pid(self):
return self._proc.pid
def _call(self):
# annoyingly, KeyboardInterrupts are transported to threads, while most
# other Exceptions aren't in python
try:
self._proc.wait()
except Exception, err:
self._exec = err
def wait(self, timeout=None):
if timeout is not None:
thread = Thread(target=self._call)
thread.start()
# wait for the thread and invoked process to finish
thread.join(timeout)
# be forceful
if thread.is_alive():
self._proc.terminate()
thread.join()
# if an exception happened, re-raise it here in the master thread
if self._exec is not None:
raise self._exec
return (True, self._proc.returncode)
if self._exec is not None:
raise self._exec
return (False, self._proc.returncode)
else:
return (False, self._proc.wait())
def read(self):
return self._proc.stdout.read()
except ImportError, err:
# no "subprocess"; use older popen module
from popen2 import Popen4
from signal import SIGKILL
from os import kill, waitpid, WNOHANG
class process:
_proc = None
def __init__(self, cmd):
self._proc = Popen4(cmd)
def poll(self):
return self._proc.poll()
def pid(self):
return self._proc.pid
def _call(self):
# annoyingly, KeyboardInterrupts are transported to threads, while most
# other Exceptions aren't in python
try:
self._proc.wait()
except Exception, err:
self._exec = err
def wait(self, timeout=None):
if timeout is not None:
thread = Thread(target=self._call)
thread.start()
# wait for the thread and invoked process to finish
thread.join(timeout)
# be forceful
if thread.is_alive():
kill(self._proc.pid, SIGKILL)
waitpid(-1, WNOHANG)
thread.join()
# if an exception happened, re-raise it here in the master thread
if self._exec is not None:
raise self._exec
return (True, self._proc.wait())
if self._exec is not None:
raise self._exec
return (False, self._proc.wait())
else:
return (False, self._proc.wait())
def read(self):
return self._proc.fromchild.read()
def run(cmd, timeout=3600):
start = datetime.now()
proc = process(cmd)
(timed_out, returncode) = proc.wait(timeout)
now = datetime.now()
output = ''
while True:
s = proc.read()
if s:
output += s
else:
break
return (returncode, output, timed_out)
def rstrip_last(s, chars):
if s[-1] in chars:
return s[:-1]
else:
return s
# {{{ main
usage = "usage: %prog [options]"
parser = OptionParser(usage=usage)
parser.add_option("--timeout",
action="store", type="int",
dest="timeout", default=3600,
help="Program timeout (seconds)")
parser.add_option("--program",
action="store", type="string",
dest="program",
help="Program to invoke")
(options, cmd) = parser.parse_args()
if None == options.program:
print "No program specified"
exit(1)
(returncode, output, timed_out) = run(options.program, options.timeout)
if not 0 == len(output):
print rstrip_last(output, '\n')
if timed_out:
print "Program timed out"
exit(returncode)
# }}}
Another python script puts together the command line arguments based on available resources as reported by PBS, similar to mpirun. I use python-paramiko for starting the remote commands over SSH. Initially I just executed the commands directly, but I failed to receive the correct exit codes when one of the remotely run processes exited with a signal (e.g. SIGSEGV). Thus, the need for the above script.
When running this script on my development cluster at work, I noticed that this script is subtly failing to work on my 4-core Debian GNU/Linux nodes, yet it does work on my 48-core RHEL/Linux nodes:
On the Debian nodes:
wash@hermione0:~/sandbox$ python --version
Python 2.6.7
wash@hermione0:~/sandbox$ uname -a
Linux hermione0 2.6.32-5-amd64 #1 SMP Wed Jan 12 03:40:32 UTC 2011 x86_64 GNU/Linux
wash@hermione0:~/sandbox$ time ./hpx_invoke.py --program='sleep 30' --timeout=5
Program timed out
real 0m30.025s
user 0m0.016s
sys 0m0.012s
wash@hermione0:~/sandbox$
On the RHEL nodes:
[22:08:23]:wash@vega:/home/wash/sandbox$ python --version
Python 2.6.6
[22:09:28]:wash@vega:/home/wash/sandbox$ uname -a
Linux vega 2.6.32-131.4.1.el6.x86_64 #1 SMP Fri Jun 10 10:54:26 EDT 2011 x86_64 x86_64 x86_64 GNU/Linux
[22:09:30]:wash@vega:/home/wash/sandbox$ time ./hpx_invoke.py --program='sleep 30' --timeout=5
Program timed out
real 0m5.053s
user 0m0.040s
sys 0m0.020s
[22:09:41]:wash@vega:/home/wash/sandbox$
What could be causing this?
P.S. I'm the sysadmin on these boxes.