#!/usr/bin/python # Test Cactus and SimFactory on a set of machines # This script "distribute" and its companion "distribute-watch" allow # easy testing of SimFactory and its MDB entries on many systems. In # the simplest case, you run "simfactory/bin/distribute", and this # tests on the default set of machines. In particular, it: # 1. syncs the current source tree to the machine (if not local), # 2. configures and builds from scratch both a debug and a # production configuration, # 3. submits a test job, and # 4. waits until the test job has finished. # # The options --no-sync, --no-build, --no-clean, --no-reconfig, and # --no-submit fine-tune the behaviour in the obvious way. The options # --configuration=..., --thornlist=..., and --parfile=... are also # available. # # Non-option arguments specify the set of machines on which the test # should be performed. For example, I could use # ./simfactory/bin/distribute --no-clean redshift # to build and submit quickly on my local laptop. # # [The default list of machines should really be configurable in a # better way.] # # Since this script is designed to test multiple systems # simultaneously, it executes in the background and redirects output # for each system into a log file "log/MACHINE.out". Since it runs in # the background, it cannot ask for passwords to log into remote # systems. For Athena and Kraken, where one receives a token valid for # twelve hours, I usually log in once just before calling "distribute" # to obtain the token ahead of time. # # # # The companion script "distribute-watch" watches these log files: # ./simfactory/bin/distribute-watch # This displays the status of the distribution on one line per system, # and updates the display every minute. (It only looks at the log # files and does not access the remote systems directly.) # # The output is sorted by time; running tests appear near the top, and # finished tests appear near the bottom. "age" indicates the time at # which the log file was last written to (to see when a test was last # run, or whether a test has become stuck). "dur'n" is the duration # for which the test was running (to judge the speed of a system). # "size" is the size of the log file (also indicating whether a test # has become stuck). For example, the final link stage on Abe and # Lincoln can take up to an hour. # # The state "working..." for a system indicates that the test is # running, and "[done]" that the test has finished. The outputs # "[sim-debug]", "[sim]", and "[success]" indicate that building the # debug configuration, building the production configuration, and # running the test simulations succeeded. If these indicators are not # present, then the corresponding tests failed. # # I use my own thorn list for my tests, which includes all of the # Einstein Toolkit thorns, many non-public production thorns that my # collaborators and I use, as well as a set of infrastructure and # development thorns. The currently fastest machine for this thorn # list is Bethe, a workstation at Caltech, which completes the test in # 23 minutes. Lonestar, the new HPC cluster at TACC, comes in second # with 34 minutes. (The standard Einstein Toolkit thorn list should be # significantly faster than this.) # # # # [Erik says:] Given that the set of systems to which we have access # is so diverse (local workstations, local clusters, LONI, LSU, NERSC, # TerGrid, ...), there are always some systems which are not available # or not functional. Furthermore, there are a host of transient # reasons why a test may fail on a particular system. If a test fails, # I usually just rerun this test without much analysis (and possibly # with the --no-clean option, so that the previous build is continued # where it broke); only if a test fails twice, I begin to analyse the # problem in detail. import getopt import math import os import re import time import subprocess import sys BASE_PATH = os.sep.join(os.path.abspath(__file__).split(os.sep)[:-2]) sys.path.append(os.path.join(BASE_PATH, "lib")) # Move the command line arguments away, so that SimFactory doesn't # look at them argv = sys.argv sys.argv = [] import simenv import simlib SimEnvironment = simenv.init(BASE_PATH, __file__, None, None) print ################################################################################ all_machines = [ "abe", "athena", # needs grid certificate "bd", "bethe", #LONI# "bluedawg", "bp", "carver", "croton", "damiana", #LONI# "ducky", "eric", "franklin", "hlrb2", "hopper2", "kraken", # needs grid certificate #LONI# "lacumba", "lincoln", "lonestar", "longhorn", #LONI# "louie", #"mileva", # needs grid certificate #LONI# "neptune", "numrel02", "numrel05", "numrel06", "numrel07", "numrel08", "numrel09", "numrel10", #LONI# "oliver", "orca", #LONI# "painter", #"pelican", # not available "philip", #"pople", # not set up yet #LONI# "poseidon", #LONI# "queenbee", "ranger", "requin", "s-kraken", "saw", #"steele", # not enough disk space "surveyor", #"tezpur", # can't rm -rf "vip", #LONI# "zeke", ] options, argv = getopt.getopt (argv[1:], "", ["sync", "no-sync", "build", "no-build", "clean", "no-clean", "reconfig", "no-reconfig", "submit", "no-submit", "configuration=", "thornlist=", "parfile="]) for x in argv: if re.search(r"^-", x): print 'Error: unrecognoised option "%s"' % x sys.exit(1) def optbool (options, name, default=None): for option, value in options: if option == ("--%s" % name): return True if option == ("--no-%s" % name): return False assert default!=None return default def optstr (options, name, default=None): for option, value in options: if option == ("--%s" % name): return value assert default!=None return default do_sync = optbool (options, "sync" , True) do_build = optbool (options, "build" , True) do_clean = optbool (options, "clean" , True) do_reconfig = optbool (options, "reconfig", True) do_submit = optbool (options, "submit" , True) configuration = optstr (options, "configuration", "") thornlist = optstr (options, "thornlist", "manifest/einsteintoolkit.th") #parfile = optstr (options, "parfile", "par/static_tov.par") #configuration = optstr (options, "configuration", "empty") #thornlist = optstr (options, "thornlist", "par/empty.th") parfile = optstr (options, "parfile", "par/empty.par") machines = argv if machines==[]: machines = all_machines logdir = "log" ################################################################################ def flatten(lists): return sum(lists, []) def create_timestamp(): return "[%s]" % time.strftime("%Y-%m-%d %H:%M:%S") def get_local_machine(): return simlib.GetMachineName() def get_remotes(machine): local_machine = get_local_machine() if local_machine != machine: return ["--remote", machine] else: return [] def get_mdb_entry(machine, key): description = simenv.ConfigurationDatabase.GetMachine(machine) assert description return description.GetKey(key) def execute(command): print " Executing: %s" % " ".join(command) sys.stdout.flush() sys.stderr.flush() subprocess.call(command, bufsize=1, stdin=None, stdout=sys.stdout, stderr=sys.stderr) sys.stdout.flush() sys.stderr.flush() def get_output(command): print " Executing: %s" % " ".join(command) sys.stdout.flush() sys.stderr.flush() p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # Note: we can't read stdout and stderr sequentially, because this # could lead to a deadlock output = p.stdout.read() sys.stdout.flush() sys.stderr.flush() return output ################################################################################ def sync(machine): local_machine = get_local_machine() if local_machine != machine: command = ["./simfactory/bin/sim", "sync", machine] execute(command) def build(machine, options): if configuration: if "--debug" in options: configurations = ["%s-debug" % configuration] else: configurations = [configuration] else: configurations = [] optionlist = get_mdb_entry(machine, "optionlist") submitscript = get_mdb_entry(machine, "submitscript") runscript = get_mdb_entry(machine, "runscript") optionlists = ["--optionlist=%s" % optionlist] if submitscript: submitscripts = ["--submitscript=%s" % submitscript] else: submitscripts = [] runscripts = ["--runscript=%s" % runscript] thornlists = ["--thornlist=%s" % thornlist] command = flatten([["./simfactory/bin/sim"], get_remotes(machine), ["build"], configurations, options, optionlists, submitscripts, runscripts, thornlists]) execute(command) def list_configurations(machine): command = flatten([["./simfactory/bin/sim"], get_remotes(machine), ["list-configurations", "--noverbose"]]) execute(command) def submit(machine, parfile): (path, file) = os.path.split(parfile) file = re.sub(r"\.[^.]*$", "", file) # remove suffix simulation = "%s-%s-%s" % (file, machine, date) walltime = "1:0:0" memory = float(get_mdb_entry(machine, "memory")) num_threads = int(get_mdb_entry(machine, "num-threads")) ppn = int(get_mdb_entry(machine, "ppn")) max_nodes = int(get_mdb_entry(machine, "nodes")) want_memory = 4.0 # minimum, in GByte want_nodes = 2 # minimum nodes = max(min(want_nodes, max_nodes), int(math.ceil(want_memory / memory))) procs = nodes * ppn submitscript = get_mdb_entry(machine, "submitscript") if submitscript: # Submit command = flatten([["./simfactory/bin/sim"], get_remotes(machine), ["create-submit", simulation, "--parfile=%s" % parfile, "--walltime=%s" % walltime, "--procs=%s" % procs, "--num-threads=%s" % num_threads]]) else: # Run directly command = flatten([["./simfactory/bin/sim"], get_remotes(machine), ["create-run", simulation, "--parfile=%s" % parfile, "--procs=%s" % procs, "--num-threads=%s" % num_threads]]) execute(command) def list_simulations(machine): command = flatten([["./simfactory/bin/sim"], get_remotes(machine), ["list-simulations", "--noverbose"]]) execute(command) def wait(machine, parfile): (path, file) = os.path.split(parfile) file = re.sub(r"\.[^.]*$", "", file) # remove suffix simulation = "%s-%s-%s" % (file, machine, date) sleep_time = 60 # 1 minute max_sleep_time = 600 # 10 minutes while True: print create_timestamp() # Examine the simulation status print "Examining simulation status..." command = flatten([["./simfactory/bin/sim"], get_remotes(machine), ["list-simulations", simulation]]) status_output = get_output(command) if not re.search(r"^ +%s +\[ACTIVE" % simulation, status_output, re.MULTILINE): print "Simulation is not active any more." sys.stdout.flush() break print "Waiting for %s seconds..." % sleep_time sys.stdout.flush() sys.stderr.flush() time.sleep(sleep_time) sleep_time = min(max_sleep_time, sleep_time*2) # Wait a bit for things to settle down time.sleep(60) # Examine the simulation output print "Examining simulation output..." command = flatten([["./simfactory/bin/sim"], get_remotes(machine), ["show-output", simulation]]) sim_output = get_output(command) if re.search(r"^Done\.$", sim_output, re.MULTILINE): print "Simulation finished successfully." else: print "Simulation did not finish." sys.stdout.flush() def show_output(machine, parfile): (path, file) = os.path.split(parfile) file = re.sub(r"\.[^.]*$", "", file) # remove suffix simulation = "%s-%s-%s" % (file, machine, date) command = flatten([["./simfactory/bin/sim"], get_remotes(machine), ["show-output", simulation]]) execute(command) def authenticate(machine): command = flatten([["./simfactory/bin/sim"], get_remotes(machine), ["execute", "true"]]) execute(command) def authenticate_access(machine): print "Authenticating access to %s:" % machine authenticate(machine) def sync_build_submit(machine): print "Distributing to %s..." % machine print if do_sync: print "Synchronising..." print create_timestamp() sync(machine) print create_timestamp() # Give the file system some time to settle down time.sleep(3) print if do_build: list_configurations(machine) print print "Building debug configuration..." print create_timestamp() build(machine, build_options + ["--debug"]) print create_timestamp() print print "Building optimised configuration..." print create_timestamp() build(machine, build_options) print create_timestamp() print list_configurations(machine) print if do_submit: list_simulations(machine) print print "Submitting test job..." print create_timestamp() submit(machine, parfile) print create_timestamp() print wait(machine, parfile) print create_timestamp() print show_output(machine, parfile) print list_simulations(machine) list_configurations(machine) print print create_timestamp() print "Distribution done." sys.stdout.flush() ################################################################################ print "Distributing the Cactus source tree" # # Check local system consistency # if not os.path.exists(".svn"): # print # print "This program has either been called from the wrong directory" # print "or on the wrong system. Aborting." # sys.exit(1) # Set up the tasks print print " %s target machines: %s" % (len(machines), " ".join(machines)) date = time.strftime("%Y.%m.%d-%H.%M.%S") if do_sync: print " - synchronising (if not local)" else: print " - not synchronising" build_options = [] if do_build: if do_clean: build_options += ["--clean"] if do_reconfig: build_options += ["--reconfig"] print " - building with options %s" % " ".join(build_options) else: print " - not building" if do_submit: print " - submitting %s" % parfile else: print " - not submitting" sys.stdout.flush() # Authenticate access #print #for machine in machines: # authenticate_access(machine) # Perform the tasks # # Create a new process and exit the current process, so that the new # # process is detached from the console # pid = os.fork() # if pid>0: # sys.exit(0) # # Wait for the parent to exit # time.sleep(3) print sleep_time = 0 next_sleep_time = 2 for machine in machines: time.sleep(sleep_time) sleep_time = next_sleep_time print "Scheduling distribution to %s..." % machine, sys.stdout.flush() # Create a subprocess for the task pid = os.fork() if pid==0: # # Create a new process and exit the current process, so that # # the new process is detached from the console # pid = os.fork() # if pid>0: # sys.exit(0) # # Wait for the parent to exit # time.sleep(3) # # Close stdin, so that ssh won't ask for passwords # sys.stdin.close() # # Redirect stdin, so that ssh won't ask for passwords # sys.stdin = open("/dev/null", "r") # Redirect stdout and stderr to a log file try: os.mkdir(logdir) except: pass sys.stdout = open(os.path.join(logdir, "%s.out" % machine), "w") sys.stderr = sys.stdout # Unset the DISPLAY environment variable to avoid X11 windows # asking for ssh credentials try: del os.environ["DISPLAY"] # del os.environ["SSH_ASKPASS"] # del os.environ["SSH_TTY"] except: pass # Execute the task sync_build_submit(machine) # Exit the subprocess sys.exit(0) print " (pid %s)" % pid print print "Done."