#!/usr/bin/python

# Test Cactus and SimFactory on a set of machines

# This script "distribute" and its companion "distribute-watch" allow
# easy testing of SimFactory and its MDB entries on many systems. In
# the simplest case, you run "simfactory/bin/distribute", and this
# tests on the default set of machines. In particular, it:
#    1. syncs the current source tree to the machine (if not local),
#    2. configures and builds from scratch both a debug and a
#       production configuration,
#    3. submits a test job, and
#    4. waits until the test job has finished.
#
# The options --no-sync, --no-build, --no-clean, --no-reconfig, and
# --no-submit fine-tune the behaviour in the obvious way. The options
# --configuration=..., --thornlist=..., and --parfile=... are also
# available.
#
# Non-option arguments specify the set of machines on which the test
# should be performed. For example, I could use
#    ./simfactory/bin/distribute --no-clean redshift
# to build and submit quickly on my local laptop.
#
# [The default list of machines should really be configurable in a
# better way.]
#
# Since this script is designed to test multiple systems
# simultaneously, it executes in the background and redirects output
# for each system into a log file "log/MACHINE.out". Since it runs in
# the background, it cannot ask for passwords to log into remote
# systems. For Athena and Kraken, where one receives a token valid for
# twelve hours, I usually log in once just before calling "distribute"
# to obtain the token ahead of time.
#
#
#
# The companion script "distribute-watch" watches these log files:
#    ./simfactory/bin/distribute-watch
# This displays the status of the distribution on one line per system,
# and updates the display every minute. (It only looks at the log
# files and does not access the remote systems directly.)
#
# The output is sorted by time; running tests appear near the top, and
# finished tests appear near the bottom. "age" indicates the time at
# which the log file was last written to (to see when a test was last
# run, or whether a test has become stuck). "dur'n" is the duration
# for which the test was running (to judge the speed of a system).
# "size" is the size of the log file (also indicating whether a test
# has become stuck). For example, the final link stage on Abe and
# Lincoln can take up to an hour.
#
# The state "working..." for a system indicates that the test is
# running, and "[done]" that the test has finished. The outputs
# "[sim-debug]", "[sim]", and "[success]" indicate that building the
# debug configuration, building the production configuration, and
# running the test simulations succeeded. If these indicators are not
# present, then the corresponding tests failed.
#
# I use my own thorn list for my tests, which includes all of the
# Einstein Toolkit thorns, many non-public production thorns that my
# collaborators and I use, as well as a set of infrastructure and
# development thorns. The currently fastest machine for this thorn
# list is Bethe, a workstation at Caltech, which completes the test in
# 23 minutes. Lonestar, the new HPC cluster at TACC, comes in second
# with 34 minutes. (The standard Einstein Toolkit thorn list should be
# significantly faster than this.)
#
#
#
# [Erik says:] Given that the set of systems to which we have access
# is so diverse (local workstations, local clusters, LONI, LSU, NERSC,
# TerGrid, ...), there are always some systems which are not available
# or not functional. Furthermore, there are a host of transient
# reasons why a test may fail on a particular system. If a test fails,
# I usually just rerun this test without much analysis (and possibly
# with the --no-clean option, so that the previous build is continued
# where it broke); only if a test fails twice, I begin to analyse the
# problem in detail.


import getopt
import math
import os
import re
import time
import subprocess
import sys

BASE_PATH = os.sep.join(os.path.abspath(__file__).split(os.sep)[:-2])
sys.path.append(os.path.join(BASE_PATH, "lib"))

# Move the command line arguments away, so that SimFactory doesn't
# look at them
argv = sys.argv
sys.argv = []

import simenv
import simlib

SimEnvironment = simenv.init(BASE_PATH, __file__, None, None)
print


################################################################################


all_machines = [
    "abe",
    "athena",                   # needs grid certificate
    "bd",
    "bethe",
    #LONI# "bluedawg",
    "bp",
    "carver",
    "croton",
    "damiana",
    #LONI# "ducky",
    "eric",
    "franklin",
    "hlrb2",
    "hopper2",
    "kraken",                   # needs grid certificate
    #LONI# "lacumba",
    "lincoln",
    "lonestar",
    "longhorn",
    #LONI# "louie",
    #"mileva",                   # needs grid certificate
    #LONI# "neptune",
    "numrel02",
    "numrel05",
    "numrel06",
    "numrel07",
    "numrel08",
    "numrel09",
    "numrel10",
    #LONI# "oliver",
    "orca",
    #LONI# "painter",
    #"pelican",                  # not available
    "philip",
    #"pople",                    # not set up yet
    #LONI# "poseidon",
    #LONI# "queenbee",
    "ranger",
    "requin",
    "s-kraken",
    "saw",
    #"steele",                   # not enough disk space
    "surveyor",
    #"tezpur",                   # can't rm -rf
    "vip",
    #LONI# "zeke",
    ]

options, argv = getopt.getopt (argv[1:], "",
                               ["sync", "no-sync",
                                "build", "no-build",
                                "clean", "no-clean",
                                "reconfig", "no-reconfig",
                                "submit", "no-submit",
                                "configuration=",
                                "thornlist=",
                                "parfile="])

for x in argv:
    if re.search(r"^-", x):
        print 'Error: unrecognoised option "%s"' % x
        sys.exit(1)

def optbool (options, name, default=None):
    for option, value in options:
        if option == ("--%s" % name):
            return True
        if option == ("--no-%s" % name):
            return False
    assert default!=None
    return default

def optstr (options, name, default=None):
    for option, value in options:
        if option == ("--%s" % name):
            return value
    assert default!=None
    return default

do_sync     = optbool (options, "sync"    , True)
do_build    = optbool (options, "build"   , True)
do_clean    = optbool (options, "clean"   , True)
do_reconfig = optbool (options, "reconfig", True)
do_submit   = optbool (options, "submit"  , True)

configuration = optstr (options, "configuration", "")
thornlist     = optstr (options, "thornlist",     "manifest/einsteintoolkit.th")
#parfile       = optstr (options, "parfile",       "par/static_tov.par")

#configuration = optstr (options, "configuration", "empty")
#thornlist     = optstr (options, "thornlist",     "par/empty.th")
parfile       = optstr (options, "parfile",       "par/empty.par")

machines = argv
if machines==[]:
    machines = all_machines

logdir = "log"


################################################################################


def flatten(lists):
    return sum(lists, [])

def create_timestamp():
    return "[%s]" % time.strftime("%Y-%m-%d %H:%M:%S")

def get_local_machine():
    return simlib.GetMachineName()

def get_remotes(machine):
    local_machine = get_local_machine()
    if local_machine != machine:
        return ["--remote", machine]
    else:
        return []

def get_mdb_entry(machine, key):
    description = simenv.ConfigurationDatabase.GetMachine(machine)
    assert description
    return description.GetKey(key)
    
def execute(command):
    print "   Executing: %s" % " ".join(command)
    sys.stdout.flush()
    sys.stderr.flush()
    subprocess.call(command, bufsize=1,
                    stdin=None, stdout=sys.stdout, stderr=sys.stderr)
    sys.stdout.flush()
    sys.stderr.flush()
    
def get_output(command):
    print "   Executing: %s" % " ".join(command)
    sys.stdout.flush()
    sys.stderr.flush()
    p = subprocess.Popen(command,
                         stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    # Note: we can't read stdout and stderr sequentially, because this
    # could lead to a deadlock
    output = p.stdout.read()
    sys.stdout.flush()
    sys.stderr.flush()
    return output


################################################################################


def sync(machine):
    local_machine = get_local_machine()
    if local_machine != machine:
        command = ["./simfactory/bin/sim", "sync", machine]
        execute(command)


def build(machine, options):
    if configuration:
        if "--debug" in options:
            configurations = ["%s-debug" % configuration]
        else:
            configurations = [configuration]
    else:
        configurations = []
    
    optionlist   = get_mdb_entry(machine, "optionlist")
    submitscript = get_mdb_entry(machine, "submitscript")
    runscript    = get_mdb_entry(machine, "runscript")
    
    optionlists = ["--optionlist=%s" % optionlist]
    if submitscript:
        submitscripts = ["--submitscript=%s" % submitscript]
    else:
        submitscripts = []
    runscripts = ["--runscript=%s" % runscript]
    thornlists = ["--thornlist=%s" % thornlist]
    
    command = flatten([["./simfactory/bin/sim"], get_remotes(machine), ["build"],
                       configurations, options,
                       optionlists, submitscripts, runscripts, thornlists])
    execute(command)


def list_configurations(machine):
    command = flatten([["./simfactory/bin/sim"], get_remotes(machine),
                       ["list-configurations", "--noverbose"]])
    execute(command)


def submit(machine, parfile):
    (path, file) = os.path.split(parfile)
    file = re.sub(r"\.[^.]*$", "", file) # remove suffix
    simulation = "%s-%s-%s" % (file, machine, date)
    
    walltime = "1:0:0"
    
    memory      = float(get_mdb_entry(machine, "memory"))
    num_threads = int(get_mdb_entry(machine, "num-threads"))
    ppn         = int(get_mdb_entry(machine, "ppn"))
    max_nodes   = int(get_mdb_entry(machine, "nodes"))
    want_memory = 4.0           # minimum, in GByte
    want_nodes  = 2             # minimum
    nodes = max(min(want_nodes, max_nodes),
                int(math.ceil(want_memory / memory)))
    procs = nodes * ppn
    
    submitscript = get_mdb_entry(machine, "submitscript")
    if submitscript:
        # Submit
        command = flatten([["./simfactory/bin/sim"], get_remotes(machine),
                           ["create-submit", simulation,
                            "--parfile=%s" % parfile,
                            "--walltime=%s" % walltime,
                            "--procs=%s" % procs,
                            "--num-threads=%s" % num_threads]])
    else:
        # Run directly
        command = flatten([["./simfactory/bin/sim"], get_remotes(machine),
                           ["create-run", simulation,
                            "--parfile=%s" % parfile,
                            "--procs=%s" % procs,
                            "--num-threads=%s" % num_threads]])
    execute(command)


def list_simulations(machine):
    command = flatten([["./simfactory/bin/sim"], get_remotes(machine),
                       ["list-simulations", "--noverbose"]])
    execute(command)


def wait(machine, parfile):
    (path, file) = os.path.split(parfile)
    file = re.sub(r"\.[^.]*$", "", file) # remove suffix
    simulation = "%s-%s-%s" % (file, machine, date)
    
    sleep_time = 60             # 1 minute
    max_sleep_time = 600        # 10 minutes
    while True:
        print create_timestamp()
        # Examine the simulation status
        print "Examining simulation status..."
        command = flatten([["./simfactory/bin/sim"], get_remotes(machine),
                           ["list-simulations", simulation]])
        status_output = get_output(command)
        if not re.search(r"^ +%s +\[ACTIVE" % simulation, status_output,
                         re.MULTILINE):
            print "Simulation is not active any more."
            sys.stdout.flush()
            break
        print "Waiting for %s seconds..." % sleep_time
        sys.stdout.flush()
        sys.stderr.flush()
        time.sleep(sleep_time)
        sleep_time = min(max_sleep_time, sleep_time*2)
    # Wait a bit for things to settle down
    time.sleep(60)
    # Examine the simulation output
    print "Examining simulation output..."
    command = flatten([["./simfactory/bin/sim"], get_remotes(machine),
                       ["show-output", simulation]])
    sim_output = get_output(command)
    if re.search(r"^Done\.$", sim_output, re.MULTILINE):
        print "Simulation finished successfully."
    else:
        print "Simulation did not finish."
    sys.stdout.flush()


def show_output(machine, parfile):
    (path, file) = os.path.split(parfile)
    file = re.sub(r"\.[^.]*$", "", file) # remove suffix
    simulation = "%s-%s-%s" % (file, machine, date)
    
    command = flatten([["./simfactory/bin/sim"], get_remotes(machine),
                       ["show-output", simulation]])
    execute(command)


def authenticate(machine):
    command = flatten([["./simfactory/bin/sim"], get_remotes(machine),
                       ["execute", "true"]])
    execute(command)


def authenticate_access(machine):
    print "Authenticating access to %s:" % machine
    authenticate(machine)


def sync_build_submit(machine):
    print "Distributing to %s..." % machine
    print
    if do_sync:
        print "Synchronising..."
        print create_timestamp()
        sync(machine)
        print create_timestamp()
        # Give the file system some time to settle down
        time.sleep(3)
        print
    if do_build:
        list_configurations(machine)
        print
        print "Building debug configuration..."
        print create_timestamp()
        build(machine, build_options + ["--debug"])
        print create_timestamp()
        print
        print "Building optimised configuration..."
        print create_timestamp()
        build(machine, build_options)
        print create_timestamp()
        print
    list_configurations(machine)
    print
    if do_submit:
        list_simulations(machine)
        print
        print "Submitting test job..."
        print create_timestamp()
        submit(machine, parfile)
        print create_timestamp()
        print
        wait(machine, parfile)
        print create_timestamp()
        print
        show_output(machine, parfile)
        print
    list_simulations(machine)
    list_configurations(machine)
    print
    print create_timestamp()
    print "Distribution done."
    sys.stdout.flush()


################################################################################


print "Distributing the Cactus source tree"


# # Check local system consistency
# if not os.path.exists(".svn"):
#     print
#     print "This program has either been called from the wrong directory"
#     print "or on the wrong system. Aborting."
#     sys.exit(1)


# Set up the tasks

print
print "   %s target machines: %s" % (len(machines), " ".join(machines))
date = time.strftime("%Y.%m.%d-%H.%M.%S")

if do_sync:
    print "   - synchronising (if not local)"
else:
    print "   - not synchronising"

build_options = []
if do_build:
    if do_clean:
        build_options += ["--clean"]
    if do_reconfig:
        build_options += ["--reconfig"]
    print "   - building with options %s" % " ".join(build_options)
else:
    print "   - not building"

if do_submit:
    print "   - submitting %s" % parfile
else:
    print "   - not submitting"

sys.stdout.flush()


# Authenticate access

#print
#for machine in machines:
#    authenticate_access(machine)


# Perform the tasks

# # Create a new process and exit the current process, so that the new
# # process is detached from the console
# pid = os.fork()
# if pid>0:
#     sys.exit(0)
# # Wait for the parent to exit
# time.sleep(3)

print
sleep_time = 0
next_sleep_time = 2
for machine in machines:
    time.sleep(sleep_time)
    sleep_time = next_sleep_time
    print "Scheduling distribution to %s..." % machine,
    sys.stdout.flush()
    # Create a subprocess for the task
    pid = os.fork()
    if pid==0:
        # # Create a new process and exit the current process, so that
        # # the new process is detached from the console
        # pid = os.fork()
        # if pid>0:
        #     sys.exit(0)
        # # Wait for the parent to exit
        # time.sleep(3)
        # # Close stdin, so that ssh won't ask for passwords
        # sys.stdin.close()
        # # Redirect stdin, so that ssh won't ask for passwords
        # sys.stdin = open("/dev/null", "r")
        # Redirect stdout and stderr to a log file
        try:
            os.mkdir(logdir)
        except:
            pass
        sys.stdout = open(os.path.join(logdir, "%s.out" % machine), "w")
        sys.stderr = sys.stdout
        # Unset the DISPLAY environment variable to avoid X11 windows
        # asking for ssh credentials
        try:
            del os.environ["DISPLAY"]
            # del os.environ["SSH_ASKPASS"]
            # del os.environ["SSH_TTY"]
        except:
            pass
        # Execute the task
        sync_build_submit(machine)
        # Exit the subprocess
        sys.exit(0)
    print " (pid %s)" % pid


print
print "Done."