# restartlib -- all the helper functions to assist the SimRestart class. # this seperation allows SimRemote to remain a fairly clean abstraction. import pyini import time import shutil import sys, os, re, math import libutil import simrestart from libutil import dprint class WallTime: def __init__(self, walltime=None): self.Walltime = walltime self.parseWalltime() def parseWalltime(self): if self.Walltime == None: self.Walltime = '876:00:00' # one year dprint("Warning, Walltime not specified, using %s instead" % self.Walltime) if self.Walltime.count(":") == 0: dprint("Wall time has invalid format, expecting HH[:MM[:SS]]") sys.exit(1) parts = self.Walltime.split(":") self.walltime_hh = "%02d" % int(parts[0]) self.walltime_mm = "00" self.walltime_ss = "00" if len(parts) == 2: self.walltime_mm = "%02d" % int(parts[1]) if len(parts) == 3: self.walltime_mm = "%02d" % int(parts[1]) self.Walltime = "%d:%02d:%02d" % (int(self.walltime_hh), int(self.walltime_mm), int(self.walltime_ss)) self.walltime_seconds = (int(self.walltime_hh) * 3600) + (int(self.walltime_mm) * 60) + int(self.walltime_ss) self.walltime_minutes = self.walltime_seconds / 60 self.walltime_hours = self.walltime_seconds / 3600.0 class RestartProperties: def __init__(self, env): self.SimEnvironment = env self.SimLib = env.SimLib self.IniSection = 'properties' def InitBlank(self): self.parser = pyini.IniParser() def init(self, filename=None): self.Filename = filename if not(os.path.exists(filename)): filename = None self.parser = pyini.IniParser(filename) if filename != None: self.ImportProperties() def ImportProperties(self): section = self.parser.GetSectionAsDict(self.IniSection) for key in section.keys(): io = section[key] if io.IsBlock: io.ConvertToList() value = io.Value setattr(self, key, value) def HasProperty(self, key): return hasattr(self, key) def GetProperty(self, key): return getattr(self, key, None) def AddProperty(self, key, value): if value is list: block = True bi = "EOT" else: block = False bi = None op = pyini.IniOption(self.IniSection, key, value, block) op.BlockIdentifier = bi self.parser.parser.EnterSection('properties') #self.WriteKey(self.CurrentSection, self.BlockKey, op) self.parser.parser.WriteKey(self.IniSection, key, op) setattr(self, key, value) def RemoveProperty(self, key): self.parser.parser.EnterSection('properties') self.parser.parser.RemoveKey('properties', key) if hasattr(self, key): delattr(self, key) def toString(self): return self.parser.GetIniAsString() def Save(self): if self.Filename == None: dprint("Could not write to filename, filename is undefined") sys.exit(1) self.SimLib.WriteContents(self.Filename, self.parser.GetIniAsString()) class RestartLib: def __init__(self, env): self.SimEnvironment = env self.OptionsManager = self.SimEnvironment.OptionsManager self.ConfigurationDatabase = self.SimEnvironment.ConfigurationDatabase self.DefineDatabase = self.SimEnvironment.DefineDatabase self.SimLib = self.SimEnvironment.SimLib def GetNumberOfRestarts(self, maxwalltime, walltime): return int(math.ceil(walltime.walltime_seconds/maxwalltime.walltime_seconds)) def GetConfiguration(self): if not(self.OptionsManager.HasOption("configuration")): config = self.SimLib.GetDefaultConfiguration() dprint("Configuration name not specified -- using default configuration \"%s\"" % config) return config return self.OptionsManager.GetOption("configuration") def SetWalltime(self, raw_walltime): if not isinstance(raw_walltime, WallTime): walltt = WallTime(raw_walltime) else: walltt = raw_walltime self.DefineDatabase.Set('WALLTIME', walltt.Walltime) self.DefineDatabase.Set('WALLTIME_HH', walltt.walltime_hh) self.DefineDatabase.Set('WALLTIME_MM', walltt.walltime_mm) self.DefineDatabase.Set('WALLTIME_SS', walltt.walltime_ss) self.DefineDatabase.Set('WALLTIME_SECONDS', walltt.walltime_seconds) self.DefineDatabase.Set('WALLTIME_MINUTES', walltt.walltime_minutes) self.DefineDatabase.Set('WALLTIME_HOURS', walltt.walltime_hours) def GetJobIdFromOutput(self, output): # need basedir, simulationdir, and internaldir machine = self.SimEnvironment.LocalMachine machineEntry = self.SimEnvironment.LocalMachineEntry submitRegex = libutil.ReConvert(machineEntry.submitpattern) matches = re.search(submitRegex, output) # if we didn't match anything, just use whatever got outputted. if matches == None: job_id = "-1" else: job_id = matches.group(1) return job_id def ParsePrepSchedule(self, prepFile, numProcs): if not(os.path.exists(prepFile)): dprint("Error: prep schedule file %s is not readable" % prepFile, libutil.ALWAYS_PRINT) sys.exit(1) numProcs = str(numProcs) contents = self.SimLib.GetFileContents(prepFile) lines = contents.split("\n") preps = list() parts = list() parts.append(numProcs) for line in lines: line = line.strip() if len(line) == 0: if len(parts) > 1 and len(parts) != 3: dprint("Error: syntax error in prep schedule. Each schedule needs to be two lines, followed by a blank line", libutil.ALWAYS_PRINT) sys.exit(1) if len(parts) == 3: parts.append(str()) preps.append("\n".join(parts)) parts = list() parts.append(numProcs) continue else: parts.append(line) if len(parts) == 3: parts.append(str()) preps.append("\n".join(parts)) else: if len(parts) > 1: dprint("Error: syntax error in prep schedule END. Each schedule needs to be two lines, followed by a blank line", libutil.ALWAYS_PRINT) sys.exit(1) return preps def GetExecHost(self, restart): (machine, machineEntry, sourceBaseDir, path) = self.SimLib.GetLocalEnvironment() job_id = restart.GetJobId() job_status = 'U' if job_id != -1: job_status = self.GetJobStatus(job_id) if job_status != 'R': dprint("Warning: Job is not running, cannot retreive exechost") return None self.SimLib.VerifyKeys(machineEntry, ['exechost', 'exechostpattern']) self.DefineDatabase.reset() self.DefineDatabase.Set('JOB_ID', job_id) self.DefineDatabase.Set('USER', machineEntry.user) self.DefineDatabase.Set('SIMULATION_NAME', restart.SimulationName) exechost = self.DefineDatabase.SubAll(machineEntry.GetKey('exechost')) exechostpattern = self.DefineDatabase.SubAll(machineEntry.GetKey('exechostpattern')) output = os.popen(exechost).read() exechostpattern = libutil.ReConvert(exechostpattern) rx = re.compile(exechostpattern, re.MULTILINE) matches = rx.match(output) if matches == None: dprint("Warning: Unable to retrieve exechost using pattern %s" % exechostpattern) return None return matches.group(1) def CleanupRestarts(self): if self.SimEnvironment.LocalMachine == None: return self.SimEnvironment.VERBOSE = False for sim in self.SimLib.GetSimulations(): restart = simrestart.SimRestart(self.SimEnvironment) restart.load(sim) for rid in self.GetRestartIds(restart): ret = restart.loadFromRestartId(rid) if ret > 0: restart.finish() self.SimEnvironment.VERBOSE = True def GetRestartByJobId(self, sim, job_id): restart = simrestart.SimRestart(self.SimEnvironment) restart.load(sim) for rid in self.GetRestartIds(restart): restart.loadFromRestartId(rid) if job_id == restart.GetJobId(): return restart def StopAllActiveRestarts(self, sim): restart = simrestart.SimRestart(self.SimEnvironment) restart.load(sim) for rid in self.GetRestartIds(restart): restart.loadFromRestartId(rid) job_id = restart.GetJobId() if job_id == -1: continue status = self.GetJobStatus(job_id) if status != 'U': restart.stop() def GetRestartByRestartId(self, sim, restart_id): restart = simrestart.SimRestart(self.SimEnvironment) restart.load(sim) restart.loadFromRestartId(restart_id) return restart def GetJobStatus(self, job_id): (machine, machineEntry, sourceBaseDir, path) = self.SimLib.GetLocalEnvironment() self.SimLib.VerifyKeys(machineEntry, ['getstatus', 'queuedpattern', 'runningpattern', 'statuspattern', 'user']) status_command = machineEntry.GetKey('getstatus') status_pattern = libutil.ReConvert(machineEntry.GetKey('statuspattern')) queued_pattern = libutil.ReConvert(machineEntry.GetKey('queuedpattern')) running_pattern = libutil.ReConvert(machineEntry.GetKey('runningpattern')) holding_pattern = libutil.ReConvert(machineEntry.GetKey('holdingpattern')) user = machineEntry.GetKey('user') self.DefineDatabase.Set('USER', user) self.DefineDatabase.Set('JOB_ID', job_id) status_command = self.DefineDatabase.SubAll(status_command) status_pattern = self.DefineDatabase.SubAll(status_pattern) queued_pattern = self.DefineDatabase.SubAll(queued_pattern) running_pattern = self.DefineDatabase.SubAll(running_pattern) #capture output. output = self.SimLib.ExecuteCommand(status_command, True) lines = output.split("\n") # U == unknown? status = 'U' matched = list() for line in lines: matches = re.search(status_pattern, line) if matches != None: # queued_pattern mm = re.search(queued_pattern, line) if mm != None: status = 'Q' matched.append(queued_pattern) # running_pattern mm = re.search(running_pattern, line) if mm != None: status = 'R' matched.append(running_pattern) if holding_pattern != None: # holding_pattern mm = re.search(holding_pattern, line) if mm != None: status = 'H' matched.append(holding_pattern) if matches > 1: dprint("Error: multiple status patterns matched: %s" % matched) sys.exit(1) return status def GetRestartIds(self, restart): simulationdir = restart.SimulationDir expression = "^output-([0-9]+)$" rr = re.compile(expression) ids = list() for file in os.listdir(simulationdir): matches = rr.match(file) if matches != None: id = matches.group(1) ids.append(int(id)) ids.sort() return ids def GetActiveRestartId(self, restart): ids = self.GetRestartIds(restart) active = 0 simulationdir = restart.SimulationDir if len(ids) == 0: return active for id in ids: aid = "%04d" % id afolder = "output-%s-active" % aid if os.path.exists(self.SimLib.BuildPath([simulationdir, afolder])): if active != 0: dprint("Error, more than one active restart id found in directory %s" % simulationdir) sys.exit(1) active = id return active def GetCheckpointFiles(self, workdir, parfile): if not(os.path.exists(workdir)): return (list(), 0) raw_output = self.SimLib.ExecuteCommand("find %s -name *chkpt.it_*" % workdir, True) raw_files = raw_output.split("\n") files = list() for file in raw_files: file = file.strip() if len(file) > 0: if os.path.exists(file): files.append(file) return files def GetExecutable(self): configuration = self.GetConfiguration() (machine, machineEntry, sourceBaseDir, path) = self.SimLib.GetLocalEnvironment() configPath = self.SimLib.BuildPath([self.SimEnvironment.CONFIGS_PATH, configuration]) if not(self.SimLib.FileExists(configPath)): dprint("Error, configuration '%s', which has path '%s' does not exist or is not readable" % (configuration, configPath)) sys.exit(1) submitScript = self.SimLib.BuildPath([configPath, "SubmitScript"]) if not(self.SimLib.FileExists(submitScript)): dprint("Warning: empty submit script for configuration %s" % configuration) submitScript = None runScript = self.SimLib.BuildPath([configPath, "RunScript"]) if not(self.SimLib.FileExists(runScript)): dprint("Error: empty/missing run script for configuration %s" % configuration) sys.exit(1) return (submitScript, runScript) def SubmitInteractiveRequest(self, command): #give up control to the executing terminal os.system(command) #nodes = [] #lines = output.split("\n") #alias_pattern = "^([A-Za-z0-9-]+)$" #for i in range(len(lines)): # line = lines[i] # if line.startswith("PBS has allocated"): # for j in range(i+1, len(lines)): # subline = lines[j] # if subline.startswith("A total of"): # return nodes # # matches = re.search(alias_pattern, subline) # if matches != None: # node = subline.strip() # nodes.append(node) # #return nodes def CreateSimulationId(self, simulationName): (machine, machineEntry, sourceBaseDir, path) = self.SimLib.GetLocalEnvironment() hostname = machineEntry.hostname user = os.popen('whoami').read().strip() tt = time.localtime() timestamp = "%4d.%02d.%02d-%02d.%02d.%02d" % (tt.tm_year, tt.tm_mon, tt.tm_mday, tt.tm_hour, tt.tm_min, tt.tm_sec) pid = os.getpid() simulation_id = "simulation-%s-%s-%s-%s-%s-%s" % (simulationName, machine, hostname, user, timestamp, pid) return simulation_id def CreatePbsSimulationName(self, SimRestart): simulationName = "%s-%s" % (SimRestart.SimulationName, SimRestart.LongRestartID) pid = os.getpid() if self.OptionsManager.HasOption('hide') and self.OptionsManager.GetOption('hide') == True: shortString = "sim-%06d" % pid elif self.OptionsManager.HasOption('hide-boring') and self.OptionsManager.GetOption('hide-boring') == True: words = ['headon', 'D3.0', 'a0.6', 'mu0.25', 'PN1.5', 'FMR', '1+log', 'nowaves', 'findAH', 'coarse', 'singleBH', 'PUGH', 'movie'] random.seed() randomWord = words[random.randint(0, len(words)-1)] shortString = "sim-%s-%s" % (randomWord, pid) elif self.OptionsManager.HasOption('hide-dangerous') and self.OptionsManager.GetOption('hide-dangerous') == True: words = ['paramesh', 'D25.0', 'a0.999', 'mu0.01', 'PN4.0', 'CCM', 'spec35', 'maximal', 'string', 'FE', 'tail', 'DSS', 'PRL', 'naked'] random.seed() randomWord = words[random.randint(0, len(words)-1)] shortString = "sim-%s-%s" % (randomWord, pid) else: shortString = simulationName shortString = re.sub("^[\x20-\x7E]", "", shortString) shortString = re.sub("[\s]", "_", shortString) shortString = re.sub("^(?![A-Za-z])", "J", shortString) # limit to 15 characters. shortString = shortString[:15] return shortString def CopyFileWithCaching(self, srcfile, destdir, cachedir): # os.link == create hard link. # os.makedirs() == recurse make directories. if not(os.path.exists(cachedir)): try: os.makedirs(cachedir) except: dprint("Error, could not create cache directory: %s" % cachedir) sys.exit(1) filename = self.SimLib.BaseName(srcfile) cachefile = self.SimLib.BuildPath([cachedir, filename]) dstfile = self.SimLib.BuildPath([destdir, filename]) if not(os.path.exists(cachefile)): if os.path.exists(dstfile): try: os.remove(dstfile) except: dprint("Could not remove existing destination file %s" % dstfile) sys.exit(1) try: shutil.copyfile(srcfile, dstfile) except: dprint("Error, Could not copy %s to %s" % (srcfile, dstfile)) sys.exit(1) mode = os.stat(srcfile).st_mode os.chmod(dstfile, mode) os.link(dstfile, cachefile) return # cachefile exists os.link(cachefile, dstfile) eq = True srcstat = os.stat(srcfile) dststat = os.stat(dstfile) eq = eq and srcstat.st_mtime >= dststat.st_mtime eq = eq and srcstat.st_size == dststat.st_size if eq == False: os.remove(dstfile) shutil.copyfile(srcfile, dstfile) mode = os.stat(srcfile).st_mode os.chmod(dstfile, mode) if os.path.exists(cachefile): try: os.unlink(cachefile) except: dprint("Could not remove existing cached executable %s" % cachefile) os.link(dstfile, cachefile) def CreateInternalDirs(self, internaldir): roots = ['exe', 'cfg', 'run', 'par', 'data'] mdirs = [] for root in roots: fullpath = self.SimLib.BuildPath([internaldir, root]) try: os.makedirs(fullpath) except: dprint("could not make %s directory: %s" % (root, fullpath)) sys.exit(1) mdirs.append(fullpath) return mdirs def GetMaxRestartID(self, restart): rids = self.GetRestartIds(restart) if len(rids) == 0: max_restart_id = -1 else: max_restart_id = rids[len(rids)-1] return max_restart_id def CreateRestartSkeleton(self, simulationName): (machine, machineEntry, sourceBaseDir, path) = self.SimLib.GetLocalEnvironment() basedir = self.SimLib.GetBaseDir(machineEntry) if not(self.SimLib.FileExists(basedir)): dprint("Could not access simulation base directory %s for reading and writing" % basedir) sys.exit(1) simulationdir = self.SimLib.BuildPath([basedir, simulationName]) if self.SimLib.FileExists(simulationdir): dprint("Cannot create job skeleton directory: Directory \"%s\" already exists" % simulationdir) sys.exit(1) try: os.mkdir(simulationdir) except OSError: dprint("Could not create simulation skeleton directory \"%s\"" % simulationdir) sys.exit(1) internaldir = self.SimLib.BuildPath([simulationdir, self.SimEnvironment.INTERNALDIRECTORY]) if self.SimLib.FileExists(internaldir): dprint("Cannot create job skeleton directory: Directory \"%s\" already exists" % internaldir) sys.exit(1) try: os.mkdir(internaldir) except OSError: dprint("Could not create simulation skeleton directory \"%s\"" % internaldir) sys.exit(1) cachedir = self.SimLib.BuildPath([basedir, 'CACHE']) if not(self.SimLib.FileExists(cachedir)): try: os.mkdir(cachedir) except OSError: dprint("Could not create simulation skeleton directory \"%s\"" % cachedir) sys.exit(1) return (basedir, simulationdir, internaldir, cachedir)