import os, sys, re, stat import libutil import restartlib import simlog import filecmp, shutil import time import simarchive from libutil import dprint # this class is the encapsulation of a 'Restart' # which is a submitted simulation created and managed by # simfactory # a skeleton right now class SimRestart: def __init__(self, env): self.SimEnvironment = env self.OptionsManager = self.SimEnvironment.OptionsManager self.ConfigurationDatabase = self.SimEnvironment.ConfigurationDatabase self.DefineDatabase = self.SimEnvironment.DefineDatabase self.SimLib = self.SimEnvironment.SimLib self.RestartLib = self.SimEnvironment.RestartLib self.JobID = -1 self.RestartID = -1 self.LongRestartID = -1 self.SimulationName = None self.Properties = None def load(self, simulationName): self.SimulationName = simulationName # need basedir, simulationdir, and internaldir machine = self.SimEnvironment.LocalMachine machineEntry = self.SimEnvironment.LocalMachineEntry self.BaseDir = self.SimLib.GetBaseDir(machineEntry) self.SimulationDir = self.SimLib.BuildPath([self.BaseDir, self.SimulationName]) self.InternalDir = self.SimLib.BuildPath([self.SimulationDir, self.SimEnvironment.INTERNALDIRECTORY]) self.CacheDir = self.SimLib.BuildPath([self.BaseDir, 'CACHE']) if not(os.path.exists(self.SimulationDir)): if self.SimEnvironment.VERBOSE: dprint("Warning: simulation \"%s\" does not exist or is not readable" % self.SimulationName) return -1 propertyFile = self.SimLib.BuildPath([self.InternalDir, "properties.ini"]) #dprint("Properties file: %s" % propertyFile) if not(os.path.exists(propertyFile)): if self.SimEnvironment.VERBOSE: dprint("Warning: properties.ini does not exist for simulation %s" % self.SimulationName) return -1 self.Properties = restartlib.RestartProperties(self.SimEnvironment) self.Properties.init(propertyFile) #dprint("load complete, properties:") #dprint(self.Properties.toString()) self.attachLog(self.SimulationDir) return 1 def loadFromRestartId(self, restartid): self.RestartID = int(restartid) self.LongRestartID = "%04d" % int(restartid) restart_dir = self.SimLib.BuildPath([self.SimulationDir, "output-%s" % self.LongRestartID]) internal_dir = self.SimLib.BuildPath([restart_dir, self.SimEnvironment.INTERNALDIRECTORY]) self.RestartDir = restart_dir self.InternalDir = internal_dir propertyFile = self.SimLib.BuildPath([internal_dir, "properties.ini"]) if not(os.path.exists(propertyFile)): if self.SimEnvironment.VERBOSE: dprint("Warning: could not load from restart id %s, %s does not exist, broken restart" % (self.LongRestartID, propertyFile)) return -1 self.Properties = restartlib.RestartProperties(self.SimEnvironment) self.Properties.init(propertyFile) return 1 def PrepareCheckpointing(self, max_restart_id): # make sure it's an int. max_restart_id = int(max_restart_id) dprint("DEBUG: PrepareCheckpointing: max_restart_id: %s" % max_restart_id) # lets see if from-restart-id is sest if self.Properties.HasProperty('from-restart-id'): ii = self.Properties.GetProperty('from-restart-id') try: restore_restart_id = int(ii) except ValueError: dprint("Error: Could not coerse provided from-restart-id %s into an integer" % ii) self.Log.Write("Error: Could not coerse provided from-restart-id %s into an integer" % ii) sys.exit(1) else: if max_restart_id >= 0: restore_restart_id = max_restart_id else: dprint("DEBUG: max_restart_id is < 0, returning False") return False if restore_restart_id == self.RestartID: dprint("DEBUG: max_restart_id == self.RestartID, returning false") return False if self.SimEnvironment.VERBOSE: dprint("DEBUG: Restoring from restart id %s, 04d is: %04d" % (restore_restart_id, restore_restart_id)) d_restore_restart_id = "%04d" % restore_restart_id restore_dir = self.SimLib.BuildPath([self.SimulationDir, "output-%s" % d_restore_restart_id]) previousIni = self.SimLib.BuildPath([restore_dir, self.SimEnvironment.INTERNALDIRECTORY, "properties.ini"]) work_dir = None if os.path.exists(previousIni): ep = restartlib.RestartProperties(self.SimEnvironment) ep.init(previousIni) parfile = ep.parfile pf = self.SimLib.BaseName(parfile) work_dir = self.SimLib.BuildPath([restore_dir, pf]) chkpoint_files = self.RestartLib.GetCheckpointFiles(restore_dir, self.Properties.parfile) if len(chkpoint_files) == 0: # look in the working directory for the checkpoint files as well. if work_dir != None: chkpoint_files = self.RestartLib.GetCheckpointFiles(work_dir, self.Properties.parfile) if len(chkpoint_files) == 0: return False if self.SimEnvironment.VERBOSE: for file in chkpoint_files: dprint("DEBUG: checkpoint file: %s" % file) for file in chkpoint_files: try: dfile = file.replace(restore_dir, self.RestartDir).strip() if self.SimEnvironment.VERBOSE: dprint("DEBUG: linking %s to %s" % (file, dfile)) os.link(file, dfile) except: dprint("Error: Could not link checkpoint file %s to %s" % (file, dfile)) sys.exit(1) return True def submit(self, simulationName=None, chainedJobId=None, chainedRestartId=None): if simulationName is not None: ret = self.load(simulationName) if ret < 0: self.create(simulationName) self.load(simulationName) # need basedir, simulationdir, and internaldir machine = self.SimEnvironment.LocalMachine machineEntry = self.SimEnvironment.LocalMachineEntry if self.OptionsManager.HasOption('writer-procs') == False: dprint("Error: required option --writer-procs has not been defined", libutil.ALWAYS_PRINT) sys.exit(1) self.SwanDir = self.OptionsManager.GetOption("input-dir") if self.SwanDir == None: dprint("Error, no adcs+swan input directory specified") sys.exit(0) if not(os.path.exists(self.SwanDir)): dprint("Could not access adcs+swan input directory \"%s\"" % self.SwanDir, libutil.ALWAYS_PRINT) sys.exit(1) dprint("Input Directory: %s" % self.SwanDir) self.SimLib.VerifyKeys(machineEntry, ['scratchdir', 'num-threads', 'memory', 'ppn', 'queue', 'maxwalltime', 'hostname']) simulation_id = self.Properties.simulationid #allow commandline override of submit script during submission phase ss = self.OptionsManager.GetOption('submitscript') if ss != None and os.path.exists(ss): submitScript = ss else: submitScript = self.Properties.submitscript if len(submitScript) == 0 or submitScript == "None": dprint("Error: no submission script defined for simulation %s, submission disabled" % self.SimulationName) sys.exit(1) if not(os.path.exists(submitScript)): dprint("Error: could not read submit script %s" % submitscript) sys.exit(1) #make executable contents = self.SimLib.GetFileContents(submitScript) # we need to assign this job a restart id # when the run command is executed, we will pass this restart id to it to use. (rids, max_restart_id, my_restart_id) = self.GetRestartId() self.Log.Write("Found the following restart_ids: %s" % rids) self.Log.Write("Maximum restart id determined to be: %04d" % max_restart_id) self.Log.Write("Determined submit restart id: %s " % my_restart_id) self.RestartID = my_restart_id self.LongRestartID = "%04d" % self.RestartID # need to prepare replacements. MaxWalltime = restartlib.WallTime(machineEntry.maxwalltime) if not(self.OptionsManager.HasOption('walltime')): Walltime = MaxWalltime else: Walltime = restartlib.WallTime(self.OptionsManager.GetOption('walltime')) UseChaining = False if MaxWalltime.walltime_seconds < Walltime.walltime_seconds: if chainedJobId == None: UseChaining = True self.RestartDir = self.SimLib.BuildPath([self.SimulationDir, "output-%s" % self.LongRestartID]) self.InternalDir = self.SimLib.BuildPath([self.RestartDir, self.SimEnvironment.INTERNALDIRECTORY]) self.WorkDir = self.SimLib.BuildPath([self.RestartDir, "work"]) # make the directory dirs = [self.InternalDir, self.WorkDir] for d in dirs: try: os.makedirs(d) except OSError, e: dprint("Could not create directory %s, %s" % (d, e)) sys.exit(1) existingProperties = None hostname = machineEntry.hostname user = self.SimLib.GetUsername() memory = self.SimLib.GetMachineOption('memory') cpufreq = self.SimLib.GetMachineOption('cpufreq') allocation = self.SimLib.GetMachineOption('allocation') queue = self.SimLib.GetMachineOption('queue') (nodes, ppn_used, procs, ppn, procs_requested, num_procs, num_threads, writer_procs) = self.SimLib.GetProcs(existingProperties) newsspath = self.SimLib.BuildPath([self.InternalDir, "PreparedSubmitScript"]) pbsSimulationName = self.RestartLib.CreatePbsSimulationName(self) new_properties = dict() new_properties['SOURCEDIR'] = self.Properties.sourcedir new_properties['SIMULATION_NAME'] = self.SimulationName new_properties['SHORT_SIMULATION_NAME'] = pbsSimulationName new_properties['SIMULATION_ID'] = self.Properties.simulationid new_properties['RESTART_ID'] = self.RestartID new_properties['RUNDIR'] = self.RestartDir new_properties['WORKDIR'] = self.WorkDir new_properties['SCRIPTFILE'] = self.Properties.submitscript new_properties['HOSTNAME'] = hostname new_properties['USER'] = user new_properties['NODES'] = nodes new_properties['PROCS'] = procs new_properties['PROCS_REQUESTED'] = procs_requested new_properties['PPN'] = ppn new_properties['PPN_USED'] = ppn_used new_properties['NUM_PROCS'] = num_procs new_properties['NUM_THREADS'] = num_threads new_properties['MEMORY'] = memory new_properties['CPUFREQ'] = cpufreq new_properties['ALLOCATION'] = allocation new_properties['QUEUE'] = queue new_properties['EMAIL'] = machineEntry.email new_properties['WRITER_PROCS'] = writer_procs if UseChaining or chainedJobId != None: walltt = MaxWalltime else: walltt = Walltime self.RestartLib.SetWalltime(walltt) new_properties['SIMFACTORY'] = self.SimEnvironment.EXECUTABLE new_properties['SUBMITSCRIPT'] = newsspath new_properties['SCRIPTFILE'] = newsspath if chainedRestartId == None: new_properties['FROM_RESTART_COMMAND'] = "" else: new_properties['FROM_RESTART_COMMAND'] = "--from-restart-id=%s" % chainedRestartId if chainedJobId == None: new_properties['CHAINED_JOB_ID'] = "" else: new_properties['CHAINED_JOB_ID'] = chainedJobId for key in new_properties.keys(): self.DefineDatabase.Set(key, new_properties[key]) scratchdir = self.DefineDatabase.SubAll(self.SimLib.GetMachineOption('scratchdir')) if not(os.path.exists(scratchdir)): if not(scratchdir.startswith("/")): sspath = self.SimLib.BuildPath([self.RestartDir, scratchdir]) if not(os.path.exists(sspath)): try: os.makedirs(sspath) except: dprint("Error: could not make scratch directory \"%s\"" % sspath) sys.exit(1) scratchdir = sspath else: try: os.makedirs(scratchdir) except: dprint("Error: could not make scratch directory \"%s\"" % scratchdir) sys.exit(1) sspath = self.SimLib.BuildPath([self.RestartDir, "scratch"]) os.system("ln -s %s %s" % (scratchdir, sspath)) self.DefineDatabase.Set("SCRATCHDIR", scratchdir) # lets prepare our submit script. contents = self.DefineDatabase.SubAll(contents) # pull out all of our ENV commands (contents, env) = self.DefineDatabase.ParseEnvCommands(contents) envFile = self.SimLib.BuildPath([self.InternalDir, 'ENV']) envContents = "\n".join(env) envContents = self.DefineDatabase.SubAll(envContents) # store the rest of our keys rr = { 'nodes': nodes, 'ppnused': ppn_used, 'procs':procs, 'ppn':ppn, 'procsrequested': procs_requested, 'numprocs': num_procs, 'numthreads':num_threads, 'queue': queue, 'allocation': allocation, 'hostname': hostname, 'user': user, 'memory': memory, 'cpufreq': cpufreq, 'pbsSimulationName': pbsSimulationName, 'walltime': Walltime.Walltime, 'writerprocs': writer_procs, 'swandir': self.SwanDir } for key in rr.keys(): self.Properties.AddProperty(key, rr[key]) # write to our new properties directory. self.Properties.Filename = self.SimLib.BuildPath([self.InternalDir, "properties.ini"]) dprint("writing to internalDir: %s" % self.InternalDir) self.Properties.Save() dprint("writing to: %s" % newsspath) self.SimLib.WriteContents(newsspath, contents) self.SimLib.WriteContents(envFile, envContents) # make executable! os.chmod(newsspath, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) # set our current working directory to the restart dir in case the # submit script writes out any files. os.chdir(self.RestartDir) # we need to copy the contents of the data/, and hardlink the contents of exe/ to the workdir. datadir = self.SimLib.BuildPath([self.SimulationDir, self.SimEnvironment.INTERNALDIRECTORY, 'data']) exedir = self.SimLib.BuildPath([self.SimulationDir, self.SimEnvironment.INTERNALDIRECTORY, 'exe']) for file in os.listdir(datadir): fpath = self.SimLib.BuildPath([datadir, file]) dest_path = self.SimLib.BuildPath([self.WorkDir, file]) try: shutil.copy(fpath, dest_path) except OSError, e: dprint("Error: could not copy data file %s to %s, %s" % (fpath, dest_path, e)) sys.exit(1) for file in os.listdir(exedir): fpath = self.SimLib.BuildPath([exedir, file]) dest_path = self.SimLib.BuildPath([self.WorkDir, file]) try: os.link(fpath, dest_path) except OSError, e: dprint("Error: could not copy exe file %s to %s, %s" % (fpath, dest_path, e)) sys.exit(1) # copy the contents of the input-dir to the same work directory above. for file in os.listdir(self.SwanDir): file_path = self.SimLib.BuildPath([self.SwanDir, file]) if os.path.isdir(file_path): continue dest_path = self.SimLib.BuildPath([self.WorkDir, file]) try: dprint("Copying file %s to %s" % (file_path, dest_path), libutil.ALWAYS_PRINT) shutil.copy(file_path, dest_path) except OSError, e: print "Error: could not copy adcirc+swan input file %s to %s, %s" % (file_path, dest_path, e) sys.exit(1) self.Properties.AddProperty(file, dest_path) # create our prep files adcirc_processors = int(num_procs) - int(writer_procs) dprint("Writer procs: %s" % writer_procs) dprint("Adcirc procs: %s" % adcirc_processors) preps = list() preps.append("%s\n1\nfort.14\n" % adcirc_processors) preps.append("%s\n2\nfort.26\n" % adcirc_processors) if self.OptionsManager.HasOption("adcprep-schedule"): preps = self.RestartLib.ParsePrepSchedule(self.OptionsManager.GetOption('adcprep-schedule'), adcirc_processors) prep_files = list() for i in range(len(preps)): contents = preps[i] prep_path = self.SimLib.BuildPath([self.WorkDir, "prep.%s" % i]) prep_files.append(prep_path) self.SimLib.WriteContents(prep_path, contents) self.Properties.AddProperty("prep%s" % i , prep_path) submitCommand = self.DefineDatabase.SubAll(machineEntry.submit) main_scriptfile = self.DefineDatabase.Get("SCRIPTFILE") if self.OptionsManager.GetOption('enable-adcprep-submission') == True: # time to do some magic. script = self.SimLib.GetFileContents(submitScript) #comment out the simfactory line script = script.replace("@SIMFACTORY@", "#@SIMFACTORY@") script = "%s\n\ncd @WORKDIR@\n\n" % script for prep_file in prep_files: script = "%s\n\n./adcprep < %s" % (script, prep_file) script = "%s\n\n" % script script_file = self.SimLib.BuildPath([self.InternalDir, "PreparedPrepScript"]) self.DefineDatabase.Set("SCRIPTFILE", script_file) # fix the walltime if adcprep-walltime was specified. if self.OptionsManager.HasOption("adcprep-walltime"): PrepWalltime = restartlib.WallTime(self.OptionsManager.GetOption("adcprep-walltime")) self.RestartLib.SetWalltime(PrepWalltime) script = self.DefineDatabase.SubAll(script) self.SimLib.WriteContents(script_file, script) prepSubmitCommand = self.DefineDatabase.SubAll(machineEntry.submit) dprint("Executing submit command: %s" % prepSubmitCommand) output = self.SimLib.ExecuteCommand(prepSubmitCommand, output=True) prep_job_id = self.RestartLib.GetJobIdFromOutput(output) if prep_job_id == -1: dprint("Error: could not determine job_id of adcprep submission", libutil.ALWAYS_PRINT) sys.exit(1) dprint("ADCPREP job submission id: %s" % prep_job_id) self.DefineDatabase.Set("SCRIPTFILE", main_scriptfile) self.DefineDatabase.Set("CHAINED_JOB_ID", prep_job_id) #reset the walltime back to the original WallTime self.RestartLib.SetWalltime(walltt) contents = self.SimLib.GetFileContents(submitScript) contents = self.DefineDatabase.SubAll(contents) self.SimLib.WriteContents(newsspath, contents) output = self.SimLib.ExecuteCommand(submitCommand, output=True) job_id = self.RestartLib.GetJobIdFromOutput(output) pjob = restartlib.RestartProperties(self.SimEnvironment) pjob.InitBlank() pjob.Filename = self.SimLib.BuildPath([self.InternalDir, 'job.ini']) pjob.AddProperty('jobid', job_id) pjob.Save() self.Properties.AddProperty('active', 'no') self.Properties.AddProperty('running', 'no') self.Properties.AddProperty('queued', 'yes') self.Properties.Save() if job_id == "-1": dprint("Submit either failed or could not determine job id, output:") dprint(output) return else: dprint("Submit finished, job id is %s" % job_id) self.JobID = job_id os.environ['PBS_JOBID'] = self.JobID if UseChaining: numChains = self.RestartLib.GetNumberOfRestarts(MaxWalltime, Walltime) restart = None previousRestart = None # we already submitted the first chain for i in range(numChains-1): if restart != None: previousRestart = restart restart = SimRestart(self.SimEnvironment) restart.load(self.SimulationName) if previousRestart == None: restart.submit(self.SimulationName, self.JobID, self.RestartID) else: restart.submit(self.SimulationName, previousRestart.JobID, previousRestart.RestartID) def GetRestartId(self): rids = self.RestartLib.GetRestartIds(self) max_restart_id = self.RestartLib.GetMaxRestartID(self) if self.OptionsManager.HasOption("restart-id"): ii = self.OptionsManager.GetOption("restart-id") try: my_restart_id = int(ii) except ValueError: dprint("Error: Could not coerse provided restart-id %s into an integer" % ii) self.Log.Write("Error: Could not coerse provided restart-id %s into an integer" % ii) sys.exit(1) else: my_restart_id = max_restart_id + 1 if not(self.OptionsManager.HasOption("restart-id")): if my_restart_id in rids: dprint("Error: assigned restart id %s for simulation %s already in use!" % my_restart_id) self.Log.Write("Error: assigned restart id %s for simulation %s already in use!" % my_restart_id) sys.exit(1) self.Log.Write("Assigned restart_id of: %04d" % my_restart_id) if my_restart_id > 9999: dprint("Error: maximum number of restarts reached. Please use sim purge to clear existing restarts") self.Log.Write("Error: maximum number of restarts reached. Please use sim purge to clear existing restarts") sys.exit(1) if max_restart_id == my_restart_id: max_restart_id = max_restart_id - 1 return (rids, max_restart_id, my_restart_id) def interactive(self): # need basedir, simulationdir, and internaldir machine = self.SimEnvironment.LocalMachine machineEntry = self.SimEnvironment.LocalMachineEntry self.SimLib.VerifyKeys(machineEntry, ['interactivecmd']) # need to prepare replacements. MaxWalltime = restartlib.WallTime(machineEntry.maxwalltime) if not(self.OptionsManager.HasOption('walltime')): Walltime = MaxWalltime else: Walltime = restartlib.WallTime(self.OptionsManager.GetOption('walltime')) hostname = machineEntry.hostname user = self.SimLib.GetUsername() memory = self.SimLib.GetMachineOption('memory') cpufreq = self.SimLib.GetMachineOption('cpufreq') allocation = self.SimLib.GetMachineOption('allocation') queue = self.SimLib.GetMachineOption('queue') (nodes, ppn_used, procs, ppn, procs_requested, num_procs, num_threads) = self.SimLib.GetProcs(None) new_properties = dict() new_properties['HOSTNAME'] = hostname new_properties['USER'] = user new_properties['NODES'] = nodes new_properties['PROCS'] = procs new_properties['EMAIL'] = machineEntry.email new_properties['PROCS_REQUESTED'] = procs_requested new_properties['PPN'] = ppn new_properties['PPN_USED'] = ppn_used new_properties['NUM_PROCS'] = num_procs new_properties['NUM_THREADS'] = num_threads new_properties['MEMORY'] = memory new_properties['CPUFREQ'] = cpufreq new_properties['ALLOCATION'] = allocation new_properties['QUEUE'] = queue new_properties['WALLTIME'] = Walltime.Walltime new_properties['WALLTIME_HH'] = Walltime.walltime_hh new_properties['WALLTIME_MM'] = Walltime.walltime_mm new_properties['WALLTIME_SS'] = Walltime.walltime_ss new_properties['WALLTIME_SECONDS'] = Walltime.walltime_seconds new_properties['WALLTIME_MINUTES'] = Walltime.walltime_minutes new_properties['WALLTIME_HOURS'] = Walltime.walltime_hours new_properties['SIMFACTORY'] = self.SimEnvironment.EXECUTABLE for key in new_properties.keys(): self.DefineDatabase.Set(key, new_properties[key]) interactivecmd = self.DefineDatabase.SubAll(machineEntry.interactivecmd) self.SimLib.ExecuteCommand(interactivecmd) def run(self, simulationName=None, debug=False): dprint("simulation name is: %s" % simulationName) if simulationName is not None: if self.OptionsManager.HasOption('restart-id'): restart_id = self.OptionsManager.GetOption('restart-id') self.loadFromRestartId(restart_id) else: ret = self.load(simulationName) if ret < 0: adcswan_dir = OptionsManager.GetOption("input-dir") if adcswan_dir == None: dprint("Error, no adcs+swan input directory specified") sys.exit(0) self.create(simulationName, adcswan_dir) self.load(simulationName) self.Log.Write("Executing run...") # need basedir, simulationdir, and internaldir machine = self.SimEnvironment.LocalMachine machineEntry = self.SimEnvironment.LocalMachineEntry self.SimLib.VerifyKeys(machineEntry, ['scratchdir', 'num-threads', 'memory', 'ppn', 'hostname', 'mpi']) mpi = machineEntry.mpi #if not machineEntry.HasKey('mpirun') or machineEntry.mpirun == None or len(machineEntry.mpirun) == 0: # dprint("Error: mpirun run for machine %s is not defined" % machine) # sys.exit(1) simulation_id = self.Properties.simulationid # lets figure out our restart situation. (rids, max_restart_id, my_restart_id) = self.GetRestartId() self.Log.Write("Found the following restart_ids: %s" % rids) self.Log.Write("Maximum restart id determined to be: %04d" % max_restart_id) self.RestartID = my_restart_id self.LongRestartID = "%04d" % self.RestartID active_restart_id = self.RestartLib.GetActiveRestartId(self) self.Log.Write("Active restart id: %04d" % int(active_restart_id)) self.RestartDir = self.SimLib.BuildPath([self.SimulationDir, "output-%04d" % self.RestartID]) self.InternalDir = self.SimLib.BuildPath([self.RestartDir, self.SimEnvironment.INTERNALDIRECTORY]) self.WorkDir = self.SimLib.BuildPath([self.RestartDir, "work"]) if not(os.path.exists(self.RestartDir)): try: os.mkdir(self.RestartDir) except: dprint("Error: could not create restart directory %s" % self.RestartDir) sys.exit(1) if not(os.path.exists(self.InternalDir)): try: os.mkdir(self.InternalDir) except: dprint("Error: could not create restart internal directory %s" % self.InternalDir) sys.exit(1) # switch log to new location. self.Log.Close() self.Log = simlog.SimLog(self.SimEnvironment) self.Log.Init(self.SimLib.BuildPath([self.RestartDir, 'LOG']), self) attemptedPropertyFile = self.SimLib.BuildPath([self.RestartDir, self.SimEnvironment.INTERNALDIRECTORY, "properties.ini"]) self.Log.Write("attemptedPropertyFile: %s" % attemptedPropertyFile) loadedRestart = False # if a specific properties.ini file already exists (eg, created by submit) use it # otherwise, just use the one we already have. if os.path.exists(attemptedPropertyFile): # write to our new RestartDir InternalDir self.Properties = restartlib.RestartProperties(self.SimEnvironment) self.Properties.init(attemptedPropertyFile) loadedRestart = True self.Log.Write("Loaded attempted property file: %s" % attemptedPropertyFile) existingProperties = None pbsSimulationName = self.RestartLib.CreatePbsSimulationName(self) # simulationid, executable, submitscript, parfile are all available in self.Properties # machine, machineEntry are already defined. if loadedRestart == False: hostname = machineEntry.hostname user = self.SimLib.GetUsername() memory = self.SimLib.GetMachineOption('memory') cpufreq = self.SimLib.GetMachineOption('cpufreq') (nodes, ppn_used, procs, ppn, procs_requested, num_procs, num_threads, writer_procs) = self.SimLib.GetProcs(existingProperties) # store the rest of our keys rr = { 'nodes': nodes, 'ppnused': ppn_used, 'procs':procs, 'ppn':ppn, 'procsrequested': procs_requested, 'numprocs': num_procs, 'numthreads':num_threads, 'hostname': hostname, 'user': user, 'memory': memory, 'cpufreq': cpufreq, 'pbsSimulationName': pbsSimulationName, 'writerprocs': writer_procs, 'mpi': mpi } for key in rr.keys(): self.Properties.AddProperty(key, rr[key]) scratchdir = self.SimLib.GetMachineOption('scratchdir') paths_to_make = [self.RestartDir, self.WorkDir, self.SimLib.BuildPath([self.RestartDir, self.SimEnvironment.INTERNALDIRECTORY])] for path in paths_to_make: if not(os.path.exists(path)): try: os.mkdir(path) except OSError, e: dprint("Error: could not make working directory path \"%s\", %s" % (path, e)) sys.exit(1) if loadedRestart == False: # write to our new RestartDir InternalDir self.Properties.Filename = self.SimLib.BuildPath([self.RestartDir, self.SimEnvironment.INTERNALDIRECTORY, "properties.ini"]) self.Properties.Save() os.environ['CACTUS_STARTTIME'] = str(int(time.time())) # need to remove currently active simulation. activedir = self.SimLib.BuildPath([self.SimulationDir, "output-%04d" % int(active_restart_id)]) activedirwithflag = "%s-active" % activedir if os.path.exists(activedirwithflag): try: os.unlink(activedirwithflag) except: dprint("Could not remove previous active flag on directory %s" % activedir) sys.exit(1) try: # create a symlink indicating that this job is active. os.symlink(self.RestartDir, "%s-active" % self.RestartDir) except: pass if len(self.Properties.submitscript) == 0: submitScript = None new_properties = dict() new_properties['SOURCEDIR'] = self.Properties.sourcedir new_properties['SIMULATION_NAME'] = self.SimulationName new_properties['SHORT_SIMULATION_NAME'] = self.Properties.pbsSimulationName new_properties['SIMULATION_ID'] = self.Properties.simulationid new_properties['RESTART_ID'] = self.RestartID new_properties['RUNDIR'] = self.RestartDir new_properties['WORKDIR'] = self.WorkDir new_properties['SCRIPTFILE'] = self.Properties.submitscript new_properties['SUBMITSCRIPT'] = self.Properties.submitscript new_properties['HOSTNAME'] = self.Properties.hostname new_properties['USER'] = self.Properties.user new_properties['NODES'] = self.Properties.nodes new_properties['PROCS'] = self.Properties.procs new_properties['PROCS_REQUESTED'] = self.Properties.procsrequested new_properties['PPN'] = self.Properties.ppn new_properties['PPN_USED'] = self.Properties.ppnused new_properties['NUM_PROCS'] = self.Properties.numprocs new_properties['NUM_THREADS'] = self.Properties.numthreads new_properties['MEMORY'] = self.Properties.memory new_properties['CPUFREQ'] = self.Properties.cpufreq new_properties['WRITER_PROCS'] = self.Properties.writerprocs new_properties['MPI'] = mpi if debug == True: new_properties['RUNDEBUG'] = 1 else: new_properties['RUNDEBUG'] = 0 if self.Properties.HasProperty("walltime"): walltime_raw = self.Properties.walltime Walltime = restartlib.WallTime(walltime_raw) new_properties['WALLTIME'] = Walltime.Walltime new_properties['WALLTIME_HH'] = Walltime.walltime_hh new_properties['WALLTIME_MM'] = Walltime.walltime_mm new_properties['WALLTIME_SS'] = Walltime.walltime_ss new_properties['WALLTIME_SECONDS'] = Walltime.walltime_seconds new_properties['WALLTIME_MINUTES'] = Walltime.walltime_minutes new_properties['WALLTIME_HOURS'] = Walltime.walltime_hours new_properties['SCRATCHDIR'] = scratchdir #new_properties['CHAINED_JOB_ID'] = '' #new_properties['MPI_NODEFILE'] = mpi_nodefile #new_properties['NODELIST'] = mpi_nodefile for key in new_properties.keys(): self.DefineDatabase.Set(key, new_properties[key]) self.Properties.AddProperty('active', 'yes') self.Properties.AddProperty('running', 'yes') self.Properties.AddProperty('queued', 'no') self.Properties.Save() rs = self.OptionsManager.GetOption('runscript') if rs != None and os.path.exists(rs): runScript = rs else: if not(self.Properties.HasProperty('runscript')): dprint("Error: no runscript defined for simulation %s" % self.SimulationName) sys.exit(1) runScript = self.Properties.runscript contents = self.SimLib.GetFileContents(runScript) contents = self.DefineDatabase.SubAll(contents) preparedRunScript = self.SimLib.BuildPath([self.InternalDir, 'PreparedRunScript']) self.SimLib.WriteContents(preparedRunScript, contents) cmd = "/bin/sh %s" % preparedRunScript # change to RestartDir and then run the simulation. os.chdir(self.WorkDir) # run adcprep twice on prep.14, prep.26 for file in os.listdir(self.WorkDir): if file.count("prep.") == 0: continue fullpath = self.SimLib.BuildPath([self.WorkDir, file]) prepcmd = "./adcprep < %s" % fullpath dprint("Prepping using adcprep with input file %s" % fullpath, True) self.SimLib.ExecuteCommand(prepcmd) dprint("Executing run command: %s" % cmd) retinfo = self.SimLib.ExecuteReplaceCommand(cmd) #dprint("return information: %s" % retinfo) dprint("Simfactory Done at date: %s" % os.system("date")) def create(self, simulationName): # lets start here. #the calling app (sim/sim-job/sim-run), etc, will determine where simulationName, parfile come from self.SimulationName = simulationName # build configuration configuration = self.RestartLib.GetConfiguration() configPath = self.SimLib.BuildPath([self.SimEnvironment.CONFIGS_PATH, configuration]) (self.BaseDir, self.SimulationDir, self.InternalDir, self.CacheDir) = self.RestartLib.CreateRestartSkeleton(self.SimulationName) (submitScript, runScript) = self.RestartLib.GetExecutable() dprint("SubmitScript: %s" % submitScript) #now that we have the simulationdir made, lets attach our log self.attachLog(self.SimulationDir) self.Log.Write("Skeleton Created") self.Log.Write("Job directory: \"%s\"" % self.SimulationDir) machine = self.SimEnvironment.LocalMachine machineEntry = self.SimEnvironment.LocalMachineEntry self.SimLib.VerifyKeys(machineEntry, ['user', 'email']) user = machineEntry.user email = machineEntry.email self.DefineDatabase.Reset() self.DefineDatabase.Set("USER", user) self.DefineDatabase.Set("EMAIL", email) localsourcebasedir = self.SimLib.GetLocalSourceBaseDir() dirsuffix = self.SimLib.GetDirSuffix(localsourcebasedir) sourcedir = self.SimLib.BuildPath([localsourcebasedir, dirsuffix]) #dirname == simulationdir # need to create a simulation id simulation_id = self.RestartLib.CreateSimulationId(self.SimulationName) propertyFile = self.SimLib.BuildPath([self.InternalDir, "properties.ini"]) self.Properties = restartlib.RestartProperties(self.SimEnvironment) self.Properties.init(propertyFile) self.Properties.AddProperty('machine', machine) self.Properties.AddProperty('simulationid', simulation_id) self.Properties.AddProperty('sourcedir', sourcedir) self.Properties.AddProperty('configuration', configuration) self.Properties.Save() (exedir, cfgdir, rundir, pardir, datadir) = self.RestartLib.CreateInternalDirs(self.InternalDir) # binaries binaries = ['adcirc', 'adcprep', 'adcswan', 'padcirc', 'padcswan'] for binary in binaries: binary_path = self.SimLib.BuildPath([self.SimEnvironment.CONFIGS_PATH, configuration, binary]) dest_path = self.SimLib.BuildPath([exedir, binary]) try: dprint("Copy binary %s to %s" % (binary_path, dest_path), libutil.ALWAYS_PRINT) shutil.copy(binary_path, dest_path) except OSError, e: print "Error: could not copy binary %s to %s, %s" % (binary_path, dest_path, e) sys.exit(1) self.Properties.AddProperty(binary, dest_path) # submit script if submitScript != None: queuefile = self.SimLib.BaseName(submitScript) contents = self.SimLib.GetFileContents(submitScript) self.DefineDatabase.SubAll(contents) self.SimLib.WriteContents(self.SimLib.BuildPath([rundir, queuefile]), contents) self.Log.Write("Submit script: \"%s\"" % submitScript) self.Properties.AddProperty('submitscript', self.SimLib.BuildPath([rundir, queuefile])) else: self.Properties.AddProperty('submitscript', "None") # run script if runScript != None: runfile = self.SimLib.BaseName(runScript) contents = self.SimLib.GetFileContents(runScript) self.DefineDatabase.SubAll(contents) self.SimLib.WriteContents(self.SimLib.BuildPath([rundir, runfile]), contents) self.Log.Write("Run script: \"%s\"" % runScript) self.Properties.AddProperty('runscript', self.SimLib.BuildPath([rundir, runfile])) else: self.Properties.AddProperty('runscript', "None") self.Log.Write(self.Properties.toString()) self.Properties.Save() def stop(self): job_id = self.GetJobId() if job_id == -1: dprint("Cannot stop a job without an associated job_id") sys.exit(1) status = self.RestartLib.GetJobStatus(job_id) if status == 'U': dprint("Job %s already finished or stopped" % job_id) sys.exit(0) machine = self.SimLib.GetMachineName() machineEntry = self.ConfigurationDatabase.GetMachine(machine) stopcmd = machineEntry.stop self.DefineDatabase.Reset() self.DefineDatabase.Set("JOB_ID", job_id) self.DefineDatabase.Set("USER", machineEntry.user) # it might be a regex, make sure it's in python format. stopcmd = self.DefineDatabase.SubAll(libutil.ReConvert(stopcmd)) force = self.OptionsManager.GetOption('force') term_file = self.SimLib.BuildPath([self.RestartDir, 'TERMINATE']) if os.path.exists(term_file): dprint("TERMINATE exists, stopping %s gracefully" % job_id) # write 1 to term_file fptr = open(term_file, 'w+') fptr.write('1') fptr.close() self.finish() return dprint("Forcing %s to stop without using graceful termination" % job_id) self.SimLib.ExecuteCommand(stopcmd) self.finish() return def show_output(self): if self.OptionsManager.HasOption('restart-id'): restart_id = self.OptionsManager.GetOption('restart-id') self.loadFromRestartId(restart_id) else: rids = self.RestartLib.GetRestartIds(self) if len(rids) == 0: dprint("Error: no restarts for simulation %s" % self.SimulationName) sys.exit(1) restart_id = self.RestartLib.GetMaxRestartID(self) self.loadFromRestartId(restart_id) # we're loaded!!!! woooot!!!! (machine, machineEntry, sourceBaseDir, path) = self.SimLib.GetLocalEnvironment() job_id = self.GetJobId() job_status = 'U' if job_id != -1: job_status = self.RestartLib.GetJobStatus(job_id) workdir = self.SimLib.BuildPath([self.RestartDir, 'work']) exechost = self.RestartLib.GetExecHost(self) if exechost == None and job_status == 'R': dprint("Error: could not retreive exechost for running simulation %s" % self.SimulationName) sys.exit(1) self.DefineDatabase.Set("EXECHOST", exechost) output = False stdout_output = "(file does not exist)" stderr_output = "(file does not exist)" adcirc_output = "(file does not exist)" adcirc_file = self.SimLib.BuildPath([workdir, 'output']) if os.path.exists(adcirc_file): adcirc_output = self.SimLib.GetFileContents(adcirc_file) if job_status == 'R': if self.OptionsManager.GetOption('follow') == True: self.VerifyKeys(machineEntry, ['stdout-follow']) followcmd = self.DefineDatabase.SubAll(machineEntry.GetKey('stdout-follow')) self.SimLib.ExecuteReplaceCommand(followcmd) output = True else: self.VerifyKeys(machineEntry, ['stdout', 'stderr']) stderrcmd = self.DefineDatabase.SubAll(machineEntry.GetKey('stderr')) stdoutcmd = self.DefineDatabase.SubAll(machineEntry.GetKey('stdout')) ff = os.popen(stdoutcmd) stdout_output = ff.read() ret = ff.close() if ret != None: dprint("Warning: stdout command \"%s\" returned status %s" % (stdoutcmd, ret)) ff = os.popen(stderrcmd) stderr_output = ff.read() ret = ff.close() if ret != None: dprint("Warning: stderr command \"%s\" returned status %s" % (stderr, ret)) output = True else: errfile = self.SimLib.BuildPath([self.RestartDir, "%s.err" % self.SimulationName]) outfile = self.SimLib.BuildPath([self.RestartDir, "%s.out" % self.SimulationName]) if os.path.exists(outfile): stdout_output = self.SimLib.GetFileContents(outfile) if os.path.exists(errfile): stderr_output = self.SimLib.GetFileContents(errfile) if output == False: # Show stdout, stderr, and Formaline output sep = "=" * 80 dprint(sep) dprint("The job's ADCIRC output is:") dprint(sep) dprint(adcirc_output) dprint(sep) dprint("The job's stdout is:") dprint(sep) dprint(stdout_output) dprint(sep) dprint("The job's stderr is:") dprint(sep) dprint(stderr_output) dprint(sep) def cleanup(self): # allow force. self.finish(True) def finish(self, allowForce=False): #dprint("Finishing restart %s" % self.LongRestartID) force = False if self.OptionsManager.HasOption('force'): force = self.OptionsManager.GetOption('force') job_id = self.GetJobId() job_status = 'U' if job_id != -1: job_status = self.RestartLib.GetJobStatus(job_id) if job_status in ['R', 'Q'] and force == False: self.Properties.AddProperty('active', 'no') self.Properties.AddProperty('running', 'no') self.Properties.AddProperty('queued', 'no') if job_status == 'R': self.Properties.AddProperty('running', 'yes') self.Properties.AddProperty('active', 'yes') if job_status == 'Q': self.Properties.AddProperty('queued', 'yes') self.Properties.Save() #dprint("Restart is active, properties: %s" % self.Properties.toString()) # we're still running, this is a noop. return # clean up. #dprint("Restart is not active") self.Properties.AddProperty('active', 'no') self.Properties.AddProperty('running', 'no') self.Properties.AddProperty('queued', 'no') self.Properties.Save() # step 1. remove active flag active_dir = "%s-active" % self.RestartDir if os.path.exists(active_dir): os.unlink(active_dir) # step 2. make terminate file not world writable term_file = self.SimLib.BuildPath([self.RestartDir, 'TERMINATE']) if os.path.exists(term_file): os.chmod(term_file, stat.S_IRWXU | stat.S_IRGRP | stat.S_IROTH) # step 3. make output files world readable outfile = "%s.out" % self.SimulationName errfile = "%s.err" % self.SimulationName if os.path.exists(outfile): os.chmod(outfile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IROTH) if os.path.exists(errfile): os.chmod(errfile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IROTH) # step 4. remove scratch dir machine = self.SimEnvironment.LocalMachine machineEntry = self.SimEnvironment.LocalMachineEntry scratchdir = machineEntry.GetKey('scratchdir') if not(scratchdir.startswith("/")): sspath = self.SimLib.BuildPath([self.RestartDir, scratchdir]) if os.path.exists(sspath): shutil.rmtree(sspath, True) # step 5. remove half written checkpoint files os.system("find %s -name *.chkpt.tmp.it_*.* -exec rm -rf {} \;" % self.RestartDir) # step 6. Hard-link (Formaline) tarballs between different restarts # old_dir = os.getcwd() # os.chdir(self.SimulationDir) # # files = os.popen("find . -name \*.tar.gz" % self.RestartDir).read().split("\n") # # rids = self.RestartLib.GetRestartIds(self) # rids.sort() # # for ff in files: # ff = ff.strip() # if len(ff) == 0: # continue # # if not(ff.endswith(".tar.gz")) or not(os.path.exists(ff)): # continue # # statinfo = os.stat(ff) # # if statinfo.st_nlink != 1: # continue # # # # for rid in rids.reverse(): # restart_path = self.SimLib.BuildPath([self.SimulationDir, "output-%04d" % rid]) # # if not(os.path.exists(restart_path)): # continue # # sspath = self.SimLib.BuildPath([restart_path, file]) # # if filecmp.cmp(sspath, ff) == 0: # os.link(sspath, ff) def archive(self): # we assume this restart is already loaded at this point. # either by a simulation (no restart id) or by a restart id #if restartId != None: # self.loadFromRestartId(restart_id) #else: # if self.OptionsManager.HasOption('restart-id'): # restart_id = self.OptionsManager.GetOption('restart-id') # self.loadFromRestartId(restart_id) machineEntry = self.SimEnvironment.LocalMachineEntry self.SimLib.VerifyKeys(machineEntry, ['archivetype']) archiveType = machineEntry.archivetype ArchiveEngine = simarchive.SimArchive(self.SimEnvironment, archiveType, self) ArchiveEngine.authenticate() ArchiveEngine.store() def GetJobId(self): job_file = self.SimLib.BuildPath([self.InternalDir, 'job.ini']) if not(os.path.exists(job_file)): return -1 pjob = restartlib.RestartProperties(self.SimEnvironment) pjob.init(job_file) return pjob.jobid def attachLog(self, simulationdir): self.Log = simlog.SimLog(self.SimEnvironment) self.Log.Init(self.SimLib.BuildPath([simulationdir, "LOG"]), self.SimulationName) # all of these will be flushed out as i proceed. def trash(self): (machine, machineEntry, sourceBaseDir, path) = self.SimLib.GetLocalEnvironment() self.SimLib.VerifyKeys(machineEntry, ['basedir']) basedir = self.SimLib.GetBaseDir(machineEntry) trashPath = self.SimLib.BuildPath([basedir, 'TRASH']) if not(os.path.exists(trashPath)): try: os.makedirs(trashPath) except: dprint("Error: could not make trash path: %s" % trashPath) sys.exit(1) # okay, we have trash path. lets find out if any of our restarts are running. rids = self.RestartLib.GetRestartIds(self) for rid in rids: rr = self.RestartLib.GetRestartByRestartId(self.SimulationName, rid) job_id = rr.GetJobId() if job_id == -1: continue job_status = self.RestartLib.GetJobStatus(job_id) if job_status != 'U': dprint("Error: Simulation %s, with Restart ID %s is either queued, holding, or running. " % (self.SimulationName, rid)) dprint("Error: Job ID %s must be stopped before a purge can happen" % job_id) sys.exit(1) # okay, we have no running/queued/holding jobs. # lets move the simulation to the trash trashFolder = self.SimLib.BuildPath([trashPath, self.Properties.simulationid]) if not(os.path.exists(trashFolder)): try: os.makedirs(trashFolder) except: dprint("Error: could not make trash folder: %s" % trashFolder) sys.exit(1) shutil.move(self.SimulationDir, trashFolder) dprint("Simulation %s has been moved to trash folder %s" % (self.SimulationName, trashFolder)) return def Error(self, message): error = "Error in Restart: %s" % message dprint(error) sys.exit(1)