import os, sys, re, stat import libutil import restartlib import simlog # this class is the encapsulation of a 'Restart' # which is a submitted simulation created and managed by # simfactory # a skeleton right now class SimRestart: def __init__(self, env): self.SimEnvironment = env self.OptionsManager = self.SimEnvironment.OptionsManager self.ConfigurationDatabase = self.SimEnvironment.ConfigurationDatabase self.DefineDatabase = self.SimEnvironment.DefineDatabase self.SimLib = self.SimEnvironment.SimLib self.RestartLib = self.SimEnvironment.RestartLib self.SimulationName = None self.Properties = None def load(self, simulationName): self.SimulationName = simulationName # need basedir, simulationdir, and internaldir machine = self.SimEnvironment.LocalMachine machineEntry = self.SimEnvironment.LocalMachineEntry self.BaseDir = self.SimLib.GetBaseDir(machineEntry) self.SimulationDir = self.SimLib.BuildPath([self.BaseDir, self.SimulationName]) self.InternalDir = self.SimLib.BuildPath([self.SimulationDir, self.SimEnvironment.INTERNALDIRECTORY]) self.CacheDir = self.SimLib.BuildPath([self.BaseDir, 'CACHE']) if not(os.path.exists(self.SimulationDir)): print "Error, simulation \"%s\" does not exist or is not readable" % self.SimulationName sys.exit(1) propertyFile = self.SimLib.BuildPath([self.InternalDir, "properties.ini"]) print "Properties file: %s" % propertyFile self.Properties = restartlib.RestartProperties(self.SimEnvironment) self.Properties.init(propertyFile) print "load complete, properties:" print self.Properties.toString() self.attachLog(self.SimulationDir) def PrepareCheckpointing(self): max_restart_id = self.RestartLib.GetMaxRestartID() # lets see if from-restart-id is sest if self.OptionsManager.HasOption('from-restart-id'): ii = self.OptionsManager.GetOption('from-restart-id') try: restore_restart_id = int(ii) except ValueError: print "Error, Could not coerse provided from-restart-id %s into an integer" % ii self.Log.Write("Error, Could not coerse provided from-restart-id %s into an integer" % ii) sys.exit(1) else: if max_restart_id != 0: restore_restart_id = max_restart_id else: print "Error, Could not determine a valid restart_id to recover from" self.Log.Write("Error, Could not determine a valid restart_id to recover from") sys.exit(1) self.Log.Write("Restoring from restart id %s, 04d is: %04d" % (restore_restart_id, restore_restart_id)) d_restore_restart_id = "%04d" % restore_restart_id if restore_restart_id == restart_id: print "Error, restore restart-id %s equals restart-id %s, cannot recover a job from itself" % (restore_restart_id, Self.RestartID) self.Log.Write("Error, restore restart-id %s equals restart-id %s, cannot recover a job from itself" % (restore_restart_id, Self.RestartID)) sys.exit(1) restore_dir = self.SimLib.BuildPath([self.SimulationDir, "output-%s" % d_restore_restart_id]) (chkpoint_files, iteration) = self.RestartLib.GetCheckpointFiles(restore_dir) checkpoint_workdir = self.SimLib.BuildPath([restore_dir, self.SimulationName]) self.Log.Write("Restoring from checkpoint iteration %s" % (iteration)) #lets link our files. parname = self.SimLib.FileBaseName(self.Properties.parfile) my_workdir = self.SimLib.BuildPath([self.RestartDir, parname]) if not(os.path.exists(my_workdir)): try: os.mkdir(my_workdir) except: print "Could not create output working directory %s" % my_workdir self.Log.Write("Could not create output working directory %s" % my_workdir) sys.exit(1) for file in checkpoint_files: try: sfile = self.SimLib.BuildPath([checkpoint_workdir, file]) dfile = self.SimLib.BuildPath([my_workdir, file]) os.link(sfile, dfile) except: print "Could not link checkpoint file %s to %s" % (sfile, dfile) self.Log.Write("Could not link checkpoint file %s to %s" % (sfile, dfile)) sys.exit(1) existingProperties = None previousIni = self.SimLib.BuildPath([checkpoint_workdir, self.SimEnvironment.INTERNALDIRECTORY, "properties.ini"]) if not(os.path.exists(previousIni)): print "Warning, could not load properties from restore_restart_id %s" % d_restore_restart_id else: existingProperties = restartlib.RestartProperties(self.SimEnvironment) existingProperties.init(previousIni) return existingProperties def run(self, simulationName=None): if simulationName is not None: self.load(simulationName) # need basedir, simulationdir, and internaldir machine = self.SimEnvironment.LocalMachine machineEntry = self.SimEnvironment.LocalMachineEntry self.SimLib.VerifyKeys(machineEntry, ['scratchdir', 'num-threads', 'memory', 'ppn', 'queue', 'maxwalltime', 'hostname']) if not machineEntry.HasKey('mpirun') or machineEntry.mpirun == None or len(machineEntry.mpirun) == 0: print "Error, mpirun run for machine %s is not defined" % machine sys.exit(1) simulation_id = self.Properties.simulationid # lets figure out our restart situation. rids = self.RestartLib.GetRestartIds(self) max_restart_id = self.RestartLib.GetMaxRestartID(self) self.Log.Write("Found the following restart_ids: %s" % rids) self.Log.Write("Maximum restart id determined to be: %04d" % max_restart_id) if self.OptionsManager.HasOption("restart-id"): ii = self.OptionsManager.GetOption("restart-id") try: my_restart_id = int(ii) except ValueError: print "Error, Could not coerse provided restart-id %s into an integer" % ii self.Log.Write("Error, Could not coerse provided restart-id %s into an integer" % ii) sys.exit(1) else: my_restart_id = max_restart_id + 1 self.RestartID = my_restart_id self.LongRestartID = "%04d" % self.RestartID if self.RestartID in rids: print "Error, assigned restart id %s for simulation %s already in use!" % my_restart_id self.Log.Write("Error, assigned restart id %s for simulation %s already in use!" % my_restart_id) sys.exit(1) self.Log.Write("Assigned restart_id of: %04d" % my_restart_id) if self.RestartID > 9999: print "Error, maximum number of restarts reached. Please use sim purge to clear existing restarts" self.Log.Write("Error, maximum number of restarts reached. Please use sim purge to clear existing restarts") sys.exit(1) active_restart_id = self.RestartLib.GetActiveRestartId(self) self.Log.Write("Active restart id: %04d" % int(active_restart_id)) self.RestartDir = self.SimLib.BuildPath([self.SimulationDir, "output-%04d" % self.RestartID]) recover = False if self.OptionsManager.HasOption('recover'): recover = self.OptionsManager.GetOption('recover') existingProperties = None if recover: existingProperties = self.PrepareCheckpointing() # done with recover stuff. pbsSimulationName = self.RestartLib.CreatePbsSimulationName(self) # simulationid, executable, submitscript, parfile are all available in self.Properties # machine, machineEntry are already defined. hostname = machineEntry.hostname user = self.SimLib.GetUsername() memory = self.SimLib.GetMachineOption('memory') cpufreq = self.SimLib.GetMachineOption('cpufreq') if existingProperties != None: allocation = self.SimLib.GetMachineOption('allocation', existingProperties.allocation) queue = self.SimLib.GetMachineOption('queue', existingProperties.queue) else: allocation = self.SimLib.GetMachineOption('allocation') queue = self.SimLib.GetMachineOption('queue') (nodes, ppn_used, procs, ppn, procs_requested, num_procs, num_threads) = self.SimLib.GetProcs(existingProperties) #walltime is a class, see restartlib.WallTime Walltime = restartlib.WallTime(self.OptionsManager.GetOption('walltime')) # store the rest of our keys rr = { 'nodes': nodes, 'ppnused': ppn_used, 'procs':procs, 'ppn':ppn, 'procsrequested': procs_requested, 'numprocs': num_procs, 'numthreads':num_threads, 'queue': queue, 'allocation': allocation, 'hostname': hostname, 'user': user, 'memory': memory, 'cpufreq': cpufreq, 'pbsSimulationName': pbsSimulationName, 'walltime': Walltime.Walltime } for key in rr.keys(): self.Properties.AddProperty(key, rr[key]) self.Properties.Save() scratchdir = self.SimLib.GetMachineOption('scratchdir') parname = self.SimLib.FileBaseName(self.Properties.parfile) my_workdir = self.SimLib.BuildPath([self.RestartDir, parname]) paths_to_make = [self.RestartDir, self.SimLib.BuildPath([self.RestartDir, self.SimEnvironment.INTERNALDIRECTORY]), my_workdir] for path in paths_to_make: if not(os.path.exists(path)): try: os.mkdir(path) except: print "Error, could not make working directory path \"%s\"" % path sys.exit(1) mpi_nodefile = self.SimLib.BuildPath([self.RestartDir, 'mpi_nodefile']) self.SimLib.WriteContents(mpi_nodefile, 'localhost\n') # need to remove currently active simulation. activedir = self.SimLib.BuildPath([self.SimulationDir, "output-%04d" % int(active_restart_id)]) activedirwithflag = "%s-active" % activedir if os.path.exists(activedirwithflag): try: os.unlink(activedirwithflag) except: print "Could not remove previous active flag on directory %s" % activedir sys.exit(1) # create a symlink indicating that this job is active. os.symlink(self.RestartDir, "%s-active" % self.RestartDir) # do parfile substitution parfile = self.Properties.parfile pf = self.SimLib.BaseName(parfile) contents = self.DefineDatabase.SubAll(self.SimLib.GetFileContents(parfile)) newparpath = self.SimLib.BuildPath([self.RestartDir, pf]) self.SimLib.WriteContents(newparpath, contents) new_properties = dict() new_properties['SOURCEDIR'] = self.Properties.sourcedir new_properties['SIMULATION_NAME'] = self.SimulationName new_properties['SHORT_SIMULATION_NAME'] = self.Properties.pbsSimulationName new_properties['SIMULATION_ID'] = self.Properties.simulationid new_properties['RESTART_ID'] = self.RestartID new_properties['RUNDIR'] = self.RestartDir new_properties['SCRIPTFILE'] = self.Properties.submitscript new_properties['EXECUTABLE'] = self.Properties.executable new_properties['PARFILE'] = newparpath new_properties['HOSTNAME'] = hostname new_properties['USER'] = user new_properties['NODES'] = nodes new_properties['PROCS'] = procs new_properties['PROCS_REQUESTED'] = procs_requested new_properties['PPN'] = ppn new_properties['PPN_USED'] = ppn_used new_properties['NUM_PROCS'] = num_procs new_properties['NUM_THREADS'] = num_threads new_properties['MEMORY'] = memory new_properties['CPUFREQ'] = cpufreq new_properties['ALLOCATION'] = self.Properties.allocation new_properties['QUEUE'] = self.Properties.queue new_properties['WALLTIME'] = Walltime.Walltime new_properties['WALLTIME_HH'] = Walltime.walltime_hh new_properties['WALLTIME_MM'] = Walltime.walltime_mm new_properties['WALLTIME_SS'] = Walltime.walltime_ss new_properties['WALLTIME_SECONDS'] = Walltime.walltime_seconds new_properties['WALLTIME_MINUTES'] = Walltime.walltime_minutes new_properties['WALLTIME_HOURS'] = Walltime.walltime_hours new_properties['SCRATCHDIR'] = scratchdir new_properties['CHAINED_JOB_ID'] = 0 new_properties['MPI_NODEFILE'] = mpi_nodefile for key in new_properties.keys(): self.DefineDatabase.Set(key, new_properties[key]) # do mpirun substitution mpirun = self.DefineDatabase.SubAll(self.SimLib.GetMachineOption('mpirun')) # change to RestartDir and then run the simulation. os.chdir(self.RestartDir) self.SimLib.ExecuteSubprocess(mpirun) def create(self, simulationName, parfile): # lets start here. #the calling app (sim/sim-job/sim-run), etc, will determine where simulationName, parfile come from self.SimulationName = simulationName self.Parfile = parfile if not(self.SimLib.FileExists(self.Parfile)): self.Error("Specified parfile %s does not exist or is not readable" % self.Parfile) configuration = self.RestartLib.GetConfiguration() configPath = self.SimLib.BuildPath([self.SimEnvironment.CONFIGS_PATH, configuration]) (exe, submitScript) = self.RestartLib.GetExecutable() optionlist = self.SimLib.BuildPath([configPath, 'OptionList']) config_id = self.SimLib.GetFileContents(self.SimLib.BuildPath([configPath, 'CONFIG-ID']), 'no-config-id') build_id = self.SimLib.GetFileContents(self.SimLib.BuildPath([configPath, 'BUILD-ID']), 'no-build-id') (self.BaseDir, self.SimulationDir, self.InternalDir, self.CacheDir) = self.RestartLib.CreateRestartSkeleton(self.SimulationName) #now that we have the simulationdir made, lets attach our log self.attachLog(self.SimulationDir) self.Log.Write("Skeleton Created"); self.Log.Write("Job directory: \"%s\"" % self.SimulationDir) machine = self.SimEnvironment.LocalMachine machineEntry = self.SimEnvironment.LocalMachineEntry self.SimLib.VerifyKeys(machineEntry, ['user', 'email']) user = machineEntry.user email = machineEntry.email self.DefineDatabase.Reset() self.DefineDatabase.Set("USER", user) self.DefineDatabase.Set("EMAIL", email) localsourcebasedir = self.SimLib.GetLocalSourceBaseDir() dirsuffix = self.SimLib.GetDirSuffix(localsourcebasedir) sourcedir = self.SimLib.BuildPath([localsourcebasedir, dirsuffix]) #dirname == simulationdir # need to create a simulation id simulation_id = self.RestartLib.CreateSimulationId(self.SimulationName) propertyFile = self.SimLib.BuildPath([self.InternalDir, "properties.ini"]) self.Properties = restartlib.RestartProperties(self.SimEnvironment) self.Properties.init(propertyFile) self.Properties.AddProperty('machine', machine) self.Properties.AddProperty('simulationid', simulation_id) self.Properties.AddProperty('sourcedir', sourcedir) self.Properties.AddProperty('configuration', configuration) self.Properties.AddProperty('configid', config_id) self.Properties.AddProperty('buildid', build_id) self.Properties.Save() #roots = ['exe', 'cfg', 'run', 'par'] (exedir, cfgdir, rundir, pardir, datadir) = self.RestartLib.CreateInternalDirs(self.InternalDir) # exe ef = self.SimLib.BaseName(exe) exefile = self.SimLib.BuildPath([exedir, ef]) self.RestartLib.CopyFileWithCaching(exe, exedir, self.SimLib.BuildPath([self.CacheDir, 'exe'])) os.chmod(exefile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) self.Log.Write("Executable: \"%s\"" % exe) self.Properties.AddProperty('executable', exefile) # config cfgfile = self.SimLib.BaseName(optionlist) contents = self.SimLib.GetFileContents(optionlist) self.DefineDatabase.SubAll(contents) self.SimLib.WriteContents(self.SimLib.BuildPath([cfgdir, cfgfile]), contents) self.Log.Write("Option list: \"%s\"" % optionlist) self.Properties.AddProperty('optionlist', self.SimLib.BuildPath([cfgdir, cfgfile])) # rundir queuefile = self.SimLib.BaseName(submitScript) contents = self.SimLib.GetFileContents(submitScript) self.DefineDatabase.SubAll(contents) self.SimLib.WriteContents(self.SimLib.BuildPath([rundir, queuefile]), contents) self.Log.Write("Submit script: \"%s\"" % submitScript) self.Properties.AddProperty('submitscript', self.SimLib.BuildPath([rundir, queuefile])) # parfile par = self.SimLib.BaseName(parfile) contents = self.SimLib.GetFileContents(parfile) self.DefineDatabase.SubAll(contents) self.SimLib.WriteContents(self.SimLib.BuildPath([pardir, par]), contents) self.Log.Write("Parameter file: \"%s\"" % parfile) self.Properties.AddProperty('parfile', self.SimLib.BuildPath([pardir, par])) self.Properties.Save() # data directory if self.OptionsManager.HasOption('datadir'): datasrc = self.OptionsManager.GetOption('datadir') if not(os.path.exists(datadir)): print "Error, could not open data directory \"%s\" for reading" % datasrc sys.exit(1) shutil.copytree(datasrc, datadir, True) self.Log.Write("Data Directory: \"%s\"" % datasrc) self.Log.Write(self.Properties.toString()) def attachLog(self, simulationdir): self.Log = simlog.SimLog(self.SimEnvironment) self.Log.Init(self.SimLib.BuildPath([simulationdir, "LOG"]), self.SimulationName) # all of these will be flushed out as i proceed. def destroy(self): pass def makeActive(self): pass def makeInactive(self): pass def loadFromRestartId(self, restartId): pass def presubmit(self): pass def Error(self, message): error = "Error in Restart: %s" % message print error sys.exit(1)