// Copyright (c) 2008 João Abecasis // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #include "classad.hpp" #include "condor_job.hpp" #include "log_processor.hpp" #include namespace saga { namespace adaptors { namespace condor { void log_processor::process_log_entry(::condor::job::classad & c) { using namespace saga::job::attributes; struct logger { logger(::condor::job::classad & ca) : ca_(ca), processed(false) { } ~logger() { if (processed) return; // Log entry std::string msg; SAGA_VERBOSE(SAGA_VERBOSE_LEVEL_INFO) msg += "Skipping unprocessed Condor log entry."; SAGA_VERBOSE(SAGA_VERBOSE_LEVEL_DEBUG) { msg += "ClassAd {\n"; for (::condor::job::classad::attribute_iterator iter = ca_.attributes_begin(), end = ca_.attributes_end(); iter != end; ++iter) { msg += " " + iter->first + ": (" + iter->second.type + ") " "'" + iter->second.value_ + "'\n"; } msg += "}"; } if (!msg.empty()) SAGA_LOG_INFO(msg.c_str()) } ::condor::job::classad & ca_; bool processed; } _on_return(c); int event_type = -1; std::string cluster; std::string process; boost::optional< ::condor::job::classad::value> attr; if ((attr = c.get_attribute("EventTypeNumber")) && "i" == attr->type) event_type = boost::lexical_cast(attr->value_); if (!(attr = c.get_attribute("Cluster"))) return; cluster = attr->value_; if ((attr = c.get_attribute("Proc"))) process = attr->value_; boost::shared_ptr job_data; { synchronized::lock reg(registry_); job_data = reg->find_job(cluster); if (!job_data) job_data = reg->find_job(cluster + "." + process); } if (!job_data) return; shared_job_data::scoped_lock lock(job_data->state_change_mtx); // Event descriptions from section 2.6.6 of the Condor manual. // E.g., from here: // http://www.cs.wisc.edu/condor/manual/v7.1.0/2_6Managing_Job.html switch (event_type) { case 0: // Job submitted // This event occurs when a user submits a job. It is the // first event you will see for a job, and it should only occur // once. case 1: // Job executing // This shows up when a job is running. It might occur more // than once. case 13: // Job was released // The user is requesting that a job on hold be re-run. job_data->state = saga::job::Running; break; case 12: // Job was held // The user has paused the job, perhaps with the condor_ hold // command. It was stopped, and will go back into the queue // again until it is aborted or released. job_data->state = saga::job::Suspended; break; case 2: // Error in executable // The job couldn't be run because the executable was bad. case 4: // Job evicted from machine // A job was removed from a machine before it finished, usually // for a policy reason: perhaps an interactive user has claimed // the computer, or perhaps another job is higher priority. if ((attr = c.get_attribute("EventTime"))) job_data->attributes[finished] = attr->value_; job_data->state = saga::job::Failed; break; case 5: // Job terminated // The job has completed. if ((attr = c.get_attribute("EventTime"))) job_data->attributes[finished] = attr->value_; if ((attr = c.get_attribute("ReturnValue"))) job_data->attributes[exitcode] = attr->value_; job_data->state = saga::job::Done; if ((attr = c.get_attribute("TerminatedBySignal"))) { job_data->attributes[termsig] = attr->value_; job_data->state = saga::job::Failed; } break; case 9: // Job aborted // The user cancelled the job. if ((attr = c.get_attribute("EventTime"))) job_data->attributes[finished] = attr->value_; job_data->state = saga::job::Canceled; break; case 3: // Job was checkpointed // The job's complete state was written to a checkpoint file. // This might happen without the job being removed from a // machine, because the checkpointing can happen periodically. case 6: // Image size of job updated // This is informational. It is referring to the memory that // the job is using while running. It does not reflect the // state of the job. case 7: // Shadow exception // The condor_shadow, a program on the submit computer that // watches over the job and performs some services for the job, // failed for some catastrophic reason. The job will leave the // machine and go back into the queue. case 8: // Generic log event case 10: // Job was suspended // The job is still on the computer, but it is no longer // executing. This is usually for a policy reason, like an // interactive user using the computer. case 11: // Job was unsuspended // The job has resumed execution, after being suspended // earlier. case 14: // Parallel node executed // A parallel (MPI) program is running on a node. case 15: // Parallel node terminated // A parallel (MPI) program has completed on a node. case 16: // POST script terminated // A node in a DAGMan workflow has a script that should be run // after a job. The script is run on the submit host. This // event signals that the post script has completed. case 17: // Job submitted to Globus // A grid job has been delegated to Globus (version 2, 3, or // 4). case 18: // Globus submit failed // The attempt to delegate a job to Globus failed. case 19: // Globus resource up // The Globus resource that a job wants to run on was // unavailable, but is now available. case 20: // Detected Down Globus Resource // The Globus resource that a job wants to run on has become // unavailable. case 21: // Remote error // The condor_starter (which monitors the job on the execution // machine) has failed. case 22: // Remote system call socket lost // The condor_shadow and condor_starter (which communicate // while the job runs) have lost contact. case 23: // Remote system call socket reestablished // The condor_shadow and condor_starter (which communicate // while the job runs) have been able to resume contact before // the job lease expired. case 24: // Remote system call reconnect failure // The condor_shadow and condor_starter (which communicate // while the job runs) were unable to resume contact before the // job lease expired. case 25: // Grid Resource Back Up // A grid resource that was previously unavailable is now // available. case 26: // Detected Down Grid Resource // The grid resource that a job is to run on is unavailable. case 27: // Job submitted to grid resource // A job has been submitted, and is under the auspices of the // grid resource. default: return; } std::set::iterator end = job_data->instances.end(); for (std::set::iterator it = job_data->instances.begin(); it != end; ++it) { (*it)->update_state(job_data->state, job_data->attributes); } job_data->state_change.notify_all(); _on_return.processed = true; } }}} // namespace saga::adaptors::condor