Commit bf1718ca authored by Max's avatar Max
Browse files

Add even more traces

parent d94a753f
......@@ -95,6 +95,8 @@ public class JAliEnCommandcp extends JAliEnBaseCommand {
private boolean noCommit = false;
private String protocolUploadErrors = "";
// public long timingChallenge = 0;
// public boolean isATimeChallenge = false;
......@@ -968,7 +970,8 @@ public class JAliEnCommandcp extends JAliEnBaseCommand {
if (report && (registeredPFNs == null || registeredPFNs.size() != envelopes.size()))
commander.setReturnCode(301,
"From the " + envelopes.size() + " replica with tickets only " + (registeredPFNs != null ? String.valueOf(registeredPFNs.size()) : "null") + " were registered");
"From the " + envelopes.size() + " replica with tickets only " + (registeredPFNs != null ? String.valueOf(registeredPFNs.size()) : "null") +
" were registered. We had the following errors: " + protocolUploadErrors);
}
if (sourceFile != null && envelopes.size() > 0)
......@@ -987,7 +990,7 @@ public class JAliEnCommandcp extends JAliEnBaseCommand {
return true;
}
else if (report) {
commander.setReturnCode(302, "Upload failed, sorry!");
commander.setReturnCode(302, "Upload failed, sorry! We had the following errors: " + protocolUploadErrors);
if (isSilent()) {
final IOException ex = new IOException("Upload failed");
......@@ -1029,6 +1032,7 @@ public class JAliEnCommandcp extends JAliEnBaseCommand {
catch (@SuppressWarnings("unused") final IOException ioe) {
// ignore, will try next protocol or fetch another
// replica to replace this one
protocolUploadErrors+=ioe.toString() + System.lineSeparator();
}
}
catch (@SuppressWarnings("unused") final Exception e) {
......
......@@ -473,7 +473,7 @@ public class JobAgent implements Runnable {
}
}
else
commander.q_api.putJobLog(queueId, "trace", "Local disk space limit (default): " + workdirMaxSizeMB + "MB");
commander.q_api.putJobLog(queueId, "trace", "Local disk space limit (default): " + "Unlimited");
final Integer requestedCPUCores = jdl.getInteger("CPUCores");
......@@ -616,7 +616,13 @@ public class JobAgent implements Runnable {
return 1;
}
if (monitorJob) {
final String process_res_format = String.format("EXEC_TIME", "RES_FRUNTIME", "RES_RUNTIME", "RES_CPUUSAGE", "RES_MEMUSAGE", "RES_CPUTIME", "RES_RMEM", "RES_VMEM",
"RES_NOCPUS", "RES_CPUFAMILY", "RES_CPUMHZ", "RES_RESOURCEUSAGE", "RES_RMEMMAX", "RES_VMEMMAX");
logger.log(Level.INFO, process_res_format);
commander.q_api.putJobLog(queueId, "proc", process_res_format);
wrapperPID = (int) p.pid();
apmon.addJobToMonitor(wrapperPID, jobWorkdir, ce, hostName);
......@@ -630,20 +636,8 @@ public class JobAgent implements Runnable {
final TimerTask killPayload = new TimerTask() {
@Override
public void run() {
final Vector<Integer> childProcs = mj.getChildren();
if (childProcs != null && childProcs.size() > 1) {
try {
Runtime.getRuntime().exec("kill -9 " + getPayloadPid(childProcs));
Thread.sleep(60 * 1000); // Give the JobWrapper 60s to clean things up
}
catch (final Exception e) {
logger.log(Level.INFO, "Cannot kill the child processes " + childProcs, e);
}
}
// If still alive, kill everything, including the JW
if (p.isAlive()) {
p.destroyForcibly();
}
commander.q_api.putJobLog(queueId, "trace", "Timeout has occurred. Killing job!");
killPayload(p);
}
};
......@@ -663,8 +657,9 @@ public class JobAgent implements Runnable {
monitor_loops++;
final String error = checkProcessResources();
if (error != null) {
// killProcess.run(); //TODO: Temporarily disabled
logger.log(Level.SEVERE, "Process overusing resources: " + error);
commander.q_api.putJobLog(queueId, "trace", "Process overusing resources. Killing job!");
// killProcess.run(); //TODO: Temporarily disabled
// return 1;
}
if (monitor_loops == 10) {
......@@ -813,7 +808,7 @@ public class JobAgent implements Runnable {
final Integer iTTL = jdl.getInteger("TTL");
int ttl = (iTTL != null ? iTTL.intValue() : 3600);
commander.q_api.putJobLog(queueId, "trace", "Job asks to run for " + ttl + " seconds");
commander.q_api.putJobLog(queueId, "trace", "Job asks for a TTL of " + ttl + " seconds");
ttl += 300; // extra time (saving)
final String proxyttl = jdl.gets("ProxyTTL");
......@@ -979,6 +974,29 @@ public class JobAgent implements Runnable {
return 0;
}
/**
*
* Kills the payload of a given JobWrapper process
*
* @param p JobWrapper process
*/
private void killPayload(Process p){
final Vector<Integer> childProcs = mj.getChildren();
if (childProcs != null && childProcs.size() > 1) {
try {
Runtime.getRuntime().exec("kill -9 " + getPayloadPid(childProcs));
Thread.sleep(60 * 1000); // Give the JobWrapper 60s to clean things up
}
catch (final Exception e) {
logger.log(Level.INFO, "Cannot kill the child processes " + childProcs, e);
}
}
// If still alive, kill everything, including the JW
if (p.isAlive()) {
p.destroyForcibly();
}
}
private final static String[] batchSystemVars = {
"CONDOR_PARENT_ID",
"_CONDOR_JOB_AD",
......
......@@ -184,6 +184,7 @@ public class JobWrapper implements MonitoringObject, Runnable {
final int runCode = runJob();
logger.log(Level.INFO, "JobWrapper has finished execution");
commander.q_api.putJobLog(queueId, "trace", "JobWrapper has finished execution");
if (runCode > 0)
System.exit(0); // Positive runCodes originate from the payload. Ignore. All OK here as far as we're concerned.
......@@ -459,8 +460,8 @@ public class JobWrapper implements MonitoringObject, Runnable {
commander.q_api.putJobLog(queueId, "trace", "Getting InputFile: " + entry.getKey().getCanonicalName());
logger.log(Level.INFO, "GUID g: " + g + " entry.getvalue(): " + entry.getValue());
commander.q_api.putJobLog(queueId, "trace", "GUID g: " + g + " entry.getvalue(): " + entry.getValue());
logger.log(Level.INFO, g + ". entry.getvalue(): " + entry.getValue());
commander.q_api.putJobLog(queueId, "trace", g + ". entry.getvalue(): " + entry.getValue());
final File f = IOUtils.get(g, entry.getValue());
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment