Files
cobald-testenv/conf/common/ganglia.d/00_default_metrics
2021-04-19 13:56:35 +02:00

1082 lines
31 KiB
Plaintext

[
Name = strcat(MyType,"CondorVersion");
Value = CondorVersion;
Desc = "Version String";
TargetType = "Scheduler,Negotiator";
]
[
Name = strcat(MyType,"UpdatesLost");
Value = UpdatesLost;
Verbosity = 2;
Desc = "Number of ClassAd updates that were sent by this daemon but not received by the collector";
TargetType = "Scheduler,Negotiator,Machine_slot1";
]
[
Name = strcat(MyType,"UpdatesTotal");
Value = UpdatesTotal;
Verbosity = 2;
Desc = "Number of ClassAd updates that were sent by this daemon";
TargetType = "Scheduler,Negotiator,Machine_slot1";
]
[
Name = strcat(MyType,"RecentDaemonCoreDutyCycle");
Value = RecentDaemonCoreDutyCycle;
Desc = "Recent fraction of busy time in the daemon event loop";
Scale = 100;
Units = "%";
TargetType = "Scheduler,Negotiator";
]
/* Not a useful graph. Should be converted to a human readable string metric */
[
Name = strcat(MyType,"MonitorSelfAge");
Value = MonitorSelfAge;
Verbosity = 99;
Desc = "Age of this daemon";
Units = "seconds";
TargetType = "Scheduler,Negotiator,Machine_slot1";
]
[
Name = strcat(MyType,"MonitorSelfCPUUsage");
Value = MonitorSelfCPUUsage;
Verbosity = 2;
Desc = "The fraction of one CPU recently used by this daemon";
TargetType = "Scheduler,Negotiator,Machine_slot1";
]
[
Name = strcat(MyType,"MonitorSelfImageSize");
Value = MonitorSelfImageSize;
Verbosity = 1;
Desc = "Memory allocated to this daemon (i.e. virtual image size)";
Units = "bytes";
Scale = 1024;
Type = "float";
TargetType = "Scheduler,Negotiator,Machine_slot1";
]
[
Name = strcat(MyType,"MonitorSelfRegisteredSocketCount");
Value = MonitorSelfRegisteredSocketCount;
Verbosity = 2;
Desc = "Number of sockets registered in this daemon's event loop";
Units = "sockets";
TargetType = "Scheduler,Negotiator,Machine_slot1";
]
[
Name = strcat(MyType,"MonitorSelfResidentSetSize");
Value = MonitorSelfResidentSetSize;
Verbosity = 2;
Desc = "RAM allocated to this daemon";
Units = "bytes";
Scale = 1024;
Type = "float";
TargetType = "Scheduler,Negotiator,Machine_slot1";
]
[
Name = strcat(MyType,"MonitorSelfSecuritySessions");
Value = MonitorSelfSecuritySessions;
Verbosity = 2;
Desc = "Number of security sessions in this daemon's cache";
TargetType = "Scheduler,Negotiator,Machine_slot1";
]
[
Name = "JobsAccumBadputTime";
Desc = "Runtime of jobs that were aborted (removed or held) or (standard universe only) evicted without a checkpoint.";
Scale = 0.000277778;
Units = "hours";
Type = "float";
TargetType = "Scheduler";
]
[
Name = "JobsAccumExecuteTime";
Desc = "Time spent running jobs. Does not include file transfer and other job handling time.";
Scale = 0.000277778;
Units = "hours";
Type = "float";
TargetType = "Scheduler";
]
[
Name = "JobsAccumPostExecuteTime";
Verbosity = 2;
Desc = "Time spent processing a completed job (includes output file transfer)";
Scale = 0.000277778;
Units = "hours";
Type = "float";
TargetType = "Scheduler";
]
[
Name = "JobsAccumPreExecuteTime";
Verbosity = 2;
Desc = "Time spent preparing to run a job (includes input file transfer)";
Scale = 0.000277778;
Units = "hours";
Type = "float";
TargetType = "Scheduler";
]
[
Name = "JobsAccumRunningTime";
Desc = "Time spent running jobs that were not counted as badput (i.e. not removed or held). Includes file transfer and other handling time.";
Units = "hours";
Scale = 0.000277778;
Type = "float";
TargetType = "Scheduler";
]
[
Name = "JobsAccumTimeToStart";
Verbosity = 2;
Desc = "Time between submit and running of a job";
Scale = 0.000277778;
Units = "hours";
Type = "float";
TargetType = "Scheduler";
]
[
Name = "JobsCheckpointed";
Verbosity = 2;
Desc = "Number of job run attempts that were interrupted and successfully checkpointed";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "JobsCompleted";
Desc = "Number of jobs that terminated normally (i.e. not via a signal or abort)";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "JobsCoredumped";
Verbosity = 1;
Desc = "Number of jobs that crashed and generated a core file";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "JobsDebugLogError";
Verbosity = 2;
Desc = "Count of shadows that exited due to debug log errors";
Units = "shadows";
TargetType = "Scheduler";
]
[
Name = "JobsExecFailed";
Verbosity = 1;
Desc = "Count of job run attempts that failed to execute the specified command";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "JobsExited";
Verbosity = 2;
Desc = "Count of job run attempts that have completed (successfully or not)";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "JobsExitedAndClaimClosing";
Verbosity = 2;
Desc = "Count of job run attempts that have completed when claim was not accepting additional jobs";
Units = "jobs";
TargetType = "Scheduler";
]
/* JobsExitedNormally is the same as JobsCompleted, so don't bother. */
/*
[
Name = "JobsExitedNormally";
Desc = "";
Units = "";
TargetType = "Scheduler";
]
*/
[
Name = "JobsExitException";
Verbosity = 2;
Desc = "Count of job run attempts that ended with a job handling exception (shadow exception)";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "JobsKilled";
Verbosity = 1;
Desc = "Count of job run attempts in which the job was killed (i.e. evicted)";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "JobsMissedDeferralTime";
Verbosity = 2;
Desc = "Count of job run attempts that failed because the specified deferral time was missed";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "JobsNotStarted";
Verbosity = 2;
Desc = "Count of job run attempts that failed because the request to activate the claim failed";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "JobsShadowNoMemory";
Verbosity = 2;
Desc = "Count of job run attempts that failed because there was not enough memory (RESERVED_SWAP)";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "JobsShouldHold";
Verbosity = 2;
Desc = "Count of job run attempts that have resulted in the job going on hold";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "JobsShouldRemove";
Verbosity = 2;
Desc = "Count of job run attempts that have resulted in the job being removed (e.g. periodic_remove policy)";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "JobsShouldRequeue";
Verbosity = 2;
Desc = "Count of job run attempts that ended with the job being requeued due to handling failures or OnExitRemove=false";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "JobsStarted";
Verbosity = 1;
Desc = "Number of job run attempts started";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "JobsSubmitted";
Desc = "Number of jobs submitted";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "MaxJobsRunning";
Verbosity = 1;
Desc = "Configured limit on number of running jobs";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "NumUsers";
Verbosity = 1;
Desc = "Number of different users who currently have jobs in the queue";
Units = "users";
TargetType = "Scheduler";
]
[
Name = "RecentStatsLifetime";
Verbosity = 2;
Desc = "Seconds elapsed since the beginning of the current stats collection window";
Units = "seconds";
TargetType = "Scheduler";
]
[
Name = "ScheddSwapExhausted";
Verbosity = 2;
Desc = "Non-zero when jobs cannot be started due to RESERVED_SWAP";
TargetType = "Scheduler";
]
[
Name = "ShadowsRunning";
Verbosity = 2;
Desc = "Number of shadow processes currently running";
Units = "shadows";
TargetType = "Scheduler";
]
[
Name = "ShadowsStarted";
Verbosity = 2;
Desc = "Number of shadow processes started";
Units = "shadows";
TargetType = "Scheduler";
]
[
Name = "StatsLifetime";
Verbosity = 2;
Desc = "Seconds of elapsed time since the beginning of the schedd lifetime stat collection window";
Units = "seconds";
TargetType = "Scheduler";
]
[
Name = "TotalFlockedJobs";
Desc = "Number of jobs from this schedd that are flocked to other pools";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "TotalHeldJobs";
Desc = "Number of jobs in this schedd that are on hold";
Units = "jobs";
TargetType = "Scheduler";
]
[
Aggregate = "SUM";
Name = "Held Jobs in Pool";
Value = TotalHeldJobs;
Desc = "Number of jobs on hold in schedds reporting to this pool";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "TotalIdleJobs";
Desc = "Number of idle jobs in this schedd's queue";
Units = "jobs";
TargetType = "Scheduler";
]
[
Aggregate = "SUM";
Name = "Idle Jobs in Pool";
Value = TotalIdleJobs;
Desc = "Number of idle jobs in schedds reporting to this pool";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "TotalJobAds";
Desc = "Number of jobs currently in this schedd's queue";
Units = "jobs";
TargetType = "Scheduler";
]
[
Aggregate = "SUM";
Name = "Jobs in Pool";
Value = TotalJobAds;
Desc = "Number of jobs currently in schedds reporting to this pool";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "TotalLocalJobsIdle";
Verbosity = 2;
Desc = "Number of local universe jobs in this schedd's queue";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "TotalLocalJobsRunning";
Verbosity = 2;
Desc = "Number of running local universe jobs in this schedd's queue";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "TotalRemovedJobs";
Verbosity = 1;
Desc = "Number of jobs that are in the process of being removed";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "TotalRunningJobs";
Desc = "Number of running jobs in this schedd's queue";
Units = "jobs";
TargetType = "Scheduler";
]
[
Aggregate = "SUM";
Name = "Running Jobs in Pool";
Value = TotalRunningJobs;
Desc = "Number of running jobs in schedds reporting to this pool";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "TotalSchedulerJobsIdle";
Verbosity = 2;
Desc = "Number of idle scheduler universe jobs in this schedd's queue";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = "TotalSchedulerJobsRunning";
Verbosity = 2;
Desc = "Number of running scheduler universe jobs in this schedd's queue";
Units = "jobs";
TargetType = "Scheduler";
]
[
Name = strcat(Name,"-TotalRunningJobs");
Title = strcat(Name, " Total Running Jobs");
Aggregate = "SUM";
Value = RunningJobs;
Verbosity = 2;
Desc = strcat("Total number of running jobs from user ", Name);
Units = "jobs";
TargetType = "Submitter";
Group = "HTCondor Submitters";
]
[
Name = strcat(Name, "RunningJobs");
Title = strcat(Name, " Running Jobs");
Value = RunningJobs;
Verbosity = 2;
Desc = strcat("Number of running jobs from user ", Name);
Units = "jobs";
TargetType = "Submitter";
Group = "HTCondor Submitters";
]
[
Name = strcat(Name,"-TotalIdleJobs");
Title = strcat(Name, " Total Idle Jobs");
Aggregate = "SUM";
Value = IdleJobs;
Verbosity = 2;
Desc = strcat("Total number of idle jobs from user ", Name);
Units = "jobs";
TargetType = "Submitter";
Group = "HTCondor Submitters";
]
[
Name = strcat(Name, "IdleJobs");
Title = strcat(Name, " Idle Jobs");
Value = IdleJobs;
Verbosity = 2;
Desc = strcat("Number of idle jobs from user ", Name);
Units = "jobs";
TargetType = "Submitter";
Group = "HTCondor Submitters";
]
[
Name = strcat(Name,"-TotalHeldJobs");
Title = strcat(Name, " Total Held Jobs");
Aggregate = "SUM";
Value = HeldJobs;
Verbosity = 2;
Desc = strcat("Total number of held jobs from user ", Name);
Units = "jobs";
TargetType = "Submitter";
Group = "HTCondor Submitters";
]
[
Name = strcat(Name, "HeldJobs");
Title = strcat(Name, " Held Jobs");
Value = HeldJobs;
Verbosity = 2;
Desc = strcat("Number of held jobs from user ", Name);
Units = "jobs";
TargetType = "Submitter";
Group = "HTCondor Submitters";
]
[
Name = strcat(Name,"-TotalFlockedJobs");
Title = strcat(Name, " Total Flocked Jobs");
Aggregate = "SUM";
Value = FlockedJobs;
Verbosity = 2;
Desc = strcat("Total number of flocked jobs from user ", Name);
Units = "jobs";
TargetType = "Submitter";
Group = "HTCondor Submitters";
]
[
Name = strcat(Name, "FlockedJobs");
Title = strcat(Name, " Flocked Jobs");
Value = FlockedJobs;
Verbosity = 2;
Desc = strcat("Number of flocked jobs from user ", Name);
Units = "jobs";
TargetType = "Submitter";
Group = "HTCondor Submitters";
]
[
Name = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"),"-TotalRunningJobs");
Title = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"), " Total Running Jobs");
Aggregate = "SUM";
Value = RunningJobs;
Verbosity = 2;
Desc = strcat("Total number of running jobs from ", ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "group \\1"),"no group"));
Units = "jobs";
TargetType = "Submitter";
Group = "HTCondor Accounting Groups";
]
[
Name = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"),"-TotalIdleJobs");
Title = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"), " Total Idle Jobs");
Aggregate = "SUM";
Value = IdleJobs;
Verbosity = 2;
Desc = strcat("Total number of idle jobs from ", ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "group \\1"),"no group"));
Units = "jobs";
TargetType = "Submitter";
Group = "HTCondor Accounting Groups";
]
[
Name = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"),"-TotalHeldJobs");
Title = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"), " Total Held Jobs");
Aggregate = "SUM";
Value = HeldJobs;
Verbosity = 2;
Desc = strcat("Total number of held jobs from ", ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "group \\1"),"no group"));
Units = "jobs";
TargetType = "Submitter";
Group = "HTCondor Accounting Groups";
]
[
Name = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"),"-TotalFlockedJobs");
Title = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"), " Total Flocked Jobs");
Aggregate = "SUM";
Value = FlockedJobs;
Verbosity = 2;
Desc = strcat("Total number of flocked jobs from ", ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "group \\1"),"no group"));
Units = "jobs";
TargetType = "Submitter";
Group = "HTCondor Accounting Groups";
]
[
Name = "FileTransferDownloadBytes";
Verbosity = 1;
Derivative = true;
Title = "File Transfer Download Bandwidth";
Desc = "Output transfers from jobs";
Units = "bytes";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Name = "FileTransferDownloadBytesPerSecond_5m";
Verbosity = 1;
Desc = "Rate of output transfers from jobs";
Units = "bytes/s";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Regex = "Owner_([^_]*)_FileTransferDownloadBytesPerSecond_5m";
Title = "\\1 Download Bytes Per Second";
Verbosity = 2;
Desc = "Rate of output transfers from jobs by user \\1";
Units = "bytes/s";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Name = "FileTransferFileReadLoad_5m";
Verbosity = 1;
Desc = "Number of file transfer processes reading input data from files";
Units = "processes";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
/* This looks like a mismatch of stuff */
[
Name = "FileTransferFileReadSeconds";
Verbosity = 99;
Derivative = true;
Title = "File Transfer File Read Load";
Desc = "Number of file transfer processes reading input data from files";
Units = "processes";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Name = "FileTransferFileWriteLoad_5m";
Verbosity = 1;
Desc = "Number of file transfer processes writing output data to files";
Units = "processes";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
/* This looks like a mismatch of stuff */
[
Name = "FileTransferFileWriteSeconds";
Verbosity = 99;
Derivative = true;
Title = "File Transfer File Write Load";
Desc = "Number of file transfer processes writing output data to files";
Units = "processes";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Name = "FileTransferNetReadLoad_5m";
Verbosity = 1;
Desc = "Number of file transfer processes reading output data from the network";
Units = "processes";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
/* This looks like a mismatch of stuff */
[
Name = "FileTransferNetReadSeconds";
Verbosity = 99;
Derivative = true;
Desc = "Number of file transfer processes reading output data from the network";
Units = "processes";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Name = "FileTransferNetWriteLoad_5m";
Verbosity = 1;
Desc = "Number of file transfer processes writing input data to the network";
Units = "processes";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
/* This looks like a mismatch of stuff */
[
Name = "FileTransferNetWriteSeconds";
Verbosity = 99;
Derivative = true;
Title = "File Transfer Net Write Load";
Desc = "Number of file transfer processes writing input data to the network";
Units = "processes";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Name = "FileTransferUploadBytes";
Derivative = true;
Title = "File Transfer Upload Bandwidth";
Desc = "Input transfers to jobs";
Units = "bytes";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Name = "FileTransferUploadBytesPerSecond_5m";
Verbosity = 1;
Desc = "Rate of input transfers to jobs";
Units = "bytes/s";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Regex = "Owner_([^_]*)_FileTransferUploadBytesPerSecond_5m";
Title = "\\1 Upload Bytes Per Second";
Verbosity = 2;
Desc = "Rate of input transfers from jobs by user \\1";
Units = "bytes/s";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Name = "TransferQueueDownloadWaitTime";
Desc = "Oldest output file transfer waiting in the transfer queue";
Units = "seconds";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Aggregate = "MAX";
Name = "Pool Max TransferQueueDownloadWaitTime";
Value = TransferQueueDownloadWaitTime;
Desc = "Oldest output file transfer waiting in the transfer queues reporting to this pool";
Units = "seconds";
TargetType = "Scheduler";
]
[
Name = "TransferQueueNumDownloading";
Desc = "Number of jobs actively transferring output";
Units = "jobs";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Name = "TransferQueueNumUploading";
Desc = "Number of jobs actively transferring input";
Units = "jobs";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Name = "TransferQueueNumWaitingToDownload";
Desc = "Number of jobs waiting in the transfer queue to transfer output";
Units = "jobs";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Name = "TransferQueueNumWaitingToUpload";
Desc = "Number of jobs waiting in the transfer queue to transfer input";
Units = "jobs";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Name = "TransferQueueUploadWaitTime";
Desc = "Oldest input file transfer waiting in the transfer queue";
Units = "seconds";
TargetType = "Scheduler";
Group = "HTCondor File Transfer";
]
[
Aggregate = "MAX";
Name = "Pool Max TransferQueueUploadWaitTime";
Value = TransferQueueUploadWaitTime;
Desc = "Oldest input file transfer waiting in the transfer queues reporting to this pool";
Units = "seconds";
TargetType = "Scheduler";
]
[
Name = "LastNegotiationCycleActiveSubmitterCount0";
Verbosity = 1;
Desc = "The number of job submitters considered in the negotiation cycle";
Units = "submitters";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCycleCandidateSlots0";
Verbosity = 2;
Desc = "The number of slot ClassAds considered for matchmaking (reduced by NEGOTIATOR_SLOT_POOLSIZE_CONSTRAINT if applicable)";
Units = "slots";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCycleDuration0";
Desc = "The number of seconds that it took to complete the negotiation cycle";
Units = "seconds";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCycleMatches0";
Verbosity = 1;
Desc = "The number of successful matches that were made in the negotiation cycle";
Units = "matches";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCycleMatchRate0";
Verbosity = 1;
Desc = "Matches made per second during negotiation cycle";
Units = "matches/s";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCycleMatchRateSustained0";
Verbosity = 1;
Desc = "Matches made per second, including waiting time between negotiation cycles";
Units = "matches/s";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCycleNumIdleJobs0";
Verbosity = 1;
Desc = "The number of idle jobs belonging to job submitters";
Units = "jobs";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCycleNumJobsConsidered0";
Verbosity = 1;
Desc = "The number of jobs considered for matchmaking (may be mutch lower than idle jobs due to auto-cluster optimizations)";
Units = "jobs";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCycleNumSchedulers0";
Verbosity = 2;
Desc = "The number of schedds involved in negotiation for resources";
Units = "schedds";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCyclePeriod0";
Verbosity = 1;
Desc = "Seconds between the end of one cycle the the end of the next";
Units = "seconds";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCyclePhase1Duration0";
Verbosity = 2;
Desc = "Duration of Phase 1: getting submitter and machine ClassAds";
Units = "seconds";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCyclePhase2Duration0";
Verbosity = 2;
Desc = "Duration of Phase 2: filtering slots and processing accounting group configuration";
Units = "seconds";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCyclePhase3Duration0";
Verbosity = 2;
Desc = "Phase 3 of the negotiation cycle: sorting submitters by priority";
Units = "seconds";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCyclePhase4Duration0";
Verbosity = 2;
Desc = "Phase 4 of the negotiation cycle: matching slots to jobs";
Units = "seconds";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCycleRejections0";
Verbosity = 1;
Desc = "The number of rejections that occurred in the negotiation cycle (only one per auto-cluster)";
Units = "jobs";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCycleSlotShareIter0";
Verbosity = 2;
Desc = "The number of iterations in the negotiation cycle";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCycleTotalSlots0";
Verbosity = 1;
Desc = "The total number of slot ClassAds that matched NEGOTIATOR_SLOT_CONSTRAINT";
Units = "slots";
TargetType = "Negotiator";
]
[
Name = "LastNegotiationCycleTrimmedSlots0";
Verbosity = 2;
Desc = "The number of slot ClassAds considered for matchmaking, after filtering by Negotiator_CONSIDER_PREEMPTION, if applicable";
Units = "slots";
TargetType = "Negotiator";
]
[
Name = "ExpectedMachineGracefulDrainingBadput";
Verbosity = 2;
Desc = "Job runtime that would be lost if graceful draining were initiated now.";
Units = "cpus*seconds";
TargetType = "Machine_slot1";
]
[
Name = "ExpectedMachineGracefulDrainingCompletion";
Value = ExpectedMachineGracefulDrainingCompletion - time();
Verbosity = 2;
Desc = "Time graceful draining could take to complete, assuming jobs take full retirement and vacate time and there is no suspension";
Units = "seconds";
TargetType = "Machine_slot1";
]
[
Name = "ExpectedMachineQuickDrainingBadput";
Verbosity = 2;
Desc = "Job runtime that would be lost if quick draining were initiated now.";
Units = "cpus*seconds";
TargetType = "Machine_slot1";
]
[
Name = "ExpectedMachineQuickDrainingCompletion";
Verbosity = 2;
Desc = "Time quick draining could take to complete, assuming jobs take full retirement and vacate time and there is no suspension";
Units = "seconds";
TargetType = "Machine_slot1";
]
[
Name = "Linpack";
Value = KFlops;
Verbosity = 2;
Desc = "Linpack floating point benchmark";
Units = "FLOPS";
Scale = 1000;
Type = "float";
TargetType = "Machine_slot1";
]
[
Name = "Dhrystone";
Value = Mips;
Verbosity = 2;
Desc = "Dhrystone integer benchmark";
Units = "Iterations/sec";
Scale = 1000000;
Type = "float";
TargetType = "Machine_slot1";
]
[
Name = "TotalCondorLoadAvg";
Verbosity = 1;
Desc = "The CPU load attributed to jobs";
TargetType = "Machine_slot1";
]
[
Name = "TotalCpus";
Verbosity = 2;
Desc = "Number of cores";
Units = "cores";
TargetType = "Machine_slot1";
]
[
Aggregate = "SUM";
Name = "Cpus in Pool";
Value = TotalCpus;
Verbosity = 2;
Desc = "Number of cores in the pool";
Units = "cores";
TargetType = "Machine_slot1";
]
[
Name = "TotalDisk";
Verbosity = 2;
Desc = "Disk space in the job execute directory";
Units = "bytes";
Scale = 1024;
Type = "float";
TargetType = "Machine_slot1";
]
[
Name = "TotalLoadAvg";
Verbosity = 2;
Desc = "System load average";
TargetType = "Machine_slot1";
]
[
Name = "TotalMemory";
Verbosity = 2;
Desc = "RAM";
Units = "bytes";
Scale = 1048576;
Type = "float";
TargetType = "Machine_slot1";
]
[
Name = "TotalSlots";
Verbosity = 2;
Desc = "Number of slots";
Units = "slots";
TargetType = "Machine_slot1";
]
[
Aggregate = "SUM";
Name = "Pool Slot Count";
Value = TotalSlots;
Desc = "Number of slots in the pool";
Units = "slots";
TargetType = "Machine_slot1";
]
[
Name = "TotalMachineDrainingBadput";
Verbosity = 1;
Desc = "Job runtime that has been lost due to job evictions caused by draining";
Units = "cpus*seconds";
TargetType = "Machine_slot1";
]
[
Name = "TotalMachineDrainingUnclaimedTime";
Verbosity = 1;
Desc = "Time that has not been used due to draining";
Units = "cpus*seconds";
TargetType = "Machine_slot1";
]
[
Name = "TotalVirtualMemory";
Verbosity = 2;
Desc = "Addressable memory (RAM plus swap)";
Units = "bytes";
Scale = 1024;
Type = "float";
TargetType = "Machine_slot1";
]
[
Name = "TotalPreemptions";
Verbosity = 2;
Desc = "Total number of preempted jobs on this startd";
Units = "preemptions";
TargetType = "Machine_slot1";
]
[
Name = "TotalJobStarts";
Verbosity = 2;
Desc = "Total number of jobs started on this startd since boot";
Units = "jobs";
TargetType = "Machine_slot1";
]
[
Aggregate = "SUM";
Name = "Poolwide Preemptions";
Value = TotalPreemptions;
Verbosity = 2;
Desc = "Poolwide Preemptions";
Units = "preemptions";
TargetType = "Machine_slot1";
]
[
Aggregate = "SUM";
Name = "Poolwide Job Starts";
Value = TotalJobStarts;
Verbosity = 2;
Desc = "Poolwide Job Starts";
Units = "jobs";
TargetType = "Machine_slot1";
]
[
Name = "AutoClusters";
Desc = "Number of active AutoClusters in the schedd";
Units = "autoclusters";
TargetType = "Scheduler";
]
[
Aggregate = "SUM";
Name = "AutoClusters in Pool";
Value = AutoClusters;
Desc = "Number of active AutoClusters in schedds reporting to this pool";
Units = "autoclusters";
TargetType = "Scheduler";
]
[
Name = strcat(MyType,"WholeMachines");
Value = WholeMachines;
Verbosity = 2;
Desc = "Number of machines that were observed to be defragmented in the last polling interval";
TargetType = "Defrag";
]
[
Name = strcat(MyType,"MachinesDraining");
Value = MachinesDraining;
Verbosity = 2;
Desc = "Number of machines that were observed to be draining in the last polling interval";
TargetType = "Defrag";
]
[
Name = strcat(MyType,"RecentDrainSuccesses");
Value = RecentDrainSuccesses;
Verbosity = 2;
Desc = "Count of successful attempts to initiate draining during the past RecentStatsLifetime seconds";
TargetType = "Defrag";
]
[
Name = strcat(MyType,"RecentDrainFailures");
Value = RecentDrainFailures;
Verbosity = 2;
Desc = "Count of failed attempts to initiate draining during the past RecentStatsLifetime seconds";
TargetType = "Defrag";
]
[
Name = strcat(MyType,"AvgDrainingUnclaimed");
Value = AvgDrainingUnclaimed;
Verbosity = 2;
Desc = "Fraction of time CPUs in the pool have spent unclaimed by a user during draining of the machine";
TargetType = "Defrag";
]
[
Name = strcat(MyType,"WholeMachinesPeak");
Value = WholeMachinesPeak;
Verbosity = 2;
Desc = "Largest number of machines that were ever observed to be simultaneously defragmented";
TargetType = "Defrag";
]
[
Name = strcat(MyType,"AvgDrainingBadput");
Value = AvgDrainingBadput;
Verbosity = 2;
Desc = "Fraction of time CPUs in the pool have spent on jobs that were killed during draining of the machine";
TargetType = "Defrag";
]
[
Name = strcat(MyType,"MachinesDrainingPeak");
Value = MachinesDrainingPeak;
Verbosity = 2;
Desc = "Largest number of machines that were ever observed to be draining";
TargetType = "Defrag";
]