From 76f62b0cf4584062bd6f72afbce4f6b767ae8156 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20M=C3=BCller?= Date: Mon, 19 Apr 2021 13:56:35 +0200 Subject: [PATCH] initial version --- conf/common/condor_config | 92 ++ .../condor_ssh_to_job_sshd_config_template | 64 + conf/common/config.d/00-htcondor-9.0.config | 26 + conf/common/ganglia.d/00_default_metrics | 1081 +++++++++++++++++ files/htcondor.Dockerfile | 8 + inv.yml | 6 + play.yml | 200 +++ 7 files changed, 1477 insertions(+) create mode 100644 conf/common/condor_config create mode 100644 conf/common/condor_ssh_to_job_sshd_config_template create mode 100644 conf/common/config.d/00-htcondor-9.0.config create mode 100644 conf/common/ganglia.d/00_default_metrics create mode 100644 files/htcondor.Dockerfile create mode 100644 inv.yml create mode 100644 play.yml diff --git a/conf/common/condor_config b/conf/common/condor_config new file mode 100644 index 0000000..932de5c --- /dev/null +++ b/conf/common/condor_config @@ -0,0 +1,92 @@ +###################################################################### +## +## condor_config +## +## This is the global configuration file for condor. This is where +## you define where the local config file is. Any settings +## made here may potentially be overridden in the local configuration +## file. KEEP THAT IN MIND! To double-check that a variable is +## getting set from the configuration file that you expect, use +## condor_config_val -v +## +## condor_config.annotated is a more detailed sample config file +## +## Unless otherwise specified, settings that are commented out show +## the defaults that are used if you don't define a value. Settings +## that are defined here MUST BE DEFINED since they have no default +## value. +## +###################################################################### + +## Where have you installed the bin, sbin and lib condor directories? +RELEASE_DIR = /usr + +## Where is the local condor directory for each host? This is where the local config file(s), logs and +## spool/execute directories are located. this is the default for Linux and Unix systems. +LOCAL_DIR = /var + +## Where is the machine-specific local config file for each host? +# LOCAL_CONFIG_FILE = /etc/condor/condor_config.local +LOCAL_CONFIG_FILE = /etc/condor/condor_config_$(HOSTNAME).local +## If your configuration is on a shared file system, then this might be a better default +#LOCAL_CONFIG_FILE = $(RELEASE_DIR)/etc/$(HOSTNAME).local +## If the local config file is not present, is it an error? (WARNING: This is a potential security issue.) +REQUIRE_LOCAL_CONFIG_FILE = false + +## The normal way to do configuration with RPMs is to read all of the +## files in a given directory that don't match a regex as configuration files. +## Config files are read in lexicographic order. +LOCAL_CONFIG_DIR = /etc/condor/config.d +#LOCAL_CONFIG_DIR_EXCLUDE_REGEXP = ^((\..*)|(.*~)|(#.*)|(.*\.rpmsave)|(.*\.rpmnew))$ + +## +## Do NOT use host-based security by default. +## +## This was the default for the 8.8 series (and earlier), but it is +## intrinsically insecure. To make the 9.0 series secure by default, we +## commented it out. +## +## You should seriously consider improving your security configuration. +## +## To continue to use your old security configuration, knowing that it is +## insecure, add the line 'use SECURITY : HOST_BASED' to your local +## configuration directory. Don't just uncomment the final line in this +## comment block; changes in this file may be lost during your next upgrade. +## The following shell command will make the change on most Linux systems. +## +## echo 'use SECURITY : HOST_BASED' >> $(condor_config_val LOCAL_CONFIG_DIR)/00-insecure.config +## + +## To expand your condor pool beyond a single host, set ALLOW_WRITE to match all of the hosts +#ALLOW_WRITE = *.cs.wisc.edu +## FLOCK_FROM defines the machines that grant access to your pool via flocking. (i.e. these machines can join your pool). +#FLOCK_FROM = +## FLOCK_TO defines the central managers that your schedd will advertise itself to (i.e. these pools will give matches to your schedd). +#FLOCK_TO = condor.cs.wisc.edu, cm.example.edu + +##-------------------------------------------------------------------- +## Values set by the rpm patch script: +##-------------------------------------------------------------------- + +## For Unix machines, the path and file name of the file containing +## the pool password for password authentication. +#SEC_PASSWORD_FILE = $(LOCAL_DIR)/lib/condor/pool_password + +## Pathnames +RUN = $(LOCAL_DIR)/run/condor +LOG = $(LOCAL_DIR)/log/condor +LOCK = $(LOCAL_DIR)/lock/condor +SPOOL = $(LOCAL_DIR)/lib/condor/spool +EXECUTE = $(LOCAL_DIR)/lib/condor/execute +BIN = $(RELEASE_DIR)/bin +LIB = $(RELEASE_DIR)/lib64/condor +INCLUDE = $(RELEASE_DIR)/include/condor +SBIN = $(RELEASE_DIR)/sbin +LIBEXEC = $(RELEASE_DIR)/libexec/condor +SHARE = $(RELEASE_DIR)/share/condor + +PROCD_ADDRESS = $(RUN)/procd_pipe + +JAVA_CLASSPATH_DEFAULT = $(SHARE) . + +## Install the minicondor package to run HTCondor on a single node diff --git a/conf/common/condor_ssh_to_job_sshd_config_template b/conf/common/condor_ssh_to_job_sshd_config_template new file mode 100644 index 0000000..fb70d86 --- /dev/null +++ b/conf/common/condor_ssh_to_job_sshd_config_template @@ -0,0 +1,64 @@ + +HostKey _INSERT_HOST_KEY_ +AuthorizedKeysFile _INSERT_AUTHORIZED_KEYS_FILE_ + +# The following option is not supported by all recent versions of OpenSSH, +# so instead we rely on injection of the shell setup command in the authorized +# keys file. +#ForceCommand _INSERT_FORCE_COMMAND_ + +# as a convenience to users, allow remote setting of environment +# since sshd is running as the job uid, there isn't really a security concern +AcceptEnv * + +Subsystem sftp /usr/libexec/openssh/sftp-server + +# prevent sshd from freaking out about reading files from inside +# a tmp-like directory +StrictModes no + +# Rejection by tcp wrappers is not logged at INFO or VERBOSE log levels, +# so to make diagnosis of problems easier, we use DEBUG. +LogLevel DEBUG + +X11Forwarding yes + +# By default, OpenSSH uses the ipv6 loopback even if ipv6 is disabled in the +# kernel. This forces OpenSSH to use the "local" network instead +X11UseLocalhost no + +# The following settings are recommended for good security. +# In particular, only FIPS 140-2 algorithms are used. +# URLs for extra information re FIPS security compliance: +# https://rhel7stig.readthedocs.io/en/latest/ +# https://www.stigviewer.com/stig/red_hat_enterprise_linux_7/ +# https://people.redhat.com/swells/scap-security-guide/tables/table-rhel7-stig-manual.html + +#-FIPS High Severity +Protocol 2 +PermitEmptyPasswords no + +#-FIPS Medium Severity +# Note: Ciphers and MACs below will be incompatible with RHEL5 or earlier. +Ciphers aes128-ctr,aes192-ctr,aes256-ctr +MACs hmac-sha2-256,hmac-sha2-512 +PermitRootLogin no +PermitUserEnvironment no +GSSAPIAuthentication no +KerberosAuthentication no +HostbasedAuthentication no +IgnoreRhosts yes +IgnoreUserKnownHosts yes +PrintLastLog yes +UsePrivilegeSeparation sandbox +Compression delayed + +#-Recommended for security, but left out ssh_to_job config +# because they provide minimal value and are likely to annoy +# users or generate needless warnings in the ssh_to_job setting. +# +# ClientAliveInterval 600 # Note: condor_submit -i sets TMOUT +# ClientAliveCountMax 0 +# banner=/etc/issue # Set to your warning banner +# StrictModes yes # Can't set due to tmp-like directory +# RhostsRSAAuthentication no # Obsolete Protocol version 1 option diff --git a/conf/common/config.d/00-htcondor-9.0.config b/conf/common/config.d/00-htcondor-9.0.config new file mode 100644 index 0000000..1fe0314 --- /dev/null +++ b/conf/common/config.d/00-htcondor-9.0.config @@ -0,0 +1,26 @@ +## +## Default security settings +## +## Host-based security was the default for the 8.8 series (and earlier). +## +## Host-based security assumes that all users on a machine are trusted. +## For example, if host-based security trusts that a given machine can +## run jobs, then any user who can start a process on that machine can +## start a startd that can "steal" jobs from the system. +# +## To help make HTCondor secure by default, we removed host-based security +## from the default configuration file +## (/etc/condor/condor_config). +## +## New installations of HTCondor should be made using the get_htcondor tool, +## which can automatically establish IDTOKENS-based security across a multi- +## node pool. For existing installations, we recommend you +## consider improving your security configuration. +## +## To continue to use your old security configuration, +## comment out the 'recommended' line below, and uncomment the +## 'host_based' line. +## + +# use security : host_based +use security : recommended_v9_0 diff --git a/conf/common/ganglia.d/00_default_metrics b/conf/common/ganglia.d/00_default_metrics new file mode 100644 index 0000000..69a2811 --- /dev/null +++ b/conf/common/ganglia.d/00_default_metrics @@ -0,0 +1,1081 @@ +[ + Name = strcat(MyType,"CondorVersion"); + Value = CondorVersion; + Desc = "Version String"; + TargetType = "Scheduler,Negotiator"; +] +[ + Name = strcat(MyType,"UpdatesLost"); + Value = UpdatesLost; + Verbosity = 2; + Desc = "Number of ClassAd updates that were sent by this daemon but not received by the collector"; + TargetType = "Scheduler,Negotiator,Machine_slot1"; +] +[ + Name = strcat(MyType,"UpdatesTotal"); + Value = UpdatesTotal; + Verbosity = 2; + Desc = "Number of ClassAd updates that were sent by this daemon"; + TargetType = "Scheduler,Negotiator,Machine_slot1"; +] +[ + Name = strcat(MyType,"RecentDaemonCoreDutyCycle"); + Value = RecentDaemonCoreDutyCycle; + Desc = "Recent fraction of busy time in the daemon event loop"; + Scale = 100; + Units = "%"; + TargetType = "Scheduler,Negotiator"; +] +/* Not a useful graph. Should be converted to a human readable string metric */ +[ + Name = strcat(MyType,"MonitorSelfAge"); + Value = MonitorSelfAge; + Verbosity = 99; + Desc = "Age of this daemon"; + Units = "seconds"; + TargetType = "Scheduler,Negotiator,Machine_slot1"; +] +[ + Name = strcat(MyType,"MonitorSelfCPUUsage"); + Value = MonitorSelfCPUUsage; + Verbosity = 2; + Desc = "The fraction of one CPU recently used by this daemon"; + TargetType = "Scheduler,Negotiator,Machine_slot1"; +] +[ + Name = strcat(MyType,"MonitorSelfImageSize"); + Value = MonitorSelfImageSize; + Verbosity = 1; + Desc = "Memory allocated to this daemon (i.e. virtual image size)"; + Units = "bytes"; + Scale = 1024; + Type = "float"; + TargetType = "Scheduler,Negotiator,Machine_slot1"; +] +[ + Name = strcat(MyType,"MonitorSelfRegisteredSocketCount"); + Value = MonitorSelfRegisteredSocketCount; + Verbosity = 2; + Desc = "Number of sockets registered in this daemon's event loop"; + Units = "sockets"; + TargetType = "Scheduler,Negotiator,Machine_slot1"; +] +[ + Name = strcat(MyType,"MonitorSelfResidentSetSize"); + Value = MonitorSelfResidentSetSize; + Verbosity = 2; + Desc = "RAM allocated to this daemon"; + Units = "bytes"; + Scale = 1024; + Type = "float"; + TargetType = "Scheduler,Negotiator,Machine_slot1"; +] +[ + Name = strcat(MyType,"MonitorSelfSecuritySessions"); + Value = MonitorSelfSecuritySessions; + Verbosity = 2; + Desc = "Number of security sessions in this daemon's cache"; + TargetType = "Scheduler,Negotiator,Machine_slot1"; +] + +[ + Name = "JobsAccumBadputTime"; + Desc = "Runtime of jobs that were aborted (removed or held) or (standard universe only) evicted without a checkpoint."; + Scale = 0.000277778; + Units = "hours"; + Type = "float"; + TargetType = "Scheduler"; +] +[ + Name = "JobsAccumExecuteTime"; + Desc = "Time spent running jobs. Does not include file transfer and other job handling time."; + Scale = 0.000277778; + Units = "hours"; + Type = "float"; + TargetType = "Scheduler"; +] +[ + Name = "JobsAccumPostExecuteTime"; + Verbosity = 2; + Desc = "Time spent processing a completed job (includes output file transfer)"; + Scale = 0.000277778; + Units = "hours"; + Type = "float"; + TargetType = "Scheduler"; +] +[ + Name = "JobsAccumPreExecuteTime"; + Verbosity = 2; + Desc = "Time spent preparing to run a job (includes input file transfer)"; + Scale = 0.000277778; + Units = "hours"; + Type = "float"; + TargetType = "Scheduler"; +] +[ + Name = "JobsAccumRunningTime"; + Desc = "Time spent running jobs that were not counted as badput (i.e. not removed or held). Includes file transfer and other handling time."; + Units = "hours"; + Scale = 0.000277778; + Type = "float"; + TargetType = "Scheduler"; +] +[ + Name = "JobsAccumTimeToStart"; + Verbosity = 2; + Desc = "Time between submit and running of a job"; + Scale = 0.000277778; + Units = "hours"; + Type = "float"; + TargetType = "Scheduler"; +] +[ + Name = "JobsCheckpointed"; + Verbosity = 2; + Desc = "Number of job run attempts that were interrupted and successfully checkpointed"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "JobsCompleted"; + Desc = "Number of jobs that terminated normally (i.e. not via a signal or abort)"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "JobsCoredumped"; + Verbosity = 1; + Desc = "Number of jobs that crashed and generated a core file"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "JobsDebugLogError"; + Verbosity = 2; + Desc = "Count of shadows that exited due to debug log errors"; + Units = "shadows"; + TargetType = "Scheduler"; +] +[ + Name = "JobsExecFailed"; + Verbosity = 1; + Desc = "Count of job run attempts that failed to execute the specified command"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "JobsExited"; + Verbosity = 2; + Desc = "Count of job run attempts that have completed (successfully or not)"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "JobsExitedAndClaimClosing"; + Verbosity = 2; + Desc = "Count of job run attempts that have completed when claim was not accepting additional jobs"; + Units = "jobs"; + TargetType = "Scheduler"; +] +/* JobsExitedNormally is the same as JobsCompleted, so don't bother. */ +/* +[ + Name = "JobsExitedNormally"; + Desc = ""; + Units = ""; + TargetType = "Scheduler"; +] +*/ +[ + Name = "JobsExitException"; + Verbosity = 2; + Desc = "Count of job run attempts that ended with a job handling exception (shadow exception)"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "JobsKilled"; + Verbosity = 1; + Desc = "Count of job run attempts in which the job was killed (i.e. evicted)"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "JobsMissedDeferralTime"; + Verbosity = 2; + Desc = "Count of job run attempts that failed because the specified deferral time was missed"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "JobsNotStarted"; + Verbosity = 2; + Desc = "Count of job run attempts that failed because the request to activate the claim failed"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "JobsShadowNoMemory"; + Verbosity = 2; + Desc = "Count of job run attempts that failed because there was not enough memory (RESERVED_SWAP)"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "JobsShouldHold"; + Verbosity = 2; + Desc = "Count of job run attempts that have resulted in the job going on hold"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "JobsShouldRemove"; + Verbosity = 2; + Desc = "Count of job run attempts that have resulted in the job being removed (e.g. periodic_remove policy)"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "JobsShouldRequeue"; + Verbosity = 2; + Desc = "Count of job run attempts that ended with the job being requeued due to handling failures or OnExitRemove=false"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "JobsStarted"; + Verbosity = 1; + Desc = "Number of job run attempts started"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "JobsSubmitted"; + Desc = "Number of jobs submitted"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "MaxJobsRunning"; + Verbosity = 1; + Desc = "Configured limit on number of running jobs"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "NumUsers"; + Verbosity = 1; + Desc = "Number of different users who currently have jobs in the queue"; + Units = "users"; + TargetType = "Scheduler"; +] +[ + Name = "RecentStatsLifetime"; + Verbosity = 2; + Desc = "Seconds elapsed since the beginning of the current stats collection window"; + Units = "seconds"; + TargetType = "Scheduler"; +] +[ + Name = "ScheddSwapExhausted"; + Verbosity = 2; + Desc = "Non-zero when jobs cannot be started due to RESERVED_SWAP"; + TargetType = "Scheduler"; +] +[ + Name = "ShadowsRunning"; + Verbosity = 2; + Desc = "Number of shadow processes currently running"; + Units = "shadows"; + TargetType = "Scheduler"; +] +[ + Name = "ShadowsStarted"; + Verbosity = 2; + Desc = "Number of shadow processes started"; + Units = "shadows"; + TargetType = "Scheduler"; +] +[ + Name = "StatsLifetime"; + Verbosity = 2; + Desc = "Seconds of elapsed time since the beginning of the schedd lifetime stat collection window"; + Units = "seconds"; + TargetType = "Scheduler"; +] +[ + Name = "TotalFlockedJobs"; + Desc = "Number of jobs from this schedd that are flocked to other pools"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "TotalHeldJobs"; + Desc = "Number of jobs in this schedd that are on hold"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Aggregate = "SUM"; + Name = "Held Jobs in Pool"; + Value = TotalHeldJobs; + Desc = "Number of jobs on hold in schedds reporting to this pool"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "TotalIdleJobs"; + Desc = "Number of idle jobs in this schedd's queue"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Aggregate = "SUM"; + Name = "Idle Jobs in Pool"; + Value = TotalIdleJobs; + Desc = "Number of idle jobs in schedds reporting to this pool"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "TotalJobAds"; + Desc = "Number of jobs currently in this schedd's queue"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Aggregate = "SUM"; + Name = "Jobs in Pool"; + Value = TotalJobAds; + Desc = "Number of jobs currently in schedds reporting to this pool"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "TotalLocalJobsIdle"; + Verbosity = 2; + Desc = "Number of local universe jobs in this schedd's queue"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "TotalLocalJobsRunning"; + Verbosity = 2; + Desc = "Number of running local universe jobs in this schedd's queue"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "TotalRemovedJobs"; + Verbosity = 1; + Desc = "Number of jobs that are in the process of being removed"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "TotalRunningJobs"; + Desc = "Number of running jobs in this schedd's queue"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Aggregate = "SUM"; + Name = "Running Jobs in Pool"; + Value = TotalRunningJobs; + Desc = "Number of running jobs in schedds reporting to this pool"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "TotalSchedulerJobsIdle"; + Verbosity = 2; + Desc = "Number of idle scheduler universe jobs in this schedd's queue"; + Units = "jobs"; + TargetType = "Scheduler"; +] +[ + Name = "TotalSchedulerJobsRunning"; + Verbosity = 2; + Desc = "Number of running scheduler universe jobs in this schedd's queue"; + Units = "jobs"; + TargetType = "Scheduler"; +] + +[ + Name = strcat(Name,"-TotalRunningJobs"); + Title = strcat(Name, " Total Running Jobs"); + Aggregate = "SUM"; + Value = RunningJobs; + Verbosity = 2; + Desc = strcat("Total number of running jobs from user ", Name); + Units = "jobs"; + TargetType = "Submitter"; + Group = "HTCondor Submitters"; +] +[ + Name = strcat(Name, "RunningJobs"); + Title = strcat(Name, " Running Jobs"); + Value = RunningJobs; + Verbosity = 2; + Desc = strcat("Number of running jobs from user ", Name); + Units = "jobs"; + TargetType = "Submitter"; + Group = "HTCondor Submitters"; +] +[ + Name = strcat(Name,"-TotalIdleJobs"); + Title = strcat(Name, " Total Idle Jobs"); + Aggregate = "SUM"; + Value = IdleJobs; + Verbosity = 2; + Desc = strcat("Total number of idle jobs from user ", Name); + Units = "jobs"; + TargetType = "Submitter"; + Group = "HTCondor Submitters"; +] +[ + Name = strcat(Name, "IdleJobs"); + Title = strcat(Name, " Idle Jobs"); + Value = IdleJobs; + Verbosity = 2; + Desc = strcat("Number of idle jobs from user ", Name); + Units = "jobs"; + TargetType = "Submitter"; + Group = "HTCondor Submitters"; +] +[ + Name = strcat(Name,"-TotalHeldJobs"); + Title = strcat(Name, " Total Held Jobs"); + Aggregate = "SUM"; + Value = HeldJobs; + Verbosity = 2; + Desc = strcat("Total number of held jobs from user ", Name); + Units = "jobs"; + TargetType = "Submitter"; + Group = "HTCondor Submitters"; +] +[ + Name = strcat(Name, "HeldJobs"); + Title = strcat(Name, " Held Jobs"); + Value = HeldJobs; + Verbosity = 2; + Desc = strcat("Number of held jobs from user ", Name); + Units = "jobs"; + TargetType = "Submitter"; + Group = "HTCondor Submitters"; +] +[ + Name = strcat(Name,"-TotalFlockedJobs"); + Title = strcat(Name, " Total Flocked Jobs"); + Aggregate = "SUM"; + Value = FlockedJobs; + Verbosity = 2; + Desc = strcat("Total number of flocked jobs from user ", Name); + Units = "jobs"; + TargetType = "Submitter"; + Group = "HTCondor Submitters"; +] +[ + Name = strcat(Name, "FlockedJobs"); + Title = strcat(Name, " Flocked Jobs"); + Value = FlockedJobs; + Verbosity = 2; + Desc = strcat("Number of flocked jobs from user ", Name); + Units = "jobs"; + TargetType = "Submitter"; + Group = "HTCondor Submitters"; +] + +[ + Name = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"),"-TotalRunningJobs"); + Title = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"), " Total Running Jobs"); + Aggregate = "SUM"; + Value = RunningJobs; + Verbosity = 2; + Desc = strcat("Total number of running jobs from ", ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "group \\1"),"no group")); + Units = "jobs"; + TargetType = "Submitter"; + Group = "HTCondor Accounting Groups"; +] +[ + Name = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"),"-TotalIdleJobs"); + Title = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"), " Total Idle Jobs"); + Aggregate = "SUM"; + Value = IdleJobs; + Verbosity = 2; + Desc = strcat("Total number of idle jobs from ", ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "group \\1"),"no group")); + Units = "jobs"; + TargetType = "Submitter"; + Group = "HTCondor Accounting Groups"; +] +[ + Name = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"),"-TotalHeldJobs"); + Title = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"), " Total Held Jobs"); + Aggregate = "SUM"; + Value = HeldJobs; + Verbosity = 2; + Desc = strcat("Total number of held jobs from ", ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "group \\1"),"no group")); + Units = "jobs"; + TargetType = "Submitter"; + Group = "HTCondor Accounting Groups"; +] +[ + Name = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"),"-TotalFlockedJobs"); + Title = strcat(ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "\\1"),"nogroup"), " Total Flocked Jobs"); + Aggregate = "SUM"; + Value = FlockedJobs; + Verbosity = 2; + Desc = strcat("Total number of flocked jobs from ", ifThenElse(regexp("([a-zA-Z0-9.]+)\\.[a-zA-z0-9]+", splitUserName(Name)[0]),regexps("([a-zA-Z0-9.]+)\\.[a-zA-Z0-9]+", splitUserName(Name)[0], "group \\1"),"no group")); + Units = "jobs"; + TargetType = "Submitter"; + Group = "HTCondor Accounting Groups"; +] + + +[ + Name = "FileTransferDownloadBytes"; + Verbosity = 1; + Derivative = true; + Title = "File Transfer Download Bandwidth"; + Desc = "Output transfers from jobs"; + Units = "bytes"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Name = "FileTransferDownloadBytesPerSecond_5m"; + Verbosity = 1; + Desc = "Rate of output transfers from jobs"; + Units = "bytes/s"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Regex = "Owner_([^_]*)_FileTransferDownloadBytesPerSecond_5m"; + Title = "\\1 Download Bytes Per Second"; + Verbosity = 2; + Desc = "Rate of output transfers from jobs by user \\1"; + Units = "bytes/s"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Name = "FileTransferFileReadLoad_5m"; + Verbosity = 1; + Desc = "Number of file transfer processes reading input data from files"; + Units = "processes"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +/* This looks like a mismatch of stuff */ +[ + Name = "FileTransferFileReadSeconds"; + Verbosity = 99; + Derivative = true; + Title = "File Transfer File Read Load"; + Desc = "Number of file transfer processes reading input data from files"; + Units = "processes"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Name = "FileTransferFileWriteLoad_5m"; + Verbosity = 1; + Desc = "Number of file transfer processes writing output data to files"; + Units = "processes"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +/* This looks like a mismatch of stuff */ +[ + Name = "FileTransferFileWriteSeconds"; + Verbosity = 99; + Derivative = true; + Title = "File Transfer File Write Load"; + Desc = "Number of file transfer processes writing output data to files"; + Units = "processes"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Name = "FileTransferNetReadLoad_5m"; + Verbosity = 1; + Desc = "Number of file transfer processes reading output data from the network"; + Units = "processes"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +/* This looks like a mismatch of stuff */ +[ + Name = "FileTransferNetReadSeconds"; + Verbosity = 99; + Derivative = true; + Desc = "Number of file transfer processes reading output data from the network"; + Units = "processes"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Name = "FileTransferNetWriteLoad_5m"; + Verbosity = 1; + Desc = "Number of file transfer processes writing input data to the network"; + Units = "processes"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +/* This looks like a mismatch of stuff */ +[ + Name = "FileTransferNetWriteSeconds"; + Verbosity = 99; + Derivative = true; + Title = "File Transfer Net Write Load"; + Desc = "Number of file transfer processes writing input data to the network"; + Units = "processes"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Name = "FileTransferUploadBytes"; + Derivative = true; + Title = "File Transfer Upload Bandwidth"; + Desc = "Input transfers to jobs"; + Units = "bytes"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Name = "FileTransferUploadBytesPerSecond_5m"; + Verbosity = 1; + Desc = "Rate of input transfers to jobs"; + Units = "bytes/s"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Regex = "Owner_([^_]*)_FileTransferUploadBytesPerSecond_5m"; + Title = "\\1 Upload Bytes Per Second"; + Verbosity = 2; + Desc = "Rate of input transfers from jobs by user \\1"; + Units = "bytes/s"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Name = "TransferQueueDownloadWaitTime"; + Desc = "Oldest output file transfer waiting in the transfer queue"; + Units = "seconds"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Aggregate = "MAX"; + Name = "Pool Max TransferQueueDownloadWaitTime"; + Value = TransferQueueDownloadWaitTime; + Desc = "Oldest output file transfer waiting in the transfer queues reporting to this pool"; + Units = "seconds"; + TargetType = "Scheduler"; +] +[ + Name = "TransferQueueNumDownloading"; + Desc = "Number of jobs actively transferring output"; + Units = "jobs"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Name = "TransferQueueNumUploading"; + Desc = "Number of jobs actively transferring input"; + Units = "jobs"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Name = "TransferQueueNumWaitingToDownload"; + Desc = "Number of jobs waiting in the transfer queue to transfer output"; + Units = "jobs"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Name = "TransferQueueNumWaitingToUpload"; + Desc = "Number of jobs waiting in the transfer queue to transfer input"; + Units = "jobs"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Name = "TransferQueueUploadWaitTime"; + Desc = "Oldest input file transfer waiting in the transfer queue"; + Units = "seconds"; + TargetType = "Scheduler"; + Group = "HTCondor File Transfer"; +] +[ + Aggregate = "MAX"; + Name = "Pool Max TransferQueueUploadWaitTime"; + Value = TransferQueueUploadWaitTime; + Desc = "Oldest input file transfer waiting in the transfer queues reporting to this pool"; + Units = "seconds"; + TargetType = "Scheduler"; +] + +[ + Name = "LastNegotiationCycleActiveSubmitterCount0"; + Verbosity = 1; + Desc = "The number of job submitters considered in the negotiation cycle"; + Units = "submitters"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCycleCandidateSlots0"; + Verbosity = 2; + Desc = "The number of slot ClassAds considered for matchmaking (reduced by NEGOTIATOR_SLOT_POOLSIZE_CONSTRAINT if applicable)"; + Units = "slots"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCycleDuration0"; + Desc = "The number of seconds that it took to complete the negotiation cycle"; + Units = "seconds"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCycleMatches0"; + Verbosity = 1; + Desc = "The number of successful matches that were made in the negotiation cycle"; + Units = "matches"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCycleMatchRate0"; + Verbosity = 1; + Desc = "Matches made per second during negotiation cycle"; + Units = "matches/s"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCycleMatchRateSustained0"; + Verbosity = 1; + Desc = "Matches made per second, including waiting time between negotiation cycles"; + Units = "matches/s"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCycleNumIdleJobs0"; + Verbosity = 1; + Desc = "The number of idle jobs belonging to job submitters"; + Units = "jobs"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCycleNumJobsConsidered0"; + Verbosity = 1; + Desc = "The number of jobs considered for matchmaking (may be mutch lower than idle jobs due to auto-cluster optimizations)"; + Units = "jobs"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCycleNumSchedulers0"; + Verbosity = 2; + Desc = "The number of schedds involved in negotiation for resources"; + Units = "schedds"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCyclePeriod0"; + Verbosity = 1; + Desc = "Seconds between the end of one cycle the the end of the next"; + Units = "seconds"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCyclePhase1Duration0"; + Verbosity = 2; + Desc = "Duration of Phase 1: getting submitter and machine ClassAds"; + Units = "seconds"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCyclePhase2Duration0"; + Verbosity = 2; + Desc = "Duration of Phase 2: filtering slots and processing accounting group configuration"; + Units = "seconds"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCyclePhase3Duration0"; + Verbosity = 2; + Desc = "Phase 3 of the negotiation cycle: sorting submitters by priority"; + Units = "seconds"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCyclePhase4Duration0"; + Verbosity = 2; + Desc = "Phase 4 of the negotiation cycle: matching slots to jobs"; + Units = "seconds"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCycleRejections0"; + Verbosity = 1; + Desc = "The number of rejections that occurred in the negotiation cycle (only one per auto-cluster)"; + Units = "jobs"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCycleSlotShareIter0"; + Verbosity = 2; + Desc = "The number of iterations in the negotiation cycle"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCycleTotalSlots0"; + Verbosity = 1; + Desc = "The total number of slot ClassAds that matched NEGOTIATOR_SLOT_CONSTRAINT"; + Units = "slots"; + TargetType = "Negotiator"; +] +[ + Name = "LastNegotiationCycleTrimmedSlots0"; + Verbosity = 2; + Desc = "The number of slot ClassAds considered for matchmaking, after filtering by Negotiator_CONSIDER_PREEMPTION, if applicable"; + Units = "slots"; + TargetType = "Negotiator"; +] + +[ + Name = "ExpectedMachineGracefulDrainingBadput"; + Verbosity = 2; + Desc = "Job runtime that would be lost if graceful draining were initiated now."; + Units = "cpus*seconds"; + TargetType = "Machine_slot1"; +] +[ + Name = "ExpectedMachineGracefulDrainingCompletion"; + Value = ExpectedMachineGracefulDrainingCompletion - time(); + Verbosity = 2; + Desc = "Time graceful draining could take to complete, assuming jobs take full retirement and vacate time and there is no suspension"; + Units = "seconds"; + TargetType = "Machine_slot1"; +] +[ + Name = "ExpectedMachineQuickDrainingBadput"; + Verbosity = 2; + Desc = "Job runtime that would be lost if quick draining were initiated now."; + Units = "cpus*seconds"; + TargetType = "Machine_slot1"; +] +[ + Name = "ExpectedMachineQuickDrainingCompletion"; + Verbosity = 2; + Desc = "Time quick draining could take to complete, assuming jobs take full retirement and vacate time and there is no suspension"; + Units = "seconds"; + TargetType = "Machine_slot1"; +] +[ + Name = "Linpack"; + Value = KFlops; + Verbosity = 2; + Desc = "Linpack floating point benchmark"; + Units = "FLOPS"; + Scale = 1000; + Type = "float"; + TargetType = "Machine_slot1"; +] +[ + Name = "Dhrystone"; + Value = Mips; + Verbosity = 2; + Desc = "Dhrystone integer benchmark"; + Units = "Iterations/sec"; + Scale = 1000000; + Type = "float"; + TargetType = "Machine_slot1"; +] +[ + Name = "TotalCondorLoadAvg"; + Verbosity = 1; + Desc = "The CPU load attributed to jobs"; + TargetType = "Machine_slot1"; +] +[ + Name = "TotalCpus"; + Verbosity = 2; + Desc = "Number of cores"; + Units = "cores"; + TargetType = "Machine_slot1"; +] +[ + Aggregate = "SUM"; + Name = "Cpus in Pool"; + Value = TotalCpus; + Verbosity = 2; + Desc = "Number of cores in the pool"; + Units = "cores"; + TargetType = "Machine_slot1"; +] +[ + Name = "TotalDisk"; + Verbosity = 2; + Desc = "Disk space in the job execute directory"; + Units = "bytes"; + Scale = 1024; + Type = "float"; + TargetType = "Machine_slot1"; +] +[ + Name = "TotalLoadAvg"; + Verbosity = 2; + Desc = "System load average"; + TargetType = "Machine_slot1"; +] +[ + Name = "TotalMemory"; + Verbosity = 2; + Desc = "RAM"; + Units = "bytes"; + Scale = 1048576; + Type = "float"; + TargetType = "Machine_slot1"; +] +[ + Name = "TotalSlots"; + Verbosity = 2; + Desc = "Number of slots"; + Units = "slots"; + TargetType = "Machine_slot1"; +] +[ + Aggregate = "SUM"; + Name = "Pool Slot Count"; + Value = TotalSlots; + Desc = "Number of slots in the pool"; + Units = "slots"; + TargetType = "Machine_slot1"; +] +[ + Name = "TotalMachineDrainingBadput"; + Verbosity = 1; + Desc = "Job runtime that has been lost due to job evictions caused by draining"; + Units = "cpus*seconds"; + TargetType = "Machine_slot1"; +] +[ + Name = "TotalMachineDrainingUnclaimedTime"; + Verbosity = 1; + Desc = "Time that has not been used due to draining"; + Units = "cpus*seconds"; + TargetType = "Machine_slot1"; +] +[ + Name = "TotalVirtualMemory"; + Verbosity = 2; + Desc = "Addressable memory (RAM plus swap)"; + Units = "bytes"; + Scale = 1024; + Type = "float"; + TargetType = "Machine_slot1"; +] +[ + Name = "TotalPreemptions"; + Verbosity = 2; + Desc = "Total number of preempted jobs on this startd"; + Units = "preemptions"; + TargetType = "Machine_slot1"; +] +[ + Name = "TotalJobStarts"; + Verbosity = 2; + Desc = "Total number of jobs started on this startd since boot"; + Units = "jobs"; + TargetType = "Machine_slot1"; +] +[ + Aggregate = "SUM"; + Name = "Poolwide Preemptions"; + Value = TotalPreemptions; + Verbosity = 2; + Desc = "Poolwide Preemptions"; + Units = "preemptions"; + TargetType = "Machine_slot1"; +] +[ + Aggregate = "SUM"; + Name = "Poolwide Job Starts"; + Value = TotalJobStarts; + Verbosity = 2; + Desc = "Poolwide Job Starts"; + Units = "jobs"; + TargetType = "Machine_slot1"; +] +[ + Name = "AutoClusters"; + Desc = "Number of active AutoClusters in the schedd"; + Units = "autoclusters"; + TargetType = "Scheduler"; +] +[ + Aggregate = "SUM"; + Name = "AutoClusters in Pool"; + Value = AutoClusters; + Desc = "Number of active AutoClusters in schedds reporting to this pool"; + Units = "autoclusters"; + TargetType = "Scheduler"; +] +[ + Name = strcat(MyType,"WholeMachines"); + Value = WholeMachines; + Verbosity = 2; + Desc = "Number of machines that were observed to be defragmented in the last polling interval"; + TargetType = "Defrag"; +] +[ + Name = strcat(MyType,"MachinesDraining"); + Value = MachinesDraining; + Verbosity = 2; + Desc = "Number of machines that were observed to be draining in the last polling interval"; + TargetType = "Defrag"; +] +[ + Name = strcat(MyType,"RecentDrainSuccesses"); + Value = RecentDrainSuccesses; + Verbosity = 2; + Desc = "Count of successful attempts to initiate draining during the past RecentStatsLifetime seconds"; + TargetType = "Defrag"; +] +[ + Name = strcat(MyType,"RecentDrainFailures"); + Value = RecentDrainFailures; + Verbosity = 2; + Desc = "Count of failed attempts to initiate draining during the past RecentStatsLifetime seconds"; + TargetType = "Defrag"; +] +[ + Name = strcat(MyType,"AvgDrainingUnclaimed"); + Value = AvgDrainingUnclaimed; + Verbosity = 2; + Desc = "Fraction of time CPUs in the pool have spent unclaimed by a user during draining of the machine"; + TargetType = "Defrag"; +] +[ + Name = strcat(MyType,"WholeMachinesPeak"); + Value = WholeMachinesPeak; + Verbosity = 2; + Desc = "Largest number of machines that were ever observed to be simultaneously defragmented"; + TargetType = "Defrag"; +] +[ + Name = strcat(MyType,"AvgDrainingBadput"); + Value = AvgDrainingBadput; + Verbosity = 2; + Desc = "Fraction of time CPUs in the pool have spent on jobs that were killed during draining of the machine"; + TargetType = "Defrag"; +] +[ + Name = strcat(MyType,"MachinesDrainingPeak"); + Value = MachinesDrainingPeak; + Verbosity = 2; + Desc = "Largest number of machines that were ever observed to be draining"; + TargetType = "Defrag"; +] diff --git a/files/htcondor.Dockerfile b/files/htcondor.Dockerfile new file mode 100644 index 0000000..f4ae673 --- /dev/null +++ b/files/htcondor.Dockerfile @@ -0,0 +1,8 @@ +FROM docker.io/library/centos:7 + +RUN yum install -y https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm && \ + yum install --nogpgcheck -y condor && \ + yum install -y less && \ + yum clean all + +CMD bash -c 'cat <({ condor_master -f & tail --retry --pid $! -f /var/log/condor/MasterLog & })' diff --git a/inv.yml b/inv.yml new file mode 100644 index 0000000..7a96607 --- /dev/null +++ b/inv.yml @@ -0,0 +1,6 @@ +all: + hosts: + ed-c7-1: + ansible_user: root + ansible_host: ed-c7-1.virt.magni.thoto.net + # ansible_host: 192.168.122.139 diff --git a/play.yml b/play.yml new file mode 100644 index 0000000..deae3c4 --- /dev/null +++ b/play.yml @@ -0,0 +1,200 @@ +--- +- hosts: ed-c7-1 + tasks: +# - copy: +# dest: /etc/profile.d/vim-alias.sh +# content: "alias vim=/usr/bin/vi" + - yum: + name: + - vim-enhanced + - htop + - screen + state: present + + - yum: + name: https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm + state: present + + - yum: + name: htcondor-ce + state: present + + # FIXME + - yum: + name: minicondor + state: present + + - yum: + name: singularity + state: present + tags: "singularity" + + # enable fakeroot in singularity + - sysctl: + name: user.max_user_namespaces + value: "15000" + sysctl_file: /etc/sysctl.d/90-max_net_namespaces.conf + tags: "singularity" + +# - shell: +# command: singularity config fakeroot --add thoto + - name: "enable user thoto for fakeroot access" + lineinfile: + line: "thoto:4294836224:65536" + dest: "{{item}}" + with_items: ["/etc/subuid", "/etc/subgid"] + tags: "singularity" + + - block: + - get_url: + url: "https://download.docker.com/linux/centos/docker-ce.repo" + dest: "/etc/yum.repos.d/docker-ce.repo" + checksum: sha256:8ab5599eef0afcac10cbd3e8670873efee20fcceb5fb3526a62edeade603cec7 + + - yum: + name: docker-ce + state: present + + - parted: + device: /dev/vdb + number: 1 + state: present + # fs_type: xfs + + - filesystem: + dev: /dev/vdb1 + fstype: xfs + opts: "-L image-store" + + - mount: + path: "/container" + src: "/dev/vdb1" + fstype: xfs + opts: "noatime" + state: mounted + + - file: + path: "/container/docker" + state: directory + owner: root + group: root + mode: "u=rwx,g=x,o=x" + + - file: + path: "/var/lib/docker" + src: "/container/docker" + state: link + owner: root + group: root + mode: "u=rwx,g=x,o=x" + + - service: + name: docker + enabled: True + state: started + + - user: + name: thoto + groups: docker + append: True + + - yum: + name: python-docker-py + state: present + tags: "docker" + + - block: + - file: + path: "/container/docker-images/htcondor" + state: directory + owner: thoto + group: docker + mode: "u=rwx,g=rwx,o=rx" + + - copy: + dest: "/container/docker-images/htcondor/Dockerfile" + src: "htcondor.Dockerfile" + owner: thoto + group: docker + register: cp_dockerfile + + - docker_image: + name: "htcondor" +# pull: False + build: + pull: False + path: "/container/docker-images/htcondor" + source: build + force_source: "{{cp_dockerfile.changed}}" + + - copy: + src: "conf/{{item}}/" + dest: "/container/volumes/{{item}}/" + owner: thoto + group: docker + mode: "u=rwx,g=rwx" + with_items: [ "cm", "exec", "sub", "common"] + + - name: "check if pool shared secret exists" + stat: + path: "/container/volumes/common/passwords.d/POOL" + register: pool_pw + + - block: + - name: "create temporary password store" + tempfile: + state: directory + register: pool_pw_tmp + + - name: "generate pool password" + copy: + dest: "{{pool_pw_tmp.path}}/poolpw" + content: "{{lookup('password','/dev/null')}}" + no_log: True + + - name: "install pool password" + docker_container: + name: "condor-common" + image: htcondor + state: started + volumes: + - "/container/volumes/common/:/etc/condor/:rw" + - "{{pool_pw_tmp.path}}:/tmp/poolpw:ro" + detach: False + cleanup: True + command: "condor_store_cred add -c -i /tmp/poolpw/poolpw" + + always: + - file: + path: "{{pool_pw_tmp.path}}" + state: absent + when: pool_pw_tmp is defined and pool_pw_tmp.path + when: not pool_pw.stat.exists + + - name: "sync common files to individual containers" + copy: + remote_src: True + force: True + src: "/container/volumes/common/" + dest: "/container/volumes/{{item}}/" + with_items: [ "cm", "exec", "sub"] + + - docker_container: + name: "condor-cm" + image: htcondor + state: started + detach: True + cleanup: True + volumes: + - "/container/volumes/cm/:/etc/condor/:rw" +# auto_remove: True +# mounts: +# src: /container/volumes/cm/ +# dest: /etc/condor/ + + +# - add_host: +# hostname: foo +# ansible_connection: docker_api +# docker_host: ssh://ed-c7-1.virt.magni.thoto.net + tags: "docker-con"