Compare commits
4 Commits
Author | SHA1 | Date | |
---|---|---|---|
fdd4bd6bf0
|
|||
f7dd3bcf02
|
|||
f2cb9b2c6b
|
|||
38a5b89de9 |
13
inv.yml
13
inv.yml
@@ -1,12 +1,15 @@
|
|||||||
all:
|
all:
|
||||||
hosts:
|
hosts:
|
||||||
centos7VM:
|
ed-c7-1:
|
||||||
ansible_user: root
|
ansible_user: root
|
||||||
ansible_host: localhost
|
ansible_host: ed-c7-1.virt.magni.thoto.net
|
||||||
ansible_port: 19322
|
|
||||||
ssh_args: -o ControlMaster=auto -o ControlPersist=60s
|
ssh_args: -o ControlMaster=auto -o ControlPersist=60s
|
||||||
unpriv_user: centos7 # gains privileges to use docker
|
# ansible_host: 192.168.122.139
|
||||||
|
unpriv_user: thoto
|
||||||
children:
|
children:
|
||||||
|
htcondor:
|
||||||
|
hosts:
|
||||||
|
ed-c7-1:
|
||||||
slurm:
|
slurm:
|
||||||
hosts:
|
hosts:
|
||||||
centos7VM
|
ed-c7-1:
|
||||||
|
43
play.yml
43
play.yml
@@ -1,15 +1,46 @@
|
|||||||
---
|
---
|
||||||
- hosts: all
|
- hosts: all
|
||||||
tasks:
|
tasks:
|
||||||
- name: "install epel-release"
|
|
||||||
yum:
|
|
||||||
name: [ epel-release ] # necessary to find htop => separate step
|
|
||||||
state: present
|
|
||||||
- name: "install tools"
|
- name: "install tools"
|
||||||
yum:
|
yum:
|
||||||
name: [ vim-enhanced, htop, screen, tmux, bind-utils, nmap-ncat, net-tools ]
|
name: [ vim-enhanced, htop, screen, bind-utils, nmap-ncat, net-tools ]
|
||||||
state: present
|
state: present
|
||||||
|
|
||||||
|
- hosts: htcondor
|
||||||
|
pre_tasks:
|
||||||
|
- name: "install htcondor repo"
|
||||||
|
yum:
|
||||||
|
name: https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm
|
||||||
|
state: present
|
||||||
|
tags: htcondor
|
||||||
|
|
||||||
|
- name: "install htcondor software "
|
||||||
|
yum:
|
||||||
|
name: htcondor-ce
|
||||||
|
state: present
|
||||||
|
tags: htcondor
|
||||||
|
|
||||||
|
- name: "remove minicondor configuration"
|
||||||
|
yum:
|
||||||
|
name: minicondor
|
||||||
|
state: absent
|
||||||
|
tags: htcondor
|
||||||
|
|
||||||
|
- name: "setup singularity"
|
||||||
|
import_tasks: "singularity.yml"
|
||||||
|
tags: singularity
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- name: "setup docker"
|
||||||
|
role: docker
|
||||||
|
tags: docker
|
||||||
|
|
||||||
|
- name: "setup htcondor test environment in docker containers"
|
||||||
|
role: docker-htcondor
|
||||||
|
tags:
|
||||||
|
- htcondor-containered
|
||||||
|
- htcondor
|
||||||
|
|
||||||
- hosts: slurm
|
- hosts: slurm
|
||||||
vars:
|
vars:
|
||||||
container_privileged: True
|
container_privileged: True
|
||||||
@@ -20,4 +51,6 @@
|
|||||||
tags: docker
|
tags: docker
|
||||||
- name: "setup slurm test environment in docker containers"
|
- name: "setup slurm test environment in docker containers"
|
||||||
role: slurm
|
role: slurm
|
||||||
|
vars:
|
||||||
|
slurm_user: slurm # or root
|
||||||
tags: slurm
|
tags: slurm
|
||||||
|
@@ -0,0 +1,3 @@
|
|||||||
|
use ROLE:centralmanager
|
||||||
|
# ALLOW_WRITE per default on * -> following has no effect yet
|
||||||
|
ALLOW_WRITE_COLLECTOR=$(ALLOW_WRITE) condor-exec.htc.local condor-sub.htc.local
|
92
roles/docker-htcondor/files/conf/common/condor_config
Normal file
92
roles/docker-htcondor/files/conf/common/condor_config
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
######################################################################
|
||||||
|
##
|
||||||
|
## condor_config
|
||||||
|
##
|
||||||
|
## This is the global configuration file for condor. This is where
|
||||||
|
## you define where the local config file is. Any settings
|
||||||
|
## made here may potentially be overridden in the local configuration
|
||||||
|
## file. KEEP THAT IN MIND! To double-check that a variable is
|
||||||
|
## getting set from the configuration file that you expect, use
|
||||||
|
## condor_config_val -v <variable name>
|
||||||
|
##
|
||||||
|
## condor_config.annotated is a more detailed sample config file
|
||||||
|
##
|
||||||
|
## Unless otherwise specified, settings that are commented out show
|
||||||
|
## the defaults that are used if you don't define a value. Settings
|
||||||
|
## that are defined here MUST BE DEFINED since they have no default
|
||||||
|
## value.
|
||||||
|
##
|
||||||
|
######################################################################
|
||||||
|
|
||||||
|
## Where have you installed the bin, sbin and lib condor directories?
|
||||||
|
RELEASE_DIR = /usr
|
||||||
|
|
||||||
|
## Where is the local condor directory for each host? This is where the local config file(s), logs and
|
||||||
|
## spool/execute directories are located. this is the default for Linux and Unix systems.
|
||||||
|
LOCAL_DIR = /var
|
||||||
|
|
||||||
|
## Where is the machine-specific local config file for each host?
|
||||||
|
# LOCAL_CONFIG_FILE = /etc/condor/condor_config.local
|
||||||
|
LOCAL_CONFIG_FILE = /etc/condor/condor_config_$(HOSTNAME).local
|
||||||
|
## If your configuration is on a shared file system, then this might be a better default
|
||||||
|
#LOCAL_CONFIG_FILE = $(RELEASE_DIR)/etc/$(HOSTNAME).local
|
||||||
|
## If the local config file is not present, is it an error? (WARNING: This is a potential security issue.)
|
||||||
|
REQUIRE_LOCAL_CONFIG_FILE = false
|
||||||
|
|
||||||
|
## The normal way to do configuration with RPMs is to read all of the
|
||||||
|
## files in a given directory that don't match a regex as configuration files.
|
||||||
|
## Config files are read in lexicographic order.
|
||||||
|
LOCAL_CONFIG_DIR = /etc/condor/config.d
|
||||||
|
#LOCAL_CONFIG_DIR_EXCLUDE_REGEXP = ^((\..*)|(.*~)|(#.*)|(.*\.rpmsave)|(.*\.rpmnew))$
|
||||||
|
|
||||||
|
##
|
||||||
|
## Do NOT use host-based security by default.
|
||||||
|
##
|
||||||
|
## This was the default for the 8.8 series (and earlier), but it is
|
||||||
|
## intrinsically insecure. To make the 9.0 series secure by default, we
|
||||||
|
## commented it out.
|
||||||
|
##
|
||||||
|
## You should seriously consider improving your security configuration.
|
||||||
|
##
|
||||||
|
## To continue to use your old security configuration, knowing that it is
|
||||||
|
## insecure, add the line 'use SECURITY : HOST_BASED' to your local
|
||||||
|
## configuration directory. Don't just uncomment the final line in this
|
||||||
|
## comment block; changes in this file may be lost during your next upgrade.
|
||||||
|
## The following shell command will make the change on most Linux systems.
|
||||||
|
##
|
||||||
|
## echo 'use SECURITY : HOST_BASED' >> $(condor_config_val LOCAL_CONFIG_DIR)/00-insecure.config
|
||||||
|
##
|
||||||
|
|
||||||
|
## To expand your condor pool beyond a single host, set ALLOW_WRITE to match all of the hosts
|
||||||
|
#ALLOW_WRITE = *.cs.wisc.edu
|
||||||
|
## FLOCK_FROM defines the machines that grant access to your pool via flocking. (i.e. these machines can join your pool).
|
||||||
|
#FLOCK_FROM =
|
||||||
|
## FLOCK_TO defines the central managers that your schedd will advertise itself to (i.e. these pools will give matches to your schedd).
|
||||||
|
#FLOCK_TO = condor.cs.wisc.edu, cm.example.edu
|
||||||
|
|
||||||
|
##--------------------------------------------------------------------
|
||||||
|
## Values set by the rpm patch script:
|
||||||
|
##--------------------------------------------------------------------
|
||||||
|
|
||||||
|
## For Unix machines, the path and file name of the file containing
|
||||||
|
## the pool password for password authentication.
|
||||||
|
#SEC_PASSWORD_FILE = $(LOCAL_DIR)/lib/condor/pool_password
|
||||||
|
|
||||||
|
## Pathnames
|
||||||
|
RUN = $(LOCAL_DIR)/run/condor
|
||||||
|
LOG = $(LOCAL_DIR)/log/condor
|
||||||
|
LOCK = $(LOCAL_DIR)/lock/condor
|
||||||
|
SPOOL = $(LOCAL_DIR)/lib/condor/spool
|
||||||
|
EXECUTE = $(LOCAL_DIR)/lib/condor/execute
|
||||||
|
BIN = $(RELEASE_DIR)/bin
|
||||||
|
LIB = $(RELEASE_DIR)/lib64/condor
|
||||||
|
INCLUDE = $(RELEASE_DIR)/include/condor
|
||||||
|
SBIN = $(RELEASE_DIR)/sbin
|
||||||
|
LIBEXEC = $(RELEASE_DIR)/libexec/condor
|
||||||
|
SHARE = $(RELEASE_DIR)/share/condor
|
||||||
|
|
||||||
|
PROCD_ADDRESS = $(RUN)/procd_pipe
|
||||||
|
|
||||||
|
JAVA_CLASSPATH_DEFAULT = $(SHARE) .
|
||||||
|
|
||||||
|
## Install the minicondor package to run HTCondor on a single node
|
@@ -0,0 +1,64 @@
|
|||||||
|
|
||||||
|
HostKey _INSERT_HOST_KEY_
|
||||||
|
AuthorizedKeysFile _INSERT_AUTHORIZED_KEYS_FILE_
|
||||||
|
|
||||||
|
# The following option is not supported by all recent versions of OpenSSH,
|
||||||
|
# so instead we rely on injection of the shell setup command in the authorized
|
||||||
|
# keys file.
|
||||||
|
#ForceCommand _INSERT_FORCE_COMMAND_
|
||||||
|
|
||||||
|
# as a convenience to users, allow remote setting of environment
|
||||||
|
# since sshd is running as the job uid, there isn't really a security concern
|
||||||
|
AcceptEnv *
|
||||||
|
|
||||||
|
Subsystem sftp /usr/libexec/openssh/sftp-server
|
||||||
|
|
||||||
|
# prevent sshd from freaking out about reading files from inside
|
||||||
|
# a tmp-like directory
|
||||||
|
StrictModes no
|
||||||
|
|
||||||
|
# Rejection by tcp wrappers is not logged at INFO or VERBOSE log levels,
|
||||||
|
# so to make diagnosis of problems easier, we use DEBUG.
|
||||||
|
LogLevel DEBUG
|
||||||
|
|
||||||
|
X11Forwarding yes
|
||||||
|
|
||||||
|
# By default, OpenSSH uses the ipv6 loopback even if ipv6 is disabled in the
|
||||||
|
# kernel. This forces OpenSSH to use the "local" network instead
|
||||||
|
X11UseLocalhost no
|
||||||
|
|
||||||
|
# The following settings are recommended for good security.
|
||||||
|
# In particular, only FIPS 140-2 algorithms are used.
|
||||||
|
# URLs for extra information re FIPS security compliance:
|
||||||
|
# https://rhel7stig.readthedocs.io/en/latest/
|
||||||
|
# https://www.stigviewer.com/stig/red_hat_enterprise_linux_7/
|
||||||
|
# https://people.redhat.com/swells/scap-security-guide/tables/table-rhel7-stig-manual.html
|
||||||
|
|
||||||
|
#-FIPS High Severity
|
||||||
|
Protocol 2
|
||||||
|
PermitEmptyPasswords no
|
||||||
|
|
||||||
|
#-FIPS Medium Severity
|
||||||
|
# Note: Ciphers and MACs below will be incompatible with RHEL5 or earlier.
|
||||||
|
Ciphers aes128-ctr,aes192-ctr,aes256-ctr
|
||||||
|
MACs hmac-sha2-256,hmac-sha2-512
|
||||||
|
PermitRootLogin no
|
||||||
|
PermitUserEnvironment no
|
||||||
|
GSSAPIAuthentication no
|
||||||
|
KerberosAuthentication no
|
||||||
|
HostbasedAuthentication no
|
||||||
|
IgnoreRhosts yes
|
||||||
|
IgnoreUserKnownHosts yes
|
||||||
|
PrintLastLog yes
|
||||||
|
UsePrivilegeSeparation sandbox
|
||||||
|
Compression delayed
|
||||||
|
|
||||||
|
#-Recommended for security, but left out ssh_to_job config
|
||||||
|
# because they provide minimal value and are likely to annoy
|
||||||
|
# users or generate needless warnings in the ssh_to_job setting.
|
||||||
|
#
|
||||||
|
# ClientAliveInterval 600 # Note: condor_submit -i sets TMOUT
|
||||||
|
# ClientAliveCountMax 0
|
||||||
|
# banner=/etc/issue # Set to your warning banner
|
||||||
|
# StrictModes yes # Can't set due to tmp-like directory
|
||||||
|
# RhostsRSAAuthentication no # Obsolete Protocol version 1 option
|
@@ -0,0 +1,26 @@
|
|||||||
|
##
|
||||||
|
## Default security settings
|
||||||
|
##
|
||||||
|
## Host-based security was the default for the 8.8 series (and earlier).
|
||||||
|
##
|
||||||
|
## Host-based security assumes that all users on a machine are trusted.
|
||||||
|
## For example, if host-based security trusts that a given machine can
|
||||||
|
## run jobs, then any user who can start a process on that machine can
|
||||||
|
## start a startd that can "steal" jobs from the system.
|
||||||
|
#
|
||||||
|
## To help make HTCondor secure by default, we removed host-based security
|
||||||
|
## from the default configuration file
|
||||||
|
## (/etc/condor/condor_config).
|
||||||
|
##
|
||||||
|
## New installations of HTCondor should be made using the get_htcondor tool,
|
||||||
|
## which can automatically establish IDTOKENS-based security across a multi-
|
||||||
|
## node pool. For existing installations, we recommend you
|
||||||
|
## consider improving your security configuration.
|
||||||
|
##
|
||||||
|
## To continue to use your old security configuration,
|
||||||
|
## comment out the 'recommended' line below, and uncomment the
|
||||||
|
## 'host_based' line.
|
||||||
|
##
|
||||||
|
|
||||||
|
# use security : host_based
|
||||||
|
use security : recommended_v9_0
|
@@ -0,0 +1 @@
|
|||||||
|
CONDOR_HOST = condor-cm.htc.local
|
1081
roles/docker-htcondor/files/conf/common/ganglia.d/00_default_metrics
Normal file
1081
roles/docker-htcondor/files/conf/common/ganglia.d/00_default_metrics
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1 @@
|
|||||||
|
use ROLE:execute
|
@@ -0,0 +1 @@
|
|||||||
|
use ROLE:submit
|
11
roles/docker-htcondor/files/htcondor.Dockerfile
Normal file
11
roles/docker-htcondor/files/htcondor.Dockerfile
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
FROM docker.io/library/centos:7
|
||||||
|
|
||||||
|
RUN yum install -y https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm && \
|
||||||
|
yum install --nogpgcheck -y condor && \
|
||||||
|
yum install -y less && \
|
||||||
|
yum clean all
|
||||||
|
|
||||||
|
RUN yum install -y iproute bind-utils nmap-ncat net-tools && \
|
||||||
|
yum clean all
|
||||||
|
|
||||||
|
CMD bash -c 'cat <({ condor_master -f & tail --retry --pid $! -f /var/log/condor/MasterLog & })'
|
142
roles/docker-htcondor/tasks/main.yml
Normal file
142
roles/docker-htcondor/tasks/main.yml
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
- name: "htcondor docker image"
|
||||||
|
file:
|
||||||
|
path: "/container/docker-images/htcondor"
|
||||||
|
state: directory
|
||||||
|
owner: "{{unpriv_user}}"
|
||||||
|
group: docker
|
||||||
|
mode: "u=rwx,g=rwx,o=rx"
|
||||||
|
|
||||||
|
- copy:
|
||||||
|
dest: "/container/docker-images/htcondor/Dockerfile"
|
||||||
|
src: "htcondor.Dockerfile"
|
||||||
|
owner: "{{unpriv_user}}"
|
||||||
|
group: docker
|
||||||
|
register: cp_dockerfile
|
||||||
|
|
||||||
|
- docker_image:
|
||||||
|
name: "htcondor"
|
||||||
|
# pull: False
|
||||||
|
build:
|
||||||
|
pull: False
|
||||||
|
path: "/container/docker-images/htcondor"
|
||||||
|
source: build
|
||||||
|
force_source: "{{cp_dockerfile.changed}}"
|
||||||
|
|
||||||
|
- name: "copy htcondor container configuration"
|
||||||
|
copy:
|
||||||
|
src: "conf/{{item}}/"
|
||||||
|
dest: "/container/volumes/{{item}}/"
|
||||||
|
owner: "{{unpriv_user}}"
|
||||||
|
group: docker
|
||||||
|
mode: "u=rwx,g=rwx"
|
||||||
|
with_items: [ "cm", "exec", "sub", "common"]
|
||||||
|
|
||||||
|
- name: "check if pool shared secret exists"
|
||||||
|
stat:
|
||||||
|
path: "/container/volumes/common/passwords.d/POOL"
|
||||||
|
register: pool_pw
|
||||||
|
|
||||||
|
- block:
|
||||||
|
- name: "create temporary password store"
|
||||||
|
tempfile:
|
||||||
|
state: directory
|
||||||
|
register: pool_pw_tmp
|
||||||
|
|
||||||
|
- name: "generate pool password"
|
||||||
|
copy:
|
||||||
|
dest: "{{pool_pw_tmp.path}}/poolpw"
|
||||||
|
content: "{{lookup('password','/dev/null')}}"
|
||||||
|
no_log: True
|
||||||
|
|
||||||
|
- name: "install pool password"
|
||||||
|
docker_container:
|
||||||
|
name: "condor-common"
|
||||||
|
image: htcondor
|
||||||
|
state: started
|
||||||
|
volumes:
|
||||||
|
- "/container/volumes/common/:/etc/condor/:rw"
|
||||||
|
- "{{pool_pw_tmp.path}}:/tmp/poolpw:ro"
|
||||||
|
detach: False
|
||||||
|
cleanup: True
|
||||||
|
command: "condor_store_cred add -c -i /tmp/poolpw/poolpw"
|
||||||
|
|
||||||
|
- name: "remove tokens since pool password (cert) changed"
|
||||||
|
file:
|
||||||
|
path: "/container/volumes/{{item}}/tokens.d/condor@htc.local"
|
||||||
|
state: absent
|
||||||
|
with_items: [ "cm", "exec", "sub" ]
|
||||||
|
|
||||||
|
always:
|
||||||
|
- name: "remove temporary password store"
|
||||||
|
file:
|
||||||
|
path: "{{pool_pw_tmp.path}}"
|
||||||
|
state: absent
|
||||||
|
when: pool_pw_tmp is defined and pool_pw_tmp.path
|
||||||
|
when: not pool_pw.stat.exists
|
||||||
|
|
||||||
|
- name: "sync common files to individual containers"
|
||||||
|
copy_2:
|
||||||
|
remote_src: True
|
||||||
|
force: True
|
||||||
|
directory_mode: preserve
|
||||||
|
mode: preserve
|
||||||
|
src: "/container/volumes/common/"
|
||||||
|
dest: "/container/volumes/{{item}}/"
|
||||||
|
with_items: [ "cm", "exec", "sub"]
|
||||||
|
|
||||||
|
|
||||||
|
- name: "collect tokens to generate"
|
||||||
|
stat:
|
||||||
|
path: "/container/volumes/{{item}}/tokens.d/condor@htc.local"
|
||||||
|
with_items: [ "cm", "exec", "sub" ]
|
||||||
|
register: tokens_state
|
||||||
|
|
||||||
|
- name: "generate tokens"
|
||||||
|
docker_container:
|
||||||
|
name: "condor-common"
|
||||||
|
image: htcondor
|
||||||
|
state: started
|
||||||
|
volumes:
|
||||||
|
- "/container/volumes/{{item}}/:/etc/condor/:rw"
|
||||||
|
detach: False
|
||||||
|
cleanup: True
|
||||||
|
command: "condor_token_create -identity condor@$(domainname) -token /etc/condor/tokens.d/condor@htc.local"
|
||||||
|
with_items: "{{tokens_state.results | rejectattr('stat.exists') | map(attribute='item') | list }}"
|
||||||
|
|
||||||
|
|
||||||
|
- name: "create docker network to make service discovery work"
|
||||||
|
docker_network:
|
||||||
|
name: condor
|
||||||
|
state: present
|
||||||
|
|
||||||
|
# TODO: reserve some address using docker_network_info and assign as aux
|
||||||
|
# address to enable cm to get a static address in order to be reachable from
|
||||||
|
# htcondor running on docker host to enable submitting jobs.
|
||||||
|
|
||||||
|
- name: "run htcondor containers"
|
||||||
|
docker_container:
|
||||||
|
name: "condor-{{item}}"
|
||||||
|
hostname: "condor-{{item}}"
|
||||||
|
domainname: "htc.local"
|
||||||
|
image: htcondor
|
||||||
|
state: started
|
||||||
|
detach: True
|
||||||
|
cleanup: True
|
||||||
|
networks_cli_compatible: True
|
||||||
|
networks:
|
||||||
|
- name: "condor"
|
||||||
|
aliases: [ "condor-{{item}}.htc.local" ]
|
||||||
|
volumes:
|
||||||
|
- "/container/volumes/{{item}}/:/etc/condor/:rw"
|
||||||
|
with_items: [ "cm", "exec", "sub"]
|
||||||
|
# auto_remove: True
|
||||||
|
# mounts:
|
||||||
|
# src: /container/volumes/cm/
|
||||||
|
# dest: /etc/condor/
|
||||||
|
|
||||||
|
|
||||||
|
#- add_host:
|
||||||
|
# hostname: foo
|
||||||
|
# ansible_connection: docker_api
|
||||||
|
# docker_host: ssh://ed-c7-1.virt.magni.thoto.net
|
||||||
|
|
@@ -8,6 +8,41 @@
|
|||||||
name: [ "docker-ce", "python-docker-py" ] # latter for ansible modules
|
name: [ "docker-ce", "python-docker-py" ] # latter for ansible modules
|
||||||
state: present
|
state: present
|
||||||
|
|
||||||
|
- name: "partition container image disk"
|
||||||
|
parted:
|
||||||
|
device: /dev/vdb
|
||||||
|
number: 1
|
||||||
|
state: present
|
||||||
|
# fs_type: xfs
|
||||||
|
|
||||||
|
- filesystem:
|
||||||
|
dev: /dev/vdb1
|
||||||
|
fstype: xfs
|
||||||
|
opts: "-L image-store"
|
||||||
|
|
||||||
|
- mount:
|
||||||
|
path: "/container"
|
||||||
|
src: "/dev/vdb1"
|
||||||
|
fstype: xfs
|
||||||
|
opts: "noatime"
|
||||||
|
state: mounted
|
||||||
|
|
||||||
|
- file:
|
||||||
|
path: "/container/docker"
|
||||||
|
state: directory
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: "u=rwx,g=x,o=x"
|
||||||
|
|
||||||
|
- name: "link docker configuration to new container partition"
|
||||||
|
file:
|
||||||
|
path: "/var/lib/docker"
|
||||||
|
src: "/container/docker"
|
||||||
|
state: link
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: "u=rwx,g=x,o=x"
|
||||||
|
|
||||||
- name: "enable docker service in systemd"
|
- name: "enable docker service in systemd"
|
||||||
service:
|
service:
|
||||||
name: docker
|
name: docker
|
||||||
|
@@ -1 +1,5 @@
|
|||||||
container_privileged: False
|
container_privileged: False
|
||||||
|
slurm_user: slurm
|
||||||
|
slurm_log_path_ctld: /var/log/slurm/slurmctld.log
|
||||||
|
slurm_log_path_d: /var/log/slurm/slurmd.log
|
||||||
|
slurm_log_path_sched: /var/log/slurm/slurmsched.log
|
||||||
|
@@ -1,8 +1,9 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
chown munge:munge /etc/munge/munge.key
|
if [ -f "/etc/munge/munge.key" ] ; then
|
||||||
|
chown munge:munge /etc/munge/munge.key
|
||||||
chmod 600 /etc/munge/munge.key
|
chmod 600 /etc/munge/munge.key
|
||||||
|
fi
|
||||||
|
|
||||||
exec "$@"
|
exec "$@"
|
||||||
|
@@ -2,25 +2,42 @@ FROM docker.io/library/centos:7 as base
|
|||||||
|
|
||||||
RUN yum install -y epel-release && \
|
RUN yum install -y epel-release && \
|
||||||
yum install -y slurm && \
|
yum install -y slurm && \
|
||||||
yum clean all
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
|
||||||
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
|
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
|
||||||
yum clean all
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
|
||||||
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
|
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
|
||||||
|
|
||||||
RUN chown root:root /usr/local/sbin/entrypoint.sh && chmod 755 /usr/local/sbin/entrypoint.sh
|
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
|
||||||
|
chmod 755 /usr/local/sbin/entrypoint.sh
|
||||||
|
|
||||||
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
|
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
|
||||||
|
|
||||||
|
ARG slurmuser=slurm
|
||||||
|
ENV slurmuser=${slurmuser}
|
||||||
|
|
||||||
|
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
|
||||||
|
slurm-setuser -u $slurmuser -g $slurmuser -y
|
||||||
|
|
||||||
|
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
|
||||||
|
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
|
||||||
|
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
|
||||||
|
|
||||||
FROM base as slurmd
|
FROM base as slurmd
|
||||||
|
|
||||||
RUN yum install -y slurm-slurmd && \
|
RUN yum install -y slurm-slurmd && \
|
||||||
yum clean all
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
|
||||||
|
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
|
||||||
|
slurmd -D 2>/dev/null 1>/dev/null & \
|
||||||
|
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'
|
||||||
|
|
||||||
FROM base as slurmctld
|
FROM base as slurmctld
|
||||||
|
|
||||||
RUN yum install -y slurm-slurmctld && \
|
RUN yum install -y slurm-slurmctld && \
|
||||||
yum clean all
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
|
||||||
# CMD bash -c 'cat <({ condor_master -f & tail --retry --pid $! -f /var/log/condor/MasterLog & })'
|
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
|
||||||
|
su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null & \
|
||||||
|
tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'
|
||||||
|
@@ -1,20 +1,32 @@
|
|||||||
FROM docker.io/library/centos:7 as base
|
FROM docker.io/library/centos:7 as base
|
||||||
|
|
||||||
RUN yum install -y epel-release && \
|
RUN yum install -y epel-release && \
|
||||||
yum install -y slurm slurm-slurmctld && \
|
yum install -y slurm && \
|
||||||
yum clean all
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
|
||||||
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
|
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
|
||||||
yum clean all
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
|
||||||
RUN yum install -y slurm-slurmctld && \
|
|
||||||
yum clean all
|
|
||||||
|
|
||||||
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
|
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
|
||||||
|
|
||||||
RUN chown root:root /usr/local/sbin/entrypoint.sh && chmod 755 /usr/local/sbin/entrypoint.sh
|
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
|
||||||
|
chmod 755 /usr/local/sbin/entrypoint.sh
|
||||||
|
|
||||||
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
|
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
|
||||||
|
|
||||||
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & slurmctld -D & })'
|
ARG slurmuser=slurm
|
||||||
# ... & tail --retry --pid $! -f /var/log/condor/MasterLog & })'
|
ENV slurmuser=${slurmuser}
|
||||||
|
|
||||||
|
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
|
||||||
|
slurm-setuser -u $slurmuser -g $slurmuser -y
|
||||||
|
|
||||||
|
RUN yum install -y slurm-slurmctld && \
|
||||||
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
|
||||||
|
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
|
||||||
|
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
|
||||||
|
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
|
||||||
|
|
||||||
|
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
|
||||||
|
su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null & \
|
||||||
|
tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'
|
||||||
|
@@ -1,17 +1,32 @@
|
|||||||
FROM docker.io/library/centos:7
|
FROM docker.io/library/centos:7
|
||||||
|
|
||||||
RUN yum install -y epel-release && \
|
RUN yum install -y epel-release && \
|
||||||
yum install -y slurm slurm-slurmd && \
|
yum install -y slurm && \
|
||||||
yum clean all
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
|
||||||
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
|
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
|
||||||
yum clean all
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
|
||||||
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
|
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
|
||||||
|
|
||||||
RUN chown root:root /usr/local/sbin/entrypoint.sh && chmod 755 /usr/local/sbin/entrypoint.sh
|
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
|
||||||
|
chmod 755 /usr/local/sbin/entrypoint.sh
|
||||||
|
|
||||||
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
|
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
|
||||||
|
|
||||||
# CMD bash -c 'cat <({ condor_master -f & tail --retry --pid $! -f /var/log/condor/MasterLog & })'
|
ARG slurmuser=slurm
|
||||||
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & slurmd -D & })'
|
ENV slurmuser=${slurmuser}
|
||||||
|
|
||||||
|
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
|
||||||
|
slurm-setuser -u $slurmuser -g $slurmuser -y
|
||||||
|
|
||||||
|
RUN yum install -y slurm-slurmd && \
|
||||||
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
|
||||||
|
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
|
||||||
|
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
|
||||||
|
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
|
||||||
|
|
||||||
|
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
|
||||||
|
slurmd -D 2>/dev/null 1>/dev/null & \
|
||||||
|
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'
|
||||||
|
@@ -1,19 +1,19 @@
|
|||||||
- file:
|
- file:
|
||||||
path: "/home/centos7/docker-images/{{item}}"
|
path: "/container/docker-images/{{item}}"
|
||||||
state: directory
|
state: directory
|
||||||
owner: "{{unpriv_user}}"
|
owner: "{{unpriv_user}}"
|
||||||
group: docker
|
group: docker
|
||||||
|
|
||||||
- copy:
|
- copy:
|
||||||
src: "{{item}}.Dockerfile"
|
src: "{{item}}.Dockerfile"
|
||||||
dest: "/home/centos7/docker-images/{{item}}/Dockerfile"
|
dest: "/container/docker-images/{{item}}/Dockerfile"
|
||||||
owner: "{{unpriv_user}}"
|
owner: "{{unpriv_user}}"
|
||||||
group: docker
|
group: docker
|
||||||
register: slurm_cp_dockerfile
|
register: slurm_cp_dockerfile
|
||||||
|
|
||||||
- copy:
|
- copy:
|
||||||
src: "entrypoint.sh"
|
src: "entrypoint.sh"
|
||||||
dest: "/home/centos7/docker-images/{{item}}/entrypoint.sh"
|
dest: "/container/docker-images/{{item}}/entrypoint.sh"
|
||||||
owner: root
|
owner: root
|
||||||
group: root
|
group: root
|
||||||
mode: u=rwx,g=rx,o=rx
|
mode: u=rwx,g=rx,o=rx
|
||||||
@@ -24,7 +24,7 @@
|
|||||||
# pull: False
|
# pull: False
|
||||||
build:
|
build:
|
||||||
pull: False
|
pull: False
|
||||||
path: "/home/centos7/docker-images/{{item}}"
|
path: "/container/docker-images/{{item}}"
|
||||||
# target: "{{item}}" # unsupported on old docker-py versions as in el7
|
# target: "{{item}}" # unsupported on old docker-py versions as in el7
|
||||||
source: build
|
source: build
|
||||||
force_source: "{{slurm_cp_dockerfile.changed or slurm_cp_entrypt.changed}}"
|
force_source: "{{slurm_cp_dockerfile.changed or slurm_cp_entrypt.changed}}"
|
||||||
|
@@ -21,7 +21,7 @@
|
|||||||
mode: u=rw,g=,o=
|
mode: u=rw,g=,o=
|
||||||
|
|
||||||
- file:
|
- file:
|
||||||
path: /home/centos7/volumes/munge
|
path: /container/volumes/munge
|
||||||
state: directory
|
state: directory
|
||||||
owner: munge
|
owner: munge
|
||||||
group: munge
|
group: munge
|
||||||
@@ -33,17 +33,17 @@
|
|||||||
force: true
|
force: true
|
||||||
mode: preserve
|
mode: preserve
|
||||||
src: /etc/munge/munge.key
|
src: /etc/munge/munge.key
|
||||||
dest: /home/centos7/volumes/munge/munge.key
|
dest: /container/volumes/munge/munge.key
|
||||||
|
|
||||||
- file:
|
- file:
|
||||||
path: /home/centos7/volumes/slurm/
|
path: /container/volumes/slurm/
|
||||||
state: directory
|
state: directory
|
||||||
|
|
||||||
- name: upload slurm config
|
- name: upload slurm config
|
||||||
template:
|
template:
|
||||||
force: true
|
force: true
|
||||||
src: "{{item}}.j2"
|
src: "{{item}}.j2"
|
||||||
dest: "/home/centos7/volumes/slurm/{{item}}"
|
dest: "/container/volumes/slurm/{{item}}"
|
||||||
loop:
|
loop:
|
||||||
- slurm.conf
|
- slurm.conf
|
||||||
- cgroup.conf
|
- cgroup.conf
|
||||||
@@ -82,6 +82,8 @@
|
|||||||
volumes: "{{default_mounts + ( item.extra_mounts | default([]) ) }}"
|
volumes: "{{default_mounts + ( item.extra_mounts | default([]) ) }}"
|
||||||
networks:
|
networks:
|
||||||
- name: "slurm"
|
- name: "slurm"
|
||||||
|
env:
|
||||||
|
slurmuser: "{{slurm_user}}"
|
||||||
image: "{{item.image}}"
|
image: "{{item.image}}"
|
||||||
state: started
|
state: started
|
||||||
detach: True
|
detach: True
|
||||||
@@ -90,8 +92,8 @@
|
|||||||
networks_cli_compatible: True
|
networks_cli_compatible: True
|
||||||
vars:
|
vars:
|
||||||
default_mounts:
|
default_mounts:
|
||||||
- /home/centos7/volumes/slurm/:/etc/slurm/:rw
|
- /container/volumes/slurm/:/etc/slurm/:rw
|
||||||
- /home/centos7/volumes/munge/munge.key:/etc/munge/munge.key:rw
|
- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
|
||||||
- slurm-shared:/shared/:rw
|
- slurm-shared:/shared/:rw
|
||||||
slurm_nodes_all: | # add execute nodes
|
slurm_nodes_all: | # add execute nodes
|
||||||
{% for i in range(1, 4) -%}
|
{% for i in range(1, 4) -%}
|
||||||
|
@@ -32,6 +32,7 @@ CryptoType=crypto/munge
|
|||||||
#MaxStepCount=40000
|
#MaxStepCount=40000
|
||||||
#MaxTasksPerNode=128
|
#MaxTasksPerNode=128
|
||||||
MpiDefault=pmix
|
MpiDefault=pmix
|
||||||
|
# when running slurmd as user change to: MpiDefault=none
|
||||||
#MpiParams=ports=#-#
|
#MpiParams=ports=#-#
|
||||||
#PluginDir=
|
#PluginDir=
|
||||||
#PlugStackConfig=
|
#PlugStackConfig=
|
||||||
@@ -57,8 +58,9 @@ SlurmctldPort=6817
|
|||||||
SlurmdPidFile=/var/run/slurm/slurmd.pid
|
SlurmdPidFile=/var/run/slurm/slurmd.pid
|
||||||
SlurmdPort=6818
|
SlurmdPort=6818
|
||||||
SlurmdSpoolDir=/var/spool/slurm/d
|
SlurmdSpoolDir=/var/spool/slurm/d
|
||||||
SlurmUser=root
|
SlurmUser={{slurm_user}}
|
||||||
#SlurmdUser=root
|
SlurmdUser=root
|
||||||
|
# SlurmdUser=slurm -> sbatch does not work
|
||||||
#SrunEpilog=
|
#SrunEpilog=
|
||||||
#SrunProlog=
|
#SrunProlog=
|
||||||
StateSaveLocation=/var/spool/slurm/ctld
|
StateSaveLocation=/var/spool/slurm/ctld
|
||||||
@@ -130,7 +132,7 @@ AccountingStorageType=accounting_storage/none
|
|||||||
#AccountingStorageUser=
|
#AccountingStorageUser=
|
||||||
AccountingStoreJobComment=YES
|
AccountingStoreJobComment=YES
|
||||||
ClusterName=cluster
|
ClusterName=cluster
|
||||||
#DebugFlags=
|
#DebugFlags=Steps,TraceJobs
|
||||||
#JobCompHost=
|
#JobCompHost=
|
||||||
JobCompLoc=/tmp/jobcomp
|
JobCompLoc=/tmp/jobcomp
|
||||||
#JobCompPass=
|
#JobCompPass=
|
||||||
@@ -141,10 +143,10 @@ JobCompType=jobcomp/filetxt
|
|||||||
JobAcctGatherFrequency=30
|
JobAcctGatherFrequency=30
|
||||||
JobAcctGatherType=jobacct_gather/none
|
JobAcctGatherType=jobacct_gather/none
|
||||||
SlurmctldDebug=verbose
|
SlurmctldDebug=verbose
|
||||||
#SlurmctldLogFile=
|
SlurmctldLogFile={{slurm_log_path_ctld}}
|
||||||
SlurmdDebug=verbose
|
SlurmdDebug=verbose
|
||||||
#SlurmdLogFile=
|
SlurmdLogFile={{slurm_log_path_d}}
|
||||||
#SlurmSchedLogFile=
|
SlurmSchedLogFile={{slurm_log_path_sched}}
|
||||||
#SlurmSchedLogLevel=
|
#SlurmSchedLogLevel=
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
@@ -163,4 +165,4 @@ SlurmdDebug=verbose
|
|||||||
# COMPUTE NODES
|
# COMPUTE NODES
|
||||||
NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN
|
NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN
|
||||||
NodeName=slurm-submit1 CPUs=1 State=UNKNOWN
|
NodeName=slurm-submit1 CPUs=1 State=UNKNOWN
|
||||||
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] Default=YES MaxTime=INFINITE State=UP
|
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes=slurm-submit1 Default=YES MaxTime=INFINITE State=UP
|
||||||
|
Reference in New Issue
Block a user