Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
8c1cd6e902 |
13
inv.yml
13
inv.yml
@@ -1,15 +1,12 @@
|
||||
all:
|
||||
hosts:
|
||||
ed-c7-1:
|
||||
centos7VM:
|
||||
ansible_user: root
|
||||
ansible_host: ed-c7-1.virt.magni.thoto.net
|
||||
ansible_host: localhost
|
||||
ansible_port: 19322
|
||||
ssh_args: -o ControlMaster=auto -o ControlPersist=60s
|
||||
# ansible_host: 192.168.122.139
|
||||
unpriv_user: thoto
|
||||
unpriv_user: centos7 # gains privileges to use docker
|
||||
children:
|
||||
htcondor:
|
||||
hosts:
|
||||
ed-c7-1:
|
||||
slurm:
|
||||
hosts:
|
||||
ed-c7-1:
|
||||
centos7VM
|
||||
|
43
play.yml
43
play.yml
@@ -1,46 +1,15 @@
|
||||
---
|
||||
- hosts: all
|
||||
tasks:
|
||||
- name: "install epel-release"
|
||||
yum:
|
||||
name: [ epel-release ] # necessary to find htop => separate step
|
||||
state: present
|
||||
- name: "install tools"
|
||||
yum:
|
||||
name: [ vim-enhanced, htop, screen, bind-utils, nmap-ncat, net-tools ]
|
||||
name: [ vim-enhanced, htop, screen, tmux, bind-utils, nmap-ncat, net-tools ]
|
||||
state: present
|
||||
|
||||
- hosts: htcondor
|
||||
pre_tasks:
|
||||
- name: "install htcondor repo"
|
||||
yum:
|
||||
name: https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm
|
||||
state: present
|
||||
tags: htcondor
|
||||
|
||||
- name: "install htcondor software "
|
||||
yum:
|
||||
name: htcondor-ce
|
||||
state: present
|
||||
tags: htcondor
|
||||
|
||||
- name: "remove minicondor configuration"
|
||||
yum:
|
||||
name: minicondor
|
||||
state: absent
|
||||
tags: htcondor
|
||||
|
||||
- name: "setup singularity"
|
||||
import_tasks: "singularity.yml"
|
||||
tags: singularity
|
||||
|
||||
roles:
|
||||
- name: "setup docker"
|
||||
role: docker
|
||||
tags: docker
|
||||
|
||||
- name: "setup htcondor test environment in docker containers"
|
||||
role: docker-htcondor
|
||||
tags:
|
||||
- htcondor-containered
|
||||
- htcondor
|
||||
|
||||
- hosts: slurm
|
||||
vars:
|
||||
container_privileged: True
|
||||
@@ -51,6 +20,4 @@
|
||||
tags: docker
|
||||
- name: "setup slurm test environment in docker containers"
|
||||
role: slurm
|
||||
vars:
|
||||
slurm_user: slurm # or root
|
||||
tags: slurm
|
||||
|
@@ -1,3 +0,0 @@
|
||||
use ROLE:centralmanager
|
||||
# ALLOW_WRITE per default on * -> following has no effect yet
|
||||
ALLOW_WRITE_COLLECTOR=$(ALLOW_WRITE) condor-exec.htc.local condor-sub.htc.local
|
@@ -1,92 +0,0 @@
|
||||
######################################################################
|
||||
##
|
||||
## condor_config
|
||||
##
|
||||
## This is the global configuration file for condor. This is where
|
||||
## you define where the local config file is. Any settings
|
||||
## made here may potentially be overridden in the local configuration
|
||||
## file. KEEP THAT IN MIND! To double-check that a variable is
|
||||
## getting set from the configuration file that you expect, use
|
||||
## condor_config_val -v <variable name>
|
||||
##
|
||||
## condor_config.annotated is a more detailed sample config file
|
||||
##
|
||||
## Unless otherwise specified, settings that are commented out show
|
||||
## the defaults that are used if you don't define a value. Settings
|
||||
## that are defined here MUST BE DEFINED since they have no default
|
||||
## value.
|
||||
##
|
||||
######################################################################
|
||||
|
||||
## Where have you installed the bin, sbin and lib condor directories?
|
||||
RELEASE_DIR = /usr
|
||||
|
||||
## Where is the local condor directory for each host? This is where the local config file(s), logs and
|
||||
## spool/execute directories are located. this is the default for Linux and Unix systems.
|
||||
LOCAL_DIR = /var
|
||||
|
||||
## Where is the machine-specific local config file for each host?
|
||||
# LOCAL_CONFIG_FILE = /etc/condor/condor_config.local
|
||||
LOCAL_CONFIG_FILE = /etc/condor/condor_config_$(HOSTNAME).local
|
||||
## If your configuration is on a shared file system, then this might be a better default
|
||||
#LOCAL_CONFIG_FILE = $(RELEASE_DIR)/etc/$(HOSTNAME).local
|
||||
## If the local config file is not present, is it an error? (WARNING: This is a potential security issue.)
|
||||
REQUIRE_LOCAL_CONFIG_FILE = false
|
||||
|
||||
## The normal way to do configuration with RPMs is to read all of the
|
||||
## files in a given directory that don't match a regex as configuration files.
|
||||
## Config files are read in lexicographic order.
|
||||
LOCAL_CONFIG_DIR = /etc/condor/config.d
|
||||
#LOCAL_CONFIG_DIR_EXCLUDE_REGEXP = ^((\..*)|(.*~)|(#.*)|(.*\.rpmsave)|(.*\.rpmnew))$
|
||||
|
||||
##
|
||||
## Do NOT use host-based security by default.
|
||||
##
|
||||
## This was the default for the 8.8 series (and earlier), but it is
|
||||
## intrinsically insecure. To make the 9.0 series secure by default, we
|
||||
## commented it out.
|
||||
##
|
||||
## You should seriously consider improving your security configuration.
|
||||
##
|
||||
## To continue to use your old security configuration, knowing that it is
|
||||
## insecure, add the line 'use SECURITY : HOST_BASED' to your local
|
||||
## configuration directory. Don't just uncomment the final line in this
|
||||
## comment block; changes in this file may be lost during your next upgrade.
|
||||
## The following shell command will make the change on most Linux systems.
|
||||
##
|
||||
## echo 'use SECURITY : HOST_BASED' >> $(condor_config_val LOCAL_CONFIG_DIR)/00-insecure.config
|
||||
##
|
||||
|
||||
## To expand your condor pool beyond a single host, set ALLOW_WRITE to match all of the hosts
|
||||
#ALLOW_WRITE = *.cs.wisc.edu
|
||||
## FLOCK_FROM defines the machines that grant access to your pool via flocking. (i.e. these machines can join your pool).
|
||||
#FLOCK_FROM =
|
||||
## FLOCK_TO defines the central managers that your schedd will advertise itself to (i.e. these pools will give matches to your schedd).
|
||||
#FLOCK_TO = condor.cs.wisc.edu, cm.example.edu
|
||||
|
||||
##--------------------------------------------------------------------
|
||||
## Values set by the rpm patch script:
|
||||
##--------------------------------------------------------------------
|
||||
|
||||
## For Unix machines, the path and file name of the file containing
|
||||
## the pool password for password authentication.
|
||||
#SEC_PASSWORD_FILE = $(LOCAL_DIR)/lib/condor/pool_password
|
||||
|
||||
## Pathnames
|
||||
RUN = $(LOCAL_DIR)/run/condor
|
||||
LOG = $(LOCAL_DIR)/log/condor
|
||||
LOCK = $(LOCAL_DIR)/lock/condor
|
||||
SPOOL = $(LOCAL_DIR)/lib/condor/spool
|
||||
EXECUTE = $(LOCAL_DIR)/lib/condor/execute
|
||||
BIN = $(RELEASE_DIR)/bin
|
||||
LIB = $(RELEASE_DIR)/lib64/condor
|
||||
INCLUDE = $(RELEASE_DIR)/include/condor
|
||||
SBIN = $(RELEASE_DIR)/sbin
|
||||
LIBEXEC = $(RELEASE_DIR)/libexec/condor
|
||||
SHARE = $(RELEASE_DIR)/share/condor
|
||||
|
||||
PROCD_ADDRESS = $(RUN)/procd_pipe
|
||||
|
||||
JAVA_CLASSPATH_DEFAULT = $(SHARE) .
|
||||
|
||||
## Install the minicondor package to run HTCondor on a single node
|
@@ -1,64 +0,0 @@
|
||||
|
||||
HostKey _INSERT_HOST_KEY_
|
||||
AuthorizedKeysFile _INSERT_AUTHORIZED_KEYS_FILE_
|
||||
|
||||
# The following option is not supported by all recent versions of OpenSSH,
|
||||
# so instead we rely on injection of the shell setup command in the authorized
|
||||
# keys file.
|
||||
#ForceCommand _INSERT_FORCE_COMMAND_
|
||||
|
||||
# as a convenience to users, allow remote setting of environment
|
||||
# since sshd is running as the job uid, there isn't really a security concern
|
||||
AcceptEnv *
|
||||
|
||||
Subsystem sftp /usr/libexec/openssh/sftp-server
|
||||
|
||||
# prevent sshd from freaking out about reading files from inside
|
||||
# a tmp-like directory
|
||||
StrictModes no
|
||||
|
||||
# Rejection by tcp wrappers is not logged at INFO or VERBOSE log levels,
|
||||
# so to make diagnosis of problems easier, we use DEBUG.
|
||||
LogLevel DEBUG
|
||||
|
||||
X11Forwarding yes
|
||||
|
||||
# By default, OpenSSH uses the ipv6 loopback even if ipv6 is disabled in the
|
||||
# kernel. This forces OpenSSH to use the "local" network instead
|
||||
X11UseLocalhost no
|
||||
|
||||
# The following settings are recommended for good security.
|
||||
# In particular, only FIPS 140-2 algorithms are used.
|
||||
# URLs for extra information re FIPS security compliance:
|
||||
# https://rhel7stig.readthedocs.io/en/latest/
|
||||
# https://www.stigviewer.com/stig/red_hat_enterprise_linux_7/
|
||||
# https://people.redhat.com/swells/scap-security-guide/tables/table-rhel7-stig-manual.html
|
||||
|
||||
#-FIPS High Severity
|
||||
Protocol 2
|
||||
PermitEmptyPasswords no
|
||||
|
||||
#-FIPS Medium Severity
|
||||
# Note: Ciphers and MACs below will be incompatible with RHEL5 or earlier.
|
||||
Ciphers aes128-ctr,aes192-ctr,aes256-ctr
|
||||
MACs hmac-sha2-256,hmac-sha2-512
|
||||
PermitRootLogin no
|
||||
PermitUserEnvironment no
|
||||
GSSAPIAuthentication no
|
||||
KerberosAuthentication no
|
||||
HostbasedAuthentication no
|
||||
IgnoreRhosts yes
|
||||
IgnoreUserKnownHosts yes
|
||||
PrintLastLog yes
|
||||
UsePrivilegeSeparation sandbox
|
||||
Compression delayed
|
||||
|
||||
#-Recommended for security, but left out ssh_to_job config
|
||||
# because they provide minimal value and are likely to annoy
|
||||
# users or generate needless warnings in the ssh_to_job setting.
|
||||
#
|
||||
# ClientAliveInterval 600 # Note: condor_submit -i sets TMOUT
|
||||
# ClientAliveCountMax 0
|
||||
# banner=/etc/issue # Set to your warning banner
|
||||
# StrictModes yes # Can't set due to tmp-like directory
|
||||
# RhostsRSAAuthentication no # Obsolete Protocol version 1 option
|
@@ -1,26 +0,0 @@
|
||||
##
|
||||
## Default security settings
|
||||
##
|
||||
## Host-based security was the default for the 8.8 series (and earlier).
|
||||
##
|
||||
## Host-based security assumes that all users on a machine are trusted.
|
||||
## For example, if host-based security trusts that a given machine can
|
||||
## run jobs, then any user who can start a process on that machine can
|
||||
## start a startd that can "steal" jobs from the system.
|
||||
#
|
||||
## To help make HTCondor secure by default, we removed host-based security
|
||||
## from the default configuration file
|
||||
## (/etc/condor/condor_config).
|
||||
##
|
||||
## New installations of HTCondor should be made using the get_htcondor tool,
|
||||
## which can automatically establish IDTOKENS-based security across a multi-
|
||||
## node pool. For existing installations, we recommend you
|
||||
## consider improving your security configuration.
|
||||
##
|
||||
## To continue to use your old security configuration,
|
||||
## comment out the 'recommended' line below, and uncomment the
|
||||
## 'host_based' line.
|
||||
##
|
||||
|
||||
# use security : host_based
|
||||
use security : recommended_v9_0
|
@@ -1 +0,0 @@
|
||||
CONDOR_HOST = condor-cm.htc.local
|
File diff suppressed because it is too large
Load Diff
@@ -1 +0,0 @@
|
||||
use ROLE:execute
|
@@ -1 +0,0 @@
|
||||
use ROLE:submit
|
@@ -1,11 +0,0 @@
|
||||
FROM docker.io/library/centos:7
|
||||
|
||||
RUN yum install -y https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm && \
|
||||
yum install --nogpgcheck -y condor && \
|
||||
yum install -y less && \
|
||||
yum clean all
|
||||
|
||||
RUN yum install -y iproute bind-utils nmap-ncat net-tools && \
|
||||
yum clean all
|
||||
|
||||
CMD bash -c 'cat <({ condor_master -f & tail --retry --pid $! -f /var/log/condor/MasterLog & })'
|
@@ -1,142 +0,0 @@
|
||||
- name: "htcondor docker image"
|
||||
file:
|
||||
path: "/container/docker-images/htcondor"
|
||||
state: directory
|
||||
owner: "{{unpriv_user}}"
|
||||
group: docker
|
||||
mode: "u=rwx,g=rwx,o=rx"
|
||||
|
||||
- copy:
|
||||
dest: "/container/docker-images/htcondor/Dockerfile"
|
||||
src: "htcondor.Dockerfile"
|
||||
owner: "{{unpriv_user}}"
|
||||
group: docker
|
||||
register: cp_dockerfile
|
||||
|
||||
- docker_image:
|
||||
name: "htcondor"
|
||||
# pull: False
|
||||
build:
|
||||
pull: False
|
||||
path: "/container/docker-images/htcondor"
|
||||
source: build
|
||||
force_source: "{{cp_dockerfile.changed}}"
|
||||
|
||||
- name: "copy htcondor container configuration"
|
||||
copy:
|
||||
src: "conf/{{item}}/"
|
||||
dest: "/container/volumes/{{item}}/"
|
||||
owner: "{{unpriv_user}}"
|
||||
group: docker
|
||||
mode: "u=rwx,g=rwx"
|
||||
with_items: [ "cm", "exec", "sub", "common"]
|
||||
|
||||
- name: "check if pool shared secret exists"
|
||||
stat:
|
||||
path: "/container/volumes/common/passwords.d/POOL"
|
||||
register: pool_pw
|
||||
|
||||
- block:
|
||||
- name: "create temporary password store"
|
||||
tempfile:
|
||||
state: directory
|
||||
register: pool_pw_tmp
|
||||
|
||||
- name: "generate pool password"
|
||||
copy:
|
||||
dest: "{{pool_pw_tmp.path}}/poolpw"
|
||||
content: "{{lookup('password','/dev/null')}}"
|
||||
no_log: True
|
||||
|
||||
- name: "install pool password"
|
||||
docker_container:
|
||||
name: "condor-common"
|
||||
image: htcondor
|
||||
state: started
|
||||
volumes:
|
||||
- "/container/volumes/common/:/etc/condor/:rw"
|
||||
- "{{pool_pw_tmp.path}}:/tmp/poolpw:ro"
|
||||
detach: False
|
||||
cleanup: True
|
||||
command: "condor_store_cred add -c -i /tmp/poolpw/poolpw"
|
||||
|
||||
- name: "remove tokens since pool password (cert) changed"
|
||||
file:
|
||||
path: "/container/volumes/{{item}}/tokens.d/condor@htc.local"
|
||||
state: absent
|
||||
with_items: [ "cm", "exec", "sub" ]
|
||||
|
||||
always:
|
||||
- name: "remove temporary password store"
|
||||
file:
|
||||
path: "{{pool_pw_tmp.path}}"
|
||||
state: absent
|
||||
when: pool_pw_tmp is defined and pool_pw_tmp.path
|
||||
when: not pool_pw.stat.exists
|
||||
|
||||
- name: "sync common files to individual containers"
|
||||
copy_2:
|
||||
remote_src: True
|
||||
force: True
|
||||
directory_mode: preserve
|
||||
mode: preserve
|
||||
src: "/container/volumes/common/"
|
||||
dest: "/container/volumes/{{item}}/"
|
||||
with_items: [ "cm", "exec", "sub"]
|
||||
|
||||
|
||||
- name: "collect tokens to generate"
|
||||
stat:
|
||||
path: "/container/volumes/{{item}}/tokens.d/condor@htc.local"
|
||||
with_items: [ "cm", "exec", "sub" ]
|
||||
register: tokens_state
|
||||
|
||||
- name: "generate tokens"
|
||||
docker_container:
|
||||
name: "condor-common"
|
||||
image: htcondor
|
||||
state: started
|
||||
volumes:
|
||||
- "/container/volumes/{{item}}/:/etc/condor/:rw"
|
||||
detach: False
|
||||
cleanup: True
|
||||
command: "condor_token_create -identity condor@$(domainname) -token /etc/condor/tokens.d/condor@htc.local"
|
||||
with_items: "{{tokens_state.results | rejectattr('stat.exists') | map(attribute='item') | list }}"
|
||||
|
||||
|
||||
- name: "create docker network to make service discovery work"
|
||||
docker_network:
|
||||
name: condor
|
||||
state: present
|
||||
|
||||
# TODO: reserve some address using docker_network_info and assign as aux
|
||||
# address to enable cm to get a static address in order to be reachable from
|
||||
# htcondor running on docker host to enable submitting jobs.
|
||||
|
||||
- name: "run htcondor containers"
|
||||
docker_container:
|
||||
name: "condor-{{item}}"
|
||||
hostname: "condor-{{item}}"
|
||||
domainname: "htc.local"
|
||||
image: htcondor
|
||||
state: started
|
||||
detach: True
|
||||
cleanup: True
|
||||
networks_cli_compatible: True
|
||||
networks:
|
||||
- name: "condor"
|
||||
aliases: [ "condor-{{item}}.htc.local" ]
|
||||
volumes:
|
||||
- "/container/volumes/{{item}}/:/etc/condor/:rw"
|
||||
with_items: [ "cm", "exec", "sub"]
|
||||
# auto_remove: True
|
||||
# mounts:
|
||||
# src: /container/volumes/cm/
|
||||
# dest: /etc/condor/
|
||||
|
||||
|
||||
#- add_host:
|
||||
# hostname: foo
|
||||
# ansible_connection: docker_api
|
||||
# docker_host: ssh://ed-c7-1.virt.magni.thoto.net
|
||||
|
@@ -8,41 +8,6 @@
|
||||
name: [ "docker-ce", "python-docker-py" ] # latter for ansible modules
|
||||
state: present
|
||||
|
||||
- name: "partition container image disk"
|
||||
parted:
|
||||
device: /dev/vdb
|
||||
number: 1
|
||||
state: present
|
||||
# fs_type: xfs
|
||||
|
||||
- filesystem:
|
||||
dev: /dev/vdb1
|
||||
fstype: xfs
|
||||
opts: "-L image-store"
|
||||
|
||||
- mount:
|
||||
path: "/container"
|
||||
src: "/dev/vdb1"
|
||||
fstype: xfs
|
||||
opts: "noatime"
|
||||
state: mounted
|
||||
|
||||
- file:
|
||||
path: "/container/docker"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "u=rwx,g=x,o=x"
|
||||
|
||||
- name: "link docker configuration to new container partition"
|
||||
file:
|
||||
path: "/var/lib/docker"
|
||||
src: "/container/docker"
|
||||
state: link
|
||||
owner: root
|
||||
group: root
|
||||
mode: "u=rwx,g=x,o=x"
|
||||
|
||||
- name: "enable docker service in systemd"
|
||||
service:
|
||||
name: docker
|
||||
|
@@ -1,5 +1 @@
|
||||
container_privileged: False
|
||||
slurm_user: slurm
|
||||
slurm_log_path_ctld: /var/log/slurm/slurmctld.log
|
||||
slurm_log_path_d: /var/log/slurm/slurmd.log
|
||||
slurm_log_path_sched: /var/log/slurm/slurmsched.log
|
||||
|
@@ -1,9 +1,8 @@
|
||||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
if [ -f "/etc/munge/munge.key" ] ; then
|
||||
chown munge:munge /etc/munge/munge.key
|
||||
chmod 600 /etc/munge/munge.key
|
||||
fi
|
||||
chown munge:munge /etc/munge/munge.key
|
||||
|
||||
chmod 600 /etc/munge/munge.key
|
||||
|
||||
exec "$@"
|
||||
|
@@ -2,42 +2,25 @@ FROM docker.io/library/centos:7 as base
|
||||
|
||||
RUN yum install -y epel-release && \
|
||||
yum install -y slurm && \
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
yum clean all
|
||||
|
||||
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
yum clean all
|
||||
|
||||
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
|
||||
|
||||
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
|
||||
chmod 755 /usr/local/sbin/entrypoint.sh
|
||||
RUN chown root:root /usr/local/sbin/entrypoint.sh && chmod 755 /usr/local/sbin/entrypoint.sh
|
||||
|
||||
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
|
||||
|
||||
ARG slurmuser=slurm
|
||||
ENV slurmuser=${slurmuser}
|
||||
|
||||
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
|
||||
slurm-setuser -u $slurmuser -g $slurmuser -y
|
||||
|
||||
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
|
||||
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
|
||||
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
|
||||
|
||||
FROM base as slurmd
|
||||
|
||||
RUN yum install -y slurm-slurmd && \
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
|
||||
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
|
||||
slurmd -D 2>/dev/null 1>/dev/null & \
|
||||
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'
|
||||
yum clean all
|
||||
|
||||
FROM base as slurmctld
|
||||
|
||||
RUN yum install -y slurm-slurmctld && \
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
yum clean all
|
||||
|
||||
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
|
||||
su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null & \
|
||||
tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'
|
||||
# CMD bash -c 'cat <({ condor_master -f & tail --retry --pid $! -f /var/log/condor/MasterLog & })'
|
||||
|
@@ -1,32 +1,20 @@
|
||||
FROM docker.io/library/centos:7 as base
|
||||
|
||||
RUN yum install -y epel-release && \
|
||||
yum install -y slurm && \
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
yum install -y slurm slurm-slurmctld && \
|
||||
yum clean all
|
||||
|
||||
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
yum clean all
|
||||
|
||||
RUN yum install -y slurm-slurmctld && \
|
||||
yum clean all
|
||||
|
||||
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
|
||||
|
||||
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
|
||||
chmod 755 /usr/local/sbin/entrypoint.sh
|
||||
RUN chown root:root /usr/local/sbin/entrypoint.sh && chmod 755 /usr/local/sbin/entrypoint.sh
|
||||
|
||||
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
|
||||
|
||||
ARG slurmuser=slurm
|
||||
ENV slurmuser=${slurmuser}
|
||||
|
||||
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
|
||||
slurm-setuser -u $slurmuser -g $slurmuser -y
|
||||
|
||||
RUN yum install -y slurm-slurmctld && \
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
|
||||
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
|
||||
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
|
||||
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
|
||||
|
||||
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
|
||||
su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null & \
|
||||
tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'
|
||||
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & slurmctld -D & })'
|
||||
# ... & tail --retry --pid $! -f /var/log/condor/MasterLog & })'
|
||||
|
@@ -1,32 +1,17 @@
|
||||
FROM docker.io/library/centos:7
|
||||
|
||||
RUN yum install -y epel-release && \
|
||||
yum install -y slurm && \
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
yum install -y slurm slurm-slurmd && \
|
||||
yum clean all
|
||||
|
||||
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
yum clean all
|
||||
|
||||
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
|
||||
|
||||
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
|
||||
chmod 755 /usr/local/sbin/entrypoint.sh
|
||||
RUN chown root:root /usr/local/sbin/entrypoint.sh && chmod 755 /usr/local/sbin/entrypoint.sh
|
||||
|
||||
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
|
||||
|
||||
ARG slurmuser=slurm
|
||||
ENV slurmuser=${slurmuser}
|
||||
|
||||
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
|
||||
slurm-setuser -u $slurmuser -g $slurmuser -y
|
||||
|
||||
RUN yum install -y slurm-slurmd && \
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
|
||||
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
|
||||
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
|
||||
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
|
||||
|
||||
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
|
||||
slurmd -D 2>/dev/null 1>/dev/null & \
|
||||
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'
|
||||
# CMD bash -c 'cat <({ condor_master -f & tail --retry --pid $! -f /var/log/condor/MasterLog & })'
|
||||
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & slurmd -D & })'
|
||||
|
@@ -1,19 +1,19 @@
|
||||
- file:
|
||||
path: "/container/docker-images/{{item}}"
|
||||
path: "/home/centos7/docker-images/{{item}}"
|
||||
state: directory
|
||||
owner: "{{unpriv_user}}"
|
||||
group: docker
|
||||
|
||||
- copy:
|
||||
src: "{{item}}.Dockerfile"
|
||||
dest: "/container/docker-images/{{item}}/Dockerfile"
|
||||
dest: "/home/centos7/docker-images/{{item}}/Dockerfile"
|
||||
owner: "{{unpriv_user}}"
|
||||
group: docker
|
||||
register: slurm_cp_dockerfile
|
||||
|
||||
- copy:
|
||||
src: "entrypoint.sh"
|
||||
dest: "/container/docker-images/{{item}}/entrypoint.sh"
|
||||
dest: "/home/centos7/docker-images/{{item}}/entrypoint.sh"
|
||||
owner: root
|
||||
group: root
|
||||
mode: u=rwx,g=rx,o=rx
|
||||
@@ -24,7 +24,7 @@
|
||||
# pull: False
|
||||
build:
|
||||
pull: False
|
||||
path: "/container/docker-images/{{item}}"
|
||||
path: "/home/centos7/docker-images/{{item}}"
|
||||
# target: "{{item}}" # unsupported on old docker-py versions as in el7
|
||||
source: build
|
||||
force_source: "{{slurm_cp_dockerfile.changed or slurm_cp_entrypt.changed}}"
|
||||
|
@@ -21,7 +21,7 @@
|
||||
mode: u=rw,g=,o=
|
||||
|
||||
- file:
|
||||
path: /container/volumes/munge
|
||||
path: /home/centos7/volumes/munge
|
||||
state: directory
|
||||
owner: munge
|
||||
group: munge
|
||||
@@ -33,17 +33,17 @@
|
||||
force: true
|
||||
mode: preserve
|
||||
src: /etc/munge/munge.key
|
||||
dest: /container/volumes/munge/munge.key
|
||||
dest: /home/centos7/volumes/munge/munge.key
|
||||
|
||||
- file:
|
||||
path: /container/volumes/slurm/
|
||||
path: /home/centos7/volumes/slurm/
|
||||
state: directory
|
||||
|
||||
- name: upload slurm config
|
||||
template:
|
||||
force: true
|
||||
src: "{{item}}.j2"
|
||||
dest: "/container/volumes/slurm/{{item}}"
|
||||
dest: "/home/centos7/volumes/slurm/{{item}}"
|
||||
loop:
|
||||
- slurm.conf
|
||||
- cgroup.conf
|
||||
@@ -82,8 +82,6 @@
|
||||
volumes: "{{default_mounts + ( item.extra_mounts | default([]) ) }}"
|
||||
networks:
|
||||
- name: "slurm"
|
||||
env:
|
||||
slurmuser: "{{slurm_user}}"
|
||||
image: "{{item.image}}"
|
||||
state: started
|
||||
detach: True
|
||||
@@ -92,8 +90,8 @@
|
||||
networks_cli_compatible: True
|
||||
vars:
|
||||
default_mounts:
|
||||
- /container/volumes/slurm/:/etc/slurm/:rw
|
||||
- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
|
||||
- /home/centos7/volumes/slurm/:/etc/slurm/:rw
|
||||
- /home/centos7/volumes/munge/munge.key:/etc/munge/munge.key:rw
|
||||
- slurm-shared:/shared/:rw
|
||||
slurm_nodes_all: | # add execute nodes
|
||||
{% for i in range(1, 4) -%}
|
||||
|
@@ -32,7 +32,6 @@ CryptoType=crypto/munge
|
||||
#MaxStepCount=40000
|
||||
#MaxTasksPerNode=128
|
||||
MpiDefault=pmix
|
||||
# when running slurmd as user change to: MpiDefault=none
|
||||
#MpiParams=ports=#-#
|
||||
#PluginDir=
|
||||
#PlugStackConfig=
|
||||
@@ -58,9 +57,8 @@ SlurmctldPort=6817
|
||||
SlurmdPidFile=/var/run/slurm/slurmd.pid
|
||||
SlurmdPort=6818
|
||||
SlurmdSpoolDir=/var/spool/slurm/d
|
||||
SlurmUser={{slurm_user}}
|
||||
SlurmdUser=root
|
||||
# SlurmdUser=slurm -> sbatch does not work
|
||||
SlurmUser=root
|
||||
#SlurmdUser=root
|
||||
#SrunEpilog=
|
||||
#SrunProlog=
|
||||
StateSaveLocation=/var/spool/slurm/ctld
|
||||
@@ -132,7 +130,7 @@ AccountingStorageType=accounting_storage/none
|
||||
#AccountingStorageUser=
|
||||
AccountingStoreJobComment=YES
|
||||
ClusterName=cluster
|
||||
#DebugFlags=Steps,TraceJobs
|
||||
#DebugFlags=
|
||||
#JobCompHost=
|
||||
JobCompLoc=/tmp/jobcomp
|
||||
#JobCompPass=
|
||||
@@ -143,10 +141,10 @@ JobCompType=jobcomp/filetxt
|
||||
JobAcctGatherFrequency=30
|
||||
JobAcctGatherType=jobacct_gather/none
|
||||
SlurmctldDebug=verbose
|
||||
SlurmctldLogFile={{slurm_log_path_ctld}}
|
||||
#SlurmctldLogFile=
|
||||
SlurmdDebug=verbose
|
||||
SlurmdLogFile={{slurm_log_path_d}}
|
||||
SlurmSchedLogFile={{slurm_log_path_sched}}
|
||||
#SlurmdLogFile=
|
||||
#SlurmSchedLogFile=
|
||||
#SlurmSchedLogLevel=
|
||||
#
|
||||
#
|
||||
@@ -165,4 +163,4 @@ SlurmSchedLogFile={{slurm_log_path_sched}}
|
||||
# COMPUTE NODES
|
||||
NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN
|
||||
NodeName=slurm-submit1 CPUs=1 State=UNKNOWN
|
||||
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes=slurm-submit1 Default=YES MaxTime=INFINITE State=UP
|
||||
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] Default=YES MaxTime=INFINITE State=UP
|
||||
|
Reference in New Issue
Block a user