Compare commits

...

19 Commits

Author SHA1 Message Date
4c63f2a825 fix: slurm host system access 2021-07-06 15:14:33 +02:00
51390bb321 improved file loading 2021-07-06 14:54:18 +02:00
52022a3013 WIP: cobald tardis config 2021-07-05 18:52:41 +02:00
574d2fcb4e drone shutdown 2021-07-05 18:25:26 +02:00
2919c98d5f cons res 2021-07-05 18:24:24 +02:00
f73fef1473 cgroups inside singularity container 2021-07-02 00:47:32 +02:00
8bc2f717e0 slurm container running when drone started 2021-07-01 15:19:35 +02:00
d88761ca7d singularity for cobald 2021-06-30 16:31:06 +02:00
3be5025442 alias making running drones working 2021-06-30 10:17:51 +02:00
4c4c4da79d parametric additional partition 2021-06-29 22:25:09 +02:00
1a952a4e7a option for docker host access to cluster 2021-06-28 17:51:45 +02:00
74a760cf98 screen scrolling 2021-06-28 17:44:39 +02:00
cd7dea8fda fix: variables hostname telegraf/influx 2021-06-28 12:14:31 +02:00
7e767c3716 memory / cpu ressources fix 2021-06-25 12:19:00 +02:00
a61d08d118 restructured playbooks, cleanup 2021-06-25 01:55:14 +02:00
188a9215a9 tags #2 2021-06-24 16:37:46 +02:00
9499ce49ae fix: wrong network 2021-06-24 16:37:10 +02:00
9237d736d8 tags 2021-06-24 14:17:16 +02:00
e979ea4d6e fix hostname of cobald slurm node
made cobald be able to run slurm jobs, previously failing with
permission denied.
2021-06-24 14:07:35 +02:00
27 changed files with 498 additions and 182 deletions

27
base.yml Normal file
View File

@@ -0,0 +1,27 @@
---
- hosts: all
tasks:
- name: "install epel repo" # for htop etc.
yum:
name: epel-release
state: present
- name: "install tools"
yum:
name: [ vim-enhanced, htop, screen, bind-utils, nmap-ncat, net-tools ]
state: present
- name: "screenrc native scrolling in tmux"
copy:
content: "termcapinfo xterm* ti@:te@\ntermcapinfo screen* ti@:te@\n"
dest: "{{item}}"
with_items:
- "~{{unpriv_user}}/.screenrc"
- "~root/.screenrc"
- name: "install ssh-key"
authorized_key:
user: "{{cfg_unpriv_user}}"
key: "{{cfg_ssh_key}}"
state: present

61
cobald.yml Normal file
View File

@@ -0,0 +1,61 @@
---
- hosts: slurm, cobald
vars:
container_privileged: True
slurm_num_nodes: 10
tasks:
- name: "setup docker"
import_role: name=docker
tags: docker
- name: "get facts from existing cobald instance (i.e. hostname)"
include_role:
name: cobald
tasks_from: facts
apply:
tags: slurm, cobald, slurm-config
tags: slurm, cobald, slurm-config
vars:
container_name: cobald
- name: "setup slurm test environment in docker containers"
include_role:
name: slurm
apply:
tags: slurm
vars:
slurm_user: slurm # or root
slurm_user_accounts:
- name: cobald
dir: /var/lib/cobald
num_nodes: "{{slurm_num_nodes}}"
extra_nodes:
- name: cobald
hostname: "{{cobald_container_hostname}}" # from cobald/facts.yml above
# hostname is used as NodeHostname, which is used slurms "networking
# code" (https://bugs.schedmd.com/show_bug.cgi?id=8615).
# It works either way around, but one of NodeName or NodeHostname has
# to match the container name (-n flag, not --hostname) since when
# submitting tasks to the slurm controller, it matches access
# permissions against a reverse lookup of the submitting ip address.
# Docker always and unconfigureably resolves the container ip in any
# network to containername.netname, where containername is the
# containers runtime name (not hostname supplied!) and netname is
# the network name in host environment. We should run our own dns...
docker_network: slurm
slurm_hostsystem_cluster_access: True
when: '"slurm" in group_names'
tags: slurm, cobald, influxdb, slurm-config
# tags: cobald requires some slurm facts, so cobald tag is included here
- name: "install cobald"
include_role:
name: cobald
apply:
tags: cobald
vars:
cobald_slurm: True
container_name: cobald
# docker_network: slurm # overriden by vars/slurm.yml
when: '"cobald" in group_names'
tags: cobald, influxdb, singularity

32
htcondor.yml Normal file
View File

@@ -0,0 +1,32 @@
---
- hosts: htcondor
tasks:
- name: "install htcondor repo"
yum:
name: https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm
state: present
tags: htcondor
- name: "install htcondor software "
yum:
name: htcondor-ce
state: present
tags: htcondor
- name: "remove minicondor configuration"
yum:
name: minicondor
state: absent
tags: htcondor
- name: "setup singularity"
import_role: name="singularity"
tags: singularity
- name: "setup docker"
import_role: name=docker
tags: docker
- name: "setup htcondor test environment in docker containers"
import_role: name=docker-htcondor
tags: htcondor-containered, htcondor

View File

@@ -6,12 +6,14 @@ all:
ssh_args: -o ControlMaster=auto -o ControlPersist=60s
# ansible_host: 192.168.122.139
unpriv_user: thoto
cfg_unpriv_user: thoto
ed-c7-2:
ansible_user: root
ansible_host: ed-c7-2.virt.uller.thoto.net
# ansible_host: 192.168.123.60 # + jumphost
ssh_args: -o ControlMaster=auto -o ControlPersist=60s
unpriv_user: thoto
cfg_unpriv_user: thoto
children:
htcondor:
hosts:

100
play.yml
View File

@@ -1,96 +1,10 @@
---
- hosts: all
vars_files:
- vars-auth.yml
- vars-influx.yml
tasks:
- name: "install epel repo" # for htop etc.
yum:
name: epel-release
state: present
- name: base setup
import_playbook: base.yml
- name: "install tools"
yum:
name: [ vim-enhanced, htop, screen, bind-utils, nmap-ncat, net-tools ]
state: present
- name: setup htcondor
import_playbook: htcondor.yml
when: '"htcondor" in group_names'
- name: "install ssh-key"
authorized_key:
user: thoto
key: "{{ssh_key}}"
state: present
- hosts: htcondor
pre_tasks:
- name: "install htcondor repo"
yum:
name: https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm
state: present
tags: htcondor
- name: "install htcondor software "
yum:
name: htcondor-ce
state: present
tags: htcondor
- name: "remove minicondor configuration"
yum:
name: minicondor
state: absent
tags: htcondor
- name: "setup singularity"
import_tasks: "singularity.yml"
tags: singularity
roles:
- name: "setup docker"
role: docker
tags: docker
- name: "setup htcondor test environment in docker containers"
role: docker-htcondor
tags:
- htcondor-containered
- htcondor
- hosts: slurm, cobald
vars:
container_privileged: True
slurm_num_nodes: 10
tasks:
- name: "setup docker"
import_role: name=docker
tags: docker
- name: "get facts from existing cobald instance (i.e. hostname)"
import_role:
name: cobald
tasks_from: facts
vars:
container_name: cobald
tags: [ slurm, cobald ]
- name: "setup slurm test environment in docker containers"
import_role: name=slurm
vars:
slurm_user: slurm # or root
slurm_user_accounts:
- name: cobald
dir: /var/lib/cobald
num_nodes: "{{slurm_num_nodes}}"
extra_nodes:
- "{{cobald_container_hostname}}" # from cobald_facts, read or generated
docker_network: slurm
when: '"slurm" in group_names'
tags: slurm
- name: "install cobald"
import_role: name=cobald
vars:
cobald_slurm: True
container_name: cobald
# docker_network: slurm # overriden by vars/slurm.yml
when: '"cobald" in group_names'
tags: cobald
- name: setup slurm and cobald
import_playbook: cobald.yml

View File

@@ -0,0 +1,3 @@
#!/bin/sh
[ /slurm-singimage/slurmd.sif -nt /shared/slurmd.sif ] && \
cp /slurm-singimage/slurmd.sif /shared/slurmd.sif

View File

@@ -0,0 +1,3 @@
#!/bin/sh
slurmd --conf-server ${slurmctld} -D -N ${nodename} 2>/dev/null 1>/dev/null &
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}

View File

@@ -0,0 +1,11 @@
###
#
# Slurm cgroup support configuration file
#
# See man slurm.conf and man cgroup.conf for further
# information on cgroup configuration parameters
#--
CgroupAutomount=no
ConstrainCores=no
ConstrainRAMSpace=no

View File

@@ -0,0 +1,31 @@
---
pipeline:
- __type__: cobald.controller.linear.LinearController
low_utilisation: 0.9
high_allocation: 0.9
rate: 0.10
- !Limiter
minimum: 3
- !TelegrafPipelineMonitor
poll: True
- !TardisPoolFactory
configuration: /etc/cobald/tardis.yaml
logging:
version: 1
root:
level: DEBUG
handlers: [console, file]
handlers:
console:
class: logging.StreamHandler
formatter: test
level: DEBUG
stream: ext://sys.stderr
file:
class: logging.handlers.RotatingFileHandler
formatter: test
level: WARNING
filename: /var/log/cobald/cobald-tardis.log
formatters:
test:
format: " %(name)s %(message)s"

View File

@@ -0,0 +1,41 @@
Plugins:
SqliteRegistry:
db_file: /tmp/drone_registry.db
TelegrafMonitoring:
host: ed-telegraf
port: 8094
#BatchSystem:
# adapter: FakeBatchSystem
# allocation: 1.0
# utilisation: !PeriodicValue
# period: 60
# amplitude: 0.15
# offset: 0.80
## phase: 1.
# phase: 1.6
# machine_status: Available
BatchSystem:
adapter: Slurm
max_age: 0.1
options:
partition: cobald
Sites:
- name: slurmtest
adapter: Slurm
quota: 20
slurmtest:
# executor: ...
StatusUpdate: 0.1
MachineTypes:
- m1.a
MachineTypeConfiguration:
m1.a:
Walltime: 5
Partition: container
StartupCommand: /usr/local/bin/start-drone
# SubmitOptions: ...
MachineMetaData:
m1.a:
Cores: 3 # cores
Memory: 1 # GB
Disk: 4 # not passed

View File

@@ -0,0 +1,31 @@
Bootstrap: docker-daemon
From: slurm:slurmd
%files
31-slurmd-configless /etc/docker-init.d/31-slurm-configless
/container/volumes/munge/munge.key /etc/munge/munge.key
cgroup.conf.noautomount /etc/slurm/cgroup.conf
%post
rm /etc/docker-init.d/30-slurmd
chmod 755 /etc/docker-init.d/31-slurm-configless
%startscript
if [ -z "${1}" -o -z "${2}" ] ; then
echo "undefined variables slurmctld or nodename"
exit 1
fi
export slurmctld="${1}"
export nodename="${2}"
echo ${slurmctld} ${nodename} ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}
exec /usr/local/sbin/entrypoint.sh /usr/local/sbin/docker-init
%runscript
if [ -z "${1}" -o -z "${2}" ] ; then
echo "undefined variables slurmctld or nodename"
exit 1
fi
export slurmctld="${1}"
export nodename="${2}"
echo ${slurmctld} ${nodename} ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}
exec /usr/local/sbin/entrypoint.sh /usr/local/sbin/docker-init

View File

@@ -0,0 +1,59 @@
#!/bin/sh
#SBATCH -D /shared
export
echo $@
nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }')
SHUTDOWN_DONE=0
function handler_quit(){
[ $SHUTDOWN_DONE -ne 0 ] && return
set -x
echo "drain container"
scontrol update NodeName=${nodename} State=DRAIN Reason="cobald node quit"
shutdown_jobs=$(squeue -w ${nodename} --noheader -O jobid)
[ -n "${shutdown_jobs}" ] && scancel ${shutdown_jobs}
#scancel -w ${nodename}
i=$(( $(scontrol show config | grep KillWait | \
sed 's/^KillWait.*= \([0-9]*\) sec/\1/') - 2 ))
while [ -n "$(squeue -w ${nodename} --noheader -O jobid)" -o ${i} -lt 1 ]
do
i=$(( ${i} - 1 ))
sleep 1
done
scancel -s KILL -w ${nodename} # hard kill all remaining jobs
echo "shutdown container"
scontrol update NodeName=${nodename} State=DOWN Reason=shutdown
singularity instance stop slurm-drone
scontrol update NodeName=${nodename} State=FUTURE
umount /inner-cgroup/freezer
umount /inner-cgroup
SHUTDOWN_DONE=1
exit 0
}
# set -x
trap handler_quit EXIT
echo "mounting cgroups"
mkdir /inner-cgroup
mount -t tmpfs none /inner-cgroup
mkdir /inner-cgroup/freezer/
mount --bind /sys/fs/cgroup/freezer/slurm/ /inner-cgroup/freezer/
mount -o remount,ro /inner-cgroup
echo "starting ${nodename}"
scontrol update NodeName=${nodename} State=RESUME # revoke last DRAIN
scontrol update NodeName=${nodename} State=FUTURE
singularity instance start \
-B /inner-cgroup/:/sys/fs/cgroup/ \
--writable-tmpfs /shared/slurmd.sif slurm-drone \
slurm-ctl ${nodename}
# scontrol update NodeName=${nodename} NodeHostname=${SLURM_JOB_ID}
scontrol update NodeName=${nodename} NodeHostname=${TardisDroneUuid}
if [ $? -eq 0 ] ; then
echo "container started, sleeping $(( 60 * ${SLURM_Walltime} - 2 ))"
sleep $(( 60 * ${SLURM_Walltime} - 2 ))
fi
handler_quit

View File

@@ -14,11 +14,12 @@
- name: run grafana
docker_container:
name: ed-grafana
image: grafana/grafana:7.5.7
image: docker.io/grafana/grafana:7.5.7
hostname: ed-grafana
domainname: cobald.local
networks:
- name: "{{docker_network}}"
- name: "{{cobald_docker_network}}"
networks_cli_compatible: True
published_ports:
- "3000:3000"
state: started
@@ -42,7 +43,6 @@
version: Flux
additional_secure_json_data:
token: "{{influx_grafana_token.token}}"
register: das
- community.grafana.grafana_dashboard:
grafana_url: http://localhost:3000

View File

@@ -1,11 +1,12 @@
- name: run influxdb in docker container
docker_container:
name: ed-influxdb
image: influxdb
hostname: "{{influx_hostname}}"
domainname: "{{influx_domainname}}"
image: docker.io/library/influxdb:2.0
hostname: "{{cobald_influx_hostname}}"
domainname: "{{cobald_domainname}}"
networks:
- name: "{{ docker_network }}"
- name: "{{ cobald_docker_network }}"
networks_cli_compatible: True
published_ports:
- "{{influx_pubport}}:8086"
volumes:
@@ -21,7 +22,6 @@
state: started
detach: True
cleanup: True
networks_cli_compatible: True
- name: add ansible connection to influxdb container
add_host:

View File

@@ -1,5 +1,6 @@
- include_vars: cobald-slurm.yml
when: cobald_slurm | default(False)
tags: always
- name: build cobald:slurm docker image
include_role:
@@ -9,11 +10,16 @@
slurm_image_prefix: cobald
image_name: "{{cobald_image_tag}}"
dockerfile: "{{ lookup('template', 'cobald.Dockerfile') }}"
files:
- dest: cobald-entrypoint.sh
content: "{{ lookup('file', 'cobald-entrypoint.sh') }}"
- dest: init-cobaldmodules.sh
content: "{{ lookup('file', 'init-cobaldmodules.sh') }}"
files_list:
- cobald-entrypoint.sh
- init-cobaldmodules.sh
- start-drone
- 28-sync-container-slurmd
files: "
{%- set files = [] -%} {%- for i in files_list -%}
{%- set files = files.append(
{ 'dest': i, 'content': lookup('file', i) }) -%}
{%- endfor %}{{ files }}"
when: cobald_slurm | default(False)
- name: build generic cobald docker image
@@ -27,13 +33,14 @@
owner: "{{unpriv_user}}"
group: docker
- name: copy cobald config (does nothing yet)
- name: copy cobald config
copy:
src: cobald-config/
dest: /container/volumes/cobald
dest: "~{{unpriv_user}}/cobald/"
force: False
owner: "{{unpriv_user}}"
group: docker
when: False
mode: "0644"
- name: ensure network for cobald container exists
docker_network:
@@ -63,7 +70,7 @@
repo: https://github.com/thoto/cobald
dest: "~{{unpriv_user}}/cobald-src"
version: bugfix/mixed_construction_methods
update: no
update: no # FIXME
become: yes
become_user: "{{unpriv_user}}"
register: cobald_git_pull
@@ -73,6 +80,7 @@
repo: https://github.com/MatterMiners/tardis
dest: "~{{unpriv_user}}/tardis-src"
version: master
update: no # FIXME
become: yes
become_user: "{{unpriv_user}}"
register: tardis_git_pull
@@ -108,6 +116,13 @@
include_tasks: facts.yml
when: cobald_container_hostname is not defined
- name: build singularity container
include_tasks:
file: singularity.yml
apply:
tags: singularity
tags: singularity
- name: run cobald container
docker_container:
name: "{{ container_name | default('cobald') }}"
@@ -118,9 +133,6 @@
networks:
- name: "{{cobald_docker_network}}"
networks_cli_compatible: True
# env:
# slurmuser: "{{slurm_user}}"
# privileged: "{{ container_privileged | bool }}"
state: started
detach: True
cleanup: True
@@ -131,6 +143,7 @@
cobald_mounts:
- "~{{unpriv_user}}/cobald:/etc/cobald"
# - /container/volumes/cobald:/etc/cobald:ro
- "/container/docker-images/sing-slurmd/build/:/slurm-singimage/:ro"
- "~{{unpriv_user}}/cobald/modules:/usr/local/src/cobaldmodules"
- "~{{unpriv_user}}/cobald-src:/usr/local/src/cobald:ro"
- "~{{unpriv_user}}/tardis-src:/usr/local/src/tardis:ro"

View File

@@ -0,0 +1,48 @@
- name: setup singularity
import_role: name="singularity"
tags: singularity
- name: make singularity image build directory
file:
state: directory
path: "{{item}}"
owner: "{{unpriv_user}}"
group: "docker"
mode: "0755"
loop:
- /container/docker-images/sing-slurmd
- /container/docker-images/sing-slurmd/cache
- /container/docker-images/sing-slurmd/build
- name: copy slurm singularity container files
copy:
src: "{{item}}"
dest: "/container/docker-images/sing-slurmd/{{item}}"
owner: "{{unpriv_user}}"
group: "docker"
loop:
- slurm-slurmd.def
- 31-slurmd-configless
- cgroup.conf.noautomount
register: cobald_copy_sing_files
- name: remove old container
file:
path: /container/docker-images/sing-slurmd/build/slurmd.sif
state: absent
when: cobald_copy_sing_files.changed
- name: build container
shell:
chdir: /container/docker-images/sing-slurmd/
cmd: SINGULARITY_TMPDIR=/container/docker-images/sing-slurmd/cache
singularity build --disable-cache
/container/docker-images/sing-slurmd/build/slurmd.sif
/container/docker-images/sing-slurmd/slurm-slurmd.def
creates: /container/docker-images/sing-slurmd/build/slurmd.sif
register: cobald_sing_build
- debug: msg="{{[cobald_sing_build.stdout, cobald_sing_build.stderr]}}"
tags: [ never, debug ]
# TODO: trigger copy in cobald container when slurmd.sif rebuilt

View File

@@ -43,7 +43,7 @@
group: docker
vars:
influx_token: "{{influx_telegraf_token.token}}"
influx_url: "http://{{influx_hostname}}:8086"
influx_url: "http://{{cobald_influx_hostname}}:8086"
register: telegraf_config_gen
- name: run telegraf container
@@ -51,9 +51,10 @@
name: ed-telegraf
image: ed-telegraf
hostname: telegraf
domainname: cobald.local
domainname: "{{ cobald_domainname }}"
networks:
- name: "{{docker_network | default('bridge') }}"
- name: "{{ cobald_docker_network }}"
aliases: ["ed-telegraf"]
volumes:
- "/container/volumes/telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro"
state: started
@@ -64,5 +65,5 @@
- import_tasks: grafana.yml
vars:
influx_url: "http://{{influx_hostname}}:8086"
influx_url: "http://{{cobald_influx_hostname}}:8086"
tags: influxdb

View File

@@ -48,6 +48,10 @@ COPY init-cobaldmodules.sh /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
RUN chmod 755 /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
COPY start-drone /usr/local/bin/start-drone
COPY 28-sync-container-slurmd /etc/docker-init.d/28-sync-container-slurmd
RUN chmod 755 /usr/local/bin/start-drone /etc/docker-init.d/28-sync-container-slurmd
RUN echo -e "#!/bin/sh\npython3 -m cobald.daemon /etc/cobald/config.yaml" >> /etc/docker-init.d/70-cobald && chmod 755 /etc/docker-init.d/70-cobald
{% if cobald_docker_default_command | default(True) -%}

View File

@@ -1,5 +1,5 @@
cobald_image_tag: slurm
cobald_docker_base_image: "slurm:slurmd"
cobald_docker_base_image: "{{slurm.base_image}}"
cobald_docker_default_command: False
cobald_docker_network: "{{slurm.network}}"
cobald_domainname: "{{slurm.domain}}"

View File

@@ -1,3 +1,2 @@
cobald_docker_network: "{{docker_network}}"
influx_domainname: "{{ cobald_domainname }}"
influx_hostname: "ed-influxdb"
cobald_influx_hostname: "ed-influxdb"

View File

@@ -8,7 +8,7 @@
value: "15000"
sysctl_file: /etc/sysctl.d/90-max_net_namespaces.conf
- name: "enable user thoto for fakeroot access"
- name: "enable user {{unpriv_user}} for fakeroot access"
lineinfile:
line: "{{unpriv_user}}:4294836224:65536"
dest: "{{item}}"

View File

@@ -1,43 +0,0 @@
FROM docker.io/library/centos:7 as base
RUN yum install -y epel-release && \
yum install -y slurm && \
yum clean all && rm -rf /var/cache/yum
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
yum clean all && rm -rf /var/cache/yum
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
chmod 755 /usr/local/sbin/entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
ARG slurmuser=slurm
ENV slurmuser=${slurmuser}
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
slurm-setuser -u $slurmuser -g $slurmuser -y
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
FROM base as slurmd
RUN yum install -y slurm-slurmd && \
yum clean all && rm -rf /var/cache/yum
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
slurmd -D 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'
FROM base as slurmctld
RUN yum install -y slurm-slurmctld && \
yum clean all && rm -rf /var/cache/yum
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'

View File

@@ -9,3 +9,6 @@ RUN chmod 755 /etc/docker-init.d/30-slurmd
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
RUN yum install -y singularity && \
yum clean all && rm -rf /var/cache/yum

View File

@@ -0,0 +1,35 @@
# TODO: this does not work quite right since slurm-ctl does not reach the host
# system. sinfo, scontrol etc. work but srun does not!
- name: "get addresses from docker network"
docker_network_info:
name: "{{ docker_network }}"
register: slurm_network_data
- name: link host slurm config
file:
path: "/etc/slurm/slurm.conf"
src: "/container/volumes/slurm/slurm.conf"
force: True
state: link
backup: True
- name: create slurm user
user:
name: slurm
system: True
- name: place entry of slurm-ctl in host /etc/hosts
lineinfile:
line: "{{slurm_network_data.network.Containers | dict2items
| json_query('[?value.Name==`slurm-ctl`].value.IPv4Address') | first
| ipaddr('address') }}\tslurm-ctl"
regexp: "^(\\S*)(\\s*)slurm-ctl$"
path: /etc/hosts
backup: True
- name: start munge locally
service:
name: munge
enabled: True
state: started

View File

@@ -58,6 +58,13 @@
path: /container/volumes/slurm/
state: directory
- name: "create docker network to make service discovery work"
docker_network:
name: "{{ docker_network }}"
state: present
register: slurm_network_data
tags: slurm-config
- name: upload slurm config
template:
force: true
@@ -67,24 +74,29 @@
- slurm.conf
- cgroup.conf
vars:
alloc_nodes: "{{ [ slurm_prefix+'-submit1' ] + extra_nodes | default([])}}"
slurm_exec_node_cores: 3
slurm_exec_node_mem: 5000 # RealMemory=5964
slurm_alloc_nodes_default:
- name: "{{slurm_prefix+'-submit1'}}"
- name: "{{ inventory_hostname }}"
addr: "{{ slurm_network_data.network.IPAM.Config[0].Gateway }}"
alloc_nodes: "{{ slurm_alloc_nodes_default + extra_nodes | default([])}}"
partitions:
- name: cobald
nodeprefix: drone
num_nodes: 10
node_cores: 3
node_mem: 4900
port: 16818
initstate: FUTURE
notify: reconfigure slurm
tags: [ slurm-config ]
- name: "create docker network to make service discovery work"
docker_network:
name: "{{ docker_network }}"
state: present
tags: slurm-config
- name: "create docker volume for shared access between nodes"
docker_volume:
name: slurm-shared
state: present
# TODO: reserve some address using docker_network_info and assign as aux
# address to enable slurmctld to get a static address in order to be
# reachable from slurm running on docker host to enable submitting jobs.
- name: run slurm docker containers
docker_container:
name: "{{ slurm_prefix }}-{{ item.machine }}"
@@ -94,6 +106,7 @@
ports: "{{ item.exposed_ports | default([]) }}"
networks:
- name: "{{ docker_network }}"
aliases: "{{ item.aliases | default(omit) }}"
env:
slurmuser: "{{ slurm_user }}"
image: "{{ item.image }}"
@@ -108,7 +121,11 @@
loop: "{{ slurm_nodes_all }}"
loop_control:
label: "{{slurm_prefix}}-{{ item.machine }}"
tags: [ slurm-config ]
tags: slurm-config
- name: configure host system to integrate into slurm cluster
import_tasks: host-config.yml
when: slurm_hostsystem_cluster_access | default(False)
- name: export facts about slurm cluster to be used by other modules
set_fact:

View File

@@ -9,6 +9,8 @@ ControlMachine=slurm-ctl
AuthType=auth/munge
#CheckpointType=checkpoint/none
CryptoType=crypto/munge
CommunicationParameters=NoAddrCache
SlurmctldParameters=enable_configless
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
@@ -103,8 +105,10 @@ Waittime=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/linear
# SelectType=select/linear
SelectType=select/cons_res
#SelectTypeParameters=
SelectTypeParameters=CR_CORE
#
#
# JOB PRIORITY
@@ -163,8 +167,27 @@ SlurmSchedLogFile={{slurm_log_path_sched}}
#
#
# COMPUTE NODES
NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN
{% for i in alloc_nodes -%}
NodeName={{i}} State=UNKNOWN
NodeName=slurm-exec[1-{{ num_nodes }}] CPUs={{ slurm_exec_node_cores }} {{''
}} RealMemory={{ slurm_exec_node_mem }} {{''
}} CoresPerSocket={{ slurm_exec_node_cores }} State=UNKNOWN
{% for p in partitions | default([]) %}
NodeName={{ p.nodeprefix }}[1-{{ p.num_nodes }}] CPUs={{ p.node_cores }} {{''
}} RealMemory={{ p.node_mem }} {{''
}} CoresPerSocket={{ p.node_cores }} {{''
}} {%- if p.port is defined %} Port={{ p.port}} {% endif %}{{''
}} State={{ p.initstate | default('UNKNOWN') }}
{% endfor %}
{% for i in alloc_nodes -%}
NodeName={{i.name}}
{%- if i.hostname is defined %} NodeHostname={{i.hostname}} {% endif %}
{%- if i.addr is defined %} NodeAddr={{i.addr}} {% endif %}
State=UNKNOWN
{% endfor %}
PartitionName=container Nodes=slurm-exec[1-{{num_nodes}}] {{ ''
}} AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} {{ ''
}} Default=YES MaxTime=INFINITE State=UP
{% for p in partitions | default([]) %}
PartitionName={{ p.name }} Nodes={{ p.nodeprefix }}[1-{{ p.num_nodes }}] {{ ''
}} AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} {{ ''
}} MaxTime=INFINITE State=UP
{% endfor %}
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes={{alloc_nodes | join(',')}} Default=YES MaxTime=INFINITE State=UP

View File

@@ -10,7 +10,8 @@ slurm_nodes_exec: | # extend range to execute nodes list
{% set slurm_nodes_exec = [] %}
{% for i in range(1, num_nodes+1) -%}
{% set _ = slurm_nodes_exec.extend([
{'machine':'exec%s'|format(i), 'image': 'slurm:slurmd'}]) -%}
{'machine':'exec%s'|format(i), 'image': 'slurm:slurmd',
'aliases':['drone%s'|format(i)]}]) -%}
{%- endfor %}
{{ slurm_nodes_exec }}
slurm_default_mounts: