WIP: cobald container containing and using slurm

This commit is contained in:
2021-06-21 19:19:19 +02:00
parent 02e87d7c40
commit e78e184375
17 changed files with 167 additions and 74 deletions

View File

@@ -3,3 +3,6 @@ slurm_user: slurm
slurm_log_path_ctld: /var/log/slurm/slurmctld.log
slurm_log_path_d: /var/log/slurm/slurmd.log
slurm_log_path_sched: /var/log/slurm/slurmsched.log
slurm_prefix: slurm
slurm_domain: slurm.local
docker_network: slurm

View File

@@ -1,9 +0,0 @@
#!/usr/bin/env bash
set -e
if [ -f "/etc/munge/munge.key" ] ; then
chown munge:munge /etc/munge/munge.key
chmod 600 /etc/munge/munge.key
fi
exec "$@"

View File

@@ -1,3 +1,3 @@
- name: reconfigure slurm
command:
cmd: docker container exec -it slurm-ctl scontrol reconfigure
shell:
cmd: "docker container exec -it {{slurm_prefix}}-ctl scontrol reconfigure || docker container restart {{slurm_prefix}}-ctl && docker container exec -it {{slurm_prefix}}-ctl scontrol reconfigure"

View File

@@ -5,7 +5,7 @@
- include_role:
name: slurm_dockerimage
loop:
loop: # FIXME: default(omit)!
- name: slurmctld
dockerfile: "{{ lookup('file', 'slurmctld.Dockerfile') }}"
files:
@@ -59,12 +59,14 @@
loop:
- slurm.conf
- cgroup.conf
vars:
alloc_nodes: "{{ [ slurm_prefix+'-submit1' ] + extra_nodes | default([])}}"
notify: reconfigure slurm
tags: [ slurm-config ]
- name: "create docker network to make service discovery work"
docker_network:
name: slurm
name: "{{ docker_network }}"
state: present
- name: "create docker volume for shared access between nodes"
@@ -73,7 +75,7 @@
state: present
- set_fact:
slurm_nodes: # default nodes: controller and submit machine
slurm_nodes_std: # default nodes: controller and submit machine
- machine: ctl
image: slurm:slurmctld
exposed_ports: [ "6817:6817/tcp" ]
@@ -81,10 +83,18 @@
image: slurm:slurmd
extra_mounts:
- "/home/{{unpriv_user}}/job3/:/mnt/:rw"
# - machine: slurm-cobald
# image: slurm-cobald
# extra_mounts:
# # TODO
slurm_nodes_exec: | # extend range to execute nodes list
{% set slurm_nodes_exec = slurm_nodes_exec | default([]) %}
{% for i in range(1, num_nodes+1) -%}
{% set _ = slurm_nodes_exec.extend([
{'machine':'exec%s'|format(i), 'image': 'slurm:slurmd'}]) -%}
{%- endfor %}
{{ slurm_nodes_exec }}
slurm_default_mounts:
- /container/volumes/slurm/:/etc/slurm/:rw
- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
- slurm-shared:/shared/:rw
slurm_network: "{{docker_network}}"
tags: [ slurm-config ]
# TODO: reserve some address using docker_network_info and assign as aux
@@ -93,16 +103,16 @@
- name: run slurm docker containers
docker_container:
name: "slurm-{{item.machine}}"
hostname: "slurm-{{item.machine}}"
domainname: "slurm.local"
volumes: "{{default_mounts + ( item.extra_mounts | default([]) ) }}"
name: "{{ slurm_prefix }}-{{ item.machine }}"
hostname: "{{ slurm_prefix }}-{{ item.machine }}"
domainname: "{{ slurm_domain }}"
volumes: "{{ slurm_default_mounts + ( item.extra_mounts | default([]) ) }}"
ports: "{{ item.exposed_ports | default([]) }}"
networks:
- name: "slurm"
- name: "{{ slurm_network }}"
env:
slurmuser: "{{slurm_user}}"
image: "{{item.image}}"
slurmuser: "{{ slurm_user }}"
image: "{{ item.image }}"
state: started
detach: True
cleanup: True
@@ -110,15 +120,17 @@
networks_cli_compatible: True
interactive: True
vars:
default_mounts:
- /container/volumes/slurm/:/etc/slurm/:rw
- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
- slurm-shared:/shared/:rw
slurm_nodes_all: | # add execute nodes
{% for i in range(1, num_nodes+1) -%}
{% set _ = slurm_nodes.extend([
{'machine':'exec%s'|format(i), 'image': 'slurm:slurmd'}]) -%}
{%- endfor %}
{{ slurm_nodes }}
loop: "{{slurm_nodes_all}}"
slurm_nodes_all: "{{ slurm_nodes_exec + slurm_nodes_std }}"
loop: "{{ slurm_nodes_all }}"
loop_control:
label: "{{slurm_prefix}}-{{ item.machine }}"
tags: [ slurm-config ]
- name: set facts to be used by other modules
set_fact:
slurm:
user: "{{slurm_user}}"
domain: "{{slurm_domain}}"
base_image: "slurm:base"
mounts: "{{slurm_default_mounts}}"
network: "{{docker_network}}"

View File

@@ -164,5 +164,7 @@ SlurmSchedLogFile={{slurm_log_path_sched}}
#
# COMPUTE NODES
NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN
NodeName=slurm-submit1 CPUs=1 State=UNKNOWN
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes=slurm-submit1 Default=YES MaxTime=INFINITE State=UP
{% for i in alloc_nodes -%}
NodeName={{i}} State=UNKNOWN
{% endfor %}
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes={{alloc_nodes | join(',')}} Default=YES MaxTime=INFINITE State=UP