WIP: cobald container containing and using slurm
This commit is contained in:
@@ -3,3 +3,6 @@ slurm_user: slurm
|
||||
slurm_log_path_ctld: /var/log/slurm/slurmctld.log
|
||||
slurm_log_path_d: /var/log/slurm/slurmd.log
|
||||
slurm_log_path_sched: /var/log/slurm/slurmsched.log
|
||||
slurm_prefix: slurm
|
||||
slurm_domain: slurm.local
|
||||
docker_network: slurm
|
||||
|
@@ -1,9 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
if [ -f "/etc/munge/munge.key" ] ; then
|
||||
chown munge:munge /etc/munge/munge.key
|
||||
chmod 600 /etc/munge/munge.key
|
||||
fi
|
||||
|
||||
exec "$@"
|
@@ -1,3 +1,3 @@
|
||||
- name: reconfigure slurm
|
||||
command:
|
||||
cmd: docker container exec -it slurm-ctl scontrol reconfigure
|
||||
shell:
|
||||
cmd: "docker container exec -it {{slurm_prefix}}-ctl scontrol reconfigure || docker container restart {{slurm_prefix}}-ctl && docker container exec -it {{slurm_prefix}}-ctl scontrol reconfigure"
|
||||
|
@@ -5,7 +5,7 @@
|
||||
|
||||
- include_role:
|
||||
name: slurm_dockerimage
|
||||
loop:
|
||||
loop: # FIXME: default(omit)!
|
||||
- name: slurmctld
|
||||
dockerfile: "{{ lookup('file', 'slurmctld.Dockerfile') }}"
|
||||
files:
|
||||
@@ -59,12 +59,14 @@
|
||||
loop:
|
||||
- slurm.conf
|
||||
- cgroup.conf
|
||||
vars:
|
||||
alloc_nodes: "{{ [ slurm_prefix+'-submit1' ] + extra_nodes | default([])}}"
|
||||
notify: reconfigure slurm
|
||||
tags: [ slurm-config ]
|
||||
|
||||
- name: "create docker network to make service discovery work"
|
||||
docker_network:
|
||||
name: slurm
|
||||
name: "{{ docker_network }}"
|
||||
state: present
|
||||
|
||||
- name: "create docker volume for shared access between nodes"
|
||||
@@ -73,7 +75,7 @@
|
||||
state: present
|
||||
|
||||
- set_fact:
|
||||
slurm_nodes: # default nodes: controller and submit machine
|
||||
slurm_nodes_std: # default nodes: controller and submit machine
|
||||
- machine: ctl
|
||||
image: slurm:slurmctld
|
||||
exposed_ports: [ "6817:6817/tcp" ]
|
||||
@@ -81,10 +83,18 @@
|
||||
image: slurm:slurmd
|
||||
extra_mounts:
|
||||
- "/home/{{unpriv_user}}/job3/:/mnt/:rw"
|
||||
# - machine: slurm-cobald
|
||||
# image: slurm-cobald
|
||||
# extra_mounts:
|
||||
# # TODO
|
||||
slurm_nodes_exec: | # extend range to execute nodes list
|
||||
{% set slurm_nodes_exec = slurm_nodes_exec | default([]) %}
|
||||
{% for i in range(1, num_nodes+1) -%}
|
||||
{% set _ = slurm_nodes_exec.extend([
|
||||
{'machine':'exec%s'|format(i), 'image': 'slurm:slurmd'}]) -%}
|
||||
{%- endfor %}
|
||||
{{ slurm_nodes_exec }}
|
||||
slurm_default_mounts:
|
||||
- /container/volumes/slurm/:/etc/slurm/:rw
|
||||
- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
|
||||
- slurm-shared:/shared/:rw
|
||||
slurm_network: "{{docker_network}}"
|
||||
tags: [ slurm-config ]
|
||||
|
||||
# TODO: reserve some address using docker_network_info and assign as aux
|
||||
@@ -93,16 +103,16 @@
|
||||
|
||||
- name: run slurm docker containers
|
||||
docker_container:
|
||||
name: "slurm-{{item.machine}}"
|
||||
hostname: "slurm-{{item.machine}}"
|
||||
domainname: "slurm.local"
|
||||
volumes: "{{default_mounts + ( item.extra_mounts | default([]) ) }}"
|
||||
name: "{{ slurm_prefix }}-{{ item.machine }}"
|
||||
hostname: "{{ slurm_prefix }}-{{ item.machine }}"
|
||||
domainname: "{{ slurm_domain }}"
|
||||
volumes: "{{ slurm_default_mounts + ( item.extra_mounts | default([]) ) }}"
|
||||
ports: "{{ item.exposed_ports | default([]) }}"
|
||||
networks:
|
||||
- name: "slurm"
|
||||
- name: "{{ slurm_network }}"
|
||||
env:
|
||||
slurmuser: "{{slurm_user}}"
|
||||
image: "{{item.image}}"
|
||||
slurmuser: "{{ slurm_user }}"
|
||||
image: "{{ item.image }}"
|
||||
state: started
|
||||
detach: True
|
||||
cleanup: True
|
||||
@@ -110,15 +120,17 @@
|
||||
networks_cli_compatible: True
|
||||
interactive: True
|
||||
vars:
|
||||
default_mounts:
|
||||
- /container/volumes/slurm/:/etc/slurm/:rw
|
||||
- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
|
||||
- slurm-shared:/shared/:rw
|
||||
slurm_nodes_all: | # add execute nodes
|
||||
{% for i in range(1, num_nodes+1) -%}
|
||||
{% set _ = slurm_nodes.extend([
|
||||
{'machine':'exec%s'|format(i), 'image': 'slurm:slurmd'}]) -%}
|
||||
{%- endfor %}
|
||||
{{ slurm_nodes }}
|
||||
loop: "{{slurm_nodes_all}}"
|
||||
slurm_nodes_all: "{{ slurm_nodes_exec + slurm_nodes_std }}"
|
||||
loop: "{{ slurm_nodes_all }}"
|
||||
loop_control:
|
||||
label: "{{slurm_prefix}}-{{ item.machine }}"
|
||||
tags: [ slurm-config ]
|
||||
|
||||
- name: set facts to be used by other modules
|
||||
set_fact:
|
||||
slurm:
|
||||
user: "{{slurm_user}}"
|
||||
domain: "{{slurm_domain}}"
|
||||
base_image: "slurm:base"
|
||||
mounts: "{{slurm_default_mounts}}"
|
||||
network: "{{docker_network}}"
|
||||
|
@@ -164,5 +164,7 @@ SlurmSchedLogFile={{slurm_log_path_sched}}
|
||||
#
|
||||
# COMPUTE NODES
|
||||
NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN
|
||||
NodeName=slurm-submit1 CPUs=1 State=UNKNOWN
|
||||
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes=slurm-submit1 Default=YES MaxTime=INFINITE State=UP
|
||||
{% for i in alloc_nodes -%}
|
||||
NodeName={{i}} State=UNKNOWN
|
||||
{% endfor %}
|
||||
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes={{alloc_nodes | join(',')}} Default=YES MaxTime=INFINITE State=UP
|
||||
|
Reference in New Issue
Block a user