From e979ea4d6e8fc874a53454523fb4893a64f79173 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20M=C3=BCller?= Date: Thu, 24 Jun 2021 14:07:35 +0200 Subject: [PATCH] fix hostname of cobald slurm node made cobald be able to run slurm jobs, previously failing with permission denied. --- play.yml | 13 ++++++++++++- roles/cobald/tasks/main.yml | 3 --- roles/cobald/vars/cobald-slurm.yml | 2 +- roles/slurm/tasks/main.yml | 4 +++- roles/slurm/templates/slurm.conf.j2 | 6 ++++-- 5 files changed, 20 insertions(+), 8 deletions(-) diff --git a/play.yml b/play.yml index a7c18ca..7e48ee4 100644 --- a/play.yml +++ b/play.yml @@ -81,7 +81,18 @@ dir: /var/lib/cobald num_nodes: "{{slurm_num_nodes}}" extra_nodes: - - "{{cobald_container_hostname}}" # from cobald_facts, read or generated + - name: cobald + hostname: "{{cobald_container_hostname}}" # from cobald/facts.yml above + # hostname is used as NodeHostname, which is used slurms "networking + # code" (https://bugs.schedmd.com/show_bug.cgi?id=8615). + # It works either way around, but one of NodeName or NodeHostname has + # to match the container name (-n flag, not --hostname) since when + # submitting tasks to the slurm controller, it matches access + # permissions against a reverse lookup of the submitting ip address. + # Docker always and unconfigureably resolves the container ip in any + # network to containername.netname, where containername is the + # containers runtime name (not hostname supplied!) and netname is + # the network name in host environment. We should run our own dns... docker_network: slurm when: '"slurm" in group_names' tags: slurm diff --git a/roles/cobald/tasks/main.yml b/roles/cobald/tasks/main.yml index dfe32ab..dc90e49 100644 --- a/roles/cobald/tasks/main.yml +++ b/roles/cobald/tasks/main.yml @@ -118,9 +118,6 @@ networks: - name: "{{cobald_docker_network}}" networks_cli_compatible: True -# env: -# slurmuser: "{{slurm_user}}" -# privileged: "{{ container_privileged | bool }}" state: started detach: True cleanup: True diff --git a/roles/cobald/vars/cobald-slurm.yml b/roles/cobald/vars/cobald-slurm.yml index cbe6924..ab38387 100644 --- a/roles/cobald/vars/cobald-slurm.yml +++ b/roles/cobald/vars/cobald-slurm.yml @@ -1,5 +1,5 @@ cobald_image_tag: slurm -cobald_docker_base_image: "slurm:slurmd" +cobald_docker_base_image: "{{slurm.base_image}}" cobald_docker_default_command: False cobald_docker_network: "{{slurm.network}}" cobald_domainname: "{{slurm.domain}}" diff --git a/roles/slurm/tasks/main.yml b/roles/slurm/tasks/main.yml index e0900ba..81a6873 100644 --- a/roles/slurm/tasks/main.yml +++ b/roles/slurm/tasks/main.yml @@ -67,7 +67,9 @@ - slurm.conf - cgroup.conf vars: - alloc_nodes: "{{ [ slurm_prefix+'-submit1' ] + extra_nodes | default([])}}" + slurm_alloc_nodes_default: + - name: "{{slurm_prefix+'-submit1'}}" + alloc_nodes: "{{ slurm_alloc_nodes_default + extra_nodes | default([])}}" notify: reconfigure slurm tags: [ slurm-config ] diff --git a/roles/slurm/templates/slurm.conf.j2 b/roles/slurm/templates/slurm.conf.j2 index ccda145..2f9e009 100644 --- a/roles/slurm/templates/slurm.conf.j2 +++ b/roles/slurm/templates/slurm.conf.j2 @@ -165,6 +165,8 @@ SlurmSchedLogFile={{slurm_log_path_sched}} # COMPUTE NODES NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN {% for i in alloc_nodes -%} -NodeName={{i}} State=UNKNOWN +NodeName={{i.name}} + {%- if i.hostname is defined %} NodeHostname={{i.hostname}} {% endif %} + State=UNKNOWN {% endfor %} -PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes={{alloc_nodes | join(',')}} Default=YES MaxTime=INFINITE State=UP +PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} Default=YES MaxTime=INFINITE State=UP