fix hostname of cobald slurm node

made cobald be able to run slurm jobs, previously failing with
permission denied.
This commit is contained in:
2021-06-24 14:07:35 +02:00
parent c7e931f29e
commit e979ea4d6e
5 changed files with 20 additions and 8 deletions

View File

@@ -81,7 +81,18 @@
dir: /var/lib/cobald
num_nodes: "{{slurm_num_nodes}}"
extra_nodes:
- "{{cobald_container_hostname}}" # from cobald_facts, read or generated
- name: cobald
hostname: "{{cobald_container_hostname}}" # from cobald/facts.yml above
# hostname is used as NodeHostname, which is used slurms "networking
# code" (https://bugs.schedmd.com/show_bug.cgi?id=8615).
# It works either way around, but one of NodeName or NodeHostname has
# to match the container name (-n flag, not --hostname) since when
# submitting tasks to the slurm controller, it matches access
# permissions against a reverse lookup of the submitting ip address.
# Docker always and unconfigureably resolves the container ip in any
# network to containername.netname, where containername is the
# containers runtime name (not hostname supplied!) and netname is
# the network name in host environment. We should run our own dns...
docker_network: slurm
when: '"slurm" in group_names'
tags: slurm

View File

@@ -118,9 +118,6 @@
networks:
- name: "{{cobald_docker_network}}"
networks_cli_compatible: True
# env:
# slurmuser: "{{slurm_user}}"
# privileged: "{{ container_privileged | bool }}"
state: started
detach: True
cleanup: True

View File

@@ -1,5 +1,5 @@
cobald_image_tag: slurm
cobald_docker_base_image: "slurm:slurmd"
cobald_docker_base_image: "{{slurm.base_image}}"
cobald_docker_default_command: False
cobald_docker_network: "{{slurm.network}}"
cobald_domainname: "{{slurm.domain}}"

View File

@@ -67,7 +67,9 @@
- slurm.conf
- cgroup.conf
vars:
alloc_nodes: "{{ [ slurm_prefix+'-submit1' ] + extra_nodes | default([])}}"
slurm_alloc_nodes_default:
- name: "{{slurm_prefix+'-submit1'}}"
alloc_nodes: "{{ slurm_alloc_nodes_default + extra_nodes | default([])}}"
notify: reconfigure slurm
tags: [ slurm-config ]

View File

@@ -165,6 +165,8 @@ SlurmSchedLogFile={{slurm_log_path_sched}}
# COMPUTE NODES
NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN
{% for i in alloc_nodes -%}
NodeName={{i}} State=UNKNOWN
NodeName={{i.name}}
{%- if i.hostname is defined %} NodeHostname={{i.hostname}} {% endif %}
State=UNKNOWN
{% endfor %}
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes={{alloc_nodes | join(',')}} Default=YES MaxTime=INFINITE State=UP
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} Default=YES MaxTime=INFINITE State=UP