Compare commits

...

2 Commits

Author SHA1 Message Date
9237d736d8 tags 2021-06-24 14:17:16 +02:00
e979ea4d6e fix hostname of cobald slurm node
made cobald be able to run slurm jobs, previously failing with
permission denied.
2021-06-24 14:07:35 +02:00
5 changed files with 36 additions and 16 deletions

View File

@@ -65,15 +65,20 @@
tags: docker
- name: "get facts from existing cobald instance (i.e. hostname)"
import_role:
include_role:
name: cobald
tasks_from: facts
apply:
tags: slurm, cobald, slurm-config
tags: slurm, cobald, slurm-config
vars:
container_name: cobald
tags: [ slurm, cobald ]
- name: "setup slurm test environment in docker containers"
import_role: name=slurm
include_role:
name: slurm
apply:
tags: slurm
vars:
slurm_user: slurm # or root
slurm_user_accounts:
@@ -81,16 +86,30 @@
dir: /var/lib/cobald
num_nodes: "{{slurm_num_nodes}}"
extra_nodes:
- "{{cobald_container_hostname}}" # from cobald_facts, read or generated
- name: cobald
hostname: "{{cobald_container_hostname}}" # from cobald/facts.yml above
# hostname is used as NodeHostname, which is used slurms "networking
# code" (https://bugs.schedmd.com/show_bug.cgi?id=8615).
# It works either way around, but one of NodeName or NodeHostname has
# to match the container name (-n flag, not --hostname) since when
# submitting tasks to the slurm controller, it matches access
# permissions against a reverse lookup of the submitting ip address.
# Docker always and unconfigureably resolves the container ip in any
# network to containername.netname, where containername is the
# containers runtime name (not hostname supplied!) and netname is
# the network name in host environment. We should run our own dns...
docker_network: slurm
when: '"slurm" in group_names'
tags: slurm
tags: slurm, slurm-config
- name: "install cobald"
import_role: name=cobald
include_role:
name: cobald
apply:
tags: cobald
vars:
cobald_slurm: True
container_name: cobald
# docker_network: slurm # overriden by vars/slurm.yml
when: '"cobald" in group_names'
tags: cobald
tags: cobald, influxdb

View File

@@ -118,9 +118,6 @@
networks:
- name: "{{cobald_docker_network}}"
networks_cli_compatible: True
# env:
# slurmuser: "{{slurm_user}}"
# privileged: "{{ container_privileged | bool }}"
state: started
detach: True
cleanup: True

View File

@@ -1,5 +1,5 @@
cobald_image_tag: slurm
cobald_docker_base_image: "slurm:slurmd"
cobald_docker_base_image: "{{slurm.base_image}}"
cobald_docker_default_command: False
cobald_docker_network: "{{slurm.network}}"
cobald_domainname: "{{slurm.domain}}"

View File

@@ -67,9 +67,11 @@
- slurm.conf
- cgroup.conf
vars:
alloc_nodes: "{{ [ slurm_prefix+'-submit1' ] + extra_nodes | default([])}}"
slurm_alloc_nodes_default:
- name: "{{slurm_prefix+'-submit1'}}"
alloc_nodes: "{{ slurm_alloc_nodes_default + extra_nodes | default([])}}"
notify: reconfigure slurm
tags: [ slurm-config ]
tags: slurm-config
- name: "create docker network to make service discovery work"
docker_network:
@@ -108,7 +110,7 @@
loop: "{{ slurm_nodes_all }}"
loop_control:
label: "{{slurm_prefix}}-{{ item.machine }}"
tags: [ slurm-config ]
tags: slurm-config
- name: export facts about slurm cluster to be used by other modules
set_fact:

View File

@@ -165,6 +165,8 @@ SlurmSchedLogFile={{slurm_log_path_sched}}
# COMPUTE NODES
NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN
{% for i in alloc_nodes -%}
NodeName={{i}} State=UNKNOWN
NodeName={{i.name}}
{%- if i.hostname is defined %} NodeHostname={{i.hostname}} {% endif %}
State=UNKNOWN
{% endfor %}
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes={{alloc_nodes | join(',')}} Default=YES MaxTime=INFINITE State=UP
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} Default=YES MaxTime=INFINITE State=UP