WIP: cobald container containing and using slurm

This commit is contained in:
2021-06-21 19:19:19 +02:00
parent 02e87d7c40
commit e78e184375
17 changed files with 167 additions and 74 deletions

View File

@@ -1,6 +1,8 @@
--- ---
- hosts: all - hosts: all
vars_files: ['vars-auth.yml'] vars_files:
- vars-auth.yml
- vars-influx.yml
tasks: tasks:
- name: "install epel repo" # for htop etc. - name: "install epel repo" # for htop etc.
yum: yum:
@@ -61,16 +63,26 @@
- name: "setup docker" - name: "setup docker"
role: docker role: docker
tags: docker tags: docker
- name: "get facts from existing cobald instance (i.e. hostname)"
role: cobald_facts
vars:
container_name: cobald
tags: [ slurm, cobald ]
- name: "setup slurm test environment in docker containers" - name: "setup slurm test environment in docker containers"
role: slurm role: slurm
vars: vars:
slurm_user: slurm # or root slurm_user: slurm # or root
num_nodes: "{{slurm_num_nodes}}" num_nodes: "{{slurm_num_nodes}}"
extra_nodes:
- "{{cobald_container_hostname}}" # from cobald_facts, read or generated
docker_network: slurm
when: '"slurm" in group_names' when: '"slurm" in group_names'
tags: slurm tags: slurm
- name: "install cobald" - name: "install cobald"
role: cobald role: cobald
vars: vars:
docker_network: slurm cobald_slurm: True
container_name: cobald
# docker_network: slurm # overriden by vars/slurm.yml
when: '"cobald" in group_names' when: '"cobald" in group_names'
tags: cobald tags: cobald

View File

@@ -1,7 +1,7 @@
#!/bin/sh #!/bin/sh
[ -f /usr/local/lib/cobaldmodules/setup.py -a \ for i in /usr/local/lib/entrypoints.d/* ; do
-d /usr/local/lib/cobaldmodules/cobaldmodules ] && \ [ -f $i ] && /bin/sh $i || break
pip3 install --no-deps --editable /usr/local/lib/cobaldmodules done
exec "${@:-/bin/bash}" exec "${@:-/bin/bash}"

View File

@@ -0,0 +1,5 @@
#!/bin/sh
[ -f /usr/local/lib/cobaldmodules/setup.py -a \
-d /usr/local/lib/cobaldmodules/cobaldmodules ] && \
pip3 install --no-deps --editable /usr/local/lib/cobaldmodules

View File

@@ -1,33 +1,41 @@
- include_vars: cobald-slurm.yml
when: cobald_slurm | default(False)
- file: - file:
path: "/container/{{item}}/cobald/" path: "/container/{{item.name}}/cobald{{item.pfx|default('')}}/"
state: directory state: directory
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
group: docker group: docker
loop: loop:
- docker-images - name: docker-images
- volumes pfx: ".{{cobald_image_tag|default('latest')}}"
- name: volumes
- copy: - template:
src: cobald.Dockerfile src: cobald.Dockerfile
dest: /container/docker-images/cobald/Dockerfile dest: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/Dockerfile"
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
group: docker group: docker
register: cobald_cp_dockerfile register: cobald_cp_dockerfile
- copy: - copy:
src: cobald-entrypoint.sh src: "{{item}}"
dest: /container/docker-images/cobald/cobald-entrypoint.sh dest: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/{{item}}"
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
group: docker group: docker
mode: 0755 mode: 0755
with_items:
- cobald-entrypoint.sh
- init-cobaldmodules.sh
register: cobald_cp_files register: cobald_cp_files
- docker_image: - docker_image:
name: "cobald" name: "cobald"
tag: "{{cobald_image_tag|default('latest')}}"
# pull: False # pull: False
build: build:
pull: False pull: False
path: "/container/docker-images/cobald/" path: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/"
source: build source: build
force_source: "{{cobald_cp_dockerfile.changed or cobald_cp_files.changed}}" force_source: "{{cobald_cp_dockerfile.changed or cobald_cp_files.changed}}"
@@ -39,7 +47,7 @@
when: False when: False
- docker_network: - docker_network:
name: "{{docker_network}}" name: "{{cobald_docker_network}}" # FIXME
state: present state: present
# docker run -v $(pwd)/cobald-config-host:/etc/cobald -v $(pwd)/cobald:/cobald --rm -it cobald bash # docker run -v $(pwd)/cobald-config-host:/etc/cobald -v $(pwd)/cobald:/cobald --rm -it cobald bash
@@ -82,7 +90,7 @@
- name: run pip install - name: run pip install
docker_container: docker_container:
image: cobald image: "cobald:{{cobald_image_tag|default('latest')}}"
name: "cobald-src-{{item.name}}-install" name: "cobald-src-{{item.name}}-install"
volumes: volumes:
- "~{{unpriv_user}}/{{item.name}}-src:/usr/local/src/{{item.name}}:rw" - "~{{unpriv_user}}/{{item.name}}-src:/usr/local/src/{{item.name}}:rw"
@@ -102,29 +110,34 @@
- import_tasks: telegraf.yml - import_tasks: telegraf.yml
- docker_container_info: - name: get cobald hostname
name: cobald include_role:
register: cobald_container_info name: cobald_facts
when: cobald_container_hostname is not defined
- docker_container: - name: run cobald container
name: cobald docker_container:
image: cobald name: "{{ container_name | default('cobald') }}"
hostname: |- image: "cobald:{{cobald_image_tag|default('latest')}}"
{{cobald_container_info.container.Config.Hostname | default('cobald-'+ hostname: "{{cobald_container_hostname}}"
lookup('password', '/dev/null chars=ascii_lowercase length=6')) }} domainname: "{{ cobald_domainname | default('cobald.local')}}"
domainname: cobald.local volumes: "{{default_mounts + cobald_mounts }}"
volumes: networks:
- name: "{{cobald_docker_network}}"
networks_cli_compatible: True
# env:
# slurmuser: "{{slurm_user}}"
# privileged: "{{ container_privileged | bool }}"
state: started
detach: True
cleanup: True
interactive: True
# command: python3 -m cobald.daemon /etc/cobald/config.yaml
vars:
default_mounts: "{{cobald_slurm_mounts | default([])}}"
cobald_mounts:
- "~{{unpriv_user}}/cobald:/etc/cobald" - "~{{unpriv_user}}/cobald:/etc/cobald"
# - /container/volumes/cobald:/etc/cobald:ro # - /container/volumes/cobald:/etc/cobald:ro
- "~{{unpriv_user}}/cobald/modules:/usr/local/src/cobaldmodules" - "~{{unpriv_user}}/cobald/modules:/usr/local/src/cobaldmodules"
- "~{{unpriv_user}}/cobald-src:/usr/local/src/cobald:ro" - "~{{unpriv_user}}/cobald-src:/usr/local/src/cobald:ro"
- "~{{unpriv_user}}/tardis-src:/usr/local/src/tardis:ro" - "~{{unpriv_user}}/tardis-src:/usr/local/src/tardis:ro"
networks:
- name: "{{docker_network}}"
state: started
detach: True
cleanup: True
interactive: True
# command: /bin/bash
# python3 -m cobald.daemon /etc/cobald/config.yaml
command: python3 -m cobald.daemon /etc/cobald/config.yaml

View File

@@ -1,4 +1,4 @@
FROM docker.io/library/centos:7 FROM {{ cobald_docker_base_image | default("docker.io/library/centos:7") }}
RUN yum update -y && \ RUN yum update -y && \
yum install -y python3 git && pip3 install --upgrade pip && \ yum install -y python3 git && pip3 install --upgrade pip && \
@@ -41,11 +41,20 @@ VOLUME /usr/local/src/cobaldmodules
VOLUME /etc/cobald VOLUME /etc/cobald
COPY cobald-entrypoint.sh /usr/local/sbin/docker-entrypoint.sh RUN mkdir -p /usr/local/lib/entrypoints.d/
RUN chmod 755 /usr/local/sbin/docker-entrypoint.sh COPY init-cobaldmodules.sh /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
ENTRYPOINT [ "/usr/local/sbin/docker-entrypoint.sh" ] RUN chmod 755 /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
RUN echo -e "#!/bin/sh\npython3 -m cobald.daemon /etc/cobald/config.yaml" >> /etc/docker-init.d/70-cobald && chmod 755 /etc/docker-init.d/70-cobald
{% if cobald_docker_default_command | default(True) -%}
COPY cobald-entrypoint.sh /usr/local/sbin/cobald-entrypoint.sh
RUN chmod 755 /usr/local/sbin/cobald-entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/cobald-entrypoint.sh" ]
RUN yum -y install iproute &&\ RUN yum -y install iproute &&\
yum clean all && rm -rf /var/cache/yum yum clean all && rm -rf /var/cache/yum
@@ -54,4 +63,6 @@ USER cobald
STOPSIGNAL SIGINT STOPSIGNAL SIGINT
CMD "python3 -m cobald.daemon /etc/cobald/config.yaml" # CMD "python3 -m cobald.daemon /etc/cobald/config.yaml"
CMD /etc/docker-init.d/60-cobald
{%- endif %}

View File

@@ -0,0 +1,12 @@
cobald_image_tag: slurm
cobald_docker_base_image: "{{slurm.base_image}}"
cobald_docker_default_command: False
cobald_docker_network: "{{slurm.network}}"
cobald_domainname: "{{slurm.domain}}"
cobald_slurm_mounts: "{{slurm.mounts}}"
#- /container/volumes/slurm/:/etc/slurm/:rw
##- "{{slurm_cfg_path | mandatory}}:/etc/slurm/:rw"
#- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
## - "{{slurm_munge_path | mandatory}}:/etc/munge/munge.key:rw"
#- slurm-shared:/shared/:rw
## - "{{slurm_shared_path | mandatory}}:{{slurm_shared_target | default('/shared')}}:rw"

View File

@@ -0,0 +1 @@
cobald_docker_network: "{{docker_network}}"

View File

@@ -0,0 +1,11 @@
- block:
- docker_container_info:
name: "{{ container_name | mandatory }}"
register: cobald_container_info
- set_fact:
cobald_container_hostname: |-
{{cobald_container_info.container.Config.Hostname | default('cobald-'+
lookup('password', '/dev/null chars=ascii_lowercase length=6')) }}
when: cobald_container_hostname is not defined

View File

@@ -3,3 +3,6 @@ slurm_user: slurm
slurm_log_path_ctld: /var/log/slurm/slurmctld.log slurm_log_path_ctld: /var/log/slurm/slurmctld.log
slurm_log_path_d: /var/log/slurm/slurmd.log slurm_log_path_d: /var/log/slurm/slurmd.log
slurm_log_path_sched: /var/log/slurm/slurmsched.log slurm_log_path_sched: /var/log/slurm/slurmsched.log
slurm_prefix: slurm
slurm_domain: slurm.local
docker_network: slurm

View File

@@ -1,3 +1,3 @@
- name: reconfigure slurm - name: reconfigure slurm
command: shell:
cmd: docker container exec -it slurm-ctl scontrol reconfigure cmd: "docker container exec -it {{slurm_prefix}}-ctl scontrol reconfigure || docker container restart {{slurm_prefix}}-ctl && docker container exec -it {{slurm_prefix}}-ctl scontrol reconfigure"

View File

@@ -5,7 +5,7 @@
- include_role: - include_role:
name: slurm_dockerimage name: slurm_dockerimage
loop: loop: # FIXME: default(omit)!
- name: slurmctld - name: slurmctld
dockerfile: "{{ lookup('file', 'slurmctld.Dockerfile') }}" dockerfile: "{{ lookup('file', 'slurmctld.Dockerfile') }}"
files: files:
@@ -59,12 +59,14 @@
loop: loop:
- slurm.conf - slurm.conf
- cgroup.conf - cgroup.conf
vars:
alloc_nodes: "{{ [ slurm_prefix+'-submit1' ] + extra_nodes | default([])}}"
notify: reconfigure slurm notify: reconfigure slurm
tags: [ slurm-config ] tags: [ slurm-config ]
- name: "create docker network to make service discovery work" - name: "create docker network to make service discovery work"
docker_network: docker_network:
name: slurm name: "{{ docker_network }}"
state: present state: present
- name: "create docker volume for shared access between nodes" - name: "create docker volume for shared access between nodes"
@@ -73,7 +75,7 @@
state: present state: present
- set_fact: - set_fact:
slurm_nodes: # default nodes: controller and submit machine slurm_nodes_std: # default nodes: controller and submit machine
- machine: ctl - machine: ctl
image: slurm:slurmctld image: slurm:slurmctld
exposed_ports: [ "6817:6817/tcp" ] exposed_ports: [ "6817:6817/tcp" ]
@@ -81,10 +83,18 @@
image: slurm:slurmd image: slurm:slurmd
extra_mounts: extra_mounts:
- "/home/{{unpriv_user}}/job3/:/mnt/:rw" - "/home/{{unpriv_user}}/job3/:/mnt/:rw"
# - machine: slurm-cobald slurm_nodes_exec: | # extend range to execute nodes list
# image: slurm-cobald {% set slurm_nodes_exec = slurm_nodes_exec | default([]) %}
# extra_mounts: {% for i in range(1, num_nodes+1) -%}
# # TODO {% set _ = slurm_nodes_exec.extend([
{'machine':'exec%s'|format(i), 'image': 'slurm:slurmd'}]) -%}
{%- endfor %}
{{ slurm_nodes_exec }}
slurm_default_mounts:
- /container/volumes/slurm/:/etc/slurm/:rw
- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
- slurm-shared:/shared/:rw
slurm_network: "{{docker_network}}"
tags: [ slurm-config ] tags: [ slurm-config ]
# TODO: reserve some address using docker_network_info and assign as aux # TODO: reserve some address using docker_network_info and assign as aux
@@ -93,13 +103,13 @@
- name: run slurm docker containers - name: run slurm docker containers
docker_container: docker_container:
name: "slurm-{{item.machine}}" name: "{{ slurm_prefix }}-{{ item.machine }}"
hostname: "slurm-{{item.machine}}" hostname: "{{ slurm_prefix }}-{{ item.machine }}"
domainname: "slurm.local" domainname: "{{ slurm_domain }}"
volumes: "{{default_mounts + ( item.extra_mounts | default([]) ) }}" volumes: "{{ slurm_default_mounts + ( item.extra_mounts | default([]) ) }}"
ports: "{{ item.exposed_ports | default([]) }}" ports: "{{ item.exposed_ports | default([]) }}"
networks: networks:
- name: "slurm" - name: "{{ slurm_network }}"
env: env:
slurmuser: "{{ slurm_user }}" slurmuser: "{{ slurm_user }}"
image: "{{ item.image }}" image: "{{ item.image }}"
@@ -110,15 +120,17 @@
networks_cli_compatible: True networks_cli_compatible: True
interactive: True interactive: True
vars: vars:
default_mounts: slurm_nodes_all: "{{ slurm_nodes_exec + slurm_nodes_std }}"
- /container/volumes/slurm/:/etc/slurm/:rw
- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
- slurm-shared:/shared/:rw
slurm_nodes_all: | # add execute nodes
{% for i in range(1, num_nodes+1) -%}
{% set _ = slurm_nodes.extend([
{'machine':'exec%s'|format(i), 'image': 'slurm:slurmd'}]) -%}
{%- endfor %}
{{ slurm_nodes }}
loop: "{{ slurm_nodes_all }}" loop: "{{ slurm_nodes_all }}"
loop_control:
label: "{{slurm_prefix}}-{{ item.machine }}"
tags: [ slurm-config ] tags: [ slurm-config ]
- name: set facts to be used by other modules
set_fact:
slurm:
user: "{{slurm_user}}"
domain: "{{slurm_domain}}"
base_image: "slurm:base"
mounts: "{{slurm_default_mounts}}"
network: "{{docker_network}}"

View File

@@ -164,5 +164,7 @@ SlurmSchedLogFile={{slurm_log_path_sched}}
# #
# COMPUTE NODES # COMPUTE NODES
NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN
NodeName=slurm-submit1 CPUs=1 State=UNKNOWN {% for i in alloc_nodes -%}
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes=slurm-submit1 Default=YES MaxTime=INFINITE State=UP NodeName={{i}} State=UNKNOWN
{% endfor %}
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes={{alloc_nodes | join(',')}} Default=YES MaxTime=INFINITE State=UP

View File

@@ -5,5 +5,3 @@ if [ -f "/etc/munge/munge.key" ] ; then
chown munge:munge /etc/munge/munge.key chown munge:munge /etc/munge/munge.key
chmod 600 /etc/munge/munge.key chmod 600 /etc/munge/munge.key
fi fi
exec "$@"

View File

@@ -0,0 +1,8 @@
#!/usr/bin/env bash
set -e
for i in /usr/local/lib/entrypoints.d/* ; do
[ -f $i ] && /bin/sh $i || break
done
exec "${@:-/bin/bash}"

View File

@@ -7,10 +7,12 @@ RUN yum install -y epel-release && \
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \ RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
yum clean all && rm -rf /var/cache/yum yum clean all && rm -rf /var/cache/yum
# FIXME RUN mkdir -p /usr/local/lib/entrypoints.d/
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
RUN chown root:root /usr/local/sbin/entrypoint.sh && \ COPY --chown=root:root entry-munge.sh /usr/local/lib/entrypoints.d/10-munge.sh
COPY --chown=root:root entrypoint.sh /usr/local/sbin/entrypoint.sh
RUN chmod 755 /usr/local/lib/entrypoints.d/10-munge.sh && \
chmod 755 /usr/local/sbin/entrypoint.sh chmod 755 /usr/local/sbin/entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ] ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]

View File

@@ -23,6 +23,8 @@
group: root group: root
mode: u=rwx,g=rx,o=rx mode: u=rwx,g=rx,o=rx
loop: "{{ image.files | default([]) }}" loop: "{{ image.files | default([]) }}"
loop_control:
label: "{{ item.dest }}"
register: slurm_cp_files register: slurm_cp_files
- docker_image: - docker_image:

View File

@@ -16,6 +16,7 @@
- file: slurm-base.Dockerfile - file: slurm-base.Dockerfile
perms: u=rw,g=r,o=r perms: u=rw,g=r,o=r
- file: entrypoint.sh - file: entrypoint.sh
- file: entry-munge.sh
- file: docker-init - file: docker-init
- file: start-scripts/10-munge - file: start-scripts/10-munge
when: not (slurm_baseimg_build_chg | default(False)) when: not (slurm_baseimg_build_chg | default(False))