diff --git a/inv.yml b/inv.yml index 7b3df00..6297cf2 100644 --- a/inv.yml +++ b/inv.yml @@ -19,6 +19,7 @@ all: slurm: hosts: ed-c7-1: + ed-c7-2: cobald: hosts: ed-c7-1: diff --git a/play.yml b/play.yml index ebf7f8b..b56b6a0 100644 --- a/play.yml +++ b/play.yml @@ -53,10 +53,10 @@ - htcondor-containered - htcondor -- hosts: slurm +- hosts: slurm, cobald vars: container_privileged: True - num_nodes: 3 + slurm_num_nodes: 10 roles: - name: "setup docker" role: docker @@ -65,15 +65,12 @@ role: slurm vars: slurm_user: slurm # or root + num_nodes: "{{slurm_num_nodes}}" + when: '"slurm" in group_names' tags: slurm - -- hosts: cobald - roles: - - name: "setup docker" - role: docker - tags: docker - name: "install cobald" role: cobald vars: docker_network: slurm + when: '"cobald" in group_names' tags: cobald diff --git a/roles/slurm/files/docker-init b/roles/slurm/files/docker-init new file mode 100644 index 0000000..d920d9c --- /dev/null +++ b/roles/slurm/files/docker-init @@ -0,0 +1,18 @@ +#!/bin/bash +function trp_term(){ + echo pkill -P $pids + for j in $pids ; do + pkill -P $j + kill -SIGTERM $j + done +} +trap trp_term SIGINT SIGTERM +pids="" +for i in /etc/docker-init.d/* ; do + [ ! -f $i ] && break + $i & + pids="$pids $!" +done +wait $pids + +# TODO: call start scripts like "foo.sh start" and "foo.sh stop" to avoid pkill diff --git a/roles/slurm/files/slurmctld.Dockerfile b/roles/slurm/files/slurmctld.Dockerfile index efe2591..34175d6 100644 --- a/roles/slurm/files/slurmctld.Dockerfile +++ b/roles/slurm/files/slurmctld.Dockerfile @@ -1,4 +1,4 @@ -FROM docker.io/library/centos:7 as base +FROM docker.io/library/centos:7 RUN yum install -y epel-release && \ yum install -y slurm && \ @@ -7,6 +7,7 @@ RUN yum install -y epel-release && \ RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \ yum clean all && rm -rf /var/cache/yum +# FIXME COPY entrypoint.sh /usr/local/sbin/entrypoint.sh RUN chown root:root /usr/local/sbin/entrypoint.sh && \ @@ -23,10 +24,15 @@ RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\ RUN yum install -y slurm-slurmctld && \ yum clean all && rm -rf /var/cache/yum +COPY docker-init /usr/local/sbin/docker-init +RUN mkdir /etc/docker-init.d && chmod 755 /usr/local/sbin/docker-init +# FIXME +COPY start-scripts/10-munge /etc/docker-init.d/ +COPY start-scripts/20-slurmctld /etc/docker-init.d/ +RUN chmod 755 /etc/docker-init.d/* + ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log" ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log" ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log" -CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \ - su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null & \ - tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })' +CMD /usr/local/sbin/docker-init diff --git a/roles/slurm/files/slurmd.Dockerfile b/roles/slurm/files/slurmd.Dockerfile index e20c91b..1e7883f 100644 --- a/roles/slurm/files/slurmd.Dockerfile +++ b/roles/slurm/files/slurmd.Dockerfile @@ -7,6 +7,7 @@ RUN yum install -y epel-release && \ RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \ yum clean all && rm -rf /var/cache/yum +# FIXME COPY entrypoint.sh /usr/local/sbin/entrypoint.sh RUN chown root:root /usr/local/sbin/entrypoint.sh && \ @@ -23,10 +24,15 @@ RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\ RUN yum install -y slurm-slurmd && \ yum clean all && rm -rf /var/cache/yum +COPY docker-init /usr/local/sbin/docker-init +RUN mkdir /etc/docker-init.d && chmod 755 /usr/local/sbin/docker-init +# FIXME +COPY start-scripts/10-munge /etc/docker-init.d/ +COPY start-scripts/30-slurmd /etc/docker-init.d/ +RUN chmod 755 /etc/docker-init.d/* + ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log" ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log" ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log" -CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \ - slurmd -D 2>/dev/null 1>/dev/null & \ - tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })' +CMD /usr/local/sbin/docker-init diff --git a/roles/slurm/files/start-scripts/10-munge b/roles/slurm/files/start-scripts/10-munge new file mode 100644 index 0000000..f38cbaa --- /dev/null +++ b/roles/slurm/files/start-scripts/10-munge @@ -0,0 +1,2 @@ +#!/bin/sh +exec su -s /bin/sh -c "munged -F" munge diff --git a/roles/slurm/files/start-scripts/20-slurmctld b/roles/slurm/files/start-scripts/20-slurmctld new file mode 100644 index 0000000..0a9f8cd --- /dev/null +++ b/roles/slurm/files/start-scripts/20-slurmctld @@ -0,0 +1,4 @@ +#!/bin/sh +su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null & +tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} + diff --git a/roles/slurm/files/start-scripts/30-slurmd b/roles/slurm/files/start-scripts/30-slurmd new file mode 100644 index 0000000..bb2cbf1 --- /dev/null +++ b/roles/slurm/files/start-scripts/30-slurmd @@ -0,0 +1,4 @@ +#!/bin/sh +slurmd -D 2>/dev/null 1>/dev/null & +tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} + diff --git a/roles/slurm/tasks/dockerimage.yml b/roles/slurm/tasks/dockerimage.yml index cf73759..4dd92f2 100644 --- a/roles/slurm/tasks/dockerimage.yml +++ b/roles/slurm/tasks/dockerimage.yml @@ -1,31 +1,50 @@ - file: - path: "/container/docker-images/{{item}}" + path: "/container/docker-images/slurm" state: directory owner: "{{unpriv_user}}" group: docker -- copy: - src: "{{item}}.Dockerfile" - dest: "/container/docker-images/{{item}}/Dockerfile" +- copy: # FIXME: template + src: "{{image.name}}.Dockerfile" + dest: "/container/docker-images/slurm/{{image.name}}.Dockerfile" owner: "{{unpriv_user}}" group: docker register: slurm_cp_dockerfile -- copy: - src: "entrypoint.sh" - dest: "/container/docker-images/{{item}}/entrypoint.sh" +- name: copy entrypoint and docker-init + copy: # FIXME: swap out + src: "{{item}}" + dest: "/container/docker-images/slurm/{{item}}" owner: root group: root mode: u=rwx,g=rx,o=rx + loop: + - entrypoint.sh + - docker-init register: slurm_cp_entrypt +- name: copy startup scripts + copy: + src: "start-scripts/" + dest: "/container/docker-images/slurm/start-scripts/" + owner: root + group: root + mode: u=rwx,g=rx,o=rx + register: slurm_cp_stscrs + +- set_fact: + slurm_image_files_changed: "{{ (slurm_image_files_changed | default(False)) + or slurm_cp_entrypt.changed or slurm_cp_stscrs.changed }}" + - docker_image: - name: "slurm-{{item}}" + name: "slurm" + tag: "{{image.name}}" # pull: False build: pull: False - path: "/container/docker-images/{{item}}" -# target: "{{item}}" # unsupported on old docker-py versions as in el7 + path: "/container/docker-images/slurm/" + dockerfile: "{{image.name}}.Dockerfile" +# target: "{{image.name}}" # unsupported on old docker-py version as in el7 source: build - force_source: "{{slurm_cp_dockerfile.changed or slurm_cp_entrypt.changed}}" + force_source: "{{slurm_cp_dockerfile.changed or slurm_image_files_changed}}" diff --git a/roles/slurm/tasks/main.yml b/roles/slurm/tasks/main.yml index 563942d..4d0d4b0 100644 --- a/roles/slurm/tasks/main.yml +++ b/roles/slurm/tasks/main.yml @@ -5,8 +5,17 @@ - include_tasks: dockerimage.yml loop: - - slurmctld - - slurmd + - name: slurmctld + start: + - 10-munge + - 20-slurmctld + - name: slurmd + start: + - 10-munge + - 30-slurmd + loop_control: + loop_var: image + label: "{{ image.name }}" - name: generate munge key shell: @@ -63,13 +72,14 @@ - set_fact: slurm_nodes: # default nodes: controller and submit machine - machine: ctl - image: slurm-slurmctld + image: slurm:slurmctld + exposed_ports: [ "6817:6817/tcp" ] - machine: submit1 - image: slurm-slurmd + image: slurm:slurmd extra_mounts: - "/home/{{unpriv_user}}/job3/:/mnt/:rw" tags: [ slurm-config ] - + # TODO: reserve some address using docker_network_info and assign as aux # address to enable slurmctld to get a static address in order to be # reachable from slurm running on docker host to enable submitting jobs. @@ -80,6 +90,7 @@ hostname: "slurm-{{item.machine}}" domainname: "slurm.local" volumes: "{{default_mounts + ( item.extra_mounts | default([]) ) }}" + ports: "{{ item.exposed_ports | default([]) }}" networks: - name: "slurm" env: @@ -90,15 +101,16 @@ cleanup: True privileged: "{{ container_privileged | bool }}" networks_cli_compatible: True + interactive: True vars: default_mounts: - /container/volumes/slurm/:/etc/slurm/:rw - /container/volumes/munge/munge.key:/etc/munge/munge.key:rw - slurm-shared:/shared/:rw slurm_nodes_all: | # add execute nodes - {% for i in range(1, 4) -%} + {% for i in range(1, num_nodes+1) -%} {% set _ = slurm_nodes.extend([ - {'machine':'exec%s'|format(i), 'image': 'slurm-slurmd'}]) -%} + {'machine':'exec%s'|format(i), 'image': 'slurm:slurmd'}]) -%} {%- endfor %} {{ slurm_nodes }} loop: "{{slurm_nodes_all}}"