diff --git a/roles/cobald/files/28-sync-container-slurmd b/roles/cobald/files/28-sync-container-slurmd new file mode 100644 index 0000000..4bd9802 --- /dev/null +++ b/roles/cobald/files/28-sync-container-slurmd @@ -0,0 +1,3 @@ +#!/bin/sh +[ /slurm-singimage/slurmd.sif -nt /shared/slurmd.sif ] && \ + cp /slurm-singimage/slurmd.sif /shared/slurmd.sif diff --git a/roles/cobald/files/31-slurmd-configless b/roles/cobald/files/31-slurmd-configless new file mode 100644 index 0000000..5af6c9c --- /dev/null +++ b/roles/cobald/files/31-slurmd-configless @@ -0,0 +1,3 @@ +#!/bin/sh +slurmd --conf-server ${slurmctld} -D -N ${nodename} 2>/dev/null 1>/dev/null & +tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} diff --git a/roles/cobald/files/slurm-slurmd.def b/roles/cobald/files/slurm-slurmd.def new file mode 100644 index 0000000..33cea00 --- /dev/null +++ b/roles/cobald/files/slurm-slurmd.def @@ -0,0 +1,30 @@ +Bootstrap: docker-daemon +From: slurm:slurmd + +%files + 31-slurmd-configless /etc/docker-init.d/31-slurm-configless + /container/volumes/munge/munge.key /etc/munge/munge.key + +%post + rm /etc/docker-init.d/30-slurmd + chmod 755 /etc/docker-init.d/31-slurm-configless + +%startscript + if [ -z "${1}" -o -z "${2}" ] ; then + echo "undefined variables slurmctld or nodename" + exit 1 + fi + export slurmctld="${1}" + export nodename="${2}" + echo ${slurmctld} ${nodename} ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} + exec /usr/local/sbin/entrypoint.sh /usr/local/sbin/docker-init + +%runscript + if [ -z "${1}" -o -z "${2}" ] ; then + echo "undefined variables slurmctld or nodename" + exit 1 + fi + export slurmctld="${1}" + export nodename="${2}" + echo ${slurmctld} ${nodename} ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} + exec /usr/local/sbin/entrypoint.sh /usr/local/sbin/docker-init diff --git a/roles/cobald/files/start-drone b/roles/cobald/files/start-drone new file mode 100644 index 0000000..21e81c8 --- /dev/null +++ b/roles/cobald/files/start-drone @@ -0,0 +1,25 @@ +#!/bin/sh +#SBATCH -D /shared +export +echo $@ +nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }') + +function handler_quit(){ + echo "shutdown container" + singularity instance stop slurm-drone + scontrol update NodeName=${nodename} State=FUTURE + exit 0 +} + +trap handler_quit EXIT + +# set -x + +echo "starting ${nodename}" +singularity instance start --writable-tmpfs /shared/slurmd.sif slurm-drone \ + slurm-ctl ${nodename} +if [ $? -eq 0 ] ; then + echo "container started, sleeping $(( 60 * ${SLURM_Walltime}))" + sleep $(( 60 * ${SLURM_Walltime} )) +fi +handler_quit diff --git a/roles/cobald/tasks/main.yml b/roles/cobald/tasks/main.yml index 553e376..fec950d 100644 --- a/roles/cobald/tasks/main.yml +++ b/roles/cobald/tasks/main.yml @@ -15,6 +15,10 @@ content: "{{ lookup('file', 'cobald-entrypoint.sh') }}" - dest: init-cobaldmodules.sh content: "{{ lookup('file', 'init-cobaldmodules.sh') }}" + - dest: start-drone + content: "{{ lookup('file', 'start-drone') }}" + - dest: 28-sync-container-slurmd + content: "{{ lookup('file', '28-sync-container-slurmd') }}" when: cobald_slurm | default(False) - name: build generic cobald docker image @@ -136,6 +140,7 @@ cobald_mounts: - "~{{unpriv_user}}/cobald:/etc/cobald" # - /container/volumes/cobald:/etc/cobald:ro + - "/container/docker-images/sing-slurmd/build/:/slurm-singimage/:ro" - "~{{unpriv_user}}/cobald/modules:/usr/local/src/cobaldmodules" - "~{{unpriv_user}}/cobald-src:/usr/local/src/cobald:ro" - "~{{unpriv_user}}/tardis-src:/usr/local/src/tardis:ro" diff --git a/roles/cobald/tasks/singularity.yml b/roles/cobald/tasks/singularity.yml index 1dfb3cb..dc26f23 100644 --- a/roles/cobald/tasks/singularity.yml +++ b/roles/cobald/tasks/singularity.yml @@ -1,8 +1,47 @@ -- name: "setup singularity" +- name: setup singularity import_role: name="singularity" tags: singularity -#- name: copy slurm def file -# - name: export docker image -#- name: build container +- name: make singularity image build directory + file: + state: directory + path: "{{item}}" + owner: "{{unpriv_user}}" + group: "docker" + mode: "0755" + loop: + - /container/docker-images/sing-slurmd + - /container/docker-images/sing-slurmd/cache + - /container/docker-images/sing-slurmd/build +- name: copy slurm singularity container files + copy: + src: "{{item}}" + dest: "/container/docker-images/sing-slurmd/{{item}}" + owner: "{{unpriv_user}}" + group: "docker" + loop: + - slurm-slurmd.def + - 31-slurmd-configless + register: cobald_copy_sing_files + +- name: remove old container + file: + path: /container/docker-images/sing-slurmd/build/slurmd.sif + state: absent + when: cobald_copy_sing_files.changed + +- name: build container + shell: + chdir: /container/docker-images/sing-slurmd/ + cmd: SINGULARITY_TMPDIR=/container/docker-images/sing-slurmd/cache + singularity build --disable-cache + /container/docker-images/sing-slurmd/build/slurmd.sif + /container/docker-images/sing-slurmd/slurm-slurmd.def + creates: /container/docker-images/sing-slurmd/build/slurmd.sif + register: cobald_sing_build + +- debug: msg="{{[cobald_sing_build.stdout, cobald_sing_build.stderr]}}" + tags: [ never, debug ] + +# TODO: trigger copy in cobald container when slurmd.sif rebuilt diff --git a/roles/cobald/templates/cobald.Dockerfile b/roles/cobald/templates/cobald.Dockerfile index 7ff8689..4db4745 100644 --- a/roles/cobald/templates/cobald.Dockerfile +++ b/roles/cobald/templates/cobald.Dockerfile @@ -48,6 +48,10 @@ COPY init-cobaldmodules.sh /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh RUN chmod 755 /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh +COPY start-drone /usr/local/bin/start-drone +COPY 28-sync-container-slurmd /etc/docker-init.d/28-sync-container-slurmd +RUN chmod 755 /usr/local/bin/start-drone /etc/docker-init.d/28-sync-container-slurmd + RUN echo -e "#!/bin/sh\npython3 -m cobald.daemon /etc/cobald/config.yaml" >> /etc/docker-init.d/70-cobald && chmod 755 /etc/docker-init.d/70-cobald {% if cobald_docker_default_command | default(True) -%} diff --git a/roles/slurm/templates/slurm.conf.j2 b/roles/slurm/templates/slurm.conf.j2 index b1feff4..a67a907 100644 --- a/roles/slurm/templates/slurm.conf.j2 +++ b/roles/slurm/templates/slurm.conf.j2 @@ -10,6 +10,7 @@ AuthType=auth/munge #CheckpointType=checkpoint/none CryptoType=crypto/munge CommunicationParameters=NoAddrCache +SlurmctldParameters=enable_configless #DisableRootJobs=NO #EnforcePartLimits=NO #Epilog=