#!/bin/sh #SBATCH -D /shared export echo $@ nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }') SHUTDOWN_DONE=0 function handler_quit(){ [ $SHUTDOWN_DONE -ne 0 ] && return set -x echo "drain container" scontrol update NodeName=${nodename} State=DRAIN Reason="cobald node quit" shutdown_jobs=$(squeue -w ${nodename} --noheader -O jobid) [ -n "${shutdown_jobs}" ] && scancel ${shutdown_jobs} #scancel -w ${nodename} i=$(( $(scontrol show config | grep KillWait | \ sed 's/^KillWait.*= \([0-9]*\) sec/\1/') - 2 )) while [ -n "$(squeue -w ${nodename} --noheader -O jobid)" -o ${i} -lt 1 ] do i=$(( ${i} - 1 )) sleep 1 done scancel -s KILL -w ${nodename} # hard kill all remaining jobs echo "shutdown container" scontrol update NodeName=${nodename} State=DOWN Reason=shutdown singularity instance stop slurm-drone scontrol update NodeName=${nodename} State=FUTURE umount /inner-cgroup/freezer umount /inner-cgroup SHUTDOWN_DONE=1 exit 0 } # set -x trap handler_quit EXIT echo "mounting cgroups" mkdir /inner-cgroup mount -t tmpfs none /inner-cgroup mkdir /inner-cgroup/freezer/ mount --bind /sys/fs/cgroup/freezer/slurm/ /inner-cgroup/freezer/ mount -o remount,ro /inner-cgroup echo "starting ${nodename}" scontrol update NodeName=${nodename} State=RESUME # revoke last DRAIN scontrol update NodeName=${nodename} State=FUTURE singularity instance start \ -B /inner-cgroup/:/sys/fs/cgroup/ \ --writable-tmpfs /shared/slurmd.sif slurm-drone \ slurm-ctl ${nodename} # scontrol update NodeName=${nodename} NodeHostname=${SLURM_JOB_ID} scontrol update NodeName=${nodename} NodeHostname=${TardisDroneUuid} if [ $? -eq 0 ] ; then echo "container started, sleeping $(( 60 * ${SLURM_Walltime} - 2 ))" sleep $(( 60 * ${SLURM_Walltime} - 2 )) fi handler_quit