From 574d2fcb4e76a0e3e01941edc29081f36326cd7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20M=C3=BCller?= Date: Mon, 5 Jul 2021 18:25:26 +0200 Subject: [PATCH] drone shutdown --- roles/cobald/files/start-drone | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/roles/cobald/files/start-drone b/roles/cobald/files/start-drone index e55282f..a344d1c 100644 --- a/roles/cobald/files/start-drone +++ b/roles/cobald/files/start-drone @@ -4,16 +4,34 @@ export echo $@ nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }') +SHUTDOWN_DONE=0 + function handler_quit(){ + [ $SHUTDOWN_DONE -ne 0 ] && return + set -x + echo "drain container" + scontrol update NodeName=${nodename} State=DRAIN Reason="cobald node quit" + shutdown_jobs=$(squeue -w ${nodename} --noheader -O jobid) + [ -n "${shutdown_jobs}" ] && scancel ${shutdown_jobs} + #scancel -w ${nodename} + i=$(( $(scontrol show config | grep KillWait | \ + sed 's/^KillWait.*= \([0-9]*\) sec/\1/') - 2 )) + while [ -n "$(squeue -w ${nodename} --noheader -O jobid)" -o ${i} -lt 1 ] + do + i=$(( ${i} - 1 )) + sleep 1 + done + scancel -s KILL -w ${nodename} # hard kill all remaining jobs echo "shutdown container" + scontrol update NodeName=${nodename} State=DOWN Reason=shutdown singularity instance stop slurm-drone scontrol update NodeName=${nodename} State=FUTURE umount /inner-cgroup/freezer umount /inner-cgroup + SHUTDOWN_DONE=1 exit 0 } - # set -x trap handler_quit EXIT @@ -26,12 +44,16 @@ mount --bind /sys/fs/cgroup/freezer/slurm/ /inner-cgroup/freezer/ mount -o remount,ro /inner-cgroup echo "starting ${nodename}" +scontrol update NodeName=${nodename} State=RESUME # revoke last DRAIN +scontrol update NodeName=${nodename} State=FUTURE singularity instance start \ -B /inner-cgroup/:/sys/fs/cgroup/ \ --writable-tmpfs /shared/slurmd.sif slurm-drone \ slurm-ctl ${nodename} +# scontrol update NodeName=${nodename} NodeHostname=${SLURM_JOB_ID} +scontrol update NodeName=${nodename} NodeHostname=${TardisDroneUuid} if [ $? -eq 0 ] ; then - echo "container started, sleeping $(( 60 * ${SLURM_Walltime}))" - sleep $(( 60 * ${SLURM_Walltime} )) + echo "container started, sleeping $(( 60 * ${SLURM_Walltime} - 2 ))" + sleep $(( 60 * ${SLURM_Walltime} - 2 )) fi handler_quit