|
|
|
@@ -4,16 +4,34 @@ export
|
|
|
|
|
echo $@
|
|
|
|
|
nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }')
|
|
|
|
|
|
|
|
|
|
SHUTDOWN_DONE=0
|
|
|
|
|
|
|
|
|
|
function handler_quit(){
|
|
|
|
|
[ $SHUTDOWN_DONE -ne 0 ] && return
|
|
|
|
|
set -x
|
|
|
|
|
echo "drain container"
|
|
|
|
|
scontrol update NodeName=${nodename} State=DRAIN Reason="cobald node quit"
|
|
|
|
|
shutdown_jobs=$(squeue -w ${nodename} --noheader -O jobid)
|
|
|
|
|
[ -n "${shutdown_jobs}" ] && scancel ${shutdown_jobs}
|
|
|
|
|
#scancel -w ${nodename}
|
|
|
|
|
i=$(( $(scontrol show config | grep KillWait | \
|
|
|
|
|
sed 's/^KillWait.*= \([0-9]*\) sec/\1/') - 2 ))
|
|
|
|
|
while [ -n "$(squeue -w ${nodename} --noheader -O jobid)" -o ${i} -lt 1 ]
|
|
|
|
|
do
|
|
|
|
|
i=$(( ${i} - 1 ))
|
|
|
|
|
sleep 1
|
|
|
|
|
done
|
|
|
|
|
scancel -s KILL -w ${nodename} # hard kill all remaining jobs
|
|
|
|
|
echo "shutdown container"
|
|
|
|
|
scontrol update NodeName=${nodename} State=DOWN Reason=shutdown
|
|
|
|
|
singularity instance stop slurm-drone
|
|
|
|
|
scontrol update NodeName=${nodename} State=FUTURE
|
|
|
|
|
umount /inner-cgroup/freezer
|
|
|
|
|
umount /inner-cgroup
|
|
|
|
|
SHUTDOWN_DONE=1
|
|
|
|
|
exit 0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# set -x
|
|
|
|
|
|
|
|
|
|
trap handler_quit EXIT
|
|
|
|
@@ -26,12 +44,16 @@ mount --bind /sys/fs/cgroup/freezer/slurm/ /inner-cgroup/freezer/
|
|
|
|
|
mount -o remount,ro /inner-cgroup
|
|
|
|
|
|
|
|
|
|
echo "starting ${nodename}"
|
|
|
|
|
scontrol update NodeName=${nodename} State=RESUME # revoke last DRAIN
|
|
|
|
|
scontrol update NodeName=${nodename} State=FUTURE
|
|
|
|
|
singularity instance start \
|
|
|
|
|
-B /inner-cgroup/:/sys/fs/cgroup/ \
|
|
|
|
|
--writable-tmpfs /shared/slurmd.sif slurm-drone \
|
|
|
|
|
slurm-ctl ${nodename}
|
|
|
|
|
# scontrol update NodeName=${nodename} NodeHostname=${SLURM_JOB_ID}
|
|
|
|
|
scontrol update NodeName=${nodename} NodeHostname=${TardisDroneUuid}
|
|
|
|
|
if [ $? -eq 0 ] ; then
|
|
|
|
|
echo "container started, sleeping $(( 60 * ${SLURM_Walltime}))"
|
|
|
|
|
sleep $(( 60 * ${SLURM_Walltime} ))
|
|
|
|
|
echo "container started, sleeping $(( 60 * ${SLURM_Walltime} - 2 ))"
|
|
|
|
|
sleep $(( 60 * ${SLURM_Walltime} - 2 ))
|
|
|
|
|
fi
|
|
|
|
|
handler_quit
|
|
|
|
|