Compare commits

..

2 Commits

Author SHA1 Message Date
574d2fcb4e drone shutdown 2021-07-05 18:25:26 +02:00
2919c98d5f cons res 2021-07-05 18:24:24 +02:00
2 changed files with 28 additions and 4 deletions

View File

@@ -4,16 +4,34 @@ export
echo $@ echo $@
nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }') nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }')
SHUTDOWN_DONE=0
function handler_quit(){ function handler_quit(){
[ $SHUTDOWN_DONE -ne 0 ] && return
set -x
echo "drain container"
scontrol update NodeName=${nodename} State=DRAIN Reason="cobald node quit"
shutdown_jobs=$(squeue -w ${nodename} --noheader -O jobid)
[ -n "${shutdown_jobs}" ] && scancel ${shutdown_jobs}
#scancel -w ${nodename}
i=$(( $(scontrol show config | grep KillWait | \
sed 's/^KillWait.*= \([0-9]*\) sec/\1/') - 2 ))
while [ -n "$(squeue -w ${nodename} --noheader -O jobid)" -o ${i} -lt 1 ]
do
i=$(( ${i} - 1 ))
sleep 1
done
scancel -s KILL -w ${nodename} # hard kill all remaining jobs
echo "shutdown container" echo "shutdown container"
scontrol update NodeName=${nodename} State=DOWN Reason=shutdown
singularity instance stop slurm-drone singularity instance stop slurm-drone
scontrol update NodeName=${nodename} State=FUTURE scontrol update NodeName=${nodename} State=FUTURE
umount /inner-cgroup/freezer umount /inner-cgroup/freezer
umount /inner-cgroup umount /inner-cgroup
SHUTDOWN_DONE=1
exit 0 exit 0
} }
# set -x # set -x
trap handler_quit EXIT trap handler_quit EXIT
@@ -26,12 +44,16 @@ mount --bind /sys/fs/cgroup/freezer/slurm/ /inner-cgroup/freezer/
mount -o remount,ro /inner-cgroup mount -o remount,ro /inner-cgroup
echo "starting ${nodename}" echo "starting ${nodename}"
scontrol update NodeName=${nodename} State=RESUME # revoke last DRAIN
scontrol update NodeName=${nodename} State=FUTURE
singularity instance start \ singularity instance start \
-B /inner-cgroup/:/sys/fs/cgroup/ \ -B /inner-cgroup/:/sys/fs/cgroup/ \
--writable-tmpfs /shared/slurmd.sif slurm-drone \ --writable-tmpfs /shared/slurmd.sif slurm-drone \
slurm-ctl ${nodename} slurm-ctl ${nodename}
# scontrol update NodeName=${nodename} NodeHostname=${SLURM_JOB_ID}
scontrol update NodeName=${nodename} NodeHostname=${TardisDroneUuid}
if [ $? -eq 0 ] ; then if [ $? -eq 0 ] ; then
echo "container started, sleeping $(( 60 * ${SLURM_Walltime}))" echo "container started, sleeping $(( 60 * ${SLURM_Walltime} - 2 ))"
sleep $(( 60 * ${SLURM_Walltime} )) sleep $(( 60 * ${SLURM_Walltime} - 2 ))
fi fi
handler_quit handler_quit

View File

@@ -105,8 +105,10 @@ Waittime=0
#MaxMemPerCPU=0 #MaxMemPerCPU=0
#SchedulerTimeSlice=30 #SchedulerTimeSlice=30
SchedulerType=sched/backfill SchedulerType=sched/backfill
SelectType=select/linear # SelectType=select/linear
SelectType=select/cons_res
#SelectTypeParameters= #SelectTypeParameters=
SelectTypeParameters=CR_CORE
# #
# #
# JOB PRIORITY # JOB PRIORITY