slurm container running when drone started
This commit is contained in:
3
roles/cobald/files/28-sync-container-slurmd
Normal file
3
roles/cobald/files/28-sync-container-slurmd
Normal file
@@ -0,0 +1,3 @@
|
||||
#!/bin/sh
|
||||
[ /slurm-singimage/slurmd.sif -nt /shared/slurmd.sif ] && \
|
||||
cp /slurm-singimage/slurmd.sif /shared/slurmd.sif
|
3
roles/cobald/files/31-slurmd-configless
Normal file
3
roles/cobald/files/31-slurmd-configless
Normal file
@@ -0,0 +1,3 @@
|
||||
#!/bin/sh
|
||||
slurmd --conf-server ${slurmctld} -D -N ${nodename} 2>/dev/null 1>/dev/null &
|
||||
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}
|
30
roles/cobald/files/slurm-slurmd.def
Normal file
30
roles/cobald/files/slurm-slurmd.def
Normal file
@@ -0,0 +1,30 @@
|
||||
Bootstrap: docker-daemon
|
||||
From: slurm:slurmd
|
||||
|
||||
%files
|
||||
31-slurmd-configless /etc/docker-init.d/31-slurm-configless
|
||||
/container/volumes/munge/munge.key /etc/munge/munge.key
|
||||
|
||||
%post
|
||||
rm /etc/docker-init.d/30-slurmd
|
||||
chmod 755 /etc/docker-init.d/31-slurm-configless
|
||||
|
||||
%startscript
|
||||
if [ -z "${1}" -o -z "${2}" ] ; then
|
||||
echo "undefined variables slurmctld or nodename"
|
||||
exit 1
|
||||
fi
|
||||
export slurmctld="${1}"
|
||||
export nodename="${2}"
|
||||
echo ${slurmctld} ${nodename} ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}
|
||||
exec /usr/local/sbin/entrypoint.sh /usr/local/sbin/docker-init
|
||||
|
||||
%runscript
|
||||
if [ -z "${1}" -o -z "${2}" ] ; then
|
||||
echo "undefined variables slurmctld or nodename"
|
||||
exit 1
|
||||
fi
|
||||
export slurmctld="${1}"
|
||||
export nodename="${2}"
|
||||
echo ${slurmctld} ${nodename} ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}
|
||||
exec /usr/local/sbin/entrypoint.sh /usr/local/sbin/docker-init
|
25
roles/cobald/files/start-drone
Normal file
25
roles/cobald/files/start-drone
Normal file
@@ -0,0 +1,25 @@
|
||||
#!/bin/sh
|
||||
#SBATCH -D /shared
|
||||
export
|
||||
echo $@
|
||||
nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }')
|
||||
|
||||
function handler_quit(){
|
||||
echo "shutdown container"
|
||||
singularity instance stop slurm-drone
|
||||
scontrol update NodeName=${nodename} State=FUTURE
|
||||
exit 0
|
||||
}
|
||||
|
||||
trap handler_quit EXIT
|
||||
|
||||
# set -x
|
||||
|
||||
echo "starting ${nodename}"
|
||||
singularity instance start --writable-tmpfs /shared/slurmd.sif slurm-drone \
|
||||
slurm-ctl ${nodename}
|
||||
if [ $? -eq 0 ] ; then
|
||||
echo "container started, sleeping $(( 60 * ${SLURM_Walltime}))"
|
||||
sleep $(( 60 * ${SLURM_Walltime} ))
|
||||
fi
|
||||
handler_quit
|
@@ -15,6 +15,10 @@
|
||||
content: "{{ lookup('file', 'cobald-entrypoint.sh') }}"
|
||||
- dest: init-cobaldmodules.sh
|
||||
content: "{{ lookup('file', 'init-cobaldmodules.sh') }}"
|
||||
- dest: start-drone
|
||||
content: "{{ lookup('file', 'start-drone') }}"
|
||||
- dest: 28-sync-container-slurmd
|
||||
content: "{{ lookup('file', '28-sync-container-slurmd') }}"
|
||||
when: cobald_slurm | default(False)
|
||||
|
||||
- name: build generic cobald docker image
|
||||
@@ -136,6 +140,7 @@
|
||||
cobald_mounts:
|
||||
- "~{{unpriv_user}}/cobald:/etc/cobald"
|
||||
# - /container/volumes/cobald:/etc/cobald:ro
|
||||
- "/container/docker-images/sing-slurmd/build/:/slurm-singimage/:ro"
|
||||
- "~{{unpriv_user}}/cobald/modules:/usr/local/src/cobaldmodules"
|
||||
- "~{{unpriv_user}}/cobald-src:/usr/local/src/cobald:ro"
|
||||
- "~{{unpriv_user}}/tardis-src:/usr/local/src/tardis:ro"
|
||||
|
@@ -1,8 +1,47 @@
|
||||
- name: "setup singularity"
|
||||
- name: setup singularity
|
||||
import_role: name="singularity"
|
||||
tags: singularity
|
||||
|
||||
#- name: copy slurm def file
|
||||
# - name: export docker image
|
||||
#- name: build container
|
||||
- name: make singularity image build directory
|
||||
file:
|
||||
state: directory
|
||||
path: "{{item}}"
|
||||
owner: "{{unpriv_user}}"
|
||||
group: "docker"
|
||||
mode: "0755"
|
||||
loop:
|
||||
- /container/docker-images/sing-slurmd
|
||||
- /container/docker-images/sing-slurmd/cache
|
||||
- /container/docker-images/sing-slurmd/build
|
||||
|
||||
- name: copy slurm singularity container files
|
||||
copy:
|
||||
src: "{{item}}"
|
||||
dest: "/container/docker-images/sing-slurmd/{{item}}"
|
||||
owner: "{{unpriv_user}}"
|
||||
group: "docker"
|
||||
loop:
|
||||
- slurm-slurmd.def
|
||||
- 31-slurmd-configless
|
||||
register: cobald_copy_sing_files
|
||||
|
||||
- name: remove old container
|
||||
file:
|
||||
path: /container/docker-images/sing-slurmd/build/slurmd.sif
|
||||
state: absent
|
||||
when: cobald_copy_sing_files.changed
|
||||
|
||||
- name: build container
|
||||
shell:
|
||||
chdir: /container/docker-images/sing-slurmd/
|
||||
cmd: SINGULARITY_TMPDIR=/container/docker-images/sing-slurmd/cache
|
||||
singularity build --disable-cache
|
||||
/container/docker-images/sing-slurmd/build/slurmd.sif
|
||||
/container/docker-images/sing-slurmd/slurm-slurmd.def
|
||||
creates: /container/docker-images/sing-slurmd/build/slurmd.sif
|
||||
register: cobald_sing_build
|
||||
|
||||
- debug: msg="{{[cobald_sing_build.stdout, cobald_sing_build.stderr]}}"
|
||||
tags: [ never, debug ]
|
||||
|
||||
# TODO: trigger copy in cobald container when slurmd.sif rebuilt
|
||||
|
@@ -48,6 +48,10 @@ COPY init-cobaldmodules.sh /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
|
||||
|
||||
RUN chmod 755 /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
|
||||
|
||||
COPY start-drone /usr/local/bin/start-drone
|
||||
COPY 28-sync-container-slurmd /etc/docker-init.d/28-sync-container-slurmd
|
||||
RUN chmod 755 /usr/local/bin/start-drone /etc/docker-init.d/28-sync-container-slurmd
|
||||
|
||||
RUN echo -e "#!/bin/sh\npython3 -m cobald.daemon /etc/cobald/config.yaml" >> /etc/docker-init.d/70-cobald && chmod 755 /etc/docker-init.d/70-cobald
|
||||
|
||||
{% if cobald_docker_default_command | default(True) -%}
|
||||
|
@@ -10,6 +10,7 @@ AuthType=auth/munge
|
||||
#CheckpointType=checkpoint/none
|
||||
CryptoType=crypto/munge
|
||||
CommunicationParameters=NoAddrCache
|
||||
SlurmctldParameters=enable_configless
|
||||
#DisableRootJobs=NO
|
||||
#EnforcePartLimits=NO
|
||||
#Epilog=
|
||||
|
Reference in New Issue
Block a user