Compare commits
4 Commits
1a952a4e7a
...
8bc2f717e0
Author | SHA1 | Date | |
---|---|---|---|
8bc2f717e0
|
|||
d88761ca7d
|
|||
3be5025442
|
|||
4c4c4da79d
|
@@ -57,4 +57,4 @@
|
|||||||
container_name: cobald
|
container_name: cobald
|
||||||
# docker_network: slurm # overriden by vars/slurm.yml
|
# docker_network: slurm # overriden by vars/slurm.yml
|
||||||
when: '"cobald" in group_names'
|
when: '"cobald" in group_names'
|
||||||
tags: cobald, influxdb
|
tags: cobald, influxdb, singularity
|
||||||
|
@@ -20,7 +20,7 @@
|
|||||||
tags: htcondor
|
tags: htcondor
|
||||||
|
|
||||||
- name: "setup singularity"
|
- name: "setup singularity"
|
||||||
import_tasks: "singularity.yml"
|
import_role: name="singularity"
|
||||||
tags: singularity
|
tags: singularity
|
||||||
|
|
||||||
- name: "setup docker"
|
- name: "setup docker"
|
||||||
|
3
roles/cobald/files/28-sync-container-slurmd
Normal file
3
roles/cobald/files/28-sync-container-slurmd
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
[ /slurm-singimage/slurmd.sif -nt /shared/slurmd.sif ] && \
|
||||||
|
cp /slurm-singimage/slurmd.sif /shared/slurmd.sif
|
3
roles/cobald/files/31-slurmd-configless
Normal file
3
roles/cobald/files/31-slurmd-configless
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
slurmd --conf-server ${slurmctld} -D -N ${nodename} 2>/dev/null 1>/dev/null &
|
||||||
|
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}
|
30
roles/cobald/files/slurm-slurmd.def
Normal file
30
roles/cobald/files/slurm-slurmd.def
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
Bootstrap: docker-daemon
|
||||||
|
From: slurm:slurmd
|
||||||
|
|
||||||
|
%files
|
||||||
|
31-slurmd-configless /etc/docker-init.d/31-slurm-configless
|
||||||
|
/container/volumes/munge/munge.key /etc/munge/munge.key
|
||||||
|
|
||||||
|
%post
|
||||||
|
rm /etc/docker-init.d/30-slurmd
|
||||||
|
chmod 755 /etc/docker-init.d/31-slurm-configless
|
||||||
|
|
||||||
|
%startscript
|
||||||
|
if [ -z "${1}" -o -z "${2}" ] ; then
|
||||||
|
echo "undefined variables slurmctld or nodename"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
export slurmctld="${1}"
|
||||||
|
export nodename="${2}"
|
||||||
|
echo ${slurmctld} ${nodename} ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}
|
||||||
|
exec /usr/local/sbin/entrypoint.sh /usr/local/sbin/docker-init
|
||||||
|
|
||||||
|
%runscript
|
||||||
|
if [ -z "${1}" -o -z "${2}" ] ; then
|
||||||
|
echo "undefined variables slurmctld or nodename"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
export slurmctld="${1}"
|
||||||
|
export nodename="${2}"
|
||||||
|
echo ${slurmctld} ${nodename} ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}
|
||||||
|
exec /usr/local/sbin/entrypoint.sh /usr/local/sbin/docker-init
|
25
roles/cobald/files/start-drone
Normal file
25
roles/cobald/files/start-drone
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
#SBATCH -D /shared
|
||||||
|
export
|
||||||
|
echo $@
|
||||||
|
nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }')
|
||||||
|
|
||||||
|
function handler_quit(){
|
||||||
|
echo "shutdown container"
|
||||||
|
singularity instance stop slurm-drone
|
||||||
|
scontrol update NodeName=${nodename} State=FUTURE
|
||||||
|
exit 0
|
||||||
|
}
|
||||||
|
|
||||||
|
trap handler_quit EXIT
|
||||||
|
|
||||||
|
# set -x
|
||||||
|
|
||||||
|
echo "starting ${nodename}"
|
||||||
|
singularity instance start --writable-tmpfs /shared/slurmd.sif slurm-drone \
|
||||||
|
slurm-ctl ${nodename}
|
||||||
|
if [ $? -eq 0 ] ; then
|
||||||
|
echo "container started, sleeping $(( 60 * ${SLURM_Walltime}))"
|
||||||
|
sleep $(( 60 * ${SLURM_Walltime} ))
|
||||||
|
fi
|
||||||
|
handler_quit
|
@@ -15,6 +15,10 @@
|
|||||||
content: "{{ lookup('file', 'cobald-entrypoint.sh') }}"
|
content: "{{ lookup('file', 'cobald-entrypoint.sh') }}"
|
||||||
- dest: init-cobaldmodules.sh
|
- dest: init-cobaldmodules.sh
|
||||||
content: "{{ lookup('file', 'init-cobaldmodules.sh') }}"
|
content: "{{ lookup('file', 'init-cobaldmodules.sh') }}"
|
||||||
|
- dest: start-drone
|
||||||
|
content: "{{ lookup('file', 'start-drone') }}"
|
||||||
|
- dest: 28-sync-container-slurmd
|
||||||
|
content: "{{ lookup('file', '28-sync-container-slurmd') }}"
|
||||||
when: cobald_slurm | default(False)
|
when: cobald_slurm | default(False)
|
||||||
|
|
||||||
- name: build generic cobald docker image
|
- name: build generic cobald docker image
|
||||||
@@ -109,6 +113,13 @@
|
|||||||
include_tasks: facts.yml
|
include_tasks: facts.yml
|
||||||
when: cobald_container_hostname is not defined
|
when: cobald_container_hostname is not defined
|
||||||
|
|
||||||
|
- name: build singularity container
|
||||||
|
include_tasks:
|
||||||
|
file: singularity.yml
|
||||||
|
apply:
|
||||||
|
tags: singularity
|
||||||
|
tags: singularity
|
||||||
|
|
||||||
- name: run cobald container
|
- name: run cobald container
|
||||||
docker_container:
|
docker_container:
|
||||||
name: "{{ container_name | default('cobald') }}"
|
name: "{{ container_name | default('cobald') }}"
|
||||||
@@ -129,6 +140,7 @@
|
|||||||
cobald_mounts:
|
cobald_mounts:
|
||||||
- "~{{unpriv_user}}/cobald:/etc/cobald"
|
- "~{{unpriv_user}}/cobald:/etc/cobald"
|
||||||
# - /container/volumes/cobald:/etc/cobald:ro
|
# - /container/volumes/cobald:/etc/cobald:ro
|
||||||
|
- "/container/docker-images/sing-slurmd/build/:/slurm-singimage/:ro"
|
||||||
- "~{{unpriv_user}}/cobald/modules:/usr/local/src/cobaldmodules"
|
- "~{{unpriv_user}}/cobald/modules:/usr/local/src/cobaldmodules"
|
||||||
- "~{{unpriv_user}}/cobald-src:/usr/local/src/cobald:ro"
|
- "~{{unpriv_user}}/cobald-src:/usr/local/src/cobald:ro"
|
||||||
- "~{{unpriv_user}}/tardis-src:/usr/local/src/tardis:ro"
|
- "~{{unpriv_user}}/tardis-src:/usr/local/src/tardis:ro"
|
||||||
|
47
roles/cobald/tasks/singularity.yml
Normal file
47
roles/cobald/tasks/singularity.yml
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
- name: setup singularity
|
||||||
|
import_role: name="singularity"
|
||||||
|
tags: singularity
|
||||||
|
|
||||||
|
- name: make singularity image build directory
|
||||||
|
file:
|
||||||
|
state: directory
|
||||||
|
path: "{{item}}"
|
||||||
|
owner: "{{unpriv_user}}"
|
||||||
|
group: "docker"
|
||||||
|
mode: "0755"
|
||||||
|
loop:
|
||||||
|
- /container/docker-images/sing-slurmd
|
||||||
|
- /container/docker-images/sing-slurmd/cache
|
||||||
|
- /container/docker-images/sing-slurmd/build
|
||||||
|
|
||||||
|
- name: copy slurm singularity container files
|
||||||
|
copy:
|
||||||
|
src: "{{item}}"
|
||||||
|
dest: "/container/docker-images/sing-slurmd/{{item}}"
|
||||||
|
owner: "{{unpriv_user}}"
|
||||||
|
group: "docker"
|
||||||
|
loop:
|
||||||
|
- slurm-slurmd.def
|
||||||
|
- 31-slurmd-configless
|
||||||
|
register: cobald_copy_sing_files
|
||||||
|
|
||||||
|
- name: remove old container
|
||||||
|
file:
|
||||||
|
path: /container/docker-images/sing-slurmd/build/slurmd.sif
|
||||||
|
state: absent
|
||||||
|
when: cobald_copy_sing_files.changed
|
||||||
|
|
||||||
|
- name: build container
|
||||||
|
shell:
|
||||||
|
chdir: /container/docker-images/sing-slurmd/
|
||||||
|
cmd: SINGULARITY_TMPDIR=/container/docker-images/sing-slurmd/cache
|
||||||
|
singularity build --disable-cache
|
||||||
|
/container/docker-images/sing-slurmd/build/slurmd.sif
|
||||||
|
/container/docker-images/sing-slurmd/slurm-slurmd.def
|
||||||
|
creates: /container/docker-images/sing-slurmd/build/slurmd.sif
|
||||||
|
register: cobald_sing_build
|
||||||
|
|
||||||
|
- debug: msg="{{[cobald_sing_build.stdout, cobald_sing_build.stderr]}}"
|
||||||
|
tags: [ never, debug ]
|
||||||
|
|
||||||
|
# TODO: trigger copy in cobald container when slurmd.sif rebuilt
|
@@ -48,6 +48,10 @@ COPY init-cobaldmodules.sh /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
|
|||||||
|
|
||||||
RUN chmod 755 /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
|
RUN chmod 755 /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
|
||||||
|
|
||||||
|
COPY start-drone /usr/local/bin/start-drone
|
||||||
|
COPY 28-sync-container-slurmd /etc/docker-init.d/28-sync-container-slurmd
|
||||||
|
RUN chmod 755 /usr/local/bin/start-drone /etc/docker-init.d/28-sync-container-slurmd
|
||||||
|
|
||||||
RUN echo -e "#!/bin/sh\npython3 -m cobald.daemon /etc/cobald/config.yaml" >> /etc/docker-init.d/70-cobald && chmod 755 /etc/docker-init.d/70-cobald
|
RUN echo -e "#!/bin/sh\npython3 -m cobald.daemon /etc/cobald/config.yaml" >> /etc/docker-init.d/70-cobald && chmod 755 /etc/docker-init.d/70-cobald
|
||||||
|
|
||||||
{% if cobald_docker_default_command | default(True) -%}
|
{% if cobald_docker_default_command | default(True) -%}
|
||||||
|
@@ -8,7 +8,7 @@
|
|||||||
value: "15000"
|
value: "15000"
|
||||||
sysctl_file: /etc/sysctl.d/90-max_net_namespaces.conf
|
sysctl_file: /etc/sysctl.d/90-max_net_namespaces.conf
|
||||||
|
|
||||||
- name: "enable user thoto for fakeroot access"
|
- name: "enable user {{unpriv_user}} for fakeroot access"
|
||||||
lineinfile:
|
lineinfile:
|
||||||
line: "{{unpriv_user}}:4294836224:65536"
|
line: "{{unpriv_user}}:4294836224:65536"
|
||||||
dest: "{{item}}"
|
dest: "{{item}}"
|
@@ -9,3 +9,6 @@ RUN chmod 755 /etc/docker-init.d/30-slurmd
|
|||||||
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
|
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
|
||||||
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
|
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
|
||||||
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
|
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
|
||||||
|
|
||||||
|
RUN yum install -y singularity && \
|
||||||
|
yum clean all && rm -rf /var/cache/yum
|
||||||
|
@@ -81,6 +81,14 @@
|
|||||||
- name: "{{ inventory_hostname }}"
|
- name: "{{ inventory_hostname }}"
|
||||||
addr: "{{ slurm_network_data.network.IPAM.Config[0].Gateway }}"
|
addr: "{{ slurm_network_data.network.IPAM.Config[0].Gateway }}"
|
||||||
alloc_nodes: "{{ slurm_alloc_nodes_default + extra_nodes | default([])}}"
|
alloc_nodes: "{{ slurm_alloc_nodes_default + extra_nodes | default([])}}"
|
||||||
|
partitions:
|
||||||
|
- name: cobald
|
||||||
|
nodeprefix: drone
|
||||||
|
num_nodes: 10
|
||||||
|
node_cores: 3
|
||||||
|
node_mem: 4900
|
||||||
|
port: 16818
|
||||||
|
initstate: FUTURE
|
||||||
notify: reconfigure slurm
|
notify: reconfigure slurm
|
||||||
tags: slurm-config
|
tags: slurm-config
|
||||||
|
|
||||||
@@ -105,6 +113,7 @@
|
|||||||
ports: "{{ item.exposed_ports | default([]) }}"
|
ports: "{{ item.exposed_ports | default([]) }}"
|
||||||
networks:
|
networks:
|
||||||
- name: "{{ docker_network }}"
|
- name: "{{ docker_network }}"
|
||||||
|
aliases: "{{ item.aliases | default(omit) }}"
|
||||||
env:
|
env:
|
||||||
slurmuser: "{{ slurm_user }}"
|
slurmuser: "{{ slurm_user }}"
|
||||||
image: "{{ item.image }}"
|
image: "{{ item.image }}"
|
||||||
|
@@ -9,6 +9,8 @@ ControlMachine=slurm-ctl
|
|||||||
AuthType=auth/munge
|
AuthType=auth/munge
|
||||||
#CheckpointType=checkpoint/none
|
#CheckpointType=checkpoint/none
|
||||||
CryptoType=crypto/munge
|
CryptoType=crypto/munge
|
||||||
|
CommunicationParameters=NoAddrCache
|
||||||
|
SlurmctldParameters=enable_configless
|
||||||
#DisableRootJobs=NO
|
#DisableRootJobs=NO
|
||||||
#EnforcePartLimits=NO
|
#EnforcePartLimits=NO
|
||||||
#Epilog=
|
#Epilog=
|
||||||
@@ -166,13 +168,24 @@ SlurmSchedLogFile={{slurm_log_path_sched}}
|
|||||||
NodeName=slurm-exec[1-{{ num_nodes }}] CPUs={{ slurm_exec_node_cores }} {{''
|
NodeName=slurm-exec[1-{{ num_nodes }}] CPUs={{ slurm_exec_node_cores }} {{''
|
||||||
}} RealMemory={{ slurm_exec_node_mem }} {{''
|
}} RealMemory={{ slurm_exec_node_mem }} {{''
|
||||||
}} CoresPerSocket={{ slurm_exec_node_cores }} State=UNKNOWN
|
}} CoresPerSocket={{ slurm_exec_node_cores }} State=UNKNOWN
|
||||||
# RealMemory=5964
|
{% for p in partitions | default([]) %}
|
||||||
|
NodeName={{ p.nodeprefix }}[1-{{ p.num_nodes }}] CPUs={{ p.node_cores }} {{''
|
||||||
|
}} RealMemory={{ p.node_mem }} {{''
|
||||||
|
}} CoresPerSocket={{ p.node_cores }} {{''
|
||||||
|
}} {%- if p.port is defined %} Port={{ p.port}} {% endif %}{{''
|
||||||
|
}} State={{ p.initstate | default('UNKNOWN') }}
|
||||||
|
{% endfor %}
|
||||||
{% for i in alloc_nodes -%}
|
{% for i in alloc_nodes -%}
|
||||||
NodeName={{i.name}}
|
NodeName={{i.name}}
|
||||||
{%- if i.hostname is defined %} NodeHostname={{i.hostname}} {% endif %}
|
{%- if i.hostname is defined %} NodeHostname={{i.hostname}} {% endif %}
|
||||||
{%- if i.addr is defined %} NodeAddr={{i.addr}} {% endif %}
|
{%- if i.addr is defined %} NodeAddr={{i.addr}} {% endif %}
|
||||||
State=UNKNOWN
|
State=UNKNOWN
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] {{ ''
|
PartitionName=container Nodes=slurm-exec[1-{{num_nodes}}] {{ ''
|
||||||
}} AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} {{ ''
|
}} AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} {{ ''
|
||||||
}} Default=YES MaxTime=INFINITE State=UP
|
}} Default=YES MaxTime=INFINITE State=UP
|
||||||
|
{% for p in partitions | default([]) %}
|
||||||
|
PartitionName={{ p.name }} Nodes={{ p.nodeprefix }}[1-{{ p.num_nodes }}] {{ ''
|
||||||
|
}} AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} {{ ''
|
||||||
|
}} MaxTime=INFINITE State=UP
|
||||||
|
{% endfor %}
|
||||||
|
@@ -10,7 +10,8 @@ slurm_nodes_exec: | # extend range to execute nodes list
|
|||||||
{% set slurm_nodes_exec = [] %}
|
{% set slurm_nodes_exec = [] %}
|
||||||
{% for i in range(1, num_nodes+1) -%}
|
{% for i in range(1, num_nodes+1) -%}
|
||||||
{% set _ = slurm_nodes_exec.extend([
|
{% set _ = slurm_nodes_exec.extend([
|
||||||
{'machine':'exec%s'|format(i), 'image': 'slurm:slurmd'}]) -%}
|
{'machine':'exec%s'|format(i), 'image': 'slurm:slurmd',
|
||||||
|
'aliases':['drone%s'|format(i)]}]) -%}
|
||||||
{%- endfor %}
|
{%- endfor %}
|
||||||
{{ slurm_nodes_exec }}
|
{{ slurm_nodes_exec }}
|
||||||
slurm_default_mounts:
|
slurm_default_mounts:
|
||||||
|
Reference in New Issue
Block a user