Compare commits

..

4 Commits

14 changed files with 156 additions and 6 deletions

View File

@@ -57,4 +57,4 @@
container_name: cobald container_name: cobald
# docker_network: slurm # overriden by vars/slurm.yml # docker_network: slurm # overriden by vars/slurm.yml
when: '"cobald" in group_names' when: '"cobald" in group_names'
tags: cobald, influxdb tags: cobald, influxdb, singularity

View File

@@ -20,7 +20,7 @@
tags: htcondor tags: htcondor
- name: "setup singularity" - name: "setup singularity"
import_tasks: "singularity.yml" import_role: name="singularity"
tags: singularity tags: singularity
- name: "setup docker" - name: "setup docker"

View File

@@ -0,0 +1,3 @@
#!/bin/sh
[ /slurm-singimage/slurmd.sif -nt /shared/slurmd.sif ] && \
cp /slurm-singimage/slurmd.sif /shared/slurmd.sif

View File

@@ -0,0 +1,3 @@
#!/bin/sh
slurmd --conf-server ${slurmctld} -D -N ${nodename} 2>/dev/null 1>/dev/null &
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}

View File

@@ -0,0 +1,30 @@
Bootstrap: docker-daemon
From: slurm:slurmd
%files
31-slurmd-configless /etc/docker-init.d/31-slurm-configless
/container/volumes/munge/munge.key /etc/munge/munge.key
%post
rm /etc/docker-init.d/30-slurmd
chmod 755 /etc/docker-init.d/31-slurm-configless
%startscript
if [ -z "${1}" -o -z "${2}" ] ; then
echo "undefined variables slurmctld or nodename"
exit 1
fi
export slurmctld="${1}"
export nodename="${2}"
echo ${slurmctld} ${nodename} ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}
exec /usr/local/sbin/entrypoint.sh /usr/local/sbin/docker-init
%runscript
if [ -z "${1}" -o -z "${2}" ] ; then
echo "undefined variables slurmctld or nodename"
exit 1
fi
export slurmctld="${1}"
export nodename="${2}"
echo ${slurmctld} ${nodename} ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}
exec /usr/local/sbin/entrypoint.sh /usr/local/sbin/docker-init

View File

@@ -0,0 +1,25 @@
#!/bin/sh
#SBATCH -D /shared
export
echo $@
nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }')
function handler_quit(){
echo "shutdown container"
singularity instance stop slurm-drone
scontrol update NodeName=${nodename} State=FUTURE
exit 0
}
trap handler_quit EXIT
# set -x
echo "starting ${nodename}"
singularity instance start --writable-tmpfs /shared/slurmd.sif slurm-drone \
slurm-ctl ${nodename}
if [ $? -eq 0 ] ; then
echo "container started, sleeping $(( 60 * ${SLURM_Walltime}))"
sleep $(( 60 * ${SLURM_Walltime} ))
fi
handler_quit

View File

@@ -15,6 +15,10 @@
content: "{{ lookup('file', 'cobald-entrypoint.sh') }}" content: "{{ lookup('file', 'cobald-entrypoint.sh') }}"
- dest: init-cobaldmodules.sh - dest: init-cobaldmodules.sh
content: "{{ lookup('file', 'init-cobaldmodules.sh') }}" content: "{{ lookup('file', 'init-cobaldmodules.sh') }}"
- dest: start-drone
content: "{{ lookup('file', 'start-drone') }}"
- dest: 28-sync-container-slurmd
content: "{{ lookup('file', '28-sync-container-slurmd') }}"
when: cobald_slurm | default(False) when: cobald_slurm | default(False)
- name: build generic cobald docker image - name: build generic cobald docker image
@@ -109,6 +113,13 @@
include_tasks: facts.yml include_tasks: facts.yml
when: cobald_container_hostname is not defined when: cobald_container_hostname is not defined
- name: build singularity container
include_tasks:
file: singularity.yml
apply:
tags: singularity
tags: singularity
- name: run cobald container - name: run cobald container
docker_container: docker_container:
name: "{{ container_name | default('cobald') }}" name: "{{ container_name | default('cobald') }}"
@@ -129,6 +140,7 @@
cobald_mounts: cobald_mounts:
- "~{{unpriv_user}}/cobald:/etc/cobald" - "~{{unpriv_user}}/cobald:/etc/cobald"
# - /container/volumes/cobald:/etc/cobald:ro # - /container/volumes/cobald:/etc/cobald:ro
- "/container/docker-images/sing-slurmd/build/:/slurm-singimage/:ro"
- "~{{unpriv_user}}/cobald/modules:/usr/local/src/cobaldmodules" - "~{{unpriv_user}}/cobald/modules:/usr/local/src/cobaldmodules"
- "~{{unpriv_user}}/cobald-src:/usr/local/src/cobald:ro" - "~{{unpriv_user}}/cobald-src:/usr/local/src/cobald:ro"
- "~{{unpriv_user}}/tardis-src:/usr/local/src/tardis:ro" - "~{{unpriv_user}}/tardis-src:/usr/local/src/tardis:ro"

View File

@@ -0,0 +1,47 @@
- name: setup singularity
import_role: name="singularity"
tags: singularity
- name: make singularity image build directory
file:
state: directory
path: "{{item}}"
owner: "{{unpriv_user}}"
group: "docker"
mode: "0755"
loop:
- /container/docker-images/sing-slurmd
- /container/docker-images/sing-slurmd/cache
- /container/docker-images/sing-slurmd/build
- name: copy slurm singularity container files
copy:
src: "{{item}}"
dest: "/container/docker-images/sing-slurmd/{{item}}"
owner: "{{unpriv_user}}"
group: "docker"
loop:
- slurm-slurmd.def
- 31-slurmd-configless
register: cobald_copy_sing_files
- name: remove old container
file:
path: /container/docker-images/sing-slurmd/build/slurmd.sif
state: absent
when: cobald_copy_sing_files.changed
- name: build container
shell:
chdir: /container/docker-images/sing-slurmd/
cmd: SINGULARITY_TMPDIR=/container/docker-images/sing-slurmd/cache
singularity build --disable-cache
/container/docker-images/sing-slurmd/build/slurmd.sif
/container/docker-images/sing-slurmd/slurm-slurmd.def
creates: /container/docker-images/sing-slurmd/build/slurmd.sif
register: cobald_sing_build
- debug: msg="{{[cobald_sing_build.stdout, cobald_sing_build.stderr]}}"
tags: [ never, debug ]
# TODO: trigger copy in cobald container when slurmd.sif rebuilt

View File

@@ -48,6 +48,10 @@ COPY init-cobaldmodules.sh /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
RUN chmod 755 /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh RUN chmod 755 /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
COPY start-drone /usr/local/bin/start-drone
COPY 28-sync-container-slurmd /etc/docker-init.d/28-sync-container-slurmd
RUN chmod 755 /usr/local/bin/start-drone /etc/docker-init.d/28-sync-container-slurmd
RUN echo -e "#!/bin/sh\npython3 -m cobald.daemon /etc/cobald/config.yaml" >> /etc/docker-init.d/70-cobald && chmod 755 /etc/docker-init.d/70-cobald RUN echo -e "#!/bin/sh\npython3 -m cobald.daemon /etc/cobald/config.yaml" >> /etc/docker-init.d/70-cobald && chmod 755 /etc/docker-init.d/70-cobald
{% if cobald_docker_default_command | default(True) -%} {% if cobald_docker_default_command | default(True) -%}

View File

@@ -8,7 +8,7 @@
value: "15000" value: "15000"
sysctl_file: /etc/sysctl.d/90-max_net_namespaces.conf sysctl_file: /etc/sysctl.d/90-max_net_namespaces.conf
- name: "enable user thoto for fakeroot access" - name: "enable user {{unpriv_user}} for fakeroot access"
lineinfile: lineinfile:
line: "{{unpriv_user}}:4294836224:65536" line: "{{unpriv_user}}:4294836224:65536"
dest: "{{item}}" dest: "{{item}}"

View File

@@ -9,3 +9,6 @@ RUN chmod 755 /etc/docker-init.d/30-slurmd
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log" ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log" ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log" ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
RUN yum install -y singularity && \
yum clean all && rm -rf /var/cache/yum

View File

@@ -81,6 +81,14 @@
- name: "{{ inventory_hostname }}" - name: "{{ inventory_hostname }}"
addr: "{{ slurm_network_data.network.IPAM.Config[0].Gateway }}" addr: "{{ slurm_network_data.network.IPAM.Config[0].Gateway }}"
alloc_nodes: "{{ slurm_alloc_nodes_default + extra_nodes | default([])}}" alloc_nodes: "{{ slurm_alloc_nodes_default + extra_nodes | default([])}}"
partitions:
- name: cobald
nodeprefix: drone
num_nodes: 10
node_cores: 3
node_mem: 4900
port: 16818
initstate: FUTURE
notify: reconfigure slurm notify: reconfigure slurm
tags: slurm-config tags: slurm-config
@@ -105,6 +113,7 @@
ports: "{{ item.exposed_ports | default([]) }}" ports: "{{ item.exposed_ports | default([]) }}"
networks: networks:
- name: "{{ docker_network }}" - name: "{{ docker_network }}"
aliases: "{{ item.aliases | default(omit) }}"
env: env:
slurmuser: "{{ slurm_user }}" slurmuser: "{{ slurm_user }}"
image: "{{ item.image }}" image: "{{ item.image }}"

View File

@@ -9,6 +9,8 @@ ControlMachine=slurm-ctl
AuthType=auth/munge AuthType=auth/munge
#CheckpointType=checkpoint/none #CheckpointType=checkpoint/none
CryptoType=crypto/munge CryptoType=crypto/munge
CommunicationParameters=NoAddrCache
SlurmctldParameters=enable_configless
#DisableRootJobs=NO #DisableRootJobs=NO
#EnforcePartLimits=NO #EnforcePartLimits=NO
#Epilog= #Epilog=
@@ -166,13 +168,24 @@ SlurmSchedLogFile={{slurm_log_path_sched}}
NodeName=slurm-exec[1-{{ num_nodes }}] CPUs={{ slurm_exec_node_cores }} {{'' NodeName=slurm-exec[1-{{ num_nodes }}] CPUs={{ slurm_exec_node_cores }} {{''
}} RealMemory={{ slurm_exec_node_mem }} {{'' }} RealMemory={{ slurm_exec_node_mem }} {{''
}} CoresPerSocket={{ slurm_exec_node_cores }} State=UNKNOWN }} CoresPerSocket={{ slurm_exec_node_cores }} State=UNKNOWN
# RealMemory=5964 {% for p in partitions | default([]) %}
NodeName={{ p.nodeprefix }}[1-{{ p.num_nodes }}] CPUs={{ p.node_cores }} {{''
}} RealMemory={{ p.node_mem }} {{''
}} CoresPerSocket={{ p.node_cores }} {{''
}} {%- if p.port is defined %} Port={{ p.port}} {% endif %}{{''
}} State={{ p.initstate | default('UNKNOWN') }}
{% endfor %}
{% for i in alloc_nodes -%} {% for i in alloc_nodes -%}
NodeName={{i.name}} NodeName={{i.name}}
{%- if i.hostname is defined %} NodeHostname={{i.hostname}} {% endif %} {%- if i.hostname is defined %} NodeHostname={{i.hostname}} {% endif %}
{%- if i.addr is defined %} NodeAddr={{i.addr}} {% endif %} {%- if i.addr is defined %} NodeAddr={{i.addr}} {% endif %}
State=UNKNOWN State=UNKNOWN
{% endfor %} {% endfor %}
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] {{ '' PartitionName=container Nodes=slurm-exec[1-{{num_nodes}}] {{ ''
}} AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} {{ '' }} AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} {{ ''
}} Default=YES MaxTime=INFINITE State=UP }} Default=YES MaxTime=INFINITE State=UP
{% for p in partitions | default([]) %}
PartitionName={{ p.name }} Nodes={{ p.nodeprefix }}[1-{{ p.num_nodes }}] {{ ''
}} AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} {{ ''
}} MaxTime=INFINITE State=UP
{% endfor %}

View File

@@ -10,7 +10,8 @@ slurm_nodes_exec: | # extend range to execute nodes list
{% set slurm_nodes_exec = [] %} {% set slurm_nodes_exec = [] %}
{% for i in range(1, num_nodes+1) -%} {% for i in range(1, num_nodes+1) -%}
{% set _ = slurm_nodes_exec.extend([ {% set _ = slurm_nodes_exec.extend([
{'machine':'exec%s'|format(i), 'image': 'slurm:slurmd'}]) -%} {'machine':'exec%s'|format(i), 'image': 'slurm:slurmd',
'aliases':['drone%s'|format(i)]}]) -%}
{%- endfor %} {%- endfor %}
{{ slurm_nodes_exec }} {{ slurm_nodes_exec }}
slurm_default_mounts: slurm_default_mounts: