Compare commits

...

5 Commits

26 changed files with 374 additions and 148 deletions

View File

@@ -19,6 +19,7 @@ all:
slurm: slurm:
hosts: hosts:
ed-c7-1: ed-c7-1:
ed-c7-2:
cobald: cobald:
hosts: hosts:
ed-c7-1: ed-c7-1:

View File

@@ -1,6 +1,8 @@
--- ---
- hosts: all - hosts: all
vars_files: ['vars-auth.yml'] vars_files:
- vars-auth.yml
- vars-influx.yml
tasks: tasks:
- name: "install epel repo" # for htop etc. - name: "install epel repo" # for htop etc.
yum: yum:
@@ -53,27 +55,34 @@
- htcondor-containered - htcondor-containered
- htcondor - htcondor
- hosts: slurm - hosts: slurm, cobald
vars: vars:
container_privileged: True container_privileged: True
num_nodes: 3 slurm_num_nodes: 10
roles: roles:
- name: "setup docker" - name: "setup docker"
role: docker role: docker
tags: docker tags: docker
- name: "get facts from existing cobald instance (i.e. hostname)"
role: cobald_facts
vars:
container_name: cobald
tags: [ slurm, cobald ]
- name: "setup slurm test environment in docker containers" - name: "setup slurm test environment in docker containers"
role: slurm role: slurm
vars: vars:
slurm_user: slurm # or root slurm_user: slurm # or root
num_nodes: "{{slurm_num_nodes}}"
extra_nodes:
- "{{cobald_container_hostname}}" # from cobald_facts, read or generated
docker_network: slurm
when: '"slurm" in group_names'
tags: slurm tags: slurm
- hosts: cobald
roles:
- name: "setup docker"
role: docker
tags: docker
- name: "install cobald" - name: "install cobald"
role: cobald role: cobald
vars: vars:
docker_network: slurm cobald_slurm: True
container_name: cobald
# docker_network: slurm # overriden by vars/slurm.yml
when: '"cobald" in group_names'
tags: cobald tags: cobald

View File

@@ -1,7 +1,7 @@
#!/bin/sh #!/bin/sh
[ -f /usr/local/lib/cobaldmodules/setup.py -a \ for i in /usr/local/lib/entrypoints.d/* ; do
-d /usr/local/lib/cobaldmodules/cobaldmodules ] && \ [ -f $i ] && /bin/sh $i || break
pip3 install --no-deps --editable /usr/local/lib/cobaldmodules done
exec "${@:-/bin/bash}" exec "${@:-/bin/bash}"

View File

@@ -16,7 +16,7 @@
"gnetId": null, "gnetId": null,
"graphTooltip": 0, "graphTooltip": 0,
"id": 1, "id": 1,
"iteration": 1623184036048, "iteration": 1623317629899,
"links": [], "links": [],
"panels": [ "panels": [
{ {
@@ -99,7 +99,7 @@
], ],
"orderByTime": "ASC", "orderByTime": "ASC",
"policy": "default", "policy": "default",
"query": "nodes = from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> keep(columns: [\"tardis_machine_name\", \"_time\"])\n |> sort(columns: [\"_time\"], desc: true)\n |> unique(column: \"tardis_machine_name\")\n// |> yield()\n\noffset = from(bucket: \"batleth\")\n |> range(start: 0, stop: v.timeRangeStart)\n |> filter(fn: (r) => r._field == \"state\")\n |> group(columns: [\"tardis_machine_name\", \"machine_type\"])\n |> reduce(fn: (r, accumulator) => ({\n _value: accumulator._value + (\n if r._value == \"AvailableState\" then 1 \n else if r._value == \"DownState\" then -1 \n else 0)\n }), identity: {_value: 0})\n |> duplicate(column: \"_stop\", as: \"_time\")\n\nnew = from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._field == \"state\")\n// |> filter(fn: (r) => r.tardis_machine_name == \"${machine}\")\n |> group(columns: [\"tardis_machine_name\", \"machine_type\"])\n |> window(every: $__interval)\n |> reduce(fn: (r, accumulator) => ({\n _value: accumulator._value + (\n if r._value == \"AvailableState\" then 1 \n else if r._value == \"DownState\" then -1 \n else 0)\n }), identity: {_value: 0})\n |> duplicate(column: \"_stop\", as: \"_time\")\n\nunion(tables: [offset, new])\n |> window(every: inf)\n |> cumulativeSum()\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> yield()", "query": "f_r = (r, accumulator) => ({\n _value: accumulator._value + (\n if r._value == \"AvailableState\" then 1\n else if r._value == \"DownState\" then -1\n else 0)\n })\n\nnodes = from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> keep(columns: [\"tardis_machine_name\", \"_time\"])\n |> sort(columns: [\"_time\"], desc: true)\n |> unique(column: \"tardis_machine_name\")\n// |> yield()\n\noffset = from(bucket: \"batleth\")\n |> range(start: 0, stop: v.timeRangeStart)\n |> filter(fn: (r) => r._field == \"state\")\n |> group(columns: [\"tardis_machine_name\", \"machine_type\"])\n |> reduce(fn: f_r, identity: {_value: 0})\n |> duplicate(column: \"_stop\", as: \"_time\")\n\nnew = from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._field == \"state\")\n// |> filter(fn: (r) => r.tardis_machine_name == \"${machine}\")\n |> group(columns: [\"tardis_machine_name\", \"machine_type\"])\n |> window(every: $__interval)\n |> reduce(fn: f_r, identity: {_value: 0})\n |> duplicate(column: \"_stop\", as: \"_time\")\n\nunion(tables: [offset, new])\n |> window(every: inf)\n |> cumulativeSum()\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> yield()\n",
"refId": "A", "refId": "A",
"resultFormat": "time_series", "resultFormat": "time_series",
"select": [ "select": [
@@ -550,10 +550,9 @@
{ {
"allValue": null, "allValue": null,
"current": { "current": {
"isNone": true,
"selected": false, "selected": false,
"text": "None", "text": "cobald-xvmcqc",
"value": "" "value": "cobald-xvmcqc"
}, },
"datasource": "InfluxDB", "datasource": "InfluxDB",
"definition": "from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> keep(columns: [\"tardis_machine_name\", \"_time\"])\n |> sort(columns: [\"_time\"], desc: true)\n |> unique(column: \"tardis_machine_name\")", "definition": "from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> keep(columns: [\"tardis_machine_name\", \"_time\"])\n |> sort(columns: [\"_time\"], desc: true)\n |> unique(column: \"tardis_machine_name\")",
@@ -586,5 +585,5 @@
"timezone": "", "timezone": "",
"title": "cobald", "title": "cobald",
"uid": "urDuvE6Gk", "uid": "urDuvE6Gk",
"version": 9 "version": 2
} }

View File

@@ -0,0 +1,5 @@
#!/bin/sh
[ -f /usr/local/lib/cobaldmodules/setup.py -a \
-d /usr/local/lib/cobaldmodules/cobaldmodules ] && \
pip3 install --no-deps --editable /usr/local/lib/cobaldmodules

View File

@@ -1,33 +1,41 @@
- include_vars: cobald-slurm.yml
when: cobald_slurm | default(False)
- file: - file:
path: "/container/{{item}}/cobald/" path: "/container/{{item.name}}/cobald{{item.pfx|default('')}}/"
state: directory state: directory
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
group: docker group: docker
loop: loop:
- docker-images - name: docker-images
- volumes pfx: ".{{cobald_image_tag|default('latest')}}"
- name: volumes
- copy: - template:
src: cobald.Dockerfile src: cobald.Dockerfile
dest: /container/docker-images/cobald/Dockerfile dest: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/Dockerfile"
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
group: docker group: docker
register: cobald_cp_dockerfile register: cobald_cp_dockerfile
- copy: - copy:
src: cobald-entrypoint.sh src: "{{item}}"
dest: /container/docker-images/cobald/cobald-entrypoint.sh dest: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/{{item}}"
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
group: docker group: docker
mode: 0755 mode: 0755
with_items:
- cobald-entrypoint.sh
- init-cobaldmodules.sh
register: cobald_cp_files register: cobald_cp_files
- docker_image: - docker_image:
name: "cobald" name: "cobald"
tag: "{{cobald_image_tag|default('latest')}}"
# pull: False # pull: False
build: build:
pull: False pull: False
path: "/container/docker-images/cobald/" path: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/"
source: build source: build
force_source: "{{cobald_cp_dockerfile.changed or cobald_cp_files.changed}}" force_source: "{{cobald_cp_dockerfile.changed or cobald_cp_files.changed}}"
@@ -39,7 +47,7 @@
when: False when: False
- docker_network: - docker_network:
name: "{{docker_network}}" name: "{{cobald_docker_network}}" # FIXME
state: present state: present
# docker run -v $(pwd)/cobald-config-host:/etc/cobald -v $(pwd)/cobald:/cobald --rm -it cobald bash # docker run -v $(pwd)/cobald-config-host:/etc/cobald -v $(pwd)/cobald:/cobald --rm -it cobald bash
@@ -82,7 +90,7 @@
- name: run pip install - name: run pip install
docker_container: docker_container:
image: cobald image: "cobald:{{cobald_image_tag|default('latest')}}"
name: "cobald-src-{{item.name}}-install" name: "cobald-src-{{item.name}}-install"
volumes: volumes:
- "~{{unpriv_user}}/{{item.name}}-src:/usr/local/src/{{item.name}}:rw" - "~{{unpriv_user}}/{{item.name}}-src:/usr/local/src/{{item.name}}:rw"
@@ -102,29 +110,34 @@
- import_tasks: telegraf.yml - import_tasks: telegraf.yml
- docker_container_info: - name: get cobald hostname
name: cobald include_role:
register: cobald_container_info name: cobald_facts
when: cobald_container_hostname is not defined
- docker_container: - name: run cobald container
name: cobald docker_container:
image: cobald name: "{{ container_name | default('cobald') }}"
hostname: |- image: "cobald:{{cobald_image_tag|default('latest')}}"
{{cobald_container_info.container.Config.Hostname | default('cobald-'+ hostname: "{{cobald_container_hostname}}"
lookup('password', '/dev/null chars=ascii_lowercase length=6')) }} domainname: "{{ cobald_domainname | default('cobald.local')}}"
domainname: cobald.local volumes: "{{default_mounts + cobald_mounts }}"
volumes: networks:
- name: "{{cobald_docker_network}}"
networks_cli_compatible: True
# env:
# slurmuser: "{{slurm_user}}"
# privileged: "{{ container_privileged | bool }}"
state: started
detach: True
cleanup: True
interactive: True
# command: python3 -m cobald.daemon /etc/cobald/config.yaml
vars:
default_mounts: "{{cobald_slurm_mounts | default([])}}"
cobald_mounts:
- "~{{unpriv_user}}/cobald:/etc/cobald" - "~{{unpriv_user}}/cobald:/etc/cobald"
# - /container/volumes/cobald:/etc/cobald:ro # - /container/volumes/cobald:/etc/cobald:ro
- "~{{unpriv_user}}/cobald/modules:/usr/local/src/cobaldmodules" - "~{{unpriv_user}}/cobald/modules:/usr/local/src/cobaldmodules"
- "~{{unpriv_user}}/cobald-src:/usr/local/src/cobald:ro" - "~{{unpriv_user}}/cobald-src:/usr/local/src/cobald:ro"
- "~{{unpriv_user}}/tardis-src:/usr/local/src/tardis:ro" - "~{{unpriv_user}}/tardis-src:/usr/local/src/tardis:ro"
networks:
- name: "{{docker_network}}"
state: started
detach: True
cleanup: True
interactive: True
# command: /bin/bash
# python3 -m cobald.daemon /etc/cobald/config.yaml
command: python3 -m cobald.daemon /etc/cobald/config.yaml

View File

@@ -1,4 +1,4 @@
FROM docker.io/library/centos:7 FROM {{ cobald_docker_base_image | default("docker.io/library/centos:7") }}
RUN yum update -y && \ RUN yum update -y && \
yum install -y python3 git && pip3 install --upgrade pip && \ yum install -y python3 git && pip3 install --upgrade pip && \
@@ -41,11 +41,20 @@ VOLUME /usr/local/src/cobaldmodules
VOLUME /etc/cobald VOLUME /etc/cobald
COPY cobald-entrypoint.sh /usr/local/sbin/docker-entrypoint.sh RUN mkdir -p /usr/local/lib/entrypoints.d/
RUN chmod 755 /usr/local/sbin/docker-entrypoint.sh COPY init-cobaldmodules.sh /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
ENTRYPOINT [ "/usr/local/sbin/docker-entrypoint.sh" ] RUN chmod 755 /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
RUN echo -e "#!/bin/sh\npython3 -m cobald.daemon /etc/cobald/config.yaml" >> /etc/docker-init.d/70-cobald && chmod 755 /etc/docker-init.d/70-cobald
{% if cobald_docker_default_command | default(True) -%}
COPY cobald-entrypoint.sh /usr/local/sbin/cobald-entrypoint.sh
RUN chmod 755 /usr/local/sbin/cobald-entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/cobald-entrypoint.sh" ]
RUN yum -y install iproute &&\ RUN yum -y install iproute &&\
yum clean all && rm -rf /var/cache/yum yum clean all && rm -rf /var/cache/yum
@@ -54,4 +63,6 @@ USER cobald
STOPSIGNAL SIGINT STOPSIGNAL SIGINT
CMD "python3 -m cobald.daemon /etc/cobald/config.yaml" # CMD "python3 -m cobald.daemon /etc/cobald/config.yaml"
CMD /etc/docker-init.d/60-cobald
{%- endif %}

View File

@@ -0,0 +1,12 @@
cobald_image_tag: slurm
cobald_docker_base_image: "{{slurm.base_image}}"
cobald_docker_default_command: False
cobald_docker_network: "{{slurm.network}}"
cobald_domainname: "{{slurm.domain}}"
cobald_slurm_mounts: "{{slurm.mounts}}"
#- /container/volumes/slurm/:/etc/slurm/:rw
##- "{{slurm_cfg_path | mandatory}}:/etc/slurm/:rw"
#- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
## - "{{slurm_munge_path | mandatory}}:/etc/munge/munge.key:rw"
#- slurm-shared:/shared/:rw
## - "{{slurm_shared_path | mandatory}}:{{slurm_shared_target | default('/shared')}}:rw"

View File

@@ -0,0 +1 @@
cobald_docker_network: "{{docker_network}}"

View File

@@ -0,0 +1,11 @@
- block:
- docker_container_info:
name: "{{ container_name | mandatory }}"
register: cobald_container_info
- set_fact:
cobald_container_hostname: |-
{{cobald_container_info.container.Config.Hostname | default('cobald-'+
lookup('password', '/dev/null chars=ascii_lowercase length=6')) }}
when: cobald_container_hostname is not defined

View File

@@ -3,3 +3,6 @@ slurm_user: slurm
slurm_log_path_ctld: /var/log/slurm/slurmctld.log slurm_log_path_ctld: /var/log/slurm/slurmctld.log
slurm_log_path_d: /var/log/slurm/slurmd.log slurm_log_path_d: /var/log/slurm/slurmd.log
slurm_log_path_sched: /var/log/slurm/slurmsched.log slurm_log_path_sched: /var/log/slurm/slurmsched.log
slurm_prefix: slurm
slurm_domain: slurm.local
docker_network: slurm

View File

@@ -0,0 +1,18 @@
#!/bin/bash
function trp_term(){
echo pkill -P $pids
for j in $pids ; do
pkill -P $j
kill -SIGTERM $j
done
}
trap trp_term SIGINT SIGTERM
pids=""
for i in /etc/docker-init.d/* ; do
[ ! -f $i ] && break
$i &
pids="$pids $!"
done
wait $pids
# TODO: call start scripts like "foo.sh start" and "foo.sh stop" to avoid pkill

View File

@@ -1,32 +1,11 @@
FROM docker.io/library/centos:7 as base FROM slurm:base
RUN yum install -y epel-release && \
yum install -y slurm && \
yum clean all && rm -rf /var/cache/yum
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
yum clean all && rm -rf /var/cache/yum
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
chmod 755 /usr/local/sbin/entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
ARG slurmuser=slurm
ENV slurmuser=${slurmuser}
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
slurm-setuser -u $slurmuser -g $slurmuser -y
RUN yum install -y slurm-slurmctld && \ RUN yum install -y slurm-slurmctld && \
yum clean all && rm -rf /var/cache/yum yum clean all && rm -rf /var/cache/yum
COPY start-scripts/20-slurmctld /etc/docker-init.d/20-slurmctld
RUN chmod 755 /etc/docker-init.d/20-slurmctld
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log" ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log" ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log" ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'

View File

@@ -1,32 +1,11 @@
FROM docker.io/library/centos:7 FROM slurm:base
RUN yum install -y epel-release && \
yum install -y slurm && \
yum clean all && rm -rf /var/cache/yum
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
yum clean all && rm -rf /var/cache/yum
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
chmod 755 /usr/local/sbin/entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
ARG slurmuser=slurm
ENV slurmuser=${slurmuser}
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
slurm-setuser -u $slurmuser -g $slurmuser -y
RUN yum install -y slurm-slurmd && \ RUN yum install -y slurm-slurmd && \
yum clean all && rm -rf /var/cache/yum yum clean all && rm -rf /var/cache/yum
COPY start-scripts/30-slurmd /etc/docker-init.d/30-slurmd
RUN chmod 755 /etc/docker-init.d/30-slurmd
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log" ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log" ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log" ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
slurmd -D 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'

View File

@@ -0,0 +1,4 @@
#!/bin/sh
su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null &
tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}

View File

@@ -0,0 +1,4 @@
#!/bin/sh
slurmd -D 2>/dev/null 1>/dev/null &
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}

View File

@@ -1,3 +1,3 @@
- name: reconfigure slurm - name: reconfigure slurm
command: shell:
cmd: docker container exec -it slurm-ctl scontrol reconfigure cmd: "docker container exec -it {{slurm_prefix}}-ctl scontrol reconfigure || docker container restart {{slurm_prefix}}-ctl && docker container exec -it {{slurm_prefix}}-ctl scontrol reconfigure"

View File

@@ -1,31 +1,50 @@
- file: - file:
path: "/container/docker-images/{{item}}" path: "/container/docker-images/slurm"
state: directory state: directory
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
group: docker group: docker
- copy: - copy: # FIXME: template
src: "{{item}}.Dockerfile" src: "{{image.name}}.Dockerfile"
dest: "/container/docker-images/{{item}}/Dockerfile" dest: "/container/docker-images/slurm/{{image.name}}.Dockerfile"
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
group: docker group: docker
register: slurm_cp_dockerfile register: slurm_cp_dockerfile
- copy: - name: copy entrypoint and docker-init
src: "entrypoint.sh" copy: # FIXME: swap out
dest: "/container/docker-images/{{item}}/entrypoint.sh" src: "{{item}}"
dest: "/container/docker-images/slurm/{{item}}"
owner: root owner: root
group: root group: root
mode: u=rwx,g=rx,o=rx mode: u=rwx,g=rx,o=rx
loop:
- entrypoint.sh
- docker-init
register: slurm_cp_entrypt register: slurm_cp_entrypt
- name: copy startup scripts
copy:
src: "start-scripts/"
dest: "/container/docker-images/slurm/start-scripts/"
owner: root
group: root
mode: u=rwx,g=rx,o=rx
register: slurm_cp_stscrs
- set_fact:
slurm_image_files_changed: "{{ (slurm_image_files_changed | default(False))
or slurm_cp_entrypt.changed or slurm_cp_stscrs.changed }}"
- docker_image: - docker_image:
name: "slurm-{{item}}" name: "slurm"
tag: "{{image.name}}"
# pull: False # pull: False
build: build:
pull: False pull: False
path: "/container/docker-images/{{item}}" path: "/container/docker-images/slurm/"
# target: "{{item}}" # unsupported on old docker-py versions as in el7 dockerfile: "{{image.name}}.Dockerfile"
# target: "{{image.name}}" # unsupported on old docker-py version as in el7
source: build source: build
force_source: "{{slurm_cp_dockerfile.changed or slurm_cp_entrypt.changed}}" force_source: "{{slurm_cp_dockerfile.changed or slurm_image_files_changed}}"

View File

@@ -3,10 +3,22 @@
name: [ slurm, slurm-doc ] name: [ slurm, slurm-doc ]
state: present state: present
- include_tasks: dockerimage.yml - include_role:
loop: name: slurm_dockerimage
- slurmctld loop: # FIXME: default(omit)!
- slurmd - name: slurmctld
dockerfile: "{{ lookup('file', 'slurmctld.Dockerfile') }}"
files:
- dest: start-scripts/20-slurmctld
content: "{{ lookup('file', 'start-scripts/20-slurmctld') }}"
- name: slurmd
dockerfile: "{{ lookup('file', 'slurmd.Dockerfile') }}"
files:
- dest: start-scripts/30-slurmd
content: "{{ lookup('file', 'start-scripts/30-slurmd') }}"
loop_control:
loop_var: image
label: "{{ image.name }}"
- name: generate munge key - name: generate munge key
shell: shell:
@@ -47,12 +59,14 @@
loop: loop:
- slurm.conf - slurm.conf
- cgroup.conf - cgroup.conf
vars:
alloc_nodes: "{{ [ slurm_prefix+'-submit1' ] + extra_nodes | default([])}}"
notify: reconfigure slurm notify: reconfigure slurm
tags: [ slurm-config ] tags: [ slurm-config ]
- name: "create docker network to make service discovery work" - name: "create docker network to make service discovery work"
docker_network: docker_network:
name: slurm name: "{{ docker_network }}"
state: present state: present
- name: "create docker volume for shared access between nodes" - name: "create docker volume for shared access between nodes"
@@ -61,45 +75,62 @@
state: present state: present
- set_fact: - set_fact:
slurm_nodes: # default nodes: controller and submit machine slurm_nodes_std: # default nodes: controller and submit machine
- machine: ctl - machine: ctl
image: slurm-slurmctld image: slurm:slurmctld
exposed_ports: [ "6817:6817/tcp" ]
- machine: submit1 - machine: submit1
image: slurm-slurmd image: slurm:slurmd
extra_mounts: extra_mounts:
- "/home/{{unpriv_user}}/job3/:/mnt/:rw" - "/home/{{unpriv_user}}/job3/:/mnt/:rw"
slurm_nodes_exec: | # extend range to execute nodes list
{% set slurm_nodes_exec = slurm_nodes_exec | default([]) %}
{% for i in range(1, num_nodes+1) -%}
{% set _ = slurm_nodes_exec.extend([
{'machine':'exec%s'|format(i), 'image': 'slurm:slurmd'}]) -%}
{%- endfor %}
{{ slurm_nodes_exec }}
slurm_default_mounts:
- /container/volumes/slurm/:/etc/slurm/:rw
- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
- slurm-shared:/shared/:rw
slurm_network: "{{docker_network}}"
tags: [ slurm-config ] tags: [ slurm-config ]
# TODO: reserve some address using docker_network_info and assign as aux # TODO: reserve some address using docker_network_info and assign as aux
# address to enable slurmctld to get a static address in order to be # address to enable slurmctld to get a static address in order to be
# reachable from slurm running on docker host to enable submitting jobs. # reachable from slurm running on docker host to enable submitting jobs.
- name: run slurm docker containers - name: run slurm docker containers
docker_container: docker_container:
name: "slurm-{{item.machine}}" name: "{{ slurm_prefix }}-{{ item.machine }}"
hostname: "slurm-{{item.machine}}" hostname: "{{ slurm_prefix }}-{{ item.machine }}"
domainname: "slurm.local" domainname: "{{ slurm_domain }}"
volumes: "{{default_mounts + ( item.extra_mounts | default([]) ) }}" volumes: "{{ slurm_default_mounts + ( item.extra_mounts | default([]) ) }}"
ports: "{{ item.exposed_ports | default([]) }}"
networks: networks:
- name: "slurm" - name: "{{ slurm_network }}"
env: env:
slurmuser: "{{slurm_user}}" slurmuser: "{{ slurm_user }}"
image: "{{item.image}}" image: "{{ item.image }}"
state: started state: started
detach: True detach: True
cleanup: True cleanup: True
privileged: "{{ container_privileged | bool }}" privileged: "{{ container_privileged | bool }}"
networks_cli_compatible: True networks_cli_compatible: True
interactive: True
vars: vars:
default_mounts: slurm_nodes_all: "{{ slurm_nodes_exec + slurm_nodes_std }}"
- /container/volumes/slurm/:/etc/slurm/:rw loop: "{{ slurm_nodes_all }}"
- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw loop_control:
- slurm-shared:/shared/:rw label: "{{slurm_prefix}}-{{ item.machine }}"
slurm_nodes_all: | # add execute nodes
{% for i in range(1, 4) -%}
{% set _ = slurm_nodes.extend([
{'machine':'exec%s'|format(i), 'image': 'slurm-slurmd'}]) -%}
{%- endfor %}
{{ slurm_nodes }}
loop: "{{slurm_nodes_all}}"
tags: [ slurm-config ] tags: [ slurm-config ]
- name: set facts to be used by other modules
set_fact:
slurm:
user: "{{slurm_user}}"
domain: "{{slurm_domain}}"
base_image: "slurm:base"
mounts: "{{slurm_default_mounts}}"
network: "{{docker_network}}"

View File

@@ -164,5 +164,7 @@ SlurmSchedLogFile={{slurm_log_path_sched}}
# #
# COMPUTE NODES # COMPUTE NODES
NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN
NodeName=slurm-submit1 CPUs=1 State=UNKNOWN {% for i in alloc_nodes -%}
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes=slurm-submit1 Default=YES MaxTime=INFINITE State=UP NodeName={{i}} State=UNKNOWN
{% endfor %}
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes={{alloc_nodes | join(',')}} Default=YES MaxTime=INFINITE State=UP

View File

@@ -5,5 +5,3 @@ if [ -f "/etc/munge/munge.key" ] ; then
chown munge:munge /etc/munge/munge.key chown munge:munge /etc/munge/munge.key
chmod 600 /etc/munge/munge.key chmod 600 /etc/munge/munge.key
fi fi
exec "$@"

View File

@@ -0,0 +1,8 @@
#!/usr/bin/env bash
set -e
for i in /usr/local/lib/entrypoints.d/* ; do
[ -f $i ] && /bin/sh $i || break
done
exec "${@:-/bin/bash}"

View File

@@ -0,0 +1,31 @@
FROM docker.io/library/centos:7
RUN yum install -y epel-release && \
yum install -y slurm && \
yum clean all && rm -rf /var/cache/yum
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
yum clean all && rm -rf /var/cache/yum
RUN mkdir -p /usr/local/lib/entrypoints.d/
COPY --chown=root:root entry-munge.sh /usr/local/lib/entrypoints.d/10-munge.sh
COPY --chown=root:root entrypoint.sh /usr/local/sbin/entrypoint.sh
RUN chmod 755 /usr/local/lib/entrypoints.d/10-munge.sh && \
chmod 755 /usr/local/sbin/entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
ARG slurmuser=slurm
ENV slurmuser=${slurmuser}
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
slurm-setuser -u $slurmuser -g $slurmuser -y
COPY docker-init /usr/local/sbin/docker-init
RUN mkdir /etc/docker-init.d && chmod 755 /usr/local/sbin/docker-init
COPY start-scripts/10-munge /etc/docker-init.d/10-munge
RUN chmod 755 /etc/docker-init.d/10-munge
CMD /usr/local/sbin/docker-init

View File

@@ -0,0 +1,2 @@
#!/bin/sh
exec su -s /bin/sh -c "munged -F" munge

View File

@@ -0,0 +1,40 @@
- name: create directories for docker image build
file:
path: "/container/docker-images/slurm-{{image.name}}/{{item}}"
state: directory
owner: "{{unpriv_user}}"
group: docker
loop: "{{ [''] + (image.files | map(attribute='dest') | map('dirname') |
unique | select | list) }}"
- name: copy Dockerfile
copy:
src: "{{image.name}}.Dockerfile"
dest: "/container/docker-images/slurm-{{image.name}}/Dockerfile"
owner: "{{unpriv_user}}"
group: docker
register: slurm_cp_dockerfile
- name: copy requisite files
copy:
content: "{{ item.content }}"
dest: "/container/docker-images/slurm-{{image.name}}/{{item.dest}}"
owner: root
group: root
mode: u=rwx,g=rx,o=rx
loop: "{{ image.files | default([]) }}"
loop_control:
label: "{{ item.dest }}"
register: slurm_cp_files
- docker_image:
name: "slurm"
tag: "{{image.name}}"
# pull: False
build:
pull: False
path: "/container/docker-images/slurm-{{image.name}}/"
source: build
force_source: "{{slurm_cp_dockerfile.changed or
slurm_cp_files.changed or
slurm_baseimg_build_chg }}"

View File

@@ -0,0 +1,46 @@
- file:
path: "/container/docker-images/slurm-base/start-scripts"
state: directory
owner: "{{unpriv_user}}"
group: docker
# - name: copy Dockerfile, entrypoint, docker-init and munge startup
- name: copy slurm base image requisite files
copy: # FIXME: swap out
src: "{{item.file}}"
dest: "/container/docker-images/slurm-base/{{item.file}}"
owner: "{{unpriv_user}}"
group: docker
mode: "{{ item.perms | default('u=rwx,g=rx,o=rx') }}"
loop:
- file: slurm-base.Dockerfile
perms: u=rw,g=r,o=r
- file: entrypoint.sh
- file: entry-munge.sh
- file: docker-init
- file: start-scripts/10-munge
when: not (slurm_baseimg_build_chg | default(False))
register: slurm_baseimg_copy
- name: build base image
docker_image:
name: "slurm"
tag: "base"
# pull: False
build:
pull: False
path: "/container/docker-images/slurm-base/"
dockerfile: "slurm-base.Dockerfile"
# target: "{{image.name}}" # unsupported on old docker-py version as in el7
source: build
force_source: "{{slurm_baseimg_copy.changed}}"
# when: run only once but keep changed state
when: not (slurm_baseimg_build_chg | default(False))
register: slurm_baseimg_build
- set_fact:
slurm_baseimg_build_chg:
"{{(slurm_baseimg_build_chg | default(False)) or
slurm_baseimg_build.changed}}"
- include_tasks: dockerimage.yml