Compare commits

...

25 Commits

Author SHA1 Message Date
4c63f2a825 fix: slurm host system access 2021-07-06 15:14:33 +02:00
51390bb321 improved file loading 2021-07-06 14:54:18 +02:00
52022a3013 WIP: cobald tardis config 2021-07-05 18:52:41 +02:00
574d2fcb4e drone shutdown 2021-07-05 18:25:26 +02:00
2919c98d5f cons res 2021-07-05 18:24:24 +02:00
f73fef1473 cgroups inside singularity container 2021-07-02 00:47:32 +02:00
8bc2f717e0 slurm container running when drone started 2021-07-01 15:19:35 +02:00
d88761ca7d singularity for cobald 2021-06-30 16:31:06 +02:00
3be5025442 alias making running drones working 2021-06-30 10:17:51 +02:00
4c4c4da79d parametric additional partition 2021-06-29 22:25:09 +02:00
1a952a4e7a option for docker host access to cluster 2021-06-28 17:51:45 +02:00
74a760cf98 screen scrolling 2021-06-28 17:44:39 +02:00
cd7dea8fda fix: variables hostname telegraf/influx 2021-06-28 12:14:31 +02:00
7e767c3716 memory / cpu ressources fix 2021-06-25 12:19:00 +02:00
a61d08d118 restructured playbooks, cleanup 2021-06-25 01:55:14 +02:00
188a9215a9 tags #2 2021-06-24 16:37:46 +02:00
9499ce49ae fix: wrong network 2021-06-24 16:37:10 +02:00
9237d736d8 tags 2021-06-24 14:17:16 +02:00
e979ea4d6e fix hostname of cobald slurm node
made cobald be able to run slurm jobs, previously failing with
permission denied.
2021-06-24 14:07:35 +02:00
c7e931f29e fix: building base image -> update child images 2021-06-23 14:29:32 +02:00
a73f9ad6ad additional user in slurm base docker image 2021-06-23 14:28:50 +02:00
c35dc25c39 labels, some cleanup 2021-06-22 19:09:52 +02:00
1f4dfe1821 build cobald image from slurm role, separated tags 2021-06-22 16:48:56 +02:00
78850d4636 merged slurm_dockerimage back into slurm role 2021-06-22 00:26:00 +02:00
f83801cb62 removed cobald_facts module 2021-06-21 21:34:24 +02:00
38 changed files with 691 additions and 394 deletions

27
base.yml Normal file
View File

@@ -0,0 +1,27 @@
---
- hosts: all
tasks:
- name: "install epel repo" # for htop etc.
yum:
name: epel-release
state: present
- name: "install tools"
yum:
name: [ vim-enhanced, htop, screen, bind-utils, nmap-ncat, net-tools ]
state: present
- name: "screenrc native scrolling in tmux"
copy:
content: "termcapinfo xterm* ti@:te@\ntermcapinfo screen* ti@:te@\n"
dest: "{{item}}"
with_items:
- "~{{unpriv_user}}/.screenrc"
- "~root/.screenrc"
- name: "install ssh-key"
authorized_key:
user: "{{cfg_unpriv_user}}"
key: "{{cfg_ssh_key}}"
state: present

61
cobald.yml Normal file
View File

@@ -0,0 +1,61 @@
---
- hosts: slurm, cobald
vars:
container_privileged: True
slurm_num_nodes: 10
tasks:
- name: "setup docker"
import_role: name=docker
tags: docker
- name: "get facts from existing cobald instance (i.e. hostname)"
include_role:
name: cobald
tasks_from: facts
apply:
tags: slurm, cobald, slurm-config
tags: slurm, cobald, slurm-config
vars:
container_name: cobald
- name: "setup slurm test environment in docker containers"
include_role:
name: slurm
apply:
tags: slurm
vars:
slurm_user: slurm # or root
slurm_user_accounts:
- name: cobald
dir: /var/lib/cobald
num_nodes: "{{slurm_num_nodes}}"
extra_nodes:
- name: cobald
hostname: "{{cobald_container_hostname}}" # from cobald/facts.yml above
# hostname is used as NodeHostname, which is used slurms "networking
# code" (https://bugs.schedmd.com/show_bug.cgi?id=8615).
# It works either way around, but one of NodeName or NodeHostname has
# to match the container name (-n flag, not --hostname) since when
# submitting tasks to the slurm controller, it matches access
# permissions against a reverse lookup of the submitting ip address.
# Docker always and unconfigureably resolves the container ip in any
# network to containername.netname, where containername is the
# containers runtime name (not hostname supplied!) and netname is
# the network name in host environment. We should run our own dns...
docker_network: slurm
slurm_hostsystem_cluster_access: True
when: '"slurm" in group_names'
tags: slurm, cobald, influxdb, slurm-config
# tags: cobald requires some slurm facts, so cobald tag is included here
- name: "install cobald"
include_role:
name: cobald
apply:
tags: cobald
vars:
cobald_slurm: True
container_name: cobald
# docker_network: slurm # overriden by vars/slurm.yml
when: '"cobald" in group_names'
tags: cobald, influxdb, singularity

32
htcondor.yml Normal file
View File

@@ -0,0 +1,32 @@
---
- hosts: htcondor
tasks:
- name: "install htcondor repo"
yum:
name: https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm
state: present
tags: htcondor
- name: "install htcondor software "
yum:
name: htcondor-ce
state: present
tags: htcondor
- name: "remove minicondor configuration"
yum:
name: minicondor
state: absent
tags: htcondor
- name: "setup singularity"
import_role: name="singularity"
tags: singularity
- name: "setup docker"
import_role: name=docker
tags: docker
- name: "setup htcondor test environment in docker containers"
import_role: name=docker-htcondor
tags: htcondor-containered, htcondor

View File

@@ -6,12 +6,14 @@ all:
ssh_args: -o ControlMaster=auto -o ControlPersist=60s ssh_args: -o ControlMaster=auto -o ControlPersist=60s
# ansible_host: 192.168.122.139 # ansible_host: 192.168.122.139
unpriv_user: thoto unpriv_user: thoto
cfg_unpriv_user: thoto
ed-c7-2: ed-c7-2:
ansible_user: root ansible_user: root
ansible_host: ed-c7-2.virt.uller.thoto.net ansible_host: ed-c7-2.virt.uller.thoto.net
# ansible_host: 192.168.123.60 # + jumphost # ansible_host: 192.168.123.60 # + jumphost
ssh_args: -o ControlMaster=auto -o ControlPersist=60s ssh_args: -o ControlMaster=auto -o ControlPersist=60s
unpriv_user: thoto unpriv_user: thoto
cfg_unpriv_user: thoto
children: children:
htcondor: htcondor:
hosts: hosts:

View File

@@ -1,88 +1,10 @@
--- ---
- hosts: all - name: base setup
vars_files: import_playbook: base.yml
- vars-auth.yml
- vars-influx.yml
tasks:
- name: "install epel repo" # for htop etc.
yum:
name: epel-release
state: present
- name: "install tools" - name: setup htcondor
yum: import_playbook: htcondor.yml
name: [ vim-enhanced, htop, screen, bind-utils, nmap-ncat, net-tools ] when: '"htcondor" in group_names'
state: present
- name: "install ssh-key" - name: setup slurm and cobald
authorized_key: import_playbook: cobald.yml
user: thoto
key: "{{ssh_key}}"
state: present
- hosts: htcondor
pre_tasks:
- name: "install htcondor repo"
yum:
name: https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm
state: present
tags: htcondor
- name: "install htcondor software "
yum:
name: htcondor-ce
state: present
tags: htcondor
- name: "remove minicondor configuration"
yum:
name: minicondor
state: absent
tags: htcondor
- name: "setup singularity"
import_tasks: "singularity.yml"
tags: singularity
roles:
- name: "setup docker"
role: docker
tags: docker
- name: "setup htcondor test environment in docker containers"
role: docker-htcondor
tags:
- htcondor-containered
- htcondor
- hosts: slurm, cobald
vars:
container_privileged: True
slurm_num_nodes: 10
roles:
- name: "setup docker"
role: docker
tags: docker
- name: "get facts from existing cobald instance (i.e. hostname)"
role: cobald_facts
vars:
container_name: cobald
tags: [ slurm, cobald ]
- name: "setup slurm test environment in docker containers"
role: slurm
vars:
slurm_user: slurm # or root
num_nodes: "{{slurm_num_nodes}}"
extra_nodes:
- "{{cobald_container_hostname}}" # from cobald_facts, read or generated
docker_network: slurm
when: '"slurm" in group_names'
tags: slurm
- name: "install cobald"
role: cobald
vars:
cobald_slurm: True
container_name: cobald
# docker_network: slurm # overriden by vars/slurm.yml
when: '"cobald" in group_names'
tags: cobald

View File

@@ -1,3 +1,4 @@
cobald_domainname: cobald.local
influx_admin_user: my-user influx_admin_user: my-user
influx_admin_pw: my-password influx_admin_pw: my-password
influx_org: my-org influx_org: my-org

View File

@@ -0,0 +1,3 @@
#!/bin/sh
[ /slurm-singimage/slurmd.sif -nt /shared/slurmd.sif ] && \
cp /slurm-singimage/slurmd.sif /shared/slurmd.sif

View File

@@ -0,0 +1,3 @@
#!/bin/sh
slurmd --conf-server ${slurmctld} -D -N ${nodename} 2>/dev/null 1>/dev/null &
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}

View File

@@ -0,0 +1,11 @@
###
#
# Slurm cgroup support configuration file
#
# See man slurm.conf and man cgroup.conf for further
# information on cgroup configuration parameters
#--
CgroupAutomount=no
ConstrainCores=no
ConstrainRAMSpace=no

View File

@@ -0,0 +1,31 @@
---
pipeline:
- __type__: cobald.controller.linear.LinearController
low_utilisation: 0.9
high_allocation: 0.9
rate: 0.10
- !Limiter
minimum: 3
- !TelegrafPipelineMonitor
poll: True
- !TardisPoolFactory
configuration: /etc/cobald/tardis.yaml
logging:
version: 1
root:
level: DEBUG
handlers: [console, file]
handlers:
console:
class: logging.StreamHandler
formatter: test
level: DEBUG
stream: ext://sys.stderr
file:
class: logging.handlers.RotatingFileHandler
formatter: test
level: WARNING
filename: /var/log/cobald/cobald-tardis.log
formatters:
test:
format: " %(name)s %(message)s"

View File

@@ -0,0 +1,41 @@
Plugins:
SqliteRegistry:
db_file: /tmp/drone_registry.db
TelegrafMonitoring:
host: ed-telegraf
port: 8094
#BatchSystem:
# adapter: FakeBatchSystem
# allocation: 1.0
# utilisation: !PeriodicValue
# period: 60
# amplitude: 0.15
# offset: 0.80
## phase: 1.
# phase: 1.6
# machine_status: Available
BatchSystem:
adapter: Slurm
max_age: 0.1
options:
partition: cobald
Sites:
- name: slurmtest
adapter: Slurm
quota: 20
slurmtest:
# executor: ...
StatusUpdate: 0.1
MachineTypes:
- m1.a
MachineTypeConfiguration:
m1.a:
Walltime: 5
Partition: container
StartupCommand: /usr/local/bin/start-drone
# SubmitOptions: ...
MachineMetaData:
m1.a:
Cores: 3 # cores
Memory: 1 # GB
Disk: 4 # not passed

View File

@@ -0,0 +1,31 @@
Bootstrap: docker-daemon
From: slurm:slurmd
%files
31-slurmd-configless /etc/docker-init.d/31-slurm-configless
/container/volumes/munge/munge.key /etc/munge/munge.key
cgroup.conf.noautomount /etc/slurm/cgroup.conf
%post
rm /etc/docker-init.d/30-slurmd
chmod 755 /etc/docker-init.d/31-slurm-configless
%startscript
if [ -z "${1}" -o -z "${2}" ] ; then
echo "undefined variables slurmctld or nodename"
exit 1
fi
export slurmctld="${1}"
export nodename="${2}"
echo ${slurmctld} ${nodename} ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}
exec /usr/local/sbin/entrypoint.sh /usr/local/sbin/docker-init
%runscript
if [ -z "${1}" -o -z "${2}" ] ; then
echo "undefined variables slurmctld or nodename"
exit 1
fi
export slurmctld="${1}"
export nodename="${2}"
echo ${slurmctld} ${nodename} ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}
exec /usr/local/sbin/entrypoint.sh /usr/local/sbin/docker-init

View File

@@ -0,0 +1,59 @@
#!/bin/sh
#SBATCH -D /shared
export
echo $@
nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }')
SHUTDOWN_DONE=0
function handler_quit(){
[ $SHUTDOWN_DONE -ne 0 ] && return
set -x
echo "drain container"
scontrol update NodeName=${nodename} State=DRAIN Reason="cobald node quit"
shutdown_jobs=$(squeue -w ${nodename} --noheader -O jobid)
[ -n "${shutdown_jobs}" ] && scancel ${shutdown_jobs}
#scancel -w ${nodename}
i=$(( $(scontrol show config | grep KillWait | \
sed 's/^KillWait.*= \([0-9]*\) sec/\1/') - 2 ))
while [ -n "$(squeue -w ${nodename} --noheader -O jobid)" -o ${i} -lt 1 ]
do
i=$(( ${i} - 1 ))
sleep 1
done
scancel -s KILL -w ${nodename} # hard kill all remaining jobs
echo "shutdown container"
scontrol update NodeName=${nodename} State=DOWN Reason=shutdown
singularity instance stop slurm-drone
scontrol update NodeName=${nodename} State=FUTURE
umount /inner-cgroup/freezer
umount /inner-cgroup
SHUTDOWN_DONE=1
exit 0
}
# set -x
trap handler_quit EXIT
echo "mounting cgroups"
mkdir /inner-cgroup
mount -t tmpfs none /inner-cgroup
mkdir /inner-cgroup/freezer/
mount --bind /sys/fs/cgroup/freezer/slurm/ /inner-cgroup/freezer/
mount -o remount,ro /inner-cgroup
echo "starting ${nodename}"
scontrol update NodeName=${nodename} State=RESUME # revoke last DRAIN
scontrol update NodeName=${nodename} State=FUTURE
singularity instance start \
-B /inner-cgroup/:/sys/fs/cgroup/ \
--writable-tmpfs /shared/slurmd.sif slurm-drone \
slurm-ctl ${nodename}
# scontrol update NodeName=${nodename} NodeHostname=${SLURM_JOB_ID}
scontrol update NodeName=${nodename} NodeHostname=${TardisDroneUuid}
if [ $? -eq 0 ] ; then
echo "container started, sleeping $(( 60 * ${SLURM_Walltime} - 2 ))"
sleep $(( 60 * ${SLURM_Walltime} - 2 ))
fi
handler_quit

View File

@@ -0,0 +1,34 @@
- file:
path: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/"
state: directory
owner: "{{unpriv_user}}"
group: docker
- template:
src: cobald.Dockerfile
dest: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/Dockerfile"
owner: "{{unpriv_user}}"
group: docker
register: cobald_cp_dockerfile
- copy:
src: "{{item}}"
dest: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/{{item}}"
owner: "{{unpriv_user}}"
group: docker
mode: 0755
with_items:
- cobald-entrypoint.sh
- init-cobaldmodules.sh
register: cobald_cp_files
- docker_image:
name: "cobald"
tag: "{{cobald_image_tag|default('latest')}}"
# pull: False
build:
pull: False
path: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/"
source: build
force_source: "{{cobald_cp_dockerfile.changed or cobald_cp_files.changed}}"

View File

@@ -14,11 +14,12 @@
- name: run grafana - name: run grafana
docker_container: docker_container:
name: ed-grafana name: ed-grafana
image: grafana/grafana:7.5.7 image: docker.io/grafana/grafana:7.5.7
hostname: ed-grafana hostname: ed-grafana
domainname: cobald.local domainname: cobald.local
networks: networks:
- name: "{{docker_network}}" - name: "{{cobald_docker_network}}"
networks_cli_compatible: True
published_ports: published_ports:
- "3000:3000" - "3000:3000"
state: started state: started
@@ -42,7 +43,6 @@
version: Flux version: Flux
additional_secure_json_data: additional_secure_json_data:
token: "{{influx_grafana_token.token}}" token: "{{influx_grafana_token.token}}"
register: das
- community.grafana.grafana_dashboard: - community.grafana.grafana_dashboard:
grafana_url: http://localhost:3000 grafana_url: http://localhost:3000

View File

@@ -1,11 +1,12 @@
- name: run influxdb in docker container - name: run influxdb in docker container
docker_container: docker_container:
name: ed-influxdb name: ed-influxdb
image: influxdb image: docker.io/library/influxdb:2.0
hostname: "{{influx_hostname}}" hostname: "{{cobald_influx_hostname}}"
domainname: "{{influx_domainname}}" domainname: "{{cobald_domainname}}"
networks: networks:
- name: "{{ docker_network }}" - name: "{{ cobald_docker_network }}"
networks_cli_compatible: True
published_ports: published_ports:
- "{{influx_pubport}}:8086" - "{{influx_pubport}}:8086"
volumes: volumes:
@@ -21,13 +22,12 @@
state: started state: started
detach: True detach: True
cleanup: True cleanup: True
networks_cli_compatible: True
- add_host: - name: add ansible connection to influxdb container
add_host:
name: ed-influxdb name: ed-influxdb
ansible_connection: docker ansible_connection: docker
# ansible_docker_extra_args: "-H=ssh://ed-c7-1.virt.magni.thoto.net" # FIXME ansible_docker_extra_args: "-H=ssh://{{ansible_host}}"
ansible_docker_extra_args: "-H=ssh://{{ansible_host}}" # FIXME
changed_when: False changed_when: False
- name: wait for influx to run - name: wait for influx to run
@@ -35,32 +35,33 @@
changed_when: False changed_when: False
delegate_to: ed-influxdb delegate_to: ed-influxdb
- name: fetch auth token - name: fetch influxdb auth token
raw: influx auth list --user my-user --hide-headers --json raw: influx auth list --user my-user --hide-headers --json
register: influx_token_fetch register: influx_token_fetch
changed_when: False changed_when: False
delegate_to: ed-influxdb delegate_to: ed-influxdb
- name: set admin token - name: set influxdb admin token
set_fact: set_fact:
influx_admin_token: influx_admin_token:
"{{(influx_token_fetch.stdout | from_json | first).token}}" "{{(influx_token_fetch.stdout | from_json | first).token}}"
- name: create bucket for cobald - name: create influxdb bucket for cobald
influx_bucket: influx_bucket:
base: "http://localhost:{{influx_pubport}}" base: "http://localhost:{{influx_pubport}}"
org: "my-org" org: "my-org"
auth_token: "{{influx_admin_token}}" auth_token: "{{influx_admin_token}}"
name: "{{influx_bucket}}" name: "{{influx_bucket}}"
- name: create dashboard - name: create influxdb dashboard
influx_dashboard: influx_dashboard:
base: "http://localhost:{{influx_pubport}}" base: "http://localhost:{{influx_pubport}}"
org: "my-org" org: "my-org"
auth_token: "{{influx_admin_token}}" auth_token: "{{influx_admin_token}}"
data: "{{lookup('file', 'influxdb-dashboard-cobald.json')}}" data: "{{lookup('file', 'influxdb-dashboard-cobald.json')}}"
when: influxdb_dashboard | default(True)
- name: create token for telegraf - name: create influxdb write access token for telegraf
influx_token: influx_token:
base: "http://localhost:{{influx_pubport}}" base: "http://localhost:{{influx_pubport}}"
org: "my-org" org: "my-org"

View File

@@ -1,62 +1,61 @@
- include_vars: cobald-slurm.yml - include_vars: cobald-slurm.yml
when: cobald_slurm | default(False) when: cobald_slurm | default(False)
tags: always
- file: - name: build cobald:slurm docker image
path: "/container/{{item.name}}/cobald{{item.pfx|default('')}}/" include_role:
name: slurm
tasks_from: dockerimage
vars:
slurm_image_prefix: cobald
image_name: "{{cobald_image_tag}}"
dockerfile: "{{ lookup('template', 'cobald.Dockerfile') }}"
files_list:
- cobald-entrypoint.sh
- init-cobaldmodules.sh
- start-drone
- 28-sync-container-slurmd
files: "
{%- set files = [] -%} {%- for i in files_list -%}
{%- set files = files.append(
{ 'dest': i, 'content': lookup('file', i) }) -%}
{%- endfor %}{{ files }}"
when: cobald_slurm | default(False)
- name: build generic cobald docker image
include_tasks: dockerimage-generic.yml
when: not (cobald_slurm | default(False))
- name: make cobald data volume
file:
path: "/container/volumes/cobald/"
state: directory state: directory
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
group: docker group: docker
loop:
- name: docker-images
pfx: ".{{cobald_image_tag|default('latest')}}"
- name: volumes
- template: - name: copy cobald config
src: cobald.Dockerfile copy:
dest: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/Dockerfile"
owner: "{{unpriv_user}}"
group: docker
register: cobald_cp_dockerfile
- copy:
src: "{{item}}"
dest: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/{{item}}"
owner: "{{unpriv_user}}"
group: docker
mode: 0755
with_items:
- cobald-entrypoint.sh
- init-cobaldmodules.sh
register: cobald_cp_files
- docker_image:
name: "cobald"
tag: "{{cobald_image_tag|default('latest')}}"
# pull: False
build:
pull: False
path: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/"
source: build
force_source: "{{cobald_cp_dockerfile.changed or cobald_cp_files.changed}}"
- copy:
src: cobald-config/ src: cobald-config/
dest: /container/volumes/cobald dest: "~{{unpriv_user}}/cobald/"
force: False
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
group: docker group: docker
when: False mode: "0644"
- docker_network: - name: ensure network for cobald container exists
name: "{{cobald_docker_network}}" # FIXME docker_network:
name: "{{cobald_docker_network}}"
state: present state: present
# docker run -v $(pwd)/cobald-config-host:/etc/cobald -v $(pwd)/cobald:/cobald --rm -it cobald bash # docker run -v $(pwd)/cobald-config-host:/etc/cobald -v $(pwd)/cobald:/cobald --rm -it cobald bash
- yum: - name: install git
yum:
name: git name: git
state: present state: present
- file: - name: make directories for cobald configuration and modules
file:
path: "{{item}}" path: "{{item}}"
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
group: "{{unpriv_user}}" group: "{{unpriv_user}}"
@@ -66,19 +65,22 @@
- "~{{unpriv_user}}/cobald/modules" - "~{{unpriv_user}}/cobald/modules"
- "~{{unpriv_user}}/cobald" - "~{{unpriv_user}}/cobald"
- git: - name: clone cobald code from git
git:
repo: https://github.com/thoto/cobald repo: https://github.com/thoto/cobald
dest: "~{{unpriv_user}}/cobald-src" dest: "~{{unpriv_user}}/cobald-src"
version: bugfix/mixed_construction_methods version: bugfix/mixed_construction_methods
update: no update: no # FIXME
become: yes become: yes
become_user: "{{unpriv_user}}" become_user: "{{unpriv_user}}"
register: cobald_git_pull register: cobald_git_pull
- git: - name: clone tardis code from git
git:
repo: https://github.com/MatterMiners/tardis repo: https://github.com/MatterMiners/tardis
dest: "~{{unpriv_user}}/tardis-src" dest: "~{{unpriv_user}}/tardis-src"
version: master version: master
update: no # FIXME
become: yes become: yes
become_user: "{{unpriv_user}}" become_user: "{{unpriv_user}}"
register: tardis_git_pull register: tardis_git_pull
@@ -88,7 +90,7 @@
database: passwd database: passwd
key: "{{unpriv_user}}" key: "{{unpriv_user}}"
- name: run pip install - name: run pip install on cobald and tardis
docker_container: docker_container:
image: "cobald:{{cobald_image_tag|default('latest')}}" image: "cobald:{{cobald_image_tag|default('latest')}}"
name: "cobald-src-{{item.name}}-install" name: "cobald-src-{{item.name}}-install"
@@ -111,10 +113,16 @@
- import_tasks: telegraf.yml - import_tasks: telegraf.yml
- name: get cobald hostname - name: get cobald hostname
include_role: include_tasks: facts.yml
name: cobald_facts
when: cobald_container_hostname is not defined when: cobald_container_hostname is not defined
- name: build singularity container
include_tasks:
file: singularity.yml
apply:
tags: singularity
tags: singularity
- name: run cobald container - name: run cobald container
docker_container: docker_container:
name: "{{ container_name | default('cobald') }}" name: "{{ container_name | default('cobald') }}"
@@ -125,9 +133,6 @@
networks: networks:
- name: "{{cobald_docker_network}}" - name: "{{cobald_docker_network}}"
networks_cli_compatible: True networks_cli_compatible: True
# env:
# slurmuser: "{{slurm_user}}"
# privileged: "{{ container_privileged | bool }}"
state: started state: started
detach: True detach: True
cleanup: True cleanup: True
@@ -138,6 +143,7 @@
cobald_mounts: cobald_mounts:
- "~{{unpriv_user}}/cobald:/etc/cobald" - "~{{unpriv_user}}/cobald:/etc/cobald"
# - /container/volumes/cobald:/etc/cobald:ro # - /container/volumes/cobald:/etc/cobald:ro
- "/container/docker-images/sing-slurmd/build/:/slurm-singimage/:ro"
- "~{{unpriv_user}}/cobald/modules:/usr/local/src/cobaldmodules" - "~{{unpriv_user}}/cobald/modules:/usr/local/src/cobaldmodules"
- "~{{unpriv_user}}/cobald-src:/usr/local/src/cobald:ro" - "~{{unpriv_user}}/cobald-src:/usr/local/src/cobald:ro"
- "~{{unpriv_user}}/tardis-src:/usr/local/src/tardis:ro" - "~{{unpriv_user}}/tardis-src:/usr/local/src/tardis:ro"

View File

@@ -0,0 +1,48 @@
- name: setup singularity
import_role: name="singularity"
tags: singularity
- name: make singularity image build directory
file:
state: directory
path: "{{item}}"
owner: "{{unpriv_user}}"
group: "docker"
mode: "0755"
loop:
- /container/docker-images/sing-slurmd
- /container/docker-images/sing-slurmd/cache
- /container/docker-images/sing-slurmd/build
- name: copy slurm singularity container files
copy:
src: "{{item}}"
dest: "/container/docker-images/sing-slurmd/{{item}}"
owner: "{{unpriv_user}}"
group: "docker"
loop:
- slurm-slurmd.def
- 31-slurmd-configless
- cgroup.conf.noautomount
register: cobald_copy_sing_files
- name: remove old container
file:
path: /container/docker-images/sing-slurmd/build/slurmd.sif
state: absent
when: cobald_copy_sing_files.changed
- name: build container
shell:
chdir: /container/docker-images/sing-slurmd/
cmd: SINGULARITY_TMPDIR=/container/docker-images/sing-slurmd/cache
singularity build --disable-cache
/container/docker-images/sing-slurmd/build/slurmd.sif
/container/docker-images/sing-slurmd/slurm-slurmd.def
creates: /container/docker-images/sing-slurmd/build/slurmd.sif
register: cobald_sing_build
- debug: msg="{{[cobald_sing_build.stdout, cobald_sing_build.stderr]}}"
tags: [ never, debug ]
# TODO: trigger copy in cobald container when slurmd.sif rebuilt

View File

@@ -1,4 +1,5 @@
- file: - name: setup directories for telegraf
file:
path: "/container/{{item}}/telegraf/" path: "/container/{{item}}/telegraf/"
state: directory state: directory
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
@@ -7,14 +8,16 @@
- docker-images - docker-images
- volumes - volumes
- copy: - name: copy telegraf Dockerfile
copy:
src: telegraf.Dockerfile src: telegraf.Dockerfile
dest: /container/docker-images/telegraf/Dockerfile dest: /container/docker-images/telegraf/Dockerfile
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
group: docker group: docker
register: cobald_cp_telegraf_dockerfile register: cobald_cp_telegraf_dockerfile
- copy: # telegraf is found in influxdb repo - name: copy telegraf repo file
copy: # telegraf is found in influxdb repo
src: influxdb.repo src: influxdb.repo
dest: /container/docker-images/telegraf/influxdb.repo dest: /container/docker-images/telegraf/influxdb.repo
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
@@ -29,20 +32,9 @@
source: build source: build
force_source: "{{cobald_cp_telegraf_dockerfile.changed}}" force_source: "{{cobald_cp_telegraf_dockerfile.changed}}"
- set_fact:
influx_hostname: "ed-influxdb"
influx_domainname: "cobald.local"
influx_bucket: batleth
tags: influxdb
- import_tasks: influxdb.yml - import_tasks: influxdb.yml
tags: influxdb tags: influxdb
- set_fact:
# influx_url: "http://{{influx_hostname}}.{{influx_domainname}}:8086"
influx_url: "http://{{influx_hostname}}:8086"
tags: influxdb
- name: generate telegraf config - name: generate telegraf config
template: template:
src: telegraf.conf.j2 src: telegraf.conf.j2
@@ -51,6 +43,7 @@
group: docker group: docker
vars: vars:
influx_token: "{{influx_telegraf_token.token}}" influx_token: "{{influx_telegraf_token.token}}"
influx_url: "http://{{cobald_influx_hostname}}:8086"
register: telegraf_config_gen register: telegraf_config_gen
- name: run telegraf container - name: run telegraf container
@@ -58,9 +51,10 @@
name: ed-telegraf name: ed-telegraf
image: ed-telegraf image: ed-telegraf
hostname: telegraf hostname: telegraf
domainname: cobald.local domainname: "{{ cobald_domainname }}"
networks: networks:
- name: "{{docker_network | default('bridge') }}" - name: "{{ cobald_docker_network }}"
aliases: ["ed-telegraf"]
volumes: volumes:
- "/container/volumes/telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro" - "/container/volumes/telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro"
state: started state: started
@@ -70,4 +64,6 @@
networks_cli_compatible: True networks_cli_compatible: True
- import_tasks: grafana.yml - import_tasks: grafana.yml
vars:
influx_url: "http://{{cobald_influx_hostname}}:8086"
tags: influxdb tags: influxdb

View File

@@ -11,7 +11,8 @@ RUN git clone $REPOCOBALD /usr/local/src/cobald && \
git clone $REPOTARDIS /usr/local/src/tardis git clone $REPOTARDIS /usr/local/src/tardis
RUN mkdir /etc/cobald /var/log/cobald && \ RUN mkdir /etc/cobald /var/log/cobald && \
useradd -m -d /var/lib/cobald --no-log-init --system cobald && \ ( getent passwd cobald > /dev/null || \
useradd -m -d /var/lib/cobald --no-log-init --system cobald ) && \
chown cobald:cobald /var/log/cobald chown cobald:cobald /var/log/cobald
#RUN mkdir /cobald && python3 -m venv /cobald && source /cobald/bin/activate &&\ #RUN mkdir /cobald && python3 -m venv /cobald && source /cobald/bin/activate &&\
@@ -47,6 +48,10 @@ COPY init-cobaldmodules.sh /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
RUN chmod 755 /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh RUN chmod 755 /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
COPY start-drone /usr/local/bin/start-drone
COPY 28-sync-container-slurmd /etc/docker-init.d/28-sync-container-slurmd
RUN chmod 755 /usr/local/bin/start-drone /etc/docker-init.d/28-sync-container-slurmd
RUN echo -e "#!/bin/sh\npython3 -m cobald.daemon /etc/cobald/config.yaml" >> /etc/docker-init.d/70-cobald && chmod 755 /etc/docker-init.d/70-cobald RUN echo -e "#!/bin/sh\npython3 -m cobald.daemon /etc/cobald/config.yaml" >> /etc/docker-init.d/70-cobald && chmod 755 /etc/docker-init.d/70-cobald
{% if cobald_docker_default_command | default(True) -%} {% if cobald_docker_default_command | default(True) -%}

View File

@@ -1 +1,2 @@
cobald_docker_network: "{{docker_network}}" cobald_docker_network: "{{docker_network}}"
cobald_influx_hostname: "ed-influxdb"

View File

@@ -8,7 +8,7 @@
value: "15000" value: "15000"
sysctl_file: /etc/sysctl.d/90-max_net_namespaces.conf sysctl_file: /etc/sysctl.d/90-max_net_namespaces.conf
- name: "enable user thoto for fakeroot access" - name: "enable user {{unpriv_user}} for fakeroot access"
lineinfile: lineinfile:
line: "{{unpriv_user}}:4294836224:65536" line: "{{unpriv_user}}:4294836224:65536"
dest: "{{item}}" dest: "{{item}}"

View File

@@ -1,8 +1,9 @@
container_privileged: False
slurm_user: slurm slurm_user: slurm
slurm_log_path_ctld: /var/log/slurm/slurmctld.log slurm_log_path_ctld: /var/log/slurm/slurmctld.log
slurm_log_path_d: /var/log/slurm/slurmd.log slurm_log_path_d: /var/log/slurm/slurmd.log
slurm_log_path_sched: /var/log/slurm/slurmsched.log slurm_log_path_sched: /var/log/slurm/slurmsched.log
slurm_prefix: slurm slurm_prefix: slurm
slurm_domain: slurm.local slurm_domain: slurm.local
container_privileged: False
docker_network: slurm docker_network: slurm
slurm_image_prefix: slurm

View File

@@ -28,4 +28,8 @@ RUN mkdir /etc/docker-init.d && chmod 755 /usr/local/sbin/docker-init
COPY start-scripts/10-munge /etc/docker-init.d/10-munge COPY start-scripts/10-munge /etc/docker-init.d/10-munge
RUN chmod 755 /etc/docker-init.d/10-munge RUN chmod 755 /etc/docker-init.d/10-munge
ARG moreusers
RUN function mu { [ -z "$1" ] || useradd -d $2 -m --no-log-init --system $1 ;};\
echo "${moreusers}" | tr ',' '\n' | while read i ; do mu $i ; done
CMD /usr/local/sbin/docker-init CMD /usr/local/sbin/docker-init

View File

@@ -1,43 +0,0 @@
FROM docker.io/library/centos:7 as base
RUN yum install -y epel-release && \
yum install -y slurm && \
yum clean all && rm -rf /var/cache/yum
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
yum clean all && rm -rf /var/cache/yum
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
chmod 755 /usr/local/sbin/entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
ARG slurmuser=slurm
ENV slurmuser=${slurmuser}
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
slurm-setuser -u $slurmuser -g $slurmuser -y
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
FROM base as slurmd
RUN yum install -y slurm-slurmd && \
yum clean all && rm -rf /var/cache/yum
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
slurmd -D 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'
FROM base as slurmctld
RUN yum install -y slurm-slurmctld && \
yum clean all && rm -rf /var/cache/yum
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'

View File

@@ -9,3 +9,6 @@ RUN chmod 755 /etc/docker-init.d/30-slurmd
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log" ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log" ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log" ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
RUN yum install -y singularity && \
yum clean all && rm -rf /var/cache/yum

View File

@@ -1,50 +1,29 @@
- file: - name: build slurm base docker image
path: "/container/docker-images/slurm" include_tasks: dockerimage_build.yml
state: directory vars:
owner: "{{unpriv_user}}" slurm_image_prefix: "{{slurm_base_image_prefix | default('slurm') }}"
group: docker image_name: base
dockerfile: "{{lookup('file', 'slurm-base.Dockerfile')}}"
- copy: # FIXME: template files:
src: "{{image.name}}.Dockerfile" - dest: entrypoint.sh
dest: "/container/docker-images/slurm/{{image.name}}.Dockerfile" content: "{{ lookup('file', 'entrypoint.sh') }}"
owner: "{{unpriv_user}}" - dest: entry-munge.sh
group: docker content: "{{ lookup('file', 'entry-munge.sh') }}"
register: slurm_cp_dockerfile - dest: docker-init
content: "{{ lookup('file', 'docker-init') }}"
- name: copy entrypoint and docker-init - dest: start-scripts/10-munge
copy: # FIXME: swap out content: "{{ lookup('file', 'start-scripts/10-munge') }}"
src: "{{item}}" image_args:
dest: "/container/docker-images/slurm/{{item}}" moreusers: >-
owner: root {% for a in slurm_user_accounts | default([]) -%}
group: root {{a['name']}} {{a['dir']}}{{loop.last | ternary('',',')}}
mode: u=rwx,g=rx,o=rx {%- endfor %}
loop: when: not slurm_baseimg_build_chg | default(False)
- entrypoint.sh
- docker-init
register: slurm_cp_entrypt
- name: copy startup scripts
copy:
src: "start-scripts/"
dest: "/container/docker-images/slurm/start-scripts/"
owner: root
group: root
mode: u=rwx,g=rx,o=rx
register: slurm_cp_stscrs
- set_fact: - set_fact:
slurm_image_files_changed: "{{ (slurm_image_files_changed | default(False)) slurm_baseimg_build_chg:
or slurm_cp_entrypt.changed or slurm_cp_stscrs.changed }}" "{{(slurm_baseimg_build_chg | default(False)) or
slurm_img_build.changed}}"
- docker_image:
name: "slurm"
tag: "{{image.name}}"
# pull: False
build:
pull: False
path: "/container/docker-images/slurm/"
dockerfile: "{{image.name}}.Dockerfile"
# target: "{{image.name}}" # unsupported on old docker-py version as in el7
source: build
force_source: "{{slurm_cp_dockerfile.changed or slurm_image_files_changed}}"
- name: "build slurm base docker image {{image_name}}"
include_tasks: dockerimage_build.yml

View File

@@ -0,0 +1,43 @@
- name: create directories for docker image build
file:
path: "/container/docker-images/{{slurm_image_prefix}}-{{image_name}}/{{item}}"
state: directory
owner: "{{unpriv_user}}"
group: docker
loop: "{{ [''] + (files | map(attribute='dest') | map('dirname') |
unique | select | list) }}"
- name: "copy Dockerfile {{slurm_image_prefix}}:{{image_name}}"
copy:
content: "{{dockerfile}}"
dest: "/container/docker-images/{{slurm_image_prefix}}-{{image_name}}/Dockerfile"
owner: "{{unpriv_user}}"
group: docker
register: slurm_cp_dockerfile
- name: copy requisite files
copy:
content: "{{ item.content }}"
dest: "/container/docker-images/{{slurm_image_prefix}}-{{image_name}}/{{item.dest}}"
owner: root
group: root
mode: u=rwx,g=rx,o=rx
loop: "{{ files | default([]) }}"
loop_control:
label: "{{ item.dest }}"
register: slurm_cp_files
- name: "build docker image {{slurm_image_prefix}}:{{image_name}}"
docker_image:
name: "{{slurm_image_prefix}}"
tag: "{{image_name}}"
# pull: False
build:
args: "{{image_args | default(omit)}}"
pull: False
path: "/container/docker-images/{{slurm_image_prefix}}-{{image_name}}/"
source: build
force_source: "{{slurm_cp_dockerfile.changed or
slurm_cp_files.changed or
slurm_baseimg_build_chg | default(False) }}"
register: slurm_img_build

View File

@@ -0,0 +1,35 @@
# TODO: this does not work quite right since slurm-ctl does not reach the host
# system. sinfo, scontrol etc. work but srun does not!
- name: "get addresses from docker network"
docker_network_info:
name: "{{ docker_network }}"
register: slurm_network_data
- name: link host slurm config
file:
path: "/etc/slurm/slurm.conf"
src: "/container/volumes/slurm/slurm.conf"
force: True
state: link
backup: True
- name: create slurm user
user:
name: slurm
system: True
- name: place entry of slurm-ctl in host /etc/hosts
lineinfile:
line: "{{slurm_network_data.network.Containers | dict2items
| json_query('[?value.Name==`slurm-ctl`].value.IPv4Address') | first
| ipaddr('address') }}\tslurm-ctl"
regexp: "^(\\S*)(\\s*)slurm-ctl$"
path: /etc/hosts
backup: True
- name: start munge locally
service:
name: munge
enabled: True
state: started

View File

@@ -3,9 +3,10 @@
name: [ slurm, slurm-doc ] name: [ slurm, slurm-doc ]
state: present state: present
- include_role: - name: build docker images for slurm
name: slurm_dockerimage include_tasks:
loop: # FIXME: default(omit)! file: dockerimage.yml
loop:
- name: slurmctld - name: slurmctld
dockerfile: "{{ lookup('file', 'slurmctld.Dockerfile') }}" dockerfile: "{{ lookup('file', 'slurmctld.Dockerfile') }}"
files: files:
@@ -16,6 +17,10 @@
files: files:
- dest: start-scripts/30-slurmd - dest: start-scripts/30-slurmd
content: "{{ lookup('file', 'start-scripts/30-slurmd') }}" content: "{{ lookup('file', 'start-scripts/30-slurmd') }}"
vars:
image_name: "{{image.name | default(omit) }}"
dockerfile: "{{image.dockerfile | default(omit) }}"
files: "{{image.files | default(omit) }}"
loop_control: loop_control:
loop_var: image loop_var: image
label: "{{ image.name }}" label: "{{ image.name }}"
@@ -32,7 +37,8 @@
group: munge group: munge
mode: u=rw,g=,o= mode: u=rw,g=,o=
- file: - name: create munge key directory for containers
file:
path: /container/volumes/munge path: /container/volumes/munge
state: directory state: directory
owner: munge owner: munge
@@ -47,10 +53,18 @@
src: /etc/munge/munge.key src: /etc/munge/munge.key
dest: /container/volumes/munge/munge.key dest: /container/volumes/munge/munge.key
- file: - name: make slurm directory
file:
path: /container/volumes/slurm/ path: /container/volumes/slurm/
state: directory state: directory
- name: "create docker network to make service discovery work"
docker_network:
name: "{{ docker_network }}"
state: present
register: slurm_network_data
tags: slurm-config
- name: upload slurm config - name: upload slurm config
template: template:
force: true force: true
@@ -60,47 +74,29 @@
- slurm.conf - slurm.conf
- cgroup.conf - cgroup.conf
vars: vars:
alloc_nodes: "{{ [ slurm_prefix+'-submit1' ] + extra_nodes | default([])}}" slurm_exec_node_cores: 3
slurm_exec_node_mem: 5000 # RealMemory=5964
slurm_alloc_nodes_default:
- name: "{{slurm_prefix+'-submit1'}}"
- name: "{{ inventory_hostname }}"
addr: "{{ slurm_network_data.network.IPAM.Config[0].Gateway }}"
alloc_nodes: "{{ slurm_alloc_nodes_default + extra_nodes | default([])}}"
partitions:
- name: cobald
nodeprefix: drone
num_nodes: 10
node_cores: 3
node_mem: 4900
port: 16818
initstate: FUTURE
notify: reconfigure slurm notify: reconfigure slurm
tags: [ slurm-config ] tags: slurm-config
- name: "create docker network to make service discovery work"
docker_network:
name: "{{ docker_network }}"
state: present
- name: "create docker volume for shared access between nodes" - name: "create docker volume for shared access between nodes"
docker_volume: docker_volume:
name: slurm-shared name: slurm-shared
state: present state: present
- set_fact:
slurm_nodes_std: # default nodes: controller and submit machine
- machine: ctl
image: slurm:slurmctld
exposed_ports: [ "6817:6817/tcp" ]
- machine: submit1
image: slurm:slurmd
extra_mounts:
- "/home/{{unpriv_user}}/job3/:/mnt/:rw"
slurm_nodes_exec: | # extend range to execute nodes list
{% set slurm_nodes_exec = slurm_nodes_exec | default([]) %}
{% for i in range(1, num_nodes+1) -%}
{% set _ = slurm_nodes_exec.extend([
{'machine':'exec%s'|format(i), 'image': 'slurm:slurmd'}]) -%}
{%- endfor %}
{{ slurm_nodes_exec }}
slurm_default_mounts:
- /container/volumes/slurm/:/etc/slurm/:rw
- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
- slurm-shared:/shared/:rw
slurm_network: "{{docker_network}}"
tags: [ slurm-config ]
# TODO: reserve some address using docker_network_info and assign as aux
# address to enable slurmctld to get a static address in order to be
# reachable from slurm running on docker host to enable submitting jobs.
- name: run slurm docker containers - name: run slurm docker containers
docker_container: docker_container:
name: "{{ slurm_prefix }}-{{ item.machine }}" name: "{{ slurm_prefix }}-{{ item.machine }}"
@@ -109,7 +105,8 @@
volumes: "{{ slurm_default_mounts + ( item.extra_mounts | default([]) ) }}" volumes: "{{ slurm_default_mounts + ( item.extra_mounts | default([]) ) }}"
ports: "{{ item.exposed_ports | default([]) }}" ports: "{{ item.exposed_ports | default([]) }}"
networks: networks:
- name: "{{ slurm_network }}" - name: "{{ docker_network }}"
aliases: "{{ item.aliases | default(omit) }}"
env: env:
slurmuser: "{{ slurm_user }}" slurmuser: "{{ slurm_user }}"
image: "{{ item.image }}" image: "{{ item.image }}"
@@ -119,14 +116,18 @@
privileged: "{{ container_privileged | bool }}" privileged: "{{ container_privileged | bool }}"
networks_cli_compatible: True networks_cli_compatible: True
interactive: True interactive: True
vars: vars: # see vars/main.yml
slurm_nodes_all: "{{ slurm_nodes_exec + slurm_nodes_std }}" slurm_nodes_all: "{{ slurm_nodes_exec + slurm_nodes_std }}"
loop: "{{ slurm_nodes_all }}" loop: "{{ slurm_nodes_all }}"
loop_control: loop_control:
label: "{{slurm_prefix}}-{{ item.machine }}" label: "{{slurm_prefix}}-{{ item.machine }}"
tags: [ slurm-config ] tags: slurm-config
- name: set facts to be used by other modules - name: configure host system to integrate into slurm cluster
import_tasks: host-config.yml
when: slurm_hostsystem_cluster_access | default(False)
- name: export facts about slurm cluster to be used by other modules
set_fact: set_fact:
slurm: slurm:
user: "{{slurm_user}}" user: "{{slurm_user}}"
@@ -134,3 +135,4 @@
base_image: "slurm:base" base_image: "slurm:base"
mounts: "{{slurm_default_mounts}}" mounts: "{{slurm_default_mounts}}"
network: "{{docker_network}}" network: "{{docker_network}}"
tags: always

View File

@@ -9,6 +9,8 @@ ControlMachine=slurm-ctl
AuthType=auth/munge AuthType=auth/munge
#CheckpointType=checkpoint/none #CheckpointType=checkpoint/none
CryptoType=crypto/munge CryptoType=crypto/munge
CommunicationParameters=NoAddrCache
SlurmctldParameters=enable_configless
#DisableRootJobs=NO #DisableRootJobs=NO
#EnforcePartLimits=NO #EnforcePartLimits=NO
#Epilog= #Epilog=
@@ -103,8 +105,10 @@ Waittime=0
#MaxMemPerCPU=0 #MaxMemPerCPU=0
#SchedulerTimeSlice=30 #SchedulerTimeSlice=30
SchedulerType=sched/backfill SchedulerType=sched/backfill
SelectType=select/linear # SelectType=select/linear
SelectType=select/cons_res
#SelectTypeParameters= #SelectTypeParameters=
SelectTypeParameters=CR_CORE
# #
# #
# JOB PRIORITY # JOB PRIORITY
@@ -163,8 +167,27 @@ SlurmSchedLogFile={{slurm_log_path_sched}}
# #
# #
# COMPUTE NODES # COMPUTE NODES
NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN NodeName=slurm-exec[1-{{ num_nodes }}] CPUs={{ slurm_exec_node_cores }} {{''
{% for i in alloc_nodes -%} }} RealMemory={{ slurm_exec_node_mem }} {{''
NodeName={{i}} State=UNKNOWN }} CoresPerSocket={{ slurm_exec_node_cores }} State=UNKNOWN
{% for p in partitions | default([]) %}
NodeName={{ p.nodeprefix }}[1-{{ p.num_nodes }}] CPUs={{ p.node_cores }} {{''
}} RealMemory={{ p.node_mem }} {{''
}} CoresPerSocket={{ p.node_cores }} {{''
}} {%- if p.port is defined %} Port={{ p.port}} {% endif %}{{''
}} State={{ p.initstate | default('UNKNOWN') }}
{% endfor %}
{% for i in alloc_nodes -%}
NodeName={{i.name}}
{%- if i.hostname is defined %} NodeHostname={{i.hostname}} {% endif %}
{%- if i.addr is defined %} NodeAddr={{i.addr}} {% endif %}
State=UNKNOWN
{% endfor %}
PartitionName=container Nodes=slurm-exec[1-{{num_nodes}}] {{ ''
}} AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} {{ ''
}} Default=YES MaxTime=INFINITE State=UP
{% for p in partitions | default([]) %}
PartitionName={{ p.name }} Nodes={{ p.nodeprefix }}[1-{{ p.num_nodes }}] {{ ''
}} AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} {{ ''
}} MaxTime=INFINITE State=UP
{% endfor %} {% endfor %}
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes={{alloc_nodes | join(',')}} Default=YES MaxTime=INFINITE State=UP

21
roles/slurm/vars/main.yml Normal file
View File

@@ -0,0 +1,21 @@
slurm_nodes_std: # default nodes: controller and submit machine
- machine: ctl
image: slurm:slurmctld
exposed_ports: [ "6817:6817/tcp" ]
- machine: submit1
image: slurm:slurmd
extra_mounts:
- "/home/{{unpriv_user}}/job3/:/mnt/:rw"
slurm_nodes_exec: | # extend range to execute nodes list
{% set slurm_nodes_exec = [] %}
{% for i in range(1, num_nodes+1) -%}
{% set _ = slurm_nodes_exec.extend([
{'machine':'exec%s'|format(i), 'image': 'slurm:slurmd',
'aliases':['drone%s'|format(i)]}]) -%}
{%- endfor %}
{{ slurm_nodes_exec }}
slurm_default_mounts:
- /container/volumes/slurm/:/etc/slurm/:rw
- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
- slurm-shared:/shared/:rw

View File

@@ -1,40 +0,0 @@
- name: create directories for docker image build
file:
path: "/container/docker-images/slurm-{{image.name}}/{{item}}"
state: directory
owner: "{{unpriv_user}}"
group: docker
loop: "{{ [''] + (image.files | map(attribute='dest') | map('dirname') |
unique | select | list) }}"
- name: copy Dockerfile
copy:
src: "{{image.name}}.Dockerfile"
dest: "/container/docker-images/slurm-{{image.name}}/Dockerfile"
owner: "{{unpriv_user}}"
group: docker
register: slurm_cp_dockerfile
- name: copy requisite files
copy:
content: "{{ item.content }}"
dest: "/container/docker-images/slurm-{{image.name}}/{{item.dest}}"
owner: root
group: root
mode: u=rwx,g=rx,o=rx
loop: "{{ image.files | default([]) }}"
loop_control:
label: "{{ item.dest }}"
register: slurm_cp_files
- docker_image:
name: "slurm"
tag: "{{image.name}}"
# pull: False
build:
pull: False
path: "/container/docker-images/slurm-{{image.name}}/"
source: build
force_source: "{{slurm_cp_dockerfile.changed or
slurm_cp_files.changed or
slurm_baseimg_build_chg }}"

View File

@@ -1,46 +0,0 @@
- file:
path: "/container/docker-images/slurm-base/start-scripts"
state: directory
owner: "{{unpriv_user}}"
group: docker
# - name: copy Dockerfile, entrypoint, docker-init and munge startup
- name: copy slurm base image requisite files
copy: # FIXME: swap out
src: "{{item.file}}"
dest: "/container/docker-images/slurm-base/{{item.file}}"
owner: "{{unpriv_user}}"
group: docker
mode: "{{ item.perms | default('u=rwx,g=rx,o=rx') }}"
loop:
- file: slurm-base.Dockerfile
perms: u=rw,g=r,o=r
- file: entrypoint.sh
- file: entry-munge.sh
- file: docker-init
- file: start-scripts/10-munge
when: not (slurm_baseimg_build_chg | default(False))
register: slurm_baseimg_copy
- name: build base image
docker_image:
name: "slurm"
tag: "base"
# pull: False
build:
pull: False
path: "/container/docker-images/slurm-base/"
dockerfile: "slurm-base.Dockerfile"
# target: "{{image.name}}" # unsupported on old docker-py version as in el7
source: build
force_source: "{{slurm_baseimg_copy.changed}}"
# when: run only once but keep changed state
when: not (slurm_baseimg_build_chg | default(False))
register: slurm_baseimg_build
- set_fact:
slurm_baseimg_build_chg:
"{{(slurm_baseimg_build_chg | default(False)) or
slurm_baseimg_build.changed}}"
- include_tasks: dockerimage.yml