Compare commits
6 Commits
8bc2f717e0
...
cobaldtard
Author | SHA1 | Date | |
---|---|---|---|
4c63f2a825
|
|||
51390bb321
|
|||
52022a3013
|
|||
574d2fcb4e
|
|||
2919c98d5f
|
|||
f73fef1473
|
@@ -43,6 +43,7 @@
|
|||||||
# containers runtime name (not hostname supplied!) and netname is
|
# containers runtime name (not hostname supplied!) and netname is
|
||||||
# the network name in host environment. We should run our own dns...
|
# the network name in host environment. We should run our own dns...
|
||||||
docker_network: slurm
|
docker_network: slurm
|
||||||
|
slurm_hostsystem_cluster_access: True
|
||||||
when: '"slurm" in group_names'
|
when: '"slurm" in group_names'
|
||||||
tags: slurm, cobald, influxdb, slurm-config
|
tags: slurm, cobald, influxdb, slurm-config
|
||||||
# tags: cobald requires some slurm facts, so cobald tag is included here
|
# tags: cobald requires some slurm facts, so cobald tag is included here
|
||||||
|
11
roles/cobald/files/cgroup.conf.noautomount
Normal file
11
roles/cobald/files/cgroup.conf.noautomount
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
###
|
||||||
|
#
|
||||||
|
# Slurm cgroup support configuration file
|
||||||
|
#
|
||||||
|
# See man slurm.conf and man cgroup.conf for further
|
||||||
|
# information on cgroup configuration parameters
|
||||||
|
#--
|
||||||
|
CgroupAutomount=no
|
||||||
|
|
||||||
|
ConstrainCores=no
|
||||||
|
ConstrainRAMSpace=no
|
31
roles/cobald/files/cobald-config/config.yaml
Normal file
31
roles/cobald/files/cobald-config/config.yaml
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
---
|
||||||
|
pipeline:
|
||||||
|
- __type__: cobald.controller.linear.LinearController
|
||||||
|
low_utilisation: 0.9
|
||||||
|
high_allocation: 0.9
|
||||||
|
rate: 0.10
|
||||||
|
- !Limiter
|
||||||
|
minimum: 3
|
||||||
|
- !TelegrafPipelineMonitor
|
||||||
|
poll: True
|
||||||
|
- !TardisPoolFactory
|
||||||
|
configuration: /etc/cobald/tardis.yaml
|
||||||
|
logging:
|
||||||
|
version: 1
|
||||||
|
root:
|
||||||
|
level: DEBUG
|
||||||
|
handlers: [console, file]
|
||||||
|
handlers:
|
||||||
|
console:
|
||||||
|
class: logging.StreamHandler
|
||||||
|
formatter: test
|
||||||
|
level: DEBUG
|
||||||
|
stream: ext://sys.stderr
|
||||||
|
file:
|
||||||
|
class: logging.handlers.RotatingFileHandler
|
||||||
|
formatter: test
|
||||||
|
level: WARNING
|
||||||
|
filename: /var/log/cobald/cobald-tardis.log
|
||||||
|
formatters:
|
||||||
|
test:
|
||||||
|
format: " %(name)s %(message)s"
|
41
roles/cobald/files/cobald-config/tardis.yaml
Normal file
41
roles/cobald/files/cobald-config/tardis.yaml
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
Plugins:
|
||||||
|
SqliteRegistry:
|
||||||
|
db_file: /tmp/drone_registry.db
|
||||||
|
TelegrafMonitoring:
|
||||||
|
host: ed-telegraf
|
||||||
|
port: 8094
|
||||||
|
#BatchSystem:
|
||||||
|
# adapter: FakeBatchSystem
|
||||||
|
# allocation: 1.0
|
||||||
|
# utilisation: !PeriodicValue
|
||||||
|
# period: 60
|
||||||
|
# amplitude: 0.15
|
||||||
|
# offset: 0.80
|
||||||
|
## phase: 1.
|
||||||
|
# phase: 1.6
|
||||||
|
# machine_status: Available
|
||||||
|
BatchSystem:
|
||||||
|
adapter: Slurm
|
||||||
|
max_age: 0.1
|
||||||
|
options:
|
||||||
|
partition: cobald
|
||||||
|
Sites:
|
||||||
|
- name: slurmtest
|
||||||
|
adapter: Slurm
|
||||||
|
quota: 20
|
||||||
|
slurmtest:
|
||||||
|
# executor: ...
|
||||||
|
StatusUpdate: 0.1
|
||||||
|
MachineTypes:
|
||||||
|
- m1.a
|
||||||
|
MachineTypeConfiguration:
|
||||||
|
m1.a:
|
||||||
|
Walltime: 5
|
||||||
|
Partition: container
|
||||||
|
StartupCommand: /usr/local/bin/start-drone
|
||||||
|
# SubmitOptions: ...
|
||||||
|
MachineMetaData:
|
||||||
|
m1.a:
|
||||||
|
Cores: 3 # cores
|
||||||
|
Memory: 1 # GB
|
||||||
|
Disk: 4 # not passed
|
@@ -4,6 +4,7 @@ From: slurm:slurmd
|
|||||||
%files
|
%files
|
||||||
31-slurmd-configless /etc/docker-init.d/31-slurm-configless
|
31-slurmd-configless /etc/docker-init.d/31-slurm-configless
|
||||||
/container/volumes/munge/munge.key /etc/munge/munge.key
|
/container/volumes/munge/munge.key /etc/munge/munge.key
|
||||||
|
cgroup.conf.noautomount /etc/slurm/cgroup.conf
|
||||||
|
|
||||||
%post
|
%post
|
||||||
rm /etc/docker-init.d/30-slurmd
|
rm /etc/docker-init.d/30-slurmd
|
||||||
|
@@ -4,22 +4,56 @@ export
|
|||||||
echo $@
|
echo $@
|
||||||
nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }')
|
nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }')
|
||||||
|
|
||||||
|
SHUTDOWN_DONE=0
|
||||||
|
|
||||||
function handler_quit(){
|
function handler_quit(){
|
||||||
|
[ $SHUTDOWN_DONE -ne 0 ] && return
|
||||||
|
set -x
|
||||||
|
echo "drain container"
|
||||||
|
scontrol update NodeName=${nodename} State=DRAIN Reason="cobald node quit"
|
||||||
|
shutdown_jobs=$(squeue -w ${nodename} --noheader -O jobid)
|
||||||
|
[ -n "${shutdown_jobs}" ] && scancel ${shutdown_jobs}
|
||||||
|
#scancel -w ${nodename}
|
||||||
|
i=$(( $(scontrol show config | grep KillWait | \
|
||||||
|
sed 's/^KillWait.*= \([0-9]*\) sec/\1/') - 2 ))
|
||||||
|
while [ -n "$(squeue -w ${nodename} --noheader -O jobid)" -o ${i} -lt 1 ]
|
||||||
|
do
|
||||||
|
i=$(( ${i} - 1 ))
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
scancel -s KILL -w ${nodename} # hard kill all remaining jobs
|
||||||
echo "shutdown container"
|
echo "shutdown container"
|
||||||
|
scontrol update NodeName=${nodename} State=DOWN Reason=shutdown
|
||||||
singularity instance stop slurm-drone
|
singularity instance stop slurm-drone
|
||||||
scontrol update NodeName=${nodename} State=FUTURE
|
scontrol update NodeName=${nodename} State=FUTURE
|
||||||
|
umount /inner-cgroup/freezer
|
||||||
|
umount /inner-cgroup
|
||||||
|
SHUTDOWN_DONE=1
|
||||||
exit 0
|
exit 0
|
||||||
}
|
}
|
||||||
|
|
||||||
trap handler_quit EXIT
|
|
||||||
|
|
||||||
# set -x
|
# set -x
|
||||||
|
|
||||||
|
trap handler_quit EXIT
|
||||||
|
|
||||||
|
echo "mounting cgroups"
|
||||||
|
mkdir /inner-cgroup
|
||||||
|
mount -t tmpfs none /inner-cgroup
|
||||||
|
mkdir /inner-cgroup/freezer/
|
||||||
|
mount --bind /sys/fs/cgroup/freezer/slurm/ /inner-cgroup/freezer/
|
||||||
|
mount -o remount,ro /inner-cgroup
|
||||||
|
|
||||||
echo "starting ${nodename}"
|
echo "starting ${nodename}"
|
||||||
singularity instance start --writable-tmpfs /shared/slurmd.sif slurm-drone \
|
scontrol update NodeName=${nodename} State=RESUME # revoke last DRAIN
|
||||||
|
scontrol update NodeName=${nodename} State=FUTURE
|
||||||
|
singularity instance start \
|
||||||
|
-B /inner-cgroup/:/sys/fs/cgroup/ \
|
||||||
|
--writable-tmpfs /shared/slurmd.sif slurm-drone \
|
||||||
slurm-ctl ${nodename}
|
slurm-ctl ${nodename}
|
||||||
|
# scontrol update NodeName=${nodename} NodeHostname=${SLURM_JOB_ID}
|
||||||
|
scontrol update NodeName=${nodename} NodeHostname=${TardisDroneUuid}
|
||||||
if [ $? -eq 0 ] ; then
|
if [ $? -eq 0 ] ; then
|
||||||
echo "container started, sleeping $(( 60 * ${SLURM_Walltime}))"
|
echo "container started, sleeping $(( 60 * ${SLURM_Walltime} - 2 ))"
|
||||||
sleep $(( 60 * ${SLURM_Walltime} ))
|
sleep $(( 60 * ${SLURM_Walltime} - 2 ))
|
||||||
fi
|
fi
|
||||||
handler_quit
|
handler_quit
|
||||||
|
@@ -10,15 +10,16 @@
|
|||||||
slurm_image_prefix: cobald
|
slurm_image_prefix: cobald
|
||||||
image_name: "{{cobald_image_tag}}"
|
image_name: "{{cobald_image_tag}}"
|
||||||
dockerfile: "{{ lookup('template', 'cobald.Dockerfile') }}"
|
dockerfile: "{{ lookup('template', 'cobald.Dockerfile') }}"
|
||||||
files:
|
files_list:
|
||||||
- dest: cobald-entrypoint.sh
|
- cobald-entrypoint.sh
|
||||||
content: "{{ lookup('file', 'cobald-entrypoint.sh') }}"
|
- init-cobaldmodules.sh
|
||||||
- dest: init-cobaldmodules.sh
|
- start-drone
|
||||||
content: "{{ lookup('file', 'init-cobaldmodules.sh') }}"
|
- 28-sync-container-slurmd
|
||||||
- dest: start-drone
|
files: "
|
||||||
content: "{{ lookup('file', 'start-drone') }}"
|
{%- set files = [] -%} {%- for i in files_list -%}
|
||||||
- dest: 28-sync-container-slurmd
|
{%- set files = files.append(
|
||||||
content: "{{ lookup('file', '28-sync-container-slurmd') }}"
|
{ 'dest': i, 'content': lookup('file', i) }) -%}
|
||||||
|
{%- endfor %}{{ files }}"
|
||||||
when: cobald_slurm | default(False)
|
when: cobald_slurm | default(False)
|
||||||
|
|
||||||
- name: build generic cobald docker image
|
- name: build generic cobald docker image
|
||||||
@@ -32,13 +33,14 @@
|
|||||||
owner: "{{unpriv_user}}"
|
owner: "{{unpriv_user}}"
|
||||||
group: docker
|
group: docker
|
||||||
|
|
||||||
- name: copy cobald config (does nothing yet)
|
- name: copy cobald config
|
||||||
copy:
|
copy:
|
||||||
src: cobald-config/
|
src: cobald-config/
|
||||||
dest: /container/volumes/cobald
|
dest: "~{{unpriv_user}}/cobald/"
|
||||||
|
force: False
|
||||||
owner: "{{unpriv_user}}"
|
owner: "{{unpriv_user}}"
|
||||||
group: docker
|
group: docker
|
||||||
when: False
|
mode: "0644"
|
||||||
|
|
||||||
- name: ensure network for cobald container exists
|
- name: ensure network for cobald container exists
|
||||||
docker_network:
|
docker_network:
|
||||||
@@ -68,7 +70,7 @@
|
|||||||
repo: https://github.com/thoto/cobald
|
repo: https://github.com/thoto/cobald
|
||||||
dest: "~{{unpriv_user}}/cobald-src"
|
dest: "~{{unpriv_user}}/cobald-src"
|
||||||
version: bugfix/mixed_construction_methods
|
version: bugfix/mixed_construction_methods
|
||||||
update: no
|
update: no # FIXME
|
||||||
become: yes
|
become: yes
|
||||||
become_user: "{{unpriv_user}}"
|
become_user: "{{unpriv_user}}"
|
||||||
register: cobald_git_pull
|
register: cobald_git_pull
|
||||||
@@ -78,6 +80,7 @@
|
|||||||
repo: https://github.com/MatterMiners/tardis
|
repo: https://github.com/MatterMiners/tardis
|
||||||
dest: "~{{unpriv_user}}/tardis-src"
|
dest: "~{{unpriv_user}}/tardis-src"
|
||||||
version: master
|
version: master
|
||||||
|
update: no # FIXME
|
||||||
become: yes
|
become: yes
|
||||||
become_user: "{{unpriv_user}}"
|
become_user: "{{unpriv_user}}"
|
||||||
register: tardis_git_pull
|
register: tardis_git_pull
|
||||||
|
@@ -23,6 +23,7 @@
|
|||||||
loop:
|
loop:
|
||||||
- slurm-slurmd.def
|
- slurm-slurmd.def
|
||||||
- 31-slurmd-configless
|
- 31-slurmd-configless
|
||||||
|
- cgroup.conf.noautomount
|
||||||
register: cobald_copy_sing_files
|
register: cobald_copy_sing_files
|
||||||
|
|
||||||
- name: remove old container
|
- name: remove old container
|
||||||
|
@@ -1,3 +1,6 @@
|
|||||||
|
# TODO: this does not work quite right since slurm-ctl does not reach the host
|
||||||
|
# system. sinfo, scontrol etc. work but srun does not!
|
||||||
|
|
||||||
- name: "get addresses from docker network"
|
- name: "get addresses from docker network"
|
||||||
docker_network_info:
|
docker_network_info:
|
||||||
name: "{{ docker_network }}"
|
name: "{{ docker_network }}"
|
||||||
@@ -28,4 +31,5 @@
|
|||||||
- name: start munge locally
|
- name: start munge locally
|
||||||
service:
|
service:
|
||||||
name: munge
|
name: munge
|
||||||
|
enabled: True
|
||||||
state: started
|
state: started
|
||||||
|
@@ -92,18 +92,11 @@
|
|||||||
notify: reconfigure slurm
|
notify: reconfigure slurm
|
||||||
tags: slurm-config
|
tags: slurm-config
|
||||||
|
|
||||||
- import_tasks: host-config.yml
|
|
||||||
when: slurm_hostsystem_cluster_access | default(False)
|
|
||||||
|
|
||||||
- name: "create docker volume for shared access between nodes"
|
- name: "create docker volume for shared access between nodes"
|
||||||
docker_volume:
|
docker_volume:
|
||||||
name: slurm-shared
|
name: slurm-shared
|
||||||
state: present
|
state: present
|
||||||
|
|
||||||
# TODO: reserve some address using docker_network_info and assign as aux
|
|
||||||
# address to enable slurmctld to get a static address in order to be
|
|
||||||
# reachable from slurm running on docker host to enable submitting jobs.
|
|
||||||
|
|
||||||
- name: run slurm docker containers
|
- name: run slurm docker containers
|
||||||
docker_container:
|
docker_container:
|
||||||
name: "{{ slurm_prefix }}-{{ item.machine }}"
|
name: "{{ slurm_prefix }}-{{ item.machine }}"
|
||||||
@@ -130,6 +123,10 @@
|
|||||||
label: "{{slurm_prefix}}-{{ item.machine }}"
|
label: "{{slurm_prefix}}-{{ item.machine }}"
|
||||||
tags: slurm-config
|
tags: slurm-config
|
||||||
|
|
||||||
|
- name: configure host system to integrate into slurm cluster
|
||||||
|
import_tasks: host-config.yml
|
||||||
|
when: slurm_hostsystem_cluster_access | default(False)
|
||||||
|
|
||||||
- name: export facts about slurm cluster to be used by other modules
|
- name: export facts about slurm cluster to be used by other modules
|
||||||
set_fact:
|
set_fact:
|
||||||
slurm:
|
slurm:
|
||||||
|
@@ -105,8 +105,10 @@ Waittime=0
|
|||||||
#MaxMemPerCPU=0
|
#MaxMemPerCPU=0
|
||||||
#SchedulerTimeSlice=30
|
#SchedulerTimeSlice=30
|
||||||
SchedulerType=sched/backfill
|
SchedulerType=sched/backfill
|
||||||
SelectType=select/linear
|
# SelectType=select/linear
|
||||||
|
SelectType=select/cons_res
|
||||||
#SelectTypeParameters=
|
#SelectTypeParameters=
|
||||||
|
SelectTypeParameters=CR_CORE
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# JOB PRIORITY
|
# JOB PRIORITY
|
||||||
|
Reference in New Issue
Block a user