Compare commits

...

6 Commits

Author SHA1 Message Date
4c63f2a825 fix: slurm host system access 2021-07-06 15:14:33 +02:00
51390bb321 improved file loading 2021-07-06 14:54:18 +02:00
52022a3013 WIP: cobald tardis config 2021-07-05 18:52:41 +02:00
574d2fcb4e drone shutdown 2021-07-05 18:25:26 +02:00
2919c98d5f cons res 2021-07-05 18:24:24 +02:00
f73fef1473 cgroups inside singularity container 2021-07-02 00:47:32 +02:00
11 changed files with 152 additions and 26 deletions

View File

@@ -43,6 +43,7 @@
# containers runtime name (not hostname supplied!) and netname is # containers runtime name (not hostname supplied!) and netname is
# the network name in host environment. We should run our own dns... # the network name in host environment. We should run our own dns...
docker_network: slurm docker_network: slurm
slurm_hostsystem_cluster_access: True
when: '"slurm" in group_names' when: '"slurm" in group_names'
tags: slurm, cobald, influxdb, slurm-config tags: slurm, cobald, influxdb, slurm-config
# tags: cobald requires some slurm facts, so cobald tag is included here # tags: cobald requires some slurm facts, so cobald tag is included here

View File

@@ -0,0 +1,11 @@
###
#
# Slurm cgroup support configuration file
#
# See man slurm.conf and man cgroup.conf for further
# information on cgroup configuration parameters
#--
CgroupAutomount=no
ConstrainCores=no
ConstrainRAMSpace=no

View File

@@ -0,0 +1,31 @@
---
pipeline:
- __type__: cobald.controller.linear.LinearController
low_utilisation: 0.9
high_allocation: 0.9
rate: 0.10
- !Limiter
minimum: 3
- !TelegrafPipelineMonitor
poll: True
- !TardisPoolFactory
configuration: /etc/cobald/tardis.yaml
logging:
version: 1
root:
level: DEBUG
handlers: [console, file]
handlers:
console:
class: logging.StreamHandler
formatter: test
level: DEBUG
stream: ext://sys.stderr
file:
class: logging.handlers.RotatingFileHandler
formatter: test
level: WARNING
filename: /var/log/cobald/cobald-tardis.log
formatters:
test:
format: " %(name)s %(message)s"

View File

@@ -0,0 +1,41 @@
Plugins:
SqliteRegistry:
db_file: /tmp/drone_registry.db
TelegrafMonitoring:
host: ed-telegraf
port: 8094
#BatchSystem:
# adapter: FakeBatchSystem
# allocation: 1.0
# utilisation: !PeriodicValue
# period: 60
# amplitude: 0.15
# offset: 0.80
## phase: 1.
# phase: 1.6
# machine_status: Available
BatchSystem:
adapter: Slurm
max_age: 0.1
options:
partition: cobald
Sites:
- name: slurmtest
adapter: Slurm
quota: 20
slurmtest:
# executor: ...
StatusUpdate: 0.1
MachineTypes:
- m1.a
MachineTypeConfiguration:
m1.a:
Walltime: 5
Partition: container
StartupCommand: /usr/local/bin/start-drone
# SubmitOptions: ...
MachineMetaData:
m1.a:
Cores: 3 # cores
Memory: 1 # GB
Disk: 4 # not passed

View File

@@ -4,6 +4,7 @@ From: slurm:slurmd
%files %files
31-slurmd-configless /etc/docker-init.d/31-slurm-configless 31-slurmd-configless /etc/docker-init.d/31-slurm-configless
/container/volumes/munge/munge.key /etc/munge/munge.key /container/volumes/munge/munge.key /etc/munge/munge.key
cgroup.conf.noautomount /etc/slurm/cgroup.conf
%post %post
rm /etc/docker-init.d/30-slurmd rm /etc/docker-init.d/30-slurmd

View File

@@ -4,22 +4,56 @@ export
echo $@ echo $@
nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }') nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }')
SHUTDOWN_DONE=0
function handler_quit(){ function handler_quit(){
[ $SHUTDOWN_DONE -ne 0 ] && return
set -x
echo "drain container"
scontrol update NodeName=${nodename} State=DRAIN Reason="cobald node quit"
shutdown_jobs=$(squeue -w ${nodename} --noheader -O jobid)
[ -n "${shutdown_jobs}" ] && scancel ${shutdown_jobs}
#scancel -w ${nodename}
i=$(( $(scontrol show config | grep KillWait | \
sed 's/^KillWait.*= \([0-9]*\) sec/\1/') - 2 ))
while [ -n "$(squeue -w ${nodename} --noheader -O jobid)" -o ${i} -lt 1 ]
do
i=$(( ${i} - 1 ))
sleep 1
done
scancel -s KILL -w ${nodename} # hard kill all remaining jobs
echo "shutdown container" echo "shutdown container"
scontrol update NodeName=${nodename} State=DOWN Reason=shutdown
singularity instance stop slurm-drone singularity instance stop slurm-drone
scontrol update NodeName=${nodename} State=FUTURE scontrol update NodeName=${nodename} State=FUTURE
umount /inner-cgroup/freezer
umount /inner-cgroup
SHUTDOWN_DONE=1
exit 0 exit 0
} }
trap handler_quit EXIT
# set -x # set -x
trap handler_quit EXIT
echo "mounting cgroups"
mkdir /inner-cgroup
mount -t tmpfs none /inner-cgroup
mkdir /inner-cgroup/freezer/
mount --bind /sys/fs/cgroup/freezer/slurm/ /inner-cgroup/freezer/
mount -o remount,ro /inner-cgroup
echo "starting ${nodename}" echo "starting ${nodename}"
singularity instance start --writable-tmpfs /shared/slurmd.sif slurm-drone \ scontrol update NodeName=${nodename} State=RESUME # revoke last DRAIN
scontrol update NodeName=${nodename} State=FUTURE
singularity instance start \
-B /inner-cgroup/:/sys/fs/cgroup/ \
--writable-tmpfs /shared/slurmd.sif slurm-drone \
slurm-ctl ${nodename} slurm-ctl ${nodename}
# scontrol update NodeName=${nodename} NodeHostname=${SLURM_JOB_ID}
scontrol update NodeName=${nodename} NodeHostname=${TardisDroneUuid}
if [ $? -eq 0 ] ; then if [ $? -eq 0 ] ; then
echo "container started, sleeping $(( 60 * ${SLURM_Walltime}))" echo "container started, sleeping $(( 60 * ${SLURM_Walltime} - 2 ))"
sleep $(( 60 * ${SLURM_Walltime} )) sleep $(( 60 * ${SLURM_Walltime} - 2 ))
fi fi
handler_quit handler_quit

View File

@@ -10,15 +10,16 @@
slurm_image_prefix: cobald slurm_image_prefix: cobald
image_name: "{{cobald_image_tag}}" image_name: "{{cobald_image_tag}}"
dockerfile: "{{ lookup('template', 'cobald.Dockerfile') }}" dockerfile: "{{ lookup('template', 'cobald.Dockerfile') }}"
files: files_list:
- dest: cobald-entrypoint.sh - cobald-entrypoint.sh
content: "{{ lookup('file', 'cobald-entrypoint.sh') }}" - init-cobaldmodules.sh
- dest: init-cobaldmodules.sh - start-drone
content: "{{ lookup('file', 'init-cobaldmodules.sh') }}" - 28-sync-container-slurmd
- dest: start-drone files: "
content: "{{ lookup('file', 'start-drone') }}" {%- set files = [] -%} {%- for i in files_list -%}
- dest: 28-sync-container-slurmd {%- set files = files.append(
content: "{{ lookup('file', '28-sync-container-slurmd') }}" { 'dest': i, 'content': lookup('file', i) }) -%}
{%- endfor %}{{ files }}"
when: cobald_slurm | default(False) when: cobald_slurm | default(False)
- name: build generic cobald docker image - name: build generic cobald docker image
@@ -32,13 +33,14 @@
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
group: docker group: docker
- name: copy cobald config (does nothing yet) - name: copy cobald config
copy: copy:
src: cobald-config/ src: cobald-config/
dest: /container/volumes/cobald dest: "~{{unpriv_user}}/cobald/"
force: False
owner: "{{unpriv_user}}" owner: "{{unpriv_user}}"
group: docker group: docker
when: False mode: "0644"
- name: ensure network for cobald container exists - name: ensure network for cobald container exists
docker_network: docker_network:
@@ -68,7 +70,7 @@
repo: https://github.com/thoto/cobald repo: https://github.com/thoto/cobald
dest: "~{{unpriv_user}}/cobald-src" dest: "~{{unpriv_user}}/cobald-src"
version: bugfix/mixed_construction_methods version: bugfix/mixed_construction_methods
update: no update: no # FIXME
become: yes become: yes
become_user: "{{unpriv_user}}" become_user: "{{unpriv_user}}"
register: cobald_git_pull register: cobald_git_pull
@@ -78,6 +80,7 @@
repo: https://github.com/MatterMiners/tardis repo: https://github.com/MatterMiners/tardis
dest: "~{{unpriv_user}}/tardis-src" dest: "~{{unpriv_user}}/tardis-src"
version: master version: master
update: no # FIXME
become: yes become: yes
become_user: "{{unpriv_user}}" become_user: "{{unpriv_user}}"
register: tardis_git_pull register: tardis_git_pull

View File

@@ -23,6 +23,7 @@
loop: loop:
- slurm-slurmd.def - slurm-slurmd.def
- 31-slurmd-configless - 31-slurmd-configless
- cgroup.conf.noautomount
register: cobald_copy_sing_files register: cobald_copy_sing_files
- name: remove old container - name: remove old container

View File

@@ -1,3 +1,6 @@
# TODO: this does not work quite right since slurm-ctl does not reach the host
# system. sinfo, scontrol etc. work but srun does not!
- name: "get addresses from docker network" - name: "get addresses from docker network"
docker_network_info: docker_network_info:
name: "{{ docker_network }}" name: "{{ docker_network }}"
@@ -28,4 +31,5 @@
- name: start munge locally - name: start munge locally
service: service:
name: munge name: munge
enabled: True
state: started state: started

View File

@@ -92,18 +92,11 @@
notify: reconfigure slurm notify: reconfigure slurm
tags: slurm-config tags: slurm-config
- import_tasks: host-config.yml
when: slurm_hostsystem_cluster_access | default(False)
- name: "create docker volume for shared access between nodes" - name: "create docker volume for shared access between nodes"
docker_volume: docker_volume:
name: slurm-shared name: slurm-shared
state: present state: present
# TODO: reserve some address using docker_network_info and assign as aux
# address to enable slurmctld to get a static address in order to be
# reachable from slurm running on docker host to enable submitting jobs.
- name: run slurm docker containers - name: run slurm docker containers
docker_container: docker_container:
name: "{{ slurm_prefix }}-{{ item.machine }}" name: "{{ slurm_prefix }}-{{ item.machine }}"
@@ -130,6 +123,10 @@
label: "{{slurm_prefix}}-{{ item.machine }}" label: "{{slurm_prefix}}-{{ item.machine }}"
tags: slurm-config tags: slurm-config
- name: configure host system to integrate into slurm cluster
import_tasks: host-config.yml
when: slurm_hostsystem_cluster_access | default(False)
- name: export facts about slurm cluster to be used by other modules - name: export facts about slurm cluster to be used by other modules
set_fact: set_fact:
slurm: slurm:

View File

@@ -105,8 +105,10 @@ Waittime=0
#MaxMemPerCPU=0 #MaxMemPerCPU=0
#SchedulerTimeSlice=30 #SchedulerTimeSlice=30
SchedulerType=sched/backfill SchedulerType=sched/backfill
SelectType=select/linear # SelectType=select/linear
SelectType=select/cons_res
#SelectTypeParameters= #SelectTypeParameters=
SelectTypeParameters=CR_CORE
# #
# #
# JOB PRIORITY # JOB PRIORITY