Compare commits

..

4 Commits

Author SHA1 Message Date
7e767c3716 memory / cpu ressources fix 2021-06-25 12:19:00 +02:00
a61d08d118 restructured playbooks, cleanup 2021-06-25 01:55:14 +02:00
188a9215a9 tags #2 2021-06-24 16:37:46 +02:00
9499ce49ae fix: wrong network 2021-06-24 16:37:10 +02:00
12 changed files with 137 additions and 164 deletions

19
base.yml Normal file
View File

@@ -0,0 +1,19 @@
---
- hosts: all
tasks:
- name: "install epel repo" # for htop etc.
yum:
name: epel-release
state: present
- name: "install tools"
yum:
name: [ vim-enhanced, htop, screen, bind-utils, nmap-ncat, net-tools ]
state: present
- name: "install ssh-key"
authorized_key:
user: "{{cfg_unpriv_user}}"
key: "{{cfg_ssh_key}}"
state: present

60
cobald.yml Normal file
View File

@@ -0,0 +1,60 @@
---
- hosts: slurm, cobald
vars:
container_privileged: True
slurm_num_nodes: 10
tasks:
- name: "setup docker"
import_role: name=docker
tags: docker
- name: "get facts from existing cobald instance (i.e. hostname)"
include_role:
name: cobald
tasks_from: facts
apply:
tags: slurm, cobald, slurm-config
tags: slurm, cobald, slurm-config
vars:
container_name: cobald
- name: "setup slurm test environment in docker containers"
include_role:
name: slurm
apply:
tags: slurm
vars:
slurm_user: slurm # or root
slurm_user_accounts:
- name: cobald
dir: /var/lib/cobald
num_nodes: "{{slurm_num_nodes}}"
extra_nodes:
- name: cobald
hostname: "{{cobald_container_hostname}}" # from cobald/facts.yml above
# hostname is used as NodeHostname, which is used slurms "networking
# code" (https://bugs.schedmd.com/show_bug.cgi?id=8615).
# It works either way around, but one of NodeName or NodeHostname has
# to match the container name (-n flag, not --hostname) since when
# submitting tasks to the slurm controller, it matches access
# permissions against a reverse lookup of the submitting ip address.
# Docker always and unconfigureably resolves the container ip in any
# network to containername.netname, where containername is the
# containers runtime name (not hostname supplied!) and netname is
# the network name in host environment. We should run our own dns...
docker_network: slurm
when: '"slurm" in group_names'
tags: slurm, cobald, influxdb, slurm-config
# tags: cobald requires some slurm facts, so cobald tag is included here
- name: "install cobald"
include_role:
name: cobald
apply:
tags: cobald
vars:
cobald_slurm: True
container_name: cobald
# docker_network: slurm # overriden by vars/slurm.yml
when: '"cobald" in group_names'
tags: cobald, influxdb

32
htcondor.yml Normal file
View File

@@ -0,0 +1,32 @@
---
- hosts: htcondor
tasks:
- name: "install htcondor repo"
yum:
name: https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm
state: present
tags: htcondor
- name: "install htcondor software "
yum:
name: htcondor-ce
state: present
tags: htcondor
- name: "remove minicondor configuration"
yum:
name: minicondor
state: absent
tags: htcondor
- name: "setup singularity"
import_tasks: "singularity.yml"
tags: singularity
- name: "setup docker"
import_role: name=docker
tags: docker
- name: "setup htcondor test environment in docker containers"
import_role: name=docker-htcondor
tags: htcondor-containered, htcondor

View File

@@ -6,12 +6,14 @@ all:
ssh_args: -o ControlMaster=auto -o ControlPersist=60s ssh_args: -o ControlMaster=auto -o ControlPersist=60s
# ansible_host: 192.168.122.139 # ansible_host: 192.168.122.139
unpriv_user: thoto unpriv_user: thoto
cfg_unpriv_user: thoto
ed-c7-2: ed-c7-2:
ansible_user: root ansible_user: root
ansible_host: ed-c7-2.virt.uller.thoto.net ansible_host: ed-c7-2.virt.uller.thoto.net
# ansible_host: 192.168.123.60 # + jumphost # ansible_host: 192.168.123.60 # + jumphost
ssh_args: -o ControlMaster=auto -o ControlPersist=60s ssh_args: -o ControlMaster=auto -o ControlPersist=60s
unpriv_user: thoto unpriv_user: thoto
cfg_unpriv_user: thoto
children: children:
htcondor: htcondor:
hosts: hosts:

119
play.yml
View File

@@ -1,115 +1,10 @@
--- ---
- hosts: all - name: base setup
vars_files: import_playbook: base.yml
- vars-auth.yml
- vars-influx.yml
tasks:
- name: "install epel repo" # for htop etc.
yum:
name: epel-release
state: present
- name: "install tools" - name: setup htcondor
yum: import_playbook: htcondor.yml
name: [ vim-enhanced, htop, screen, bind-utils, nmap-ncat, net-tools ] when: '"htcondor" in group_names'
state: present
- name: "install ssh-key" - name: setup slurm and cobald
authorized_key: import_playbook: cobald.yml
user: thoto
key: "{{ssh_key}}"
state: present
- hosts: htcondor
pre_tasks:
- name: "install htcondor repo"
yum:
name: https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm
state: present
tags: htcondor
- name: "install htcondor software "
yum:
name: htcondor-ce
state: present
tags: htcondor
- name: "remove minicondor configuration"
yum:
name: minicondor
state: absent
tags: htcondor
- name: "setup singularity"
import_tasks: "singularity.yml"
tags: singularity
roles:
- name: "setup docker"
role: docker
tags: docker
- name: "setup htcondor test environment in docker containers"
role: docker-htcondor
tags:
- htcondor-containered
- htcondor
- hosts: slurm, cobald
vars:
container_privileged: True
slurm_num_nodes: 10
tasks:
- name: "setup docker"
import_role: name=docker
tags: docker
- name: "get facts from existing cobald instance (i.e. hostname)"
include_role:
name: cobald
tasks_from: facts
apply:
tags: slurm, cobald, slurm-config
tags: slurm, cobald, slurm-config
vars:
container_name: cobald
- name: "setup slurm test environment in docker containers"
include_role:
name: slurm
apply:
tags: slurm
vars:
slurm_user: slurm # or root
slurm_user_accounts:
- name: cobald
dir: /var/lib/cobald
num_nodes: "{{slurm_num_nodes}}"
extra_nodes:
- name: cobald
hostname: "{{cobald_container_hostname}}" # from cobald/facts.yml above
# hostname is used as NodeHostname, which is used slurms "networking
# code" (https://bugs.schedmd.com/show_bug.cgi?id=8615).
# It works either way around, but one of NodeName or NodeHostname has
# to match the container name (-n flag, not --hostname) since when
# submitting tasks to the slurm controller, it matches access
# permissions against a reverse lookup of the submitting ip address.
# Docker always and unconfigureably resolves the container ip in any
# network to containername.netname, where containername is the
# containers runtime name (not hostname supplied!) and netname is
# the network name in host environment. We should run our own dns...
docker_network: slurm
when: '"slurm" in group_names'
tags: slurm, slurm-config
- name: "install cobald"
include_role:
name: cobald
apply:
tags: cobald
vars:
cobald_slurm: True
container_name: cobald
# docker_network: slurm # overriden by vars/slurm.yml
when: '"cobald" in group_names'
tags: cobald, influxdb

View File

@@ -14,11 +14,12 @@
- name: run grafana - name: run grafana
docker_container: docker_container:
name: ed-grafana name: ed-grafana
image: grafana/grafana:7.5.7 image: docker.io/grafana/grafana:7.5.7
hostname: ed-grafana hostname: ed-grafana
domainname: cobald.local domainname: cobald.local
networks: networks:
- name: "{{docker_network}}" - name: "{{cobald_docker_network}}"
networks_cli_compatible: True
published_ports: published_ports:
- "3000:3000" - "3000:3000"
state: started state: started
@@ -42,7 +43,6 @@
version: Flux version: Flux
additional_secure_json_data: additional_secure_json_data:
token: "{{influx_grafana_token.token}}" token: "{{influx_grafana_token.token}}"
register: das
- community.grafana.grafana_dashboard: - community.grafana.grafana_dashboard:
grafana_url: http://localhost:3000 grafana_url: http://localhost:3000

View File

@@ -1,11 +1,12 @@
- name: run influxdb in docker container - name: run influxdb in docker container
docker_container: docker_container:
name: ed-influxdb name: ed-influxdb
image: influxdb image: docker.io/library/influxdb:2.0
hostname: "{{influx_hostname}}" hostname: "{{influx_hostname}}"
domainname: "{{influx_domainname}}" domainname: "{{influx_domainname}}"
networks: networks:
- name: "{{ docker_network }}" - name: "{{ cobald_docker_network }}"
networks_cli_compatible: True
published_ports: published_ports:
- "{{influx_pubport}}:8086" - "{{influx_pubport}}:8086"
volumes: volumes:
@@ -21,7 +22,6 @@
state: started state: started
detach: True detach: True
cleanup: True cleanup: True
networks_cli_compatible: True
- name: add ansible connection to influxdb container - name: add ansible connection to influxdb container
add_host: add_host:

View File

@@ -1,5 +1,6 @@
- include_vars: cobald-slurm.yml - include_vars: cobald-slurm.yml
when: cobald_slurm | default(False) when: cobald_slurm | default(False)
tags: always
- name: build cobald:slurm docker image - name: build cobald:slurm docker image
include_role: include_role:

View File

@@ -53,7 +53,7 @@
hostname: telegraf hostname: telegraf
domainname: cobald.local domainname: cobald.local
networks: networks:
- name: "{{docker_network | default('bridge') }}" - name: "{{ cobald_docker_network }}"
volumes: volumes:
- "/container/volumes/telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro" - "/container/volumes/telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro"
state: started state: started

View File

@@ -1,43 +0,0 @@
FROM docker.io/library/centos:7 as base
RUN yum install -y epel-release && \
yum install -y slurm && \
yum clean all && rm -rf /var/cache/yum
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
yum clean all && rm -rf /var/cache/yum
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
chmod 755 /usr/local/sbin/entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
ARG slurmuser=slurm
ENV slurmuser=${slurmuser}
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
slurm-setuser -u $slurmuser -g $slurmuser -y
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
FROM base as slurmd
RUN yum install -y slurm-slurmd && \
yum clean all && rm -rf /var/cache/yum
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
slurmd -D 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'
FROM base as slurmctld
RUN yum install -y slurm-slurmctld && \
yum clean all && rm -rf /var/cache/yum
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'

View File

@@ -67,6 +67,8 @@
- slurm.conf - slurm.conf
- cgroup.conf - cgroup.conf
vars: vars:
slurm_exec_node_cores: 3
slurm_exec_node_mem: 5000 # RealMemory=5964
slurm_alloc_nodes_default: slurm_alloc_nodes_default:
- name: "{{slurm_prefix+'-submit1'}}" - name: "{{slurm_prefix+'-submit1'}}"
alloc_nodes: "{{ slurm_alloc_nodes_default + extra_nodes | default([])}}" alloc_nodes: "{{ slurm_alloc_nodes_default + extra_nodes | default([])}}"

View File

@@ -163,10 +163,15 @@ SlurmSchedLogFile={{slurm_log_path_sched}}
# #
# #
# COMPUTE NODES # COMPUTE NODES
NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN NodeName=slurm-exec[1-{{ num_nodes }}] CPUs={{ slurm_exec_node_cores }} {{''
}} RealMemory={{ slurm_exec_node_mem }} {{''
}} CoresPerSocket={{ slurm_exec_node_cores }} State=UNKNOWN
# RealMemory=5964
{% for i in alloc_nodes -%} {% for i in alloc_nodes -%}
NodeName={{i.name}} NodeName={{i.name}}
{%- if i.hostname is defined %} NodeHostname={{i.hostname}} {% endif %} {%- if i.hostname is defined %} NodeHostname={{i.hostname}} {% endif %}
State=UNKNOWN State=UNKNOWN
{% endfor %} {% endfor %}
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} Default=YES MaxTime=INFINITE State=UP PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] {{ ''
}} AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} {{ ''
}} Default=YES MaxTime=INFINITE State=UP