Compare commits

...

48 Commits

Author SHA1 Message Date
4c63f2a825 fix: slurm host system access 2021-07-06 15:14:33 +02:00
51390bb321 improved file loading 2021-07-06 14:54:18 +02:00
52022a3013 WIP: cobald tardis config 2021-07-05 18:52:41 +02:00
574d2fcb4e drone shutdown 2021-07-05 18:25:26 +02:00
2919c98d5f cons res 2021-07-05 18:24:24 +02:00
f73fef1473 cgroups inside singularity container 2021-07-02 00:47:32 +02:00
8bc2f717e0 slurm container running when drone started 2021-07-01 15:19:35 +02:00
d88761ca7d singularity for cobald 2021-06-30 16:31:06 +02:00
3be5025442 alias making running drones working 2021-06-30 10:17:51 +02:00
4c4c4da79d parametric additional partition 2021-06-29 22:25:09 +02:00
1a952a4e7a option for docker host access to cluster 2021-06-28 17:51:45 +02:00
74a760cf98 screen scrolling 2021-06-28 17:44:39 +02:00
cd7dea8fda fix: variables hostname telegraf/influx 2021-06-28 12:14:31 +02:00
7e767c3716 memory / cpu ressources fix 2021-06-25 12:19:00 +02:00
a61d08d118 restructured playbooks, cleanup 2021-06-25 01:55:14 +02:00
188a9215a9 tags #2 2021-06-24 16:37:46 +02:00
9499ce49ae fix: wrong network 2021-06-24 16:37:10 +02:00
9237d736d8 tags 2021-06-24 14:17:16 +02:00
e979ea4d6e fix hostname of cobald slurm node
made cobald be able to run slurm jobs, previously failing with
permission denied.
2021-06-24 14:07:35 +02:00
c7e931f29e fix: building base image -> update child images 2021-06-23 14:29:32 +02:00
a73f9ad6ad additional user in slurm base docker image 2021-06-23 14:28:50 +02:00
c35dc25c39 labels, some cleanup 2021-06-22 19:09:52 +02:00
1f4dfe1821 build cobald image from slurm role, separated tags 2021-06-22 16:48:56 +02:00
78850d4636 merged slurm_dockerimage back into slurm role 2021-06-22 00:26:00 +02:00
f83801cb62 removed cobald_facts module 2021-06-21 21:34:24 +02:00
e78e184375 WIP: cobald container containing and using slurm 2021-06-21 19:19:19 +02:00
02e87d7c40 cleanup, requisite files instead of startupscripts 2021-06-18 12:03:14 +02:00
4450c9bb65 WIP: separate slurm base and docker images 2021-06-17 22:50:40 +02:00
6eb6984d6a new startup for cobald containers 2021-06-17 14:55:34 +02:00
cc43a39ea3 dashboard revision 2021-06-14 10:43:47 +02:00
962d9b5ac9 grafana dashboard updated, wait_for 2021-06-10 10:51:01 +02:00
e81fb5d445 cobald container termination signal 2021-06-09 16:26:14 +02:00
73945b6cb9 shorter hostname for cobald container 2021-06-08 16:07:54 +02:00
089ea914b6 updated dashboards 2021-06-08 12:31:46 +02:00
dd1baa4aef grafana 2021-06-08 12:31:13 +02:00
ea3195a93c minor fixes (entrypoint) and restructuring 2021-06-08 12:28:09 +02:00
aef1499e65 fixed influxdb when container absent and wait_for 2021-06-02 17:04:32 +02:00
c7203f58ff fix: influxdb connection issue 2021-06-01 19:17:04 +02:00
2e0d83cca1 host ed-c7-2, fixed htop install 2021-06-01 18:30:18 +02:00
35882ca1a9 TODO for influx modules 2021-05-25 23:56:19 +02:00
4e7f33338e telegraf + influxdb 2021-05-25 23:47:03 +02:00
ddc6c2bb4d influx modules: fixes, permission match, py2, args 2021-05-25 19:13:57 +02:00
f9e29a4e30 influx bucket module 2021-05-25 15:00:47 +02:00
c26e962898 token module improved 2021-05-25 11:55:30 +02:00
38c117d6fa influxdb2 plugins 2021-05-21 20:28:50 +02:00
ecb9724ee3 generic unpriv_user 2021-05-11 23:56:29 +02:00
4373e0a4a2 cobald development environment 2021-05-11 23:48:49 +02:00
19b71c9933 first cobald tardis 2021-05-10 12:20:27 +02:00
55 changed files with 4972 additions and 232 deletions

1
.gitignore vendored
View File

@@ -1,2 +1,3 @@
.*.swp
*.retry
vars_auth.yml

27
base.yml Normal file
View File

@@ -0,0 +1,27 @@
---
- hosts: all
tasks:
- name: "install epel repo" # for htop etc.
yum:
name: epel-release
state: present
- name: "install tools"
yum:
name: [ vim-enhanced, htop, screen, bind-utils, nmap-ncat, net-tools ]
state: present
- name: "screenrc native scrolling in tmux"
copy:
content: "termcapinfo xterm* ti@:te@\ntermcapinfo screen* ti@:te@\n"
dest: "{{item}}"
with_items:
- "~{{unpriv_user}}/.screenrc"
- "~root/.screenrc"
- name: "install ssh-key"
authorized_key:
user: "{{cfg_unpriv_user}}"
key: "{{cfg_ssh_key}}"
state: present

61
cobald.yml Normal file
View File

@@ -0,0 +1,61 @@
---
- hosts: slurm, cobald
vars:
container_privileged: True
slurm_num_nodes: 10
tasks:
- name: "setup docker"
import_role: name=docker
tags: docker
- name: "get facts from existing cobald instance (i.e. hostname)"
include_role:
name: cobald
tasks_from: facts
apply:
tags: slurm, cobald, slurm-config
tags: slurm, cobald, slurm-config
vars:
container_name: cobald
- name: "setup slurm test environment in docker containers"
include_role:
name: slurm
apply:
tags: slurm
vars:
slurm_user: slurm # or root
slurm_user_accounts:
- name: cobald
dir: /var/lib/cobald
num_nodes: "{{slurm_num_nodes}}"
extra_nodes:
- name: cobald
hostname: "{{cobald_container_hostname}}" # from cobald/facts.yml above
# hostname is used as NodeHostname, which is used slurms "networking
# code" (https://bugs.schedmd.com/show_bug.cgi?id=8615).
# It works either way around, but one of NodeName or NodeHostname has
# to match the container name (-n flag, not --hostname) since when
# submitting tasks to the slurm controller, it matches access
# permissions against a reverse lookup of the submitting ip address.
# Docker always and unconfigureably resolves the container ip in any
# network to containername.netname, where containername is the
# containers runtime name (not hostname supplied!) and netname is
# the network name in host environment. We should run our own dns...
docker_network: slurm
slurm_hostsystem_cluster_access: True
when: '"slurm" in group_names'
tags: slurm, cobald, influxdb, slurm-config
# tags: cobald requires some slurm facts, so cobald tag is included here
- name: "install cobald"
include_role:
name: cobald
apply:
tags: cobald
vars:
cobald_slurm: True
container_name: cobald
# docker_network: slurm # overriden by vars/slurm.yml
when: '"cobald" in group_names'
tags: cobald, influxdb, singularity

32
htcondor.yml Normal file
View File

@@ -0,0 +1,32 @@
---
- hosts: htcondor
tasks:
- name: "install htcondor repo"
yum:
name: https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm
state: present
tags: htcondor
- name: "install htcondor software "
yum:
name: htcondor-ce
state: present
tags: htcondor
- name: "remove minicondor configuration"
yum:
name: minicondor
state: absent
tags: htcondor
- name: "setup singularity"
import_role: name="singularity"
tags: singularity
- name: "setup docker"
import_role: name=docker
tags: docker
- name: "setup htcondor test environment in docker containers"
import_role: name=docker-htcondor
tags: htcondor-containered, htcondor

4
install.sh Executable file
View File

@@ -0,0 +1,4 @@
mkdir -p ./collections/ansible_collections/community
# git clone --depth=1 -b 1.2.1 https://github.com/ansible-collections/community.grafana.git ./collections/ansible_collections/community/grafana
git clone --depth=1 -b 1.2.1-extended https://github.com/ansible_community.grafana.git ./collections/ansible_collections/community/grafana

13
inv.yml
View File

@@ -6,6 +6,14 @@ all:
ssh_args: -o ControlMaster=auto -o ControlPersist=60s
# ansible_host: 192.168.122.139
unpriv_user: thoto
cfg_unpriv_user: thoto
ed-c7-2:
ansible_user: root
ansible_host: ed-c7-2.virt.uller.thoto.net
# ansible_host: 192.168.123.60 # + jumphost
ssh_args: -o ControlMaster=auto -o ControlPersist=60s
unpriv_user: thoto
cfg_unpriv_user: thoto
children:
htcondor:
hosts:
@@ -13,3 +21,8 @@ all:
slurm:
hosts:
ed-c7-1:
ed-c7-2:
cobald:
hosts:
ed-c7-1:
ed-c7-2:

View File

@@ -1,56 +1,10 @@
---
- hosts: all
tasks:
- name: "install tools"
yum:
name: [ vim-enhanced, htop, screen, bind-utils, nmap-ncat, net-tools ]
state: present
- name: base setup
import_playbook: base.yml
- hosts: htcondor
pre_tasks:
- name: "install htcondor repo"
yum:
name: https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm
state: present
tags: htcondor
- name: setup htcondor
import_playbook: htcondor.yml
when: '"htcondor" in group_names'
- name: "install htcondor software "
yum:
name: htcondor-ce
state: present
tags: htcondor
- name: "remove minicondor configuration"
yum:
name: minicondor
state: absent
tags: htcondor
- name: "setup singularity"
import_tasks: "singularity.yml"
tags: singularity
roles:
- name: "setup docker"
role: docker
tags: docker
- name: "setup htcondor test environment in docker containers"
role: docker-htcondor
tags:
- htcondor-containered
- htcondor
- hosts: slurm
vars:
container_privileged: True
num_nodes: 3
roles:
- name: "setup docker"
role: docker
tags: docker
- name: "setup slurm test environment in docker containers"
role: slurm
vars:
slurm_user: slurm # or root
tags: slurm
- name: setup slurm and cobald
import_playbook: cobald.yml

View File

@@ -0,0 +1,6 @@
cobald_domainname: cobald.local
influx_admin_user: my-user
influx_admin_pw: my-password
influx_org: my-org
influx_pubport: 28086
influx_bucket: batleth

View File

@@ -0,0 +1,3 @@
#!/bin/sh
[ /slurm-singimage/slurmd.sif -nt /shared/slurmd.sif ] && \
cp /slurm-singimage/slurmd.sif /shared/slurmd.sif

View File

@@ -0,0 +1,3 @@
#!/bin/sh
slurmd --conf-server ${slurmctld} -D -N ${nodename} 2>/dev/null 1>/dev/null &
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}

View File

@@ -0,0 +1,11 @@
###
#
# Slurm cgroup support configuration file
#
# See man slurm.conf and man cgroup.conf for further
# information on cgroup configuration parameters
#--
CgroupAutomount=no
ConstrainCores=no
ConstrainRAMSpace=no

View File

@@ -0,0 +1,31 @@
---
pipeline:
- __type__: cobald.controller.linear.LinearController
low_utilisation: 0.9
high_allocation: 0.9
rate: 0.10
- !Limiter
minimum: 3
- !TelegrafPipelineMonitor
poll: True
- !TardisPoolFactory
configuration: /etc/cobald/tardis.yaml
logging:
version: 1
root:
level: DEBUG
handlers: [console, file]
handlers:
console:
class: logging.StreamHandler
formatter: test
level: DEBUG
stream: ext://sys.stderr
file:
class: logging.handlers.RotatingFileHandler
formatter: test
level: WARNING
filename: /var/log/cobald/cobald-tardis.log
formatters:
test:
format: " %(name)s %(message)s"

View File

@@ -0,0 +1,41 @@
Plugins:
SqliteRegistry:
db_file: /tmp/drone_registry.db
TelegrafMonitoring:
host: ed-telegraf
port: 8094
#BatchSystem:
# adapter: FakeBatchSystem
# allocation: 1.0
# utilisation: !PeriodicValue
# period: 60
# amplitude: 0.15
# offset: 0.80
## phase: 1.
# phase: 1.6
# machine_status: Available
BatchSystem:
adapter: Slurm
max_age: 0.1
options:
partition: cobald
Sites:
- name: slurmtest
adapter: Slurm
quota: 20
slurmtest:
# executor: ...
StatusUpdate: 0.1
MachineTypes:
- m1.a
MachineTypeConfiguration:
m1.a:
Walltime: 5
Partition: container
StartupCommand: /usr/local/bin/start-drone
# SubmitOptions: ...
MachineMetaData:
m1.a:
Cores: 3 # cores
Memory: 1 # GB
Disk: 4 # not passed

View File

@@ -0,0 +1,7 @@
#!/bin/sh
for i in /usr/local/lib/entrypoints.d/* ; do
[ -f $i ] && /bin/sh $i || break
done
exec "${@:-/bin/bash}"

View File

@@ -0,0 +1,589 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 1,
"iteration": 1623317629899,
"links": [],
"panels": [
{
"collapsed": false,
"datasource": null,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 4,
"panels": [],
"title": "Row title",
"type": "row"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "InfluxDB",
"fieldConfig": {
"defaults": {
"unit": "none"
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 1
},
"hiddenSeries": false,
"id": 2,
"interval": "1s",
"legend": {
"avg": false,
"current": false,
"max": true,
"min": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"maxDataPoints": 200,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.7",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"null"
],
"type": "fill"
}
],
"orderByTime": "ASC",
"policy": "default",
"query": "f_r = (r, accumulator) => ({\n _value: accumulator._value + (\n if r._value == \"AvailableState\" then 1\n else if r._value == \"DownState\" then -1\n else 0)\n })\n\nnodes = from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> keep(columns: [\"tardis_machine_name\", \"_time\"])\n |> sort(columns: [\"_time\"], desc: true)\n |> unique(column: \"tardis_machine_name\")\n// |> yield()\n\noffset = from(bucket: \"batleth\")\n |> range(start: 0, stop: v.timeRangeStart)\n |> filter(fn: (r) => r._field == \"state\")\n |> group(columns: [\"tardis_machine_name\", \"machine_type\"])\n |> reduce(fn: f_r, identity: {_value: 0})\n |> duplicate(column: \"_stop\", as: \"_time\")\n\nnew = from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._field == \"state\")\n// |> filter(fn: (r) => r.tardis_machine_name == \"${machine}\")\n |> group(columns: [\"tardis_machine_name\", \"machine_type\"])\n |> window(every: $__interval)\n |> reduce(fn: f_r, identity: {_value: 0})\n |> duplicate(column: \"_stop\", as: \"_time\")\n\nunion(tables: [offset, new])\n |> window(every: inf)\n |> cumulativeSum()\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> yield()\n",
"refId": "A",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"value"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "nodes running",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "none",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "InfluxDB",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 1
},
"hiddenSeries": false,
"id": 8,
"interval": "1s",
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"maxDataPoints": null,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.7",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"null"
],
"type": "fill"
}
],
"orderByTime": "ASC",
"policy": "default",
"query": "from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"tardis_pipeline\")\n |> filter(fn: (r) => r._field == \"demand\" or r._field == \"supply\")\n |> drop(columns: [\"host\"])\n |> aggregateWindow(every: $__interval, fn: mean)\n |> yield()",
"queryType": "randomWalk",
"refId": "A",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"value"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "pipeline demand/supply (mean)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:115",
"decimals": null,
"format": "short",
"label": "cpus",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:116",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "InfluxDB",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 10
},
"hiddenSeries": false,
"id": 6,
"interval": "1s",
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.7",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"null"
],
"type": "fill"
}
],
"orderByTime": "ASC",
"policy": "default",
"query": "f = (r, accumulator) => ({\n _value: accumulator._value + (if r._value == \"AvailableState\" then 1 else if r._value == \"DownState\" then -1 else 0)\n })\n\nfrom(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._field == \"state\")\n |> group(columns: [\"tardis_machine_name\"])\n |> window(every: $__interval)\n |> reduce(fn: f, identity: {_value: 0})\n |> duplicate(column: \"_stop\", as: \"_time\")\n |> window(every: inf, timeColumn: \"_time\")\n |> yield()",
"queryType": "randomWalk",
"refId": "A",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"value"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "node fluctuation",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "InfluxDB",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 10
},
"hiddenSeries": false,
"id": 10,
"interval": "1s",
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"maxDataPoints": null,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.7",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"null"
],
"type": "fill"
}
],
"orderByTime": "ASC",
"policy": "default",
"query": "from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"tardis_pipeline\")\n |> filter(fn: (r) => r._field == \"utilisation\" or r._field == \"allocation\")\n |> keep(columns: [\"_time\", \"_measurement\", \"_field\", \"_value\", \"tardis_machine_name\"])\n |> aggregateWindow(every: $__interval, fn: mean)\n |> yield()",
"queryType": "randomWalk",
"refId": "A",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"value"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "pipeline (utilization/allocation)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": "5s",
"schemaVersion": 27,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allValue": null,
"current": {
"selected": false,
"text": "cobald-xvmcqc",
"value": "cobald-xvmcqc"
},
"datasource": "InfluxDB",
"definition": "from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> keep(columns: [\"tardis_machine_name\", \"_time\"])\n |> sort(columns: [\"_time\"], desc: true)\n |> unique(column: \"tardis_machine_name\")",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "machine",
"options": [],
"query": "from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> keep(columns: [\"tardis_machine_name\", \"_time\"])\n |> sort(columns: [\"_time\"], desc: true)\n |> unique(column: \"tardis_machine_name\")",
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "cobald",
"uid": "urDuvE6Gk",
"version": 2
}

View File

@@ -0,0 +1,794 @@
{
"meta": {
"version": "1",
"type": "dashboard",
"name": "cobald-Template",
"description": "template created from dashboard: cobald"
},
"content": {
"data": {
"type": "dashboard",
"attributes": {
"name": "cobald",
"description": ""
},
"relationships": {
"label": {
"data": []
},
"cell": {
"data": [
{
"type": "cell",
"id": "07900a722c363000"
},
{
"type": "cell",
"id": "07900a7236f63000"
},
{
"type": "cell",
"id": "07900a723cf63000"
},
{
"type": "cell",
"id": "07900a7243f63000"
},
{
"type": "cell",
"id": "079e694f29581000"
},
{
"type": "cell",
"id": "079e6e037c181000"
}
]
},
"variable": {
"data": []
}
}
},
"included": [
{
"id": "07900a722c363000",
"type": "cell",
"attributes": {
"x": 4,
"y": 0,
"w": 4,
"h": 4
},
"relationships": {
"view": {
"data": {
"type": "view",
"id": "07900a722c363000"
}
}
}
},
{
"id": "07900a7236f63000",
"type": "cell",
"attributes": {
"x": 0,
"y": 0,
"w": 4,
"h": 4
},
"relationships": {
"view": {
"data": {
"type": "view",
"id": "07900a7236f63000"
}
}
}
},
{
"id": "07900a723cf63000",
"type": "cell",
"attributes": {
"x": 4,
"y": 4,
"w": 4,
"h": 4
},
"relationships": {
"view": {
"data": {
"type": "view",
"id": "07900a723cf63000"
}
}
}
},
{
"id": "07900a7243f63000",
"type": "cell",
"attributes": {
"x": 0,
"y": 4,
"w": 4,
"h": 4
},
"relationships": {
"view": {
"data": {
"type": "view",
"id": "07900a7243f63000"
}
}
}
},
{
"id": "079e694f29581000",
"type": "cell",
"attributes": {
"x": 8,
"y": 0,
"w": 4,
"h": 4
},
"relationships": {
"view": {
"data": {
"type": "view",
"id": "079e694f29581000"
}
}
}
},
{
"id": "079e6e037c181000",
"type": "cell",
"attributes": {
"x": 8,
"y": 4,
"w": 4,
"h": 4
},
"relationships": {
"view": {
"data": {
"type": "view",
"id": "079e6e037c181000"
}
}
}
},
{
"type": "view",
"id": "07900a722c363000",
"attributes": {
"name": "nodes running",
"properties": {
"shape": "chronograf-v2",
"queries": [
{
"text": "from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._field == \"state\")\n |> group()\n |> window(every: 10s)\n |> reduce(fn: (r, accumulator) => ({\n _value: accumulator._value + (\n if r._value == \"AvailableState\" then 1 \n else if r._value == \"DownState\" then -1 \n else 0)\n }), identity: {_value: 0})\n |> duplicate(column: \"_stop\", as: \"_time\")\n |> window(every: inf) //, timeColumn: \"_time\")\n |> cumulativeSum()\n// |> reduce(fn: (r, accumulator) => ({r with x: r._value * 2}), identity: {x:0})\n// |> map(fn: (r) => ({r with vnew: r._value*2}))\n// |> integral(unit: 10s, timeColumn: \"_stop\")\n// |> window(every: inf) //, timeColumn: \"_stop\")\n |> yield()",
"editMode": "advanced",
"name": "",
"builderConfig": {
"buckets": [],
"tags": [
{
"key": "_measurement",
"values": [],
"aggregateFunctionType": "filter"
}
],
"functions": [
{
"name": "mean"
}
],
"aggregateWindow": {
"period": "auto",
"fillValues": false
}
}
}
],
"axes": {
"x": {
"bounds": [
"",
""
],
"label": "",
"prefix": "",
"suffix": "",
"base": "10",
"scale": "linear"
},
"y": {
"bounds": [
"",
""
],
"label": "",
"prefix": "",
"suffix": "",
"base": "10",
"scale": "linear"
}
},
"type": "xy",
"legend": {},
"geom": "line",
"colors": [
{
"id": "9b960932-18d9-4f57-80ba-24998a06613d",
"type": "scale",
"hex": "#31C0F6",
"name": "Nineteen Eighty Four",
"value": 0
},
{
"id": "c1742651-0d5e-4148-b9c0-92beb642417a",
"type": "scale",
"hex": "#A500A5",
"name": "Nineteen Eighty Four",
"value": 0
},
{
"id": "756fa8cb-9d9c-4e45-9a4f-f2b106b0216a",
"type": "scale",
"hex": "#FF7E27",
"name": "Nineteen Eighty Four",
"value": 0
}
],
"note": "",
"showNoteWhenEmpty": false,
"xColumn": "_time",
"generateXAxisTicks": [],
"xTotalTicks": 0,
"xTickStart": 0,
"xTickStep": 0,
"yColumn": "_value",
"generateYAxisTicks": [],
"yTotalTicks": 0,
"yTickStart": 0,
"yTickStep": 0,
"shadeBelow": false,
"position": "overlaid",
"timeFormat": "",
"hoverDimension": "auto",
"legendColorizeRows": true,
"legendOpacity": 1,
"legendOrientationThreshold": 100000000
}
}
},
{
"type": "view",
"id": "07900a7236f63000",
"attributes": {
"name": "Name this Cell",
"properties": {
"shape": "chronograf-v2",
"queries": [
{
"text": "from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r[\"tardis_machine_name\"] == \"678162c190d5\")\n |> window(every: 10s)\n |> count()\n |> duplicate(column: \"_stop\", as: \"_time\")\n |> window(every: inf)\n |> yield()",
"editMode": "advanced",
"name": "",
"builderConfig": {
"buckets": [],
"tags": [
{
"key": "_measurement",
"values": [],
"aggregateFunctionType": "filter"
}
],
"functions": [
{
"name": "mean"
}
],
"aggregateWindow": {
"period": "auto",
"fillValues": false
}
}
}
],
"axes": {
"x": {
"bounds": [
"",
""
],
"label": "",
"prefix": "",
"suffix": "",
"base": "10",
"scale": "linear"
},
"y": {
"bounds": [
"",
""
],
"label": "",
"prefix": "",
"suffix": "",
"base": "10",
"scale": "linear"
}
},
"type": "xy",
"legend": {},
"geom": "line",
"colors": [
{
"id": "2566435b-7ee0-4222-8ac0-b7f14ab783d9",
"type": "scale",
"hex": "#31C0F6",
"name": "Nineteen Eighty Four",
"value": 0
},
{
"id": "9263bcff-35a0-4025-bacd-68a1bef54784",
"type": "scale",
"hex": "#A500A5",
"name": "Nineteen Eighty Four",
"value": 0
},
{
"id": "2f04bd8c-2203-4be6-bc34-c25720d24379",
"type": "scale",
"hex": "#FF7E27",
"name": "Nineteen Eighty Four",
"value": 0
}
],
"note": "",
"showNoteWhenEmpty": false,
"xColumn": "_time",
"generateXAxisTicks": [],
"xTotalTicks": 0,
"xTickStart": 0,
"xTickStep": 0,
"yColumn": "_value",
"generateYAxisTicks": [],
"yTotalTicks": 0,
"yTickStart": 0,
"yTickStep": 0,
"shadeBelow": false,
"position": "overlaid",
"timeFormat": "",
"hoverDimension": "auto",
"legendColorizeRows": true,
"legendOpacity": 1,
"legendOrientationThreshold": 100000000
}
}
},
{
"type": "view",
"id": "07900a723cf63000",
"attributes": {
"name": "node fluctuation",
"properties": {
"shape": "chronograf-v2",
"queries": [
{
"text": "f = (r, accumulator) => ({\n _value: accumulator._value + (if r._value == \"AvailableState\" then 1 else if r._value == \"DownState\" then -1 else 0)\n })\n\nfrom(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._field == \"state\")\n |> group()\n |> window(every: 10s)\n |> reduce(fn: f, identity: {_value: 0})\n |> duplicate(column: \"_stop\", as: \"_time\")\n |> window(every: inf, timeColumn: \"_time\")\n |> yield()",
"editMode": "advanced",
"name": "",
"builderConfig": {
"buckets": [],
"tags": [
{
"key": "_measurement",
"values": [],
"aggregateFunctionType": "filter"
}
],
"functions": [
{
"name": "mean"
}
],
"aggregateWindow": {
"period": "auto",
"fillValues": false
}
}
}
],
"axes": {
"x": {
"bounds": [
"",
""
],
"label": "",
"prefix": "",
"suffix": "",
"base": "10",
"scale": "linear"
},
"y": {
"bounds": [
"",
""
],
"label": "",
"prefix": "",
"suffix": "",
"base": "10",
"scale": "linear"
}
},
"type": "xy",
"legend": {},
"geom": "line",
"colors": [
{
"id": "9b960932-18d9-4f57-80ba-24998a06613d",
"type": "scale",
"hex": "#31C0F6",
"name": "Nineteen Eighty Four",
"value": 0
},
{
"id": "c1742651-0d5e-4148-b9c0-92beb642417a",
"type": "scale",
"hex": "#A500A5",
"name": "Nineteen Eighty Four",
"value": 0
},
{
"id": "756fa8cb-9d9c-4e45-9a4f-f2b106b0216a",
"type": "scale",
"hex": "#FF7E27",
"name": "Nineteen Eighty Four",
"value": 0
}
],
"note": "",
"showNoteWhenEmpty": false,
"xColumn": "_time",
"generateXAxisTicks": [],
"xTotalTicks": 0,
"xTickStart": 0,
"xTickStep": 0,
"yColumn": "_value",
"generateYAxisTicks": [],
"yTotalTicks": 0,
"yTickStart": 0,
"yTickStep": 0,
"shadeBelow": false,
"position": "overlaid",
"timeFormat": "",
"hoverDimension": "auto",
"legendColorizeRows": true,
"legendOpacity": 1,
"legendOrientationThreshold": 100000000
}
}
},
{
"type": "view",
"id": "07900a7243f63000",
"attributes": {
"name": "states",
"properties": {
"shape": "chronograf-v2",
"queries": [
{
"text": "from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._field == \"state\")\n |> group(columns: [\"_value\"], mode: \"by\")\n |> duplicate(column: \"_value\", as: \"state\")\n |> window(every: 10s)\n |> count(column: \"state\")\n |> rename(columns: {\"_value\": \"_field\", \"state\": \"_value\"})\n |> group(columns: [\"_field\"])\n |> yield()",
"editMode": "advanced",
"name": "",
"builderConfig": {
"buckets": [],
"tags": [
{
"key": "_measurement",
"values": [],
"aggregateFunctionType": "filter"
}
],
"functions": [
{
"name": "mean"
}
],
"aggregateWindow": {
"period": "auto",
"fillValues": false
}
}
}
],
"axes": {
"x": {
"bounds": [
"",
""
],
"label": "",
"prefix": "",
"suffix": "",
"base": "10",
"scale": "linear"
},
"y": {
"bounds": [
"",
""
],
"label": "",
"prefix": "",
"suffix": "",
"base": "10",
"scale": "linear"
}
},
"type": "xy",
"legend": {},
"geom": "line",
"colors": [
{
"id": "9b960932-18d9-4f57-80ba-24998a06613d",
"type": "scale",
"hex": "#31C0F6",
"name": "Nineteen Eighty Four",
"value": 0
},
{
"id": "c1742651-0d5e-4148-b9c0-92beb642417a",
"type": "scale",
"hex": "#A500A5",
"name": "Nineteen Eighty Four",
"value": 0
},
{
"id": "756fa8cb-9d9c-4e45-9a4f-f2b106b0216a",
"type": "scale",
"hex": "#FF7E27",
"name": "Nineteen Eighty Four",
"value": 0
}
],
"note": "",
"showNoteWhenEmpty": false,
"xColumn": "_stop",
"generateXAxisTicks": [],
"xTotalTicks": 0,
"xTickStart": 0,
"xTickStep": 0,
"yColumn": "_value",
"generateYAxisTicks": [],
"yTotalTicks": 0,
"yTickStart": 0,
"yTickStep": 0,
"shadeBelow": false,
"position": "overlaid",
"timeFormat": "",
"hoverDimension": "auto",
"legendColorizeRows": true,
"legendOpacity": 1,
"legendOrientationThreshold": 100000000
}
}
},
{
"type": "view",
"id": "079e694f29581000",
"attributes": {
"name": "pipeline (demand / supply)",
"properties": {
"shape": "chronograf-v2",
"queries": [
{
"text": "from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"tardis_pipeline\")\n |> filter(fn: (r) => r._field == \"demand\" or r._field == \"supply\")\n// |> filter(fn: (r) => r._field == \"state\")\n// |> group()\n// |> window(every: 10s)\n// |> duplicate(column: \"_stop\", as: \"_time\")\n// |> window(every: inf) //, timeColumn: \"_time\")\n// |> cumulativeSum()\n |> yield()",
"editMode": "advanced",
"name": "",
"builderConfig": {
"buckets": [],
"tags": [
{
"key": "_measurement",
"values": [],
"aggregateFunctionType": "filter"
}
],
"functions": [
{
"name": "mean"
}
],
"aggregateWindow": {
"period": "auto",
"fillValues": false
}
}
}
],
"axes": {
"x": {
"bounds": [
"",
""
],
"label": "",
"prefix": "",
"suffix": "",
"base": "10",
"scale": "linear"
},
"y": {
"bounds": [
"",
""
],
"label": "",
"prefix": "",
"suffix": "",
"base": "10",
"scale": "linear"
}
},
"type": "xy",
"legend": {},
"geom": "line",
"colors": [
{
"id": "4ef29481-ecf3-4a09-b0f5-e34e8d3e50b5",
"type": "scale",
"hex": "#31C0F6",
"name": "Nineteen Eighty Four",
"value": 0
},
{
"id": "719de04a-f70f-4c54-a1bb-982a9d13dbae",
"type": "scale",
"hex": "#A500A5",
"name": "Nineteen Eighty Four",
"value": 0
},
{
"id": "8d5c1f25-3801-4cdd-ad40-b8e2e78342a0",
"type": "scale",
"hex": "#FF7E27",
"name": "Nineteen Eighty Four",
"value": 0
}
],
"note": "",
"showNoteWhenEmpty": false,
"xColumn": "_time",
"generateXAxisTicks": [],
"xTotalTicks": 0,
"xTickStart": 0,
"xTickStep": 0,
"yColumn": "_value",
"generateYAxisTicks": [],
"yTotalTicks": 0,
"yTickStart": 0,
"yTickStep": 0,
"shadeBelow": false,
"position": "overlaid",
"timeFormat": "",
"hoverDimension": "auto",
"legendColorizeRows": true,
"legendOpacity": 1,
"legendOrientationThreshold": 100000000
}
}
},
{
"type": "view",
"id": "079e6e037c181000",
"attributes": {
"name": "pipeline (Utilization / Allocation)",
"properties": {
"shape": "chronograf-v2",
"queries": [
{
"text": "from(bucket: \"batleth\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"tardis_pipeline\")\n |> filter(fn: (r) => r._field == \"utilisation\" or r._field == \"allocation\")\n// |> filter(fn: (r) => r._field == \"state\")\n// |> group()\n// |> window(every: 10s)\n// |> duplicate(column: \"_stop\", as: \"_time\")\n// |> window(every: inf) //, timeColumn: \"_time\")\n// |> cumulativeSum()\n |> yield()",
"editMode": "advanced",
"name": "",
"builderConfig": {
"buckets": [],
"tags": [
{
"key": "_measurement",
"values": [],
"aggregateFunctionType": "filter"
}
],
"functions": [
{
"name": "mean"
}
],
"aggregateWindow": {
"period": "auto",
"fillValues": false
}
}
}
],
"axes": {
"x": {
"bounds": [
"",
""
],
"label": "",
"prefix": "",
"suffix": "",
"base": "10",
"scale": "linear"
},
"y": {
"bounds": [
"",
""
],
"label": "",
"prefix": "",
"suffix": "",
"base": "10",
"scale": "linear"
}
},
"type": "xy",
"legend": {},
"geom": "line",
"colors": [
{
"id": "4ef29481-ecf3-4a09-b0f5-e34e8d3e50b5",
"type": "scale",
"hex": "#31C0F6",
"name": "Nineteen Eighty Four",
"value": 0
},
{
"id": "719de04a-f70f-4c54-a1bb-982a9d13dbae",
"type": "scale",
"hex": "#A500A5",
"name": "Nineteen Eighty Four",
"value": 0
},
{
"id": "8d5c1f25-3801-4cdd-ad40-b8e2e78342a0",
"type": "scale",
"hex": "#FF7E27",
"name": "Nineteen Eighty Four",
"value": 0
}
],
"note": "",
"showNoteWhenEmpty": false,
"xColumn": "_time",
"generateXAxisTicks": [],
"xTotalTicks": 0,
"xTickStart": 0,
"xTickStep": 0,
"yColumn": "_value",
"generateYAxisTicks": [],
"yTotalTicks": 0,
"yTickStart": 0,
"yTickStep": 0,
"shadeBelow": false,
"position": "overlaid",
"timeFormat": "",
"hoverDimension": "auto",
"legendColorizeRows": true,
"legendOpacity": 1,
"legendOrientationThreshold": 100000000
}
}
}
]
},
"labels": []
}

View File

@@ -0,0 +1,6 @@
[influxdb]
name = InfluxDB Repository - RHEL \$releasever
baseurl = https://repos.influxdata.com/rhel/\$releasever/\$basearch/stable
enabled = 1
gpgcheck = 1
gpgkey = https://repos.influxdata.com/influxdb.key

View File

@@ -0,0 +1,5 @@
#!/bin/sh
[ -f /usr/local/lib/cobaldmodules/setup.py -a \
-d /usr/local/lib/cobaldmodules/cobaldmodules ] && \
pip3 install --no-deps --editable /usr/local/lib/cobaldmodules

View File

@@ -0,0 +1,31 @@
Bootstrap: docker-daemon
From: slurm:slurmd
%files
31-slurmd-configless /etc/docker-init.d/31-slurm-configless
/container/volumes/munge/munge.key /etc/munge/munge.key
cgroup.conf.noautomount /etc/slurm/cgroup.conf
%post
rm /etc/docker-init.d/30-slurmd
chmod 755 /etc/docker-init.d/31-slurm-configless
%startscript
if [ -z "${1}" -o -z "${2}" ] ; then
echo "undefined variables slurmctld or nodename"
exit 1
fi
export slurmctld="${1}"
export nodename="${2}"
echo ${slurmctld} ${nodename} ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}
exec /usr/local/sbin/entrypoint.sh /usr/local/sbin/docker-init
%runscript
if [ -z "${1}" -o -z "${2}" ] ; then
echo "undefined variables slurmctld or nodename"
exit 1
fi
export slurmctld="${1}"
export nodename="${2}"
echo ${slurmctld} ${nodename} ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}
exec /usr/local/sbin/entrypoint.sh /usr/local/sbin/docker-init

View File

@@ -0,0 +1,59 @@
#!/bin/sh
#SBATCH -D /shared
export
echo $@
nodename=$(hostname | awk '{ print "drone" substr($1,match($1, "([[:digit:]]+)")) }')
SHUTDOWN_DONE=0
function handler_quit(){
[ $SHUTDOWN_DONE -ne 0 ] && return
set -x
echo "drain container"
scontrol update NodeName=${nodename} State=DRAIN Reason="cobald node quit"
shutdown_jobs=$(squeue -w ${nodename} --noheader -O jobid)
[ -n "${shutdown_jobs}" ] && scancel ${shutdown_jobs}
#scancel -w ${nodename}
i=$(( $(scontrol show config | grep KillWait | \
sed 's/^KillWait.*= \([0-9]*\) sec/\1/') - 2 ))
while [ -n "$(squeue -w ${nodename} --noheader -O jobid)" -o ${i} -lt 1 ]
do
i=$(( ${i} - 1 ))
sleep 1
done
scancel -s KILL -w ${nodename} # hard kill all remaining jobs
echo "shutdown container"
scontrol update NodeName=${nodename} State=DOWN Reason=shutdown
singularity instance stop slurm-drone
scontrol update NodeName=${nodename} State=FUTURE
umount /inner-cgroup/freezer
umount /inner-cgroup
SHUTDOWN_DONE=1
exit 0
}
# set -x
trap handler_quit EXIT
echo "mounting cgroups"
mkdir /inner-cgroup
mount -t tmpfs none /inner-cgroup
mkdir /inner-cgroup/freezer/
mount --bind /sys/fs/cgroup/freezer/slurm/ /inner-cgroup/freezer/
mount -o remount,ro /inner-cgroup
echo "starting ${nodename}"
scontrol update NodeName=${nodename} State=RESUME # revoke last DRAIN
scontrol update NodeName=${nodename} State=FUTURE
singularity instance start \
-B /inner-cgroup/:/sys/fs/cgroup/ \
--writable-tmpfs /shared/slurmd.sif slurm-drone \
slurm-ctl ${nodename}
# scontrol update NodeName=${nodename} NodeHostname=${SLURM_JOB_ID}
scontrol update NodeName=${nodename} NodeHostname=${TardisDroneUuid}
if [ $? -eq 0 ] ; then
echo "container started, sleeping $(( 60 * ${SLURM_Walltime} - 2 ))"
sleep $(( 60 * ${SLURM_Walltime} - 2 ))
fi
handler_quit

View File

@@ -0,0 +1,8 @@
FROM centos:7
COPY influxdb.repo /etc/yum.repos.d/influxdb.repo
RUN yum -y install telegraf &&\
yum clean all && rm -rf /var/cache/yum
CMD telegraf

View File

@@ -0,0 +1,11 @@
# Tests
ANSIBLE_LIBRARY=. ansible -m influx_bucket -a "base='http://192.168.122.140:28086' org='my-org' auth_token='87-fEnSlQldFi1T_CLHsrHxH-T9VKey-qzUbVH6tmR2QzL4oZzbUPwzS1wzOoIkyfmyGbRv75yLjYfztxziivw==' name='bucky' description='test 123'" localhost -vvv
ANSIBLE_LIBRARY=. ansible -m influx_token -a "base='http://192.168.122.140:28086' org='my-org' auth_token='87-fEnSlQldFi1T_CLHsrHxH-T9VKey-qzUbVH6tmR2QzL4oZzbUPwzS1wzOoIkyfmyGbRv75yLjYfztxziivw==' key='foo' description='test 123' permissions=\"{{'[{\\\"action\\\": \\\"write\\\",\\\"resource\\\": {\\\"type\\\": \\\"buckets\\\"} }]'|from_json}}\"" localhost -vvv
ANSIBLE_LIBRARY=. ansible -m influx_dashboard -create -a "base='http://192.168.122.140:28086' org='my-org' token='2Mji-PvTzgn2oie5p36pJ-vxqWCnxczMWGrnYz2nUHj6Q6XvdIGiLPmK4DjX16KGhOjxQ5dWymDusE8qJrhFFg==' data='{{lookup(\"file\", \"../files/influxdb-dashboard-cobald.json\")}}'" localhost -vvv
Missing: lot of stuff, e.g. missing tokens, invalid data like bucket instead of buckets ...
# TODO
* tests
* state (present/absent)
* `module_utils/urls.py` (https://github.com/ansible/ansible/blob/devel/lib/ansible/module_utils/urls.py)
* see module notes

View File

@@ -0,0 +1,161 @@
#!/usr/bin/env python
import requests
from ansible.module_utils.basic import AnsibleModule
DOCUMENTATION = r'''
---
module: influx2_bucket
short_description: create bucket in influxdb2
description: create bucket in influxdb2
notes:
- just works with influxdb version 2
- does not remove buckets
- no way to configure data retention
options:
base:
description: URL for path, e.g. `https://localhost:8086`
type: str
required: True
org:
description: influxdb2 organisation
type: str
required: True
auth_token:
description: influxdb2 authentication token
type: str
required: True
name:
description: name of the bucket
type: str
required: True
force:
description: force creation even if bucket already exists
(adds a new one)
type: bool
required: False
default: False
author:
- Thorsten M. (@thoto)
'''
EXAMPLES = r'''
- name: "fetch auth token"
raw: influx auth list --user my-user --hide-headers | cut -f 3
register: influx_token_fetch
delegate_to: ed-influxdb-2
- name: "create bucket"
influx_bucket:
base: "http://localhost:8086"
org: "my-org"
token: "{{influx_token_fetch.stdout_lines[0]}}"
name: "bucky"
'''
def get_org_id(base, org_name, h):
ro = requests.get("{base}/api/v2/orgs".format(base=base), headers=h,
json={"org": org_name})
ro.raise_for_status()
org_id = [o["id"] for o in ro.json()["orgs"] if o["name"] == org_name]
return org_id[0]
class Bucket:
def __init__(self, base, h, org, description, name):
self.base = base
self.h = h
self.org_id = get_org_id(base, org, h)
self.description = description
self.name = name
self.result = None
self.f = None
def check(self):
ra = requests.get("{base}/api/v2/buckets".format(
base=self.base),
params={"orgID": self.org_id, "name": self.name},
headers=self.h)
if ra.status_code == 404:
# orgID + name -> 404 on empty set. Just name -> 200 but buckets=[]
x = []
else:
ra.raise_for_status()
x = [i for i in ra.json()["buckets"]
if self.name == i["name"] and i["orgID"] == self.org_id]
assert(len(x) == 1 or len(x) == 0)
update = None
if len(x) == 0: # create
self.result = None
self.f = lambda: self._create({
"orgID": self.org_id,
"description": self.description if self.description else None,
"name": self.name,
"retentionRules": []
})
else:
self.result = x[0]
if self.description == x[0].get("description", ""):
return False # everything matches -> no change needed
else:
self.result = x[0]
update = {"id": x[0]["id"],
"description": self.description}
self.f = lambda: self._update(**update)
return True
def run(self):
if not self.f:
self.check()
self.f()
def _update(self, id, description):
ra = requests.patch(
"{base}/api/v2/buckets/{id}".format(
base=self.base, id=id),
headers=self.h, json={"description": description})
ra.raise_for_status()
return ra
def _create(self, data):
ra = requests.post("{base}/api/v2/buckets".format(base=self.base),
headers=self.h, json=data)
ra.raise_for_status()
self.result = ra.json()
return ra
if __name__ == "__main__":
result = dict(changed=False, message="")
module = AnsibleModule(
argument_spec=dict(
base=dict(type="str", required=True),
org=dict(type="str", required=True),
auth_token=dict(type="str", required=True),
name=dict(type="str", required=True),
description=dict(type="str", default=""),
force=dict(type="bool", default=False),
),
supports_check_mode=True
)
h = {"Authorization": "Token {token}".format(
token=module.params["auth_token"])}
b = Bucket(module.params["base"], h, org=module.params["org"],
description=module.params["description"],
name=module.params["name"])
changed = b.check()
if b.result:
result['bucket_id'] = b.result["id"]
result['changed'] = changed
if module.check_mode:
module.exit_json(**result)
if changed or module.params["force"]:
b.run()
result['bucket_id'] = b.result["id"]
module.exit_json(**result)

View File

@@ -0,0 +1,147 @@
#!/usr/bin/env python
import json
import requests
from ansible.module_utils.basic import AnsibleModule
DOCUMENTATION = r'''
---
module: influx2_dashboard
short_description: create dashboard in influxdb2
description: create dashboard in influxdb2
notes:
- just works with influxdb version 2
- does not create dashboard description
- does not update dashboards
- just creates a dashboard if it does not exist.
options:
base:
description: URL for path, e.g. `https://localhost:8086`
type: str
required: True
org:
description: influxdb2 organisation
type: str
required: True
auth_token:
description: influxdb2 authentication token
type: str
required: True
data:
description: exported dashboard json file
type: json
required: True
force:
description: force creation even if dashboard already exists
(adds a new one)
type: bool
required: False
default: False
author:
- Thorsten M. (@thoto)
'''
EXAMPLES = r'''
- name: "fetch auth token"
raw: influx auth list --user my-user --hide-headers | cut -f 3
register: influx_token_fetch
delegate_to: ed-influxdb-2
- name: "create dashboard"
influx_dashboard:
base: "http://localhost:8086"
org: "my-org"
auth_token: "{{influx_token_fetch.stdout_lines[0]}}"
data: "{{lookup('file', 'influxdb-dashboard-cobald.json')}}"
'''
def parse_dashboard_template(data):
j = json.loads(data)
return {
'name': j["content"]["data"]["attributes"]["name"],
'cells': {i["relationships"]["view"]["data"]["id"]: i["attributes"]
for i in j["content"]["included"] if i["type"] == "cell"},
'views': {i["id"]: i["attributes"]
for i in j["content"]["included"] if i["type"] == "view"},
}
def get_auth(base, org_name, token):
h = {"Authorization": "Token {token}".format(token=token)}
rm = requests.get("{base}/api/v2/me".format(base=base), headers=h)
rm.raise_for_status()
# me_name = rm.json()["name"]
me = rm.json()["id"]
ro = requests.get("{base}/api/v2/orgs".format(base=base), headers=h,
json={"userID": me})
ro.raise_for_status()
org_id = [o["id"] for o in ro.json()["orgs"] if o["name"] == org_name]
return {"org_id": org_id[0], "org_name": org_name, "uid": me, "h": h}
def check(base, auth, dashboard):
h = auth["h"]
rd = requests.get("{base}/api/v2/dashboards".format(base=base), headers=h)
rd.raise_for_status()
x = [i for i in rd.json()["dashboards"]
if i["name"] == dashboard["name"] and i["orgID"] == auth["org_id"]]
return len(x) == 0, x
def create(base, auth, dashboard):
h = auth["h"]
# create dashboard
rd = requests.post("{base}/api/v2/dashboards".format(base=base), headers=h,
json={"orgID": auth["org_id"],
"name": dashboard["name"]})
rd.raise_for_status()
dash_id = rd.json()["id"]
for k, v in dashboard["cells"].items():
# create cells in dashboard
rc = requests.post(
"{base}/api/v2/dashboards/{dash_id}/cells".format(
base=base, dash_id=dash_id),
headers=h, json=v)
rc.raise_for_status()
cell_id = rc.json()["id"]
# create view of cell in dashboard
rv = requests.patch(
"{base}/api/v2/dashboards/{dash_id}/cells/{cell_id}/view".format(
base=base, dash_id=dash_id, cell_id=cell_id),
headers=h, json=dashboard["views"][k])
rv.raise_for_status()
if __name__ == "__main__":
result = dict(changed=False, message="")
module = AnsibleModule(
argument_spec=dict(
base=dict(type="str", required=True),
org=dict(type="str", required=True),
auth_token=dict(type="str", required=True),
data=dict(type="json", required=True),
force=dict(type="bool", default=False),
),
supports_check_mode=True
)
dashboard = parse_dashboard_template(module.params["data"])
auth = get_auth(module.params["base"], module.params["org"],
module.params["auth_token"])
changed, x = check(module.params["base"], auth, dashboard)
result['changed'] = changed
if module.check_mode:
module.exit_json(**result)
if changed or module.params["force"]:
create(module.params["base"], auth, dashboard)
module.exit_json(**result)

View File

@@ -0,0 +1,232 @@
#!/usr/bin/env python
import requests
from ansible.module_utils.basic import AnsibleModule
DOCUMENTATION = r'''
---
module: influx2_token
short_description: generate token via influxdb2 api
description: generate token via influxdb2 api
notes:
- just works with influxdb version 2
- needs token to authenticate against API (use
`influx auth list --user my-user --hide-headers | cut -f 3`
- tokens may not be removed
- permissions can not be updated. a new token is created and the old
one is not removed.
options:
base:
description: URL for path, e.g. `https://localhost:8086`
type: str
required: True
org:
description: influxdb2 organisation
type: str
required: True
auth_token:
description: influxdb2 authentication token
type: str
required: True
key:
description: some key used to identify token. This is put into
the tokens description
type: str
required: True
description:
description: textual description for token. key gets appended
type: str
required: False
permissions:
description: list of permissions, each dict(action, resource)
type: list
required: True
force:
description: force creation even if dashboard already exists
(adds a new one)
type: bool
required: False
default: False
author:
- Thorsten M. (@thoto)
'''
EXAMPLES = r'''
- name: "fetch auth token"
raw: influx auth list --user my-user --hide-headers | cut -f 3
register: influx_token_fetch
delegate_to: ed-influxdb-2
- name: "create dashboard"
influx_token:
base: "http://localhost:8086"
org: "my-org"
auth_token: "{{influx_token_fetch.stdout_lines[0]}}"
key: "foo123"
description: "token for foo key"
permissions:
- action: "write"
resource:
type: "buckets"
register: ed-influx-token
- debug: msg="Token: {{ed-influx-token.token}}"
'''
def get_org_id(base, org_name, h):
ro = requests.get("{base}/api/v2/orgs".format(base=base), headers=h,
json={"org": org_name})
ro.raise_for_status()
org_id = [o["id"] for o in ro.json()["orgs"] if o["name"] == org_name]
return org_id[0]
def marker(key):
return "__ANSIBLE_TOKEN_{key}__".format(key=key)
def _filter_perms(perms):
for r in perms:
r["resource"] = {k: v for k, v in r["resource"].items() if v}
return perms
class Token:
def __init__(self, base, h, data):
self.base = base
self.h = h
self.marker = marker(data["key"])
self.org_id = data["org_id"]
self.perms = _filter_perms(data["perms"])
self.description = data["description"]+" "+self.marker
self.result_token = None
self.f = None
def check(self):
ra = requests.get("{base}/api/v2/authorizations".format(
base=self.base),
params={"orgID": self.org_id},
headers=self.h)
ra.raise_for_status()
update = None
for i in ra.json()["authorizations"]:
if self.marker not in i["description"] \
or i["orgID"] != self.org_id:
continue
if self._match_perms(self.perms, i["permissions"]):
self.result_token = i
if self.description == i["description"]:
return False # everything matches -> no change needed
else:
update = {"auth_id": i["id"],
"description": self.description}
# TODO: may remove token because permissions do not match?
if update:
self.f = lambda: self._update(**update)
else:
self.result_token = None
self.f = lambda: self._create({
"orgID": self.org_id,
"description": self.description,
"permissions": self.perms
})
return True
def run(self):
if not self.f:
self.check()
self.f()
def _match_perms(self, pa, pb):
def g(match, lst):
for idx, i in enumerate(lst):
if i['action'] != match['action']:
continue
for k, v in match['resource'].items():
if k not in i['resource'] or i['resource'][k] != v:
continue
else: # first best match
return idx
else:
raise ValueError
b = [b.copy() for b in pb]
for i in pa:
try:
b.pop(g(i, b))
except ValueError:
return False # permission i not present in b
if b: # not empty
return False # some permissions in b not in a
return True
def _update(self, auth_id, description):
ra = requests.patch(
"{base}/api/v2/authorizations/{auth_id}".format(
base=self.base, auth_id=auth_id),
headers=self.h, json={"description": description})
ra.raise_for_status()
return ra
def _create(self, data):
ra = requests.post("{base}/api/v2/authorizations".format(
base=self.base),
headers=self.h, json=data)
ra.raise_for_status()
self.result_token = ra.json()
return ra
if __name__ == "__main__":
result = dict(changed=False, message="")
module = AnsibleModule(
argument_spec=dict(
base=dict(type="str", required=True),
org=dict(type="str", required=True),
auth_token=dict(type="str", required=True),
key=dict(type="str", required=True),
description=dict(type="str", default=""),
permissions=dict(type="list", elements='dict', options=dict(
action=dict(type='str', choices=['read', 'write'],
required=True),
resource=dict(type='dict', options=dict(
id=dict(type='str'),
name=dict(type='str'),
org=dict(type='str'),
orgID=dict(type='str'),
type=dict(type='str', required=True, choices=[
"authorizations", "buckets", "dashboards", "orgs",
"sources", "tasks", "telegrafs", "users", "variables",
"scrapers", "secrets", "labels", "views", "documents",
"notificationRules", "notificationEndpoints", "checks",
"dbrp", "flows", "annotations", "functions"]),
), required=True),
), required=True),
force=dict(type="bool", default=False),
),
supports_check_mode=True
)
h = {"Authorization": "Token {token}".format(
token=module.params["auth_token"])}
t = Token(module.params["base"], h, {
"org_id": get_org_id(module.params["base"], module.params["org"], h),
"key": module.params["key"],
"perms": module.params["permissions"],
"description": module.params["description"]})
changed = t.check()
if t.result_token:
result['token'] = t.result_token["token"]
result['changed'] = changed
if module.check_mode:
module.exit_json(**result)
if changed or module.params["force"]:
t.run()
result['token'] = t.result_token["token"]
module.exit_json(**result)

View File

@@ -0,0 +1,34 @@
- file:
path: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/"
state: directory
owner: "{{unpriv_user}}"
group: docker
- template:
src: cobald.Dockerfile
dest: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/Dockerfile"
owner: "{{unpriv_user}}"
group: docker
register: cobald_cp_dockerfile
- copy:
src: "{{item}}"
dest: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/{{item}}"
owner: "{{unpriv_user}}"
group: docker
mode: 0755
with_items:
- cobald-entrypoint.sh
- init-cobaldmodules.sh
register: cobald_cp_files
- docker_image:
name: "cobald"
tag: "{{cobald_image_tag|default('latest')}}"
# pull: False
build:
pull: False
path: "/container/docker-images/cobald.{{cobald_image_tag|default('latest')}}/"
source: build
force_source: "{{cobald_cp_dockerfile.changed or cobald_cp_files.changed}}"

View File

@@ -0,0 +1,11 @@
- block:
- docker_container_info:
name: "{{ container_name | mandatory }}"
register: cobald_container_info
- set_fact:
cobald_container_hostname: |-
{{cobald_container_info.container.Config.Hostname | default('cobald-'+
lookup('password', '/dev/null chars=ascii_lowercase length=6')) }}
when: cobald_container_hostname is not defined

View File

@@ -0,0 +1,54 @@
- name: create influx token for grafana
influx_token:
base: "http://localhost:{{influx_pubport}}"
org: "my-org"
auth_token: "{{influx_admin_token}}"
description: grafana read access
key: grafana
permissions:
- action: read
resource:
type: buckets
register: influx_grafana_token
- name: run grafana
docker_container:
name: ed-grafana
image: docker.io/grafana/grafana:7.5.7
hostname: ed-grafana
domainname: cobald.local
networks:
- name: "{{cobald_docker_network}}"
networks_cli_compatible: True
published_ports:
- "3000:3000"
state: started
detach: True
cleanup: True
- wait_for:
host: localhost
port: 3000
- community.grafana.grafana_datasource:
grafana_url: http://localhost:3000
grafana_user: admin
grafana_password: admin
name: InfluxDB
ds_type: influxdb
ds_url: "{{influx_url}}"
additional_json_data:
defaultBucket: "{{influx_bucket}}"
organization: "{{influx_org}}"
version: Flux
additional_secure_json_data:
token: "{{influx_grafana_token.token}}"
- community.grafana.grafana_dashboard:
grafana_url: http://localhost:3000
grafana_user: admin
grafana_password: admin
state: present
commit_message: updated by ansible
overwrite: yes
json_data: "{{lookup('file', 'grafana-dashboard.json')|from_json}}"

View File

@@ -0,0 +1,76 @@
- name: run influxdb in docker container
docker_container:
name: ed-influxdb
image: docker.io/library/influxdb:2.0
hostname: "{{cobald_influx_hostname}}"
domainname: "{{cobald_domainname}}"
networks:
- name: "{{ cobald_docker_network }}"
networks_cli_compatible: True
published_ports:
- "{{influx_pubport}}:8086"
volumes:
- "ed-influxdb-data:/var/lib/influxdb2"
- "ed-influxdb-config:/etc/influxdb2"
- "/container/volumes/influxdb-backup/:/backup"
env:
DOCKER_INFLUXDB_INIT_MODE: setup
DOCKER_INFLUXDB_INIT_USERNAME: "{{influx_admin_user}}"
DOCKER_INFLUXDB_INIT_PASSWORD: "{{influx_admin_pw}}"
DOCKER_INFLUXDB_INIT_ORG: "{{influx_org}}"
DOCKER_INFLUXDB_INIT_BUCKET: my-bucket
state: started
detach: True
cleanup: True
- name: add ansible connection to influxdb container
add_host:
name: ed-influxdb
ansible_connection: docker
ansible_docker_extra_args: "-H=ssh://{{ansible_host}}"
changed_when: False
- name: wait for influx to run
raw: until curl http://localhost:8086 ; do sleep 1 ; done
changed_when: False
delegate_to: ed-influxdb
- name: fetch influxdb auth token
raw: influx auth list --user my-user --hide-headers --json
register: influx_token_fetch
changed_when: False
delegate_to: ed-influxdb
- name: set influxdb admin token
set_fact:
influx_admin_token:
"{{(influx_token_fetch.stdout | from_json | first).token}}"
- name: create influxdb bucket for cobald
influx_bucket:
base: "http://localhost:{{influx_pubport}}"
org: "my-org"
auth_token: "{{influx_admin_token}}"
name: "{{influx_bucket}}"
- name: create influxdb dashboard
influx_dashboard:
base: "http://localhost:{{influx_pubport}}"
org: "my-org"
auth_token: "{{influx_admin_token}}"
data: "{{lookup('file', 'influxdb-dashboard-cobald.json')}}"
when: influxdb_dashboard | default(True)
- name: create influxdb write access token for telegraf
influx_token:
base: "http://localhost:{{influx_pubport}}"
org: "my-org"
auth_token: "{{influx_admin_token}}"
description: cobald tardis telegraf monitoring plugin
key: telegraf_cobaldtardis
permissions:
- action: write
resource:
type: buckets
name: "{{influx_bucket}}"
register: influx_telegraf_token

149
roles/cobald/tasks/main.yml Normal file
View File

@@ -0,0 +1,149 @@
- include_vars: cobald-slurm.yml
when: cobald_slurm | default(False)
tags: always
- name: build cobald:slurm docker image
include_role:
name: slurm
tasks_from: dockerimage
vars:
slurm_image_prefix: cobald
image_name: "{{cobald_image_tag}}"
dockerfile: "{{ lookup('template', 'cobald.Dockerfile') }}"
files_list:
- cobald-entrypoint.sh
- init-cobaldmodules.sh
- start-drone
- 28-sync-container-slurmd
files: "
{%- set files = [] -%} {%- for i in files_list -%}
{%- set files = files.append(
{ 'dest': i, 'content': lookup('file', i) }) -%}
{%- endfor %}{{ files }}"
when: cobald_slurm | default(False)
- name: build generic cobald docker image
include_tasks: dockerimage-generic.yml
when: not (cobald_slurm | default(False))
- name: make cobald data volume
file:
path: "/container/volumes/cobald/"
state: directory
owner: "{{unpriv_user}}"
group: docker
- name: copy cobald config
copy:
src: cobald-config/
dest: "~{{unpriv_user}}/cobald/"
force: False
owner: "{{unpriv_user}}"
group: docker
mode: "0644"
- name: ensure network for cobald container exists
docker_network:
name: "{{cobald_docker_network}}"
state: present
# docker run -v $(pwd)/cobald-config-host:/etc/cobald -v $(pwd)/cobald:/cobald --rm -it cobald bash
- name: install git
yum:
name: git
state: present
- name: make directories for cobald configuration and modules
file:
path: "{{item}}"
owner: "{{unpriv_user}}"
group: "{{unpriv_user}}"
mode: "a=rx,u=rwx"
state: directory
with_items:
- "~{{unpriv_user}}/cobald/modules"
- "~{{unpriv_user}}/cobald"
- name: clone cobald code from git
git:
repo: https://github.com/thoto/cobald
dest: "~{{unpriv_user}}/cobald-src"
version: bugfix/mixed_construction_methods
update: no # FIXME
become: yes
become_user: "{{unpriv_user}}"
register: cobald_git_pull
- name: clone tardis code from git
git:
repo: https://github.com/MatterMiners/tardis
dest: "~{{unpriv_user}}/tardis-src"
version: master
update: no # FIXME
become: yes
become_user: "{{unpriv_user}}"
register: tardis_git_pull
- name: "get unpriv_user {{unpriv_user}} uid and gid"
getent:
database: passwd
key: "{{unpriv_user}}"
- name: run pip install on cobald and tardis
docker_container:
image: "cobald:{{cobald_image_tag|default('latest')}}"
name: "cobald-src-{{item.name}}-install"
volumes:
- "~{{unpriv_user}}/{{item.name}}-src:/usr/local/src/{{item.name}}:rw"
state: started
detach: False
cleanup: True
user: "{{getent_passwd[unpriv_user][1]}}:{{getent_passwd[unpriv_user][2]}}"
entrypoint: ""
command: |
bash -c 'HOME=/tmp pip3 install --editable /usr/local/src/{{item.name}}'
with_items:
- name: cobald
run: "{{cobald_git_pull.changed}}"
- name: tardis
run: "{{tardis_git_pull.changed}}"
when: item.run
- import_tasks: telegraf.yml
- name: get cobald hostname
include_tasks: facts.yml
when: cobald_container_hostname is not defined
- name: build singularity container
include_tasks:
file: singularity.yml
apply:
tags: singularity
tags: singularity
- name: run cobald container
docker_container:
name: "{{ container_name | default('cobald') }}"
image: "cobald:{{cobald_image_tag|default('latest')}}"
hostname: "{{cobald_container_hostname}}"
domainname: "{{ cobald_domainname | default('cobald.local')}}"
volumes: "{{default_mounts + cobald_mounts }}"
networks:
- name: "{{cobald_docker_network}}"
networks_cli_compatible: True
state: started
detach: True
cleanup: True
interactive: True
# command: python3 -m cobald.daemon /etc/cobald/config.yaml
vars:
default_mounts: "{{cobald_slurm_mounts | default([])}}"
cobald_mounts:
- "~{{unpriv_user}}/cobald:/etc/cobald"
# - /container/volumes/cobald:/etc/cobald:ro
- "/container/docker-images/sing-slurmd/build/:/slurm-singimage/:ro"
- "~{{unpriv_user}}/cobald/modules:/usr/local/src/cobaldmodules"
- "~{{unpriv_user}}/cobald-src:/usr/local/src/cobald:ro"
- "~{{unpriv_user}}/tardis-src:/usr/local/src/tardis:ro"

View File

@@ -0,0 +1,48 @@
- name: setup singularity
import_role: name="singularity"
tags: singularity
- name: make singularity image build directory
file:
state: directory
path: "{{item}}"
owner: "{{unpriv_user}}"
group: "docker"
mode: "0755"
loop:
- /container/docker-images/sing-slurmd
- /container/docker-images/sing-slurmd/cache
- /container/docker-images/sing-slurmd/build
- name: copy slurm singularity container files
copy:
src: "{{item}}"
dest: "/container/docker-images/sing-slurmd/{{item}}"
owner: "{{unpriv_user}}"
group: "docker"
loop:
- slurm-slurmd.def
- 31-slurmd-configless
- cgroup.conf.noautomount
register: cobald_copy_sing_files
- name: remove old container
file:
path: /container/docker-images/sing-slurmd/build/slurmd.sif
state: absent
when: cobald_copy_sing_files.changed
- name: build container
shell:
chdir: /container/docker-images/sing-slurmd/
cmd: SINGULARITY_TMPDIR=/container/docker-images/sing-slurmd/cache
singularity build --disable-cache
/container/docker-images/sing-slurmd/build/slurmd.sif
/container/docker-images/sing-slurmd/slurm-slurmd.def
creates: /container/docker-images/sing-slurmd/build/slurmd.sif
register: cobald_sing_build
- debug: msg="{{[cobald_sing_build.stdout, cobald_sing_build.stderr]}}"
tags: [ never, debug ]
# TODO: trigger copy in cobald container when slurmd.sif rebuilt

View File

@@ -0,0 +1,69 @@
- name: setup directories for telegraf
file:
path: "/container/{{item}}/telegraf/"
state: directory
owner: "{{unpriv_user}}"
group: docker
loop:
- docker-images
- volumes
- name: copy telegraf Dockerfile
copy:
src: telegraf.Dockerfile
dest: /container/docker-images/telegraf/Dockerfile
owner: "{{unpriv_user}}"
group: docker
register: cobald_cp_telegraf_dockerfile
- name: copy telegraf repo file
copy: # telegraf is found in influxdb repo
src: influxdb.repo
dest: /container/docker-images/telegraf/influxdb.repo
owner: "{{unpriv_user}}"
group: docker
- name: docker image for telegraf
docker_image:
name: "ed-telegraf"
build:
pull: False
path: "/container/docker-images/telegraf/"
source: build
force_source: "{{cobald_cp_telegraf_dockerfile.changed}}"
- import_tasks: influxdb.yml
tags: influxdb
- name: generate telegraf config
template:
src: telegraf.conf.j2
dest: /container/volumes/telegraf/telegraf.conf
owner: "{{unpriv_user}}"
group: docker
vars:
influx_token: "{{influx_telegraf_token.token}}"
influx_url: "http://{{cobald_influx_hostname}}:8086"
register: telegraf_config_gen
- name: run telegraf container
docker_container:
name: ed-telegraf
image: ed-telegraf
hostname: telegraf
domainname: "{{ cobald_domainname }}"
networks:
- name: "{{ cobald_docker_network }}"
aliases: ["ed-telegraf"]
volumes:
- "/container/volumes/telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro"
state: started
recreate: "{{ telegraf_config_gen.changed | default(False) | bool }}"
detach: True
# cleanup: True
networks_cli_compatible: True
- import_tasks: grafana.yml
vars:
influx_url: "http://{{cobald_influx_hostname}}:8086"
tags: influxdb

View File

@@ -0,0 +1,73 @@
FROM {{ cobald_docker_base_image | default("docker.io/library/centos:7") }}
RUN yum update -y && \
yum install -y python3 git && pip3 install --upgrade pip && \
yum clean all && rm -rf /var/cache/yum
ARG REPOCOBALD=https://github.com/MatterMiners/cobald
ARG REPOTARDIS=https://github.com/MatterMiners/tardis
RUN git clone $REPOCOBALD /usr/local/src/cobald && \
git clone $REPOTARDIS /usr/local/src/tardis
RUN mkdir /etc/cobald /var/log/cobald && \
( getent passwd cobald > /dev/null || \
useradd -m -d /var/lib/cobald --no-log-init --system cobald ) && \
chown cobald:cobald /var/log/cobald
#RUN mkdir /cobald && python3 -m venv /cobald && source /cobald/bin/activate &&\
# pip3 install --upgrade pip && pip3 install cobald
RUN mkdir /usr/local/src/cobaldmodules /usr/local/lib/cobaldmodules && \
ln -s /usr/local/src/cobaldmodules/setup.py \
/usr/local/lib/cobaldmodules/setup.py && \
ln -s /usr/local/src/cobaldmodules/cobaldmodules \
/usr/local/lib/cobaldmodules/cobaldmodules && \
chown cobald:cobald /usr/local/lib/cobaldmodules
RUN pip3 install --editable /usr/local/src/cobald && \
pip3 install --editable /usr/local/src/cobald[contrib]
RUN pip3 install --editable /usr/local/src/tardis&& \
pip3 install --editable /usr/local/src/tardis[contrib]
ENV PYTHONPATH=/usr/local/src/cobaldmodules
# pip3 install --editable .
# pip3 install --editable .[contrib]
# pip3 install --upgrade --editable /etc/cobald/modules/
# su cobald -c "python3 -m cobald.daemon /etc/cobald/config.yaml"
VOLUME /usr/local/src/cobaldmodules
VOLUME /etc/cobald
RUN mkdir -p /usr/local/lib/entrypoints.d/
COPY init-cobaldmodules.sh /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
RUN chmod 755 /usr/local/lib/entrypoints.d/50-init-cobaldmodules.sh
COPY start-drone /usr/local/bin/start-drone
COPY 28-sync-container-slurmd /etc/docker-init.d/28-sync-container-slurmd
RUN chmod 755 /usr/local/bin/start-drone /etc/docker-init.d/28-sync-container-slurmd
RUN echo -e "#!/bin/sh\npython3 -m cobald.daemon /etc/cobald/config.yaml" >> /etc/docker-init.d/70-cobald && chmod 755 /etc/docker-init.d/70-cobald
{% if cobald_docker_default_command | default(True) -%}
COPY cobald-entrypoint.sh /usr/local/sbin/cobald-entrypoint.sh
RUN chmod 755 /usr/local/sbin/cobald-entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/cobald-entrypoint.sh" ]
RUN yum -y install iproute &&\
yum clean all && rm -rf /var/cache/yum
USER cobald
STOPSIGNAL SIGINT
# CMD "python3 -m cobald.daemon /etc/cobald/config.yaml"
CMD /etc/docker-init.d/60-cobald
{%- endif %}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,12 @@
cobald_image_tag: slurm
cobald_docker_base_image: "{{slurm.base_image}}"
cobald_docker_default_command: False
cobald_docker_network: "{{slurm.network}}"
cobald_domainname: "{{slurm.domain}}"
cobald_slurm_mounts: "{{slurm.mounts}}"
#- /container/volumes/slurm/:/etc/slurm/:rw
##- "{{slurm_cfg_path | mandatory}}:/etc/slurm/:rw"
#- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
## - "{{slurm_munge_path | mandatory}}:/etc/munge/munge.key:rw"
#- slurm-shared:/shared/:rw
## - "{{slurm_shared_path | mandatory}}:{{slurm_shared_target | default('/shared')}}:rw"

View File

@@ -0,0 +1,2 @@
cobald_docker_network: "{{docker_network}}"
cobald_influx_hostname: "ed-influxdb"

View File

@@ -8,7 +8,7 @@
value: "15000"
sysctl_file: /etc/sysctl.d/90-max_net_namespaces.conf
- name: "enable user thoto for fakeroot access"
- name: "enable user {{unpriv_user}} for fakeroot access"
lineinfile:
line: "{{unpriv_user}}:4294836224:65536"
dest: "{{item}}"

View File

@@ -1,5 +1,9 @@
container_privileged: False
slurm_user: slurm
slurm_log_path_ctld: /var/log/slurm/slurmctld.log
slurm_log_path_d: /var/log/slurm/slurmd.log
slurm_log_path_sched: /var/log/slurm/slurmsched.log
slurm_prefix: slurm
slurm_domain: slurm.local
container_privileged: False
docker_network: slurm
slurm_image_prefix: slurm

View File

@@ -0,0 +1,18 @@
#!/bin/bash
function trp_term(){
echo pkill -P $pids
for j in $pids ; do
pkill -P $j
kill -SIGTERM $j
done
}
trap trp_term SIGINT SIGTERM
pids=""
for i in /etc/docker-init.d/* ; do
[ ! -f $i ] && break
$i &
pids="$pids $!"
done
wait $pids
# TODO: call start scripts like "foo.sh start" and "foo.sh stop" to avoid pkill

View File

@@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -e
if [ -f "/etc/munge/munge.key" ] ; then
chown munge:munge /etc/munge/munge.key
chmod 600 /etc/munge/munge.key
fi

View File

@@ -1,9 +1,8 @@
#!/usr/bin/env bash
set -e
if [ -f "/etc/munge/munge.key" ] ; then
chown munge:munge /etc/munge/munge.key
chmod 600 /etc/munge/munge.key
fi
for i in /usr/local/lib/entrypoints.d/* ; do
[ -f $i ] && /bin/sh $i || break
done
exec "$@"
exec "${@:-/bin/bash}"

View File

@@ -0,0 +1,35 @@
FROM docker.io/library/centos:7
RUN yum install -y epel-release && \
yum install -y slurm && \
yum clean all && rm -rf /var/cache/yum
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
yum clean all && rm -rf /var/cache/yum
RUN mkdir -p /usr/local/lib/entrypoints.d/
COPY --chown=root:root entry-munge.sh /usr/local/lib/entrypoints.d/10-munge.sh
COPY --chown=root:root entrypoint.sh /usr/local/sbin/entrypoint.sh
RUN chmod 755 /usr/local/lib/entrypoints.d/10-munge.sh && \
chmod 755 /usr/local/sbin/entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
ARG slurmuser=slurm
ENV slurmuser=${slurmuser}
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
slurm-setuser -u $slurmuser -g $slurmuser -y
COPY docker-init /usr/local/sbin/docker-init
RUN mkdir /etc/docker-init.d && chmod 755 /usr/local/sbin/docker-init
COPY start-scripts/10-munge /etc/docker-init.d/10-munge
RUN chmod 755 /etc/docker-init.d/10-munge
ARG moreusers
RUN function mu { [ -z "$1" ] || useradd -d $2 -m --no-log-init --system $1 ;};\
echo "${moreusers}" | tr ',' '\n' | while read i ; do mu $i ; done
CMD /usr/local/sbin/docker-init

View File

@@ -1,43 +0,0 @@
FROM docker.io/library/centos:7 as base
RUN yum install -y epel-release && \
yum install -y slurm && \
yum clean all && rm -rf /var/cache/yum
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
yum clean all && rm -rf /var/cache/yum
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
chmod 755 /usr/local/sbin/entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
ARG slurmuser=slurm
ENV slurmuser=${slurmuser}
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
slurm-setuser -u $slurmuser -g $slurmuser -y
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
FROM base as slurmd
RUN yum install -y slurm-slurmd && \
yum clean all && rm -rf /var/cache/yum
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
slurmd -D 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'
FROM base as slurmctld
RUN yum install -y slurm-slurmctld && \
yum clean all && rm -rf /var/cache/yum
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'

View File

@@ -1,32 +1,11 @@
FROM docker.io/library/centos:7 as base
RUN yum install -y epel-release && \
yum install -y slurm && \
yum clean all && rm -rf /var/cache/yum
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
yum clean all && rm -rf /var/cache/yum
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
chmod 755 /usr/local/sbin/entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
ARG slurmuser=slurm
ENV slurmuser=${slurmuser}
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
slurm-setuser -u $slurmuser -g $slurmuser -y
FROM slurm:base
RUN yum install -y slurm-slurmctld && \
yum clean all && rm -rf /var/cache/yum
COPY start-scripts/20-slurmctld /etc/docker-init.d/20-slurmctld
RUN chmod 755 /etc/docker-init.d/20-slurmctld
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'

View File

@@ -1,32 +1,14 @@
FROM docker.io/library/centos:7
RUN yum install -y epel-release && \
yum install -y slurm && \
yum clean all && rm -rf /var/cache/yum
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
yum clean all && rm -rf /var/cache/yum
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
chmod 755 /usr/local/sbin/entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
ARG slurmuser=slurm
ENV slurmuser=${slurmuser}
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
slurm-setuser -u $slurmuser -g $slurmuser -y
FROM slurm:base
RUN yum install -y slurm-slurmd && \
yum clean all && rm -rf /var/cache/yum
COPY start-scripts/30-slurmd /etc/docker-init.d/30-slurmd
RUN chmod 755 /etc/docker-init.d/30-slurmd
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
slurmd -D 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'
RUN yum install -y singularity && \
yum clean all && rm -rf /var/cache/yum

View File

@@ -0,0 +1,2 @@
#!/bin/sh
exec su -s /bin/sh -c "munged -F" munge

View File

@@ -0,0 +1,4 @@
#!/bin/sh
su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null &
tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}

View File

@@ -0,0 +1,4 @@
#!/bin/sh
slurmd -D 2>/dev/null 1>/dev/null &
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH}

View File

@@ -1,3 +1,3 @@
- name: reconfigure slurm
command:
cmd: docker container exec -it slurm-ctl scontrol reconfigure
shell:
cmd: "docker container exec -it {{slurm_prefix}}-ctl scontrol reconfigure || docker container restart {{slurm_prefix}}-ctl && docker container exec -it {{slurm_prefix}}-ctl scontrol reconfigure"

View File

@@ -1,31 +1,29 @@
- file:
path: "/container/docker-images/{{item}}"
state: directory
owner: "{{unpriv_user}}"
group: docker
- name: build slurm base docker image
include_tasks: dockerimage_build.yml
vars:
slurm_image_prefix: "{{slurm_base_image_prefix | default('slurm') }}"
image_name: base
dockerfile: "{{lookup('file', 'slurm-base.Dockerfile')}}"
files:
- dest: entrypoint.sh
content: "{{ lookup('file', 'entrypoint.sh') }}"
- dest: entry-munge.sh
content: "{{ lookup('file', 'entry-munge.sh') }}"
- dest: docker-init
content: "{{ lookup('file', 'docker-init') }}"
- dest: start-scripts/10-munge
content: "{{ lookup('file', 'start-scripts/10-munge') }}"
image_args:
moreusers: >-
{% for a in slurm_user_accounts | default([]) -%}
{{a['name']}} {{a['dir']}}{{loop.last | ternary('',',')}}
{%- endfor %}
when: not slurm_baseimg_build_chg | default(False)
- copy:
src: "{{item}}.Dockerfile"
dest: "/container/docker-images/{{item}}/Dockerfile"
owner: "{{unpriv_user}}"
group: docker
register: slurm_cp_dockerfile
- copy:
src: "entrypoint.sh"
dest: "/container/docker-images/{{item}}/entrypoint.sh"
owner: root
group: root
mode: u=rwx,g=rx,o=rx
register: slurm_cp_entrypt
- docker_image:
name: "slurm-{{item}}"
# pull: False
build:
pull: False
path: "/container/docker-images/{{item}}"
# target: "{{item}}" # unsupported on old docker-py versions as in el7
source: build
force_source: "{{slurm_cp_dockerfile.changed or slurm_cp_entrypt.changed}}"
- set_fact:
slurm_baseimg_build_chg:
"{{(slurm_baseimg_build_chg | default(False)) or
slurm_img_build.changed}}"
- name: "build slurm base docker image {{image_name}}"
include_tasks: dockerimage_build.yml

View File

@@ -0,0 +1,43 @@
- name: create directories for docker image build
file:
path: "/container/docker-images/{{slurm_image_prefix}}-{{image_name}}/{{item}}"
state: directory
owner: "{{unpriv_user}}"
group: docker
loop: "{{ [''] + (files | map(attribute='dest') | map('dirname') |
unique | select | list) }}"
- name: "copy Dockerfile {{slurm_image_prefix}}:{{image_name}}"
copy:
content: "{{dockerfile}}"
dest: "/container/docker-images/{{slurm_image_prefix}}-{{image_name}}/Dockerfile"
owner: "{{unpriv_user}}"
group: docker
register: slurm_cp_dockerfile
- name: copy requisite files
copy:
content: "{{ item.content }}"
dest: "/container/docker-images/{{slurm_image_prefix}}-{{image_name}}/{{item.dest}}"
owner: root
group: root
mode: u=rwx,g=rx,o=rx
loop: "{{ files | default([]) }}"
loop_control:
label: "{{ item.dest }}"
register: slurm_cp_files
- name: "build docker image {{slurm_image_prefix}}:{{image_name}}"
docker_image:
name: "{{slurm_image_prefix}}"
tag: "{{image_name}}"
# pull: False
build:
args: "{{image_args | default(omit)}}"
pull: False
path: "/container/docker-images/{{slurm_image_prefix}}-{{image_name}}/"
source: build
force_source: "{{slurm_cp_dockerfile.changed or
slurm_cp_files.changed or
slurm_baseimg_build_chg | default(False) }}"
register: slurm_img_build

View File

@@ -0,0 +1,35 @@
# TODO: this does not work quite right since slurm-ctl does not reach the host
# system. sinfo, scontrol etc. work but srun does not!
- name: "get addresses from docker network"
docker_network_info:
name: "{{ docker_network }}"
register: slurm_network_data
- name: link host slurm config
file:
path: "/etc/slurm/slurm.conf"
src: "/container/volumes/slurm/slurm.conf"
force: True
state: link
backup: True
- name: create slurm user
user:
name: slurm
system: True
- name: place entry of slurm-ctl in host /etc/hosts
lineinfile:
line: "{{slurm_network_data.network.Containers | dict2items
| json_query('[?value.Name==`slurm-ctl`].value.IPv4Address') | first
| ipaddr('address') }}\tslurm-ctl"
regexp: "^(\\S*)(\\s*)slurm-ctl$"
path: /etc/hosts
backup: True
- name: start munge locally
service:
name: munge
enabled: True
state: started

View File

@@ -3,10 +3,27 @@
name: [ slurm, slurm-doc ]
state: present
- include_tasks: dockerimage.yml
- name: build docker images for slurm
include_tasks:
file: dockerimage.yml
loop:
- slurmctld
- slurmd
- name: slurmctld
dockerfile: "{{ lookup('file', 'slurmctld.Dockerfile') }}"
files:
- dest: start-scripts/20-slurmctld
content: "{{ lookup('file', 'start-scripts/20-slurmctld') }}"
- name: slurmd
dockerfile: "{{ lookup('file', 'slurmd.Dockerfile') }}"
files:
- dest: start-scripts/30-slurmd
content: "{{ lookup('file', 'start-scripts/30-slurmd') }}"
vars:
image_name: "{{image.name | default(omit) }}"
dockerfile: "{{image.dockerfile | default(omit) }}"
files: "{{image.files | default(omit) }}"
loop_control:
loop_var: image
label: "{{ image.name }}"
- name: generate munge key
shell:
@@ -20,7 +37,8 @@
group: munge
mode: u=rw,g=,o=
- file:
- name: create munge key directory for containers
file:
path: /container/volumes/munge
state: directory
owner: munge
@@ -35,10 +53,18 @@
src: /etc/munge/munge.key
dest: /container/volumes/munge/munge.key
- file:
- name: make slurm directory
file:
path: /container/volumes/slurm/
state: directory
- name: "create docker network to make service discovery work"
docker_network:
name: "{{ docker_network }}"
state: present
register: slurm_network_data
tags: slurm-config
- name: upload slurm config
template:
force: true
@@ -47,59 +73,66 @@
loop:
- slurm.conf
- cgroup.conf
vars:
slurm_exec_node_cores: 3
slurm_exec_node_mem: 5000 # RealMemory=5964
slurm_alloc_nodes_default:
- name: "{{slurm_prefix+'-submit1'}}"
- name: "{{ inventory_hostname }}"
addr: "{{ slurm_network_data.network.IPAM.Config[0].Gateway }}"
alloc_nodes: "{{ slurm_alloc_nodes_default + extra_nodes | default([])}}"
partitions:
- name: cobald
nodeprefix: drone
num_nodes: 10
node_cores: 3
node_mem: 4900
port: 16818
initstate: FUTURE
notify: reconfigure slurm
tags: [ slurm-config ]
- name: "create docker network to make service discovery work"
docker_network:
name: slurm
state: present
tags: slurm-config
- name: "create docker volume for shared access between nodes"
docker_volume:
name: slurm-shared
state: present
- set_fact:
slurm_nodes: # default nodes: controller and submit machine
- machine: ctl
image: slurm-slurmctld
- machine: submit1
image: slurm-slurmd
extra_mounts:
- "/home/{{unpriv_user}}/job3/:/mnt/:rw"
tags: [ slurm-config ]
# TODO: reserve some address using docker_network_info and assign as aux
# address to enable slurmctld to get a static address in order to be
# reachable from slurm running on docker host to enable submitting jobs.
- name: run slurm docker containers
docker_container:
name: "slurm-{{item.machine}}"
hostname: "slurm-{{item.machine}}"
domainname: "slurm.local"
volumes: "{{default_mounts + ( item.extra_mounts | default([]) ) }}"
name: "{{ slurm_prefix }}-{{ item.machine }}"
hostname: "{{ slurm_prefix }}-{{ item.machine }}"
domainname: "{{ slurm_domain }}"
volumes: "{{ slurm_default_mounts + ( item.extra_mounts | default([]) ) }}"
ports: "{{ item.exposed_ports | default([]) }}"
networks:
- name: "slurm"
- name: "{{ docker_network }}"
aliases: "{{ item.aliases | default(omit) }}"
env:
slurmuser: "{{slurm_user}}"
image: "{{item.image}}"
slurmuser: "{{ slurm_user }}"
image: "{{ item.image }}"
state: started
detach: True
cleanup: True
privileged: "{{ container_privileged | bool }}"
networks_cli_compatible: True
vars:
default_mounts:
- /container/volumes/slurm/:/etc/slurm/:rw
- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
- slurm-shared:/shared/:rw
slurm_nodes_all: | # add execute nodes
{% for i in range(1, 4) -%}
{% set _ = slurm_nodes.extend([
{'machine':'exec%s'|format(i), 'image': 'slurm-slurmd'}]) -%}
{%- endfor %}
{{ slurm_nodes }}
loop: "{{slurm_nodes_all}}"
tags: [ slurm-config ]
interactive: True
vars: # see vars/main.yml
slurm_nodes_all: "{{ slurm_nodes_exec + slurm_nodes_std }}"
loop: "{{ slurm_nodes_all }}"
loop_control:
label: "{{slurm_prefix}}-{{ item.machine }}"
tags: slurm-config
- name: configure host system to integrate into slurm cluster
import_tasks: host-config.yml
when: slurm_hostsystem_cluster_access | default(False)
- name: export facts about slurm cluster to be used by other modules
set_fact:
slurm:
user: "{{slurm_user}}"
domain: "{{slurm_domain}}"
base_image: "slurm:base"
mounts: "{{slurm_default_mounts}}"
network: "{{docker_network}}"
tags: always

View File

@@ -9,6 +9,8 @@ ControlMachine=slurm-ctl
AuthType=auth/munge
#CheckpointType=checkpoint/none
CryptoType=crypto/munge
CommunicationParameters=NoAddrCache
SlurmctldParameters=enable_configless
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
@@ -103,8 +105,10 @@ Waittime=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/linear
# SelectType=select/linear
SelectType=select/cons_res
#SelectTypeParameters=
SelectTypeParameters=CR_CORE
#
#
# JOB PRIORITY
@@ -163,6 +167,27 @@ SlurmSchedLogFile={{slurm_log_path_sched}}
#
#
# COMPUTE NODES
NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN
NodeName=slurm-submit1 CPUs=1 State=UNKNOWN
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes=slurm-submit1 Default=YES MaxTime=INFINITE State=UP
NodeName=slurm-exec[1-{{ num_nodes }}] CPUs={{ slurm_exec_node_cores }} {{''
}} RealMemory={{ slurm_exec_node_mem }} {{''
}} CoresPerSocket={{ slurm_exec_node_cores }} State=UNKNOWN
{% for p in partitions | default([]) %}
NodeName={{ p.nodeprefix }}[1-{{ p.num_nodes }}] CPUs={{ p.node_cores }} {{''
}} RealMemory={{ p.node_mem }} {{''
}} CoresPerSocket={{ p.node_cores }} {{''
}} {%- if p.port is defined %} Port={{ p.port}} {% endif %}{{''
}} State={{ p.initstate | default('UNKNOWN') }}
{% endfor %}
{% for i in alloc_nodes -%}
NodeName={{i.name}}
{%- if i.hostname is defined %} NodeHostname={{i.hostname}} {% endif %}
{%- if i.addr is defined %} NodeAddr={{i.addr}} {% endif %}
State=UNKNOWN
{% endfor %}
PartitionName=container Nodes=slurm-exec[1-{{num_nodes}}] {{ ''
}} AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} {{ ''
}} Default=YES MaxTime=INFINITE State=UP
{% for p in partitions | default([]) %}
PartitionName={{ p.name }} Nodes={{ p.nodeprefix }}[1-{{ p.num_nodes }}] {{ ''
}} AllocNodes={{alloc_nodes |map(attribute='name') | join(',')}} {{ ''
}} MaxTime=INFINITE State=UP
{% endfor %}

21
roles/slurm/vars/main.yml Normal file
View File

@@ -0,0 +1,21 @@
slurm_nodes_std: # default nodes: controller and submit machine
- machine: ctl
image: slurm:slurmctld
exposed_ports: [ "6817:6817/tcp" ]
- machine: submit1
image: slurm:slurmd
extra_mounts:
- "/home/{{unpriv_user}}/job3/:/mnt/:rw"
slurm_nodes_exec: | # extend range to execute nodes list
{% set slurm_nodes_exec = [] %}
{% for i in range(1, num_nodes+1) -%}
{% set _ = slurm_nodes_exec.extend([
{'machine':'exec%s'|format(i), 'image': 'slurm:slurmd',
'aliases':['drone%s'|format(i)]}]) -%}
{%- endfor %}
{{ slurm_nodes_exec }}
slurm_default_mounts:
- /container/volumes/slurm/:/etc/slurm/:rw
- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
- slurm-shared:/shared/:rw