Compare commits

4 Commits

Author SHA1 Message Date
fdd4bd6bf0 copy plugin just for 2.9 2021-04-30 17:47:54 +02:00
f7dd3bcf02 run slurmctld as user
Notice: also trying to run slurmd on execute nodes as user makes no
sense because it breaks sbatch. Furthermore there is another necessary
to run mpi jobs (just tried MpiDefault=none). I don't consider running
slurmd as root a good idea, but there seems to be no other choice at the
moment.
2021-04-30 17:15:57 +02:00
f2cb9b2c6b fixed log (includes log output from tasks now) 2021-04-30 16:47:31 +02:00
38a5b89de9 minor fixes 2021-04-29 12:19:33 +02:00
22 changed files with 1597 additions and 51 deletions

13
inv.yml
View File

@@ -1,12 +1,15 @@
all:
hosts:
centos7VM:
ed-c7-1:
ansible_user: root
ansible_host: localhost
ansible_port: 19322
ansible_host: ed-c7-1.virt.magni.thoto.net
ssh_args: -o ControlMaster=auto -o ControlPersist=60s
unpriv_user: centos7 # gains privileges to use docker
# ansible_host: 192.168.122.139
unpriv_user: thoto
children:
htcondor:
hosts:
ed-c7-1:
slurm:
hosts:
centos7VM
ed-c7-1:

View File

@@ -1,15 +1,46 @@
---
- hosts: all
tasks:
- name: "install epel-release"
yum:
name: [ epel-release ] # necessary to find htop => separate step
state: present
- name: "install tools"
yum:
name: [ vim-enhanced, htop, screen, tmux, bind-utils, nmap-ncat, net-tools ]
name: [ vim-enhanced, htop, screen, bind-utils, nmap-ncat, net-tools ]
state: present
- hosts: htcondor
pre_tasks:
- name: "install htcondor repo"
yum:
name: https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm
state: present
tags: htcondor
- name: "install htcondor software "
yum:
name: htcondor-ce
state: present
tags: htcondor
- name: "remove minicondor configuration"
yum:
name: minicondor
state: absent
tags: htcondor
- name: "setup singularity"
import_tasks: "singularity.yml"
tags: singularity
roles:
- name: "setup docker"
role: docker
tags: docker
- name: "setup htcondor test environment in docker containers"
role: docker-htcondor
tags:
- htcondor-containered
- htcondor
- hosts: slurm
vars:
container_privileged: True
@@ -20,4 +51,6 @@
tags: docker
- name: "setup slurm test environment in docker containers"
role: slurm
vars:
slurm_user: slurm # or root
tags: slurm

View File

@@ -0,0 +1,3 @@
use ROLE:centralmanager
# ALLOW_WRITE per default on * -> following has no effect yet
ALLOW_WRITE_COLLECTOR=$(ALLOW_WRITE) condor-exec.htc.local condor-sub.htc.local

View File

@@ -0,0 +1,92 @@
######################################################################
##
## condor_config
##
## This is the global configuration file for condor. This is where
## you define where the local config file is. Any settings
## made here may potentially be overridden in the local configuration
## file. KEEP THAT IN MIND! To double-check that a variable is
## getting set from the configuration file that you expect, use
## condor_config_val -v <variable name>
##
## condor_config.annotated is a more detailed sample config file
##
## Unless otherwise specified, settings that are commented out show
## the defaults that are used if you don't define a value. Settings
## that are defined here MUST BE DEFINED since they have no default
## value.
##
######################################################################
## Where have you installed the bin, sbin and lib condor directories?
RELEASE_DIR = /usr
## Where is the local condor directory for each host? This is where the local config file(s), logs and
## spool/execute directories are located. this is the default for Linux and Unix systems.
LOCAL_DIR = /var
## Where is the machine-specific local config file for each host?
# LOCAL_CONFIG_FILE = /etc/condor/condor_config.local
LOCAL_CONFIG_FILE = /etc/condor/condor_config_$(HOSTNAME).local
## If your configuration is on a shared file system, then this might be a better default
#LOCAL_CONFIG_FILE = $(RELEASE_DIR)/etc/$(HOSTNAME).local
## If the local config file is not present, is it an error? (WARNING: This is a potential security issue.)
REQUIRE_LOCAL_CONFIG_FILE = false
## The normal way to do configuration with RPMs is to read all of the
## files in a given directory that don't match a regex as configuration files.
## Config files are read in lexicographic order.
LOCAL_CONFIG_DIR = /etc/condor/config.d
#LOCAL_CONFIG_DIR_EXCLUDE_REGEXP = ^((\..*)|(.*~)|(#.*)|(.*\.rpmsave)|(.*\.rpmnew))$
##
## Do NOT use host-based security by default.
##
## This was the default for the 8.8 series (and earlier), but it is
## intrinsically insecure. To make the 9.0 series secure by default, we
## commented it out.
##
## You should seriously consider improving your security configuration.
##
## To continue to use your old security configuration, knowing that it is
## insecure, add the line 'use SECURITY : HOST_BASED' to your local
## configuration directory. Don't just uncomment the final line in this
## comment block; changes in this file may be lost during your next upgrade.
## The following shell command will make the change on most Linux systems.
##
## echo 'use SECURITY : HOST_BASED' >> $(condor_config_val LOCAL_CONFIG_DIR)/00-insecure.config
##
## To expand your condor pool beyond a single host, set ALLOW_WRITE to match all of the hosts
#ALLOW_WRITE = *.cs.wisc.edu
## FLOCK_FROM defines the machines that grant access to your pool via flocking. (i.e. these machines can join your pool).
#FLOCK_FROM =
## FLOCK_TO defines the central managers that your schedd will advertise itself to (i.e. these pools will give matches to your schedd).
#FLOCK_TO = condor.cs.wisc.edu, cm.example.edu
##--------------------------------------------------------------------
## Values set by the rpm patch script:
##--------------------------------------------------------------------
## For Unix machines, the path and file name of the file containing
## the pool password for password authentication.
#SEC_PASSWORD_FILE = $(LOCAL_DIR)/lib/condor/pool_password
## Pathnames
RUN = $(LOCAL_DIR)/run/condor
LOG = $(LOCAL_DIR)/log/condor
LOCK = $(LOCAL_DIR)/lock/condor
SPOOL = $(LOCAL_DIR)/lib/condor/spool
EXECUTE = $(LOCAL_DIR)/lib/condor/execute
BIN = $(RELEASE_DIR)/bin
LIB = $(RELEASE_DIR)/lib64/condor
INCLUDE = $(RELEASE_DIR)/include/condor
SBIN = $(RELEASE_DIR)/sbin
LIBEXEC = $(RELEASE_DIR)/libexec/condor
SHARE = $(RELEASE_DIR)/share/condor
PROCD_ADDRESS = $(RUN)/procd_pipe
JAVA_CLASSPATH_DEFAULT = $(SHARE) .
## Install the minicondor package to run HTCondor on a single node

View File

@@ -0,0 +1,64 @@
HostKey _INSERT_HOST_KEY_
AuthorizedKeysFile _INSERT_AUTHORIZED_KEYS_FILE_
# The following option is not supported by all recent versions of OpenSSH,
# so instead we rely on injection of the shell setup command in the authorized
# keys file.
#ForceCommand _INSERT_FORCE_COMMAND_
# as a convenience to users, allow remote setting of environment
# since sshd is running as the job uid, there isn't really a security concern
AcceptEnv *
Subsystem sftp /usr/libexec/openssh/sftp-server
# prevent sshd from freaking out about reading files from inside
# a tmp-like directory
StrictModes no
# Rejection by tcp wrappers is not logged at INFO or VERBOSE log levels,
# so to make diagnosis of problems easier, we use DEBUG.
LogLevel DEBUG
X11Forwarding yes
# By default, OpenSSH uses the ipv6 loopback even if ipv6 is disabled in the
# kernel. This forces OpenSSH to use the "local" network instead
X11UseLocalhost no
# The following settings are recommended for good security.
# In particular, only FIPS 140-2 algorithms are used.
# URLs for extra information re FIPS security compliance:
# https://rhel7stig.readthedocs.io/en/latest/
# https://www.stigviewer.com/stig/red_hat_enterprise_linux_7/
# https://people.redhat.com/swells/scap-security-guide/tables/table-rhel7-stig-manual.html
#-FIPS High Severity
Protocol 2
PermitEmptyPasswords no
#-FIPS Medium Severity
# Note: Ciphers and MACs below will be incompatible with RHEL5 or earlier.
Ciphers aes128-ctr,aes192-ctr,aes256-ctr
MACs hmac-sha2-256,hmac-sha2-512
PermitRootLogin no
PermitUserEnvironment no
GSSAPIAuthentication no
KerberosAuthentication no
HostbasedAuthentication no
IgnoreRhosts yes
IgnoreUserKnownHosts yes
PrintLastLog yes
UsePrivilegeSeparation sandbox
Compression delayed
#-Recommended for security, but left out ssh_to_job config
# because they provide minimal value and are likely to annoy
# users or generate needless warnings in the ssh_to_job setting.
#
# ClientAliveInterval 600 # Note: condor_submit -i sets TMOUT
# ClientAliveCountMax 0
# banner=/etc/issue # Set to your warning banner
# StrictModes yes # Can't set due to tmp-like directory
# RhostsRSAAuthentication no # Obsolete Protocol version 1 option

View File

@@ -0,0 +1,26 @@
##
## Default security settings
##
## Host-based security was the default for the 8.8 series (and earlier).
##
## Host-based security assumes that all users on a machine are trusted.
## For example, if host-based security trusts that a given machine can
## run jobs, then any user who can start a process on that machine can
## start a startd that can "steal" jobs from the system.
#
## To help make HTCondor secure by default, we removed host-based security
## from the default configuration file
## (/etc/condor/condor_config).
##
## New installations of HTCondor should be made using the get_htcondor tool,
## which can automatically establish IDTOKENS-based security across a multi-
## node pool. For existing installations, we recommend you
## consider improving your security configuration.
##
## To continue to use your old security configuration,
## comment out the 'recommended' line below, and uncomment the
## 'host_based' line.
##
# use security : host_based
use security : recommended_v9_0

View File

@@ -0,0 +1 @@
CONDOR_HOST = condor-cm.htc.local

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1 @@
use ROLE:execute

View File

@@ -0,0 +1 @@
use ROLE:submit

View File

@@ -0,0 +1,11 @@
FROM docker.io/library/centos:7
RUN yum install -y https://research.cs.wisc.edu/htcondor/repo/8.9/htcondor-release-current.el7.noarch.rpm && \
yum install --nogpgcheck -y condor && \
yum install -y less && \
yum clean all
RUN yum install -y iproute bind-utils nmap-ncat net-tools && \
yum clean all
CMD bash -c 'cat <({ condor_master -f & tail --retry --pid $! -f /var/log/condor/MasterLog & })'

View File

@@ -0,0 +1,142 @@
- name: "htcondor docker image"
file:
path: "/container/docker-images/htcondor"
state: directory
owner: "{{unpriv_user}}"
group: docker
mode: "u=rwx,g=rwx,o=rx"
- copy:
dest: "/container/docker-images/htcondor/Dockerfile"
src: "htcondor.Dockerfile"
owner: "{{unpriv_user}}"
group: docker
register: cp_dockerfile
- docker_image:
name: "htcondor"
# pull: False
build:
pull: False
path: "/container/docker-images/htcondor"
source: build
force_source: "{{cp_dockerfile.changed}}"
- name: "copy htcondor container configuration"
copy:
src: "conf/{{item}}/"
dest: "/container/volumes/{{item}}/"
owner: "{{unpriv_user}}"
group: docker
mode: "u=rwx,g=rwx"
with_items: [ "cm", "exec", "sub", "common"]
- name: "check if pool shared secret exists"
stat:
path: "/container/volumes/common/passwords.d/POOL"
register: pool_pw
- block:
- name: "create temporary password store"
tempfile:
state: directory
register: pool_pw_tmp
- name: "generate pool password"
copy:
dest: "{{pool_pw_tmp.path}}/poolpw"
content: "{{lookup('password','/dev/null')}}"
no_log: True
- name: "install pool password"
docker_container:
name: "condor-common"
image: htcondor
state: started
volumes:
- "/container/volumes/common/:/etc/condor/:rw"
- "{{pool_pw_tmp.path}}:/tmp/poolpw:ro"
detach: False
cleanup: True
command: "condor_store_cred add -c -i /tmp/poolpw/poolpw"
- name: "remove tokens since pool password (cert) changed"
file:
path: "/container/volumes/{{item}}/tokens.d/condor@htc.local"
state: absent
with_items: [ "cm", "exec", "sub" ]
always:
- name: "remove temporary password store"
file:
path: "{{pool_pw_tmp.path}}"
state: absent
when: pool_pw_tmp is defined and pool_pw_tmp.path
when: not pool_pw.stat.exists
- name: "sync common files to individual containers"
copy_2:
remote_src: True
force: True
directory_mode: preserve
mode: preserve
src: "/container/volumes/common/"
dest: "/container/volumes/{{item}}/"
with_items: [ "cm", "exec", "sub"]
- name: "collect tokens to generate"
stat:
path: "/container/volumes/{{item}}/tokens.d/condor@htc.local"
with_items: [ "cm", "exec", "sub" ]
register: tokens_state
- name: "generate tokens"
docker_container:
name: "condor-common"
image: htcondor
state: started
volumes:
- "/container/volumes/{{item}}/:/etc/condor/:rw"
detach: False
cleanup: True
command: "condor_token_create -identity condor@$(domainname) -token /etc/condor/tokens.d/condor@htc.local"
with_items: "{{tokens_state.results | rejectattr('stat.exists') | map(attribute='item') | list }}"
- name: "create docker network to make service discovery work"
docker_network:
name: condor
state: present
# TODO: reserve some address using docker_network_info and assign as aux
# address to enable cm to get a static address in order to be reachable from
# htcondor running on docker host to enable submitting jobs.
- name: "run htcondor containers"
docker_container:
name: "condor-{{item}}"
hostname: "condor-{{item}}"
domainname: "htc.local"
image: htcondor
state: started
detach: True
cleanup: True
networks_cli_compatible: True
networks:
- name: "condor"
aliases: [ "condor-{{item}}.htc.local" ]
volumes:
- "/container/volumes/{{item}}/:/etc/condor/:rw"
with_items: [ "cm", "exec", "sub"]
# auto_remove: True
# mounts:
# src: /container/volumes/cm/
# dest: /etc/condor/
#- add_host:
# hostname: foo
# ansible_connection: docker_api
# docker_host: ssh://ed-c7-1.virt.magni.thoto.net

View File

@@ -8,6 +8,41 @@
name: [ "docker-ce", "python-docker-py" ] # latter for ansible modules
state: present
- name: "partition container image disk"
parted:
device: /dev/vdb
number: 1
state: present
# fs_type: xfs
- filesystem:
dev: /dev/vdb1
fstype: xfs
opts: "-L image-store"
- mount:
path: "/container"
src: "/dev/vdb1"
fstype: xfs
opts: "noatime"
state: mounted
- file:
path: "/container/docker"
state: directory
owner: root
group: root
mode: "u=rwx,g=x,o=x"
- name: "link docker configuration to new container partition"
file:
path: "/var/lib/docker"
src: "/container/docker"
state: link
owner: root
group: root
mode: "u=rwx,g=x,o=x"
- name: "enable docker service in systemd"
service:
name: docker

View File

@@ -1 +1,5 @@
container_privileged: False
slurm_user: slurm
slurm_log_path_ctld: /var/log/slurm/slurmctld.log
slurm_log_path_d: /var/log/slurm/slurmd.log
slurm_log_path_sched: /var/log/slurm/slurmsched.log

View File

@@ -1,8 +1,9 @@
#!/usr/bin/env bash
set -e
chown munge:munge /etc/munge/munge.key
chmod 600 /etc/munge/munge.key
if [ -f "/etc/munge/munge.key" ] ; then
chown munge:munge /etc/munge/munge.key
chmod 600 /etc/munge/munge.key
fi
exec "$@"

View File

@@ -2,25 +2,42 @@ FROM docker.io/library/centos:7 as base
RUN yum install -y epel-release && \
yum install -y slurm && \
yum clean all
yum clean all && rm -rf /var/cache/yum
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
yum clean all
yum clean all && rm -rf /var/cache/yum
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
RUN chown root:root /usr/local/sbin/entrypoint.sh && chmod 755 /usr/local/sbin/entrypoint.sh
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
chmod 755 /usr/local/sbin/entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
ARG slurmuser=slurm
ENV slurmuser=${slurmuser}
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
slurm-setuser -u $slurmuser -g $slurmuser -y
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
FROM base as slurmd
RUN yum install -y slurm-slurmd && \
yum clean all
yum clean all && rm -rf /var/cache/yum
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
slurmd -D 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'
FROM base as slurmctld
RUN yum install -y slurm-slurmctld && \
yum clean all
yum clean all && rm -rf /var/cache/yum
# CMD bash -c 'cat <({ condor_master -f & tail --retry --pid $! -f /var/log/condor/MasterLog & })'
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'

View File

@@ -1,20 +1,32 @@
FROM docker.io/library/centos:7 as base
RUN yum install -y epel-release && \
yum install -y slurm slurm-slurmctld && \
yum clean all
yum install -y slurm && \
yum clean all && rm -rf /var/cache/yum
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
yum clean all
RUN yum install -y slurm-slurmctld && \
yum clean all
yum clean all && rm -rf /var/cache/yum
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
RUN chown root:root /usr/local/sbin/entrypoint.sh && chmod 755 /usr/local/sbin/entrypoint.sh
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
chmod 755 /usr/local/sbin/entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & slurmctld -D & })'
# ... & tail --retry --pid $! -f /var/log/condor/MasterLog & })'
ARG slurmuser=slurm
ENV slurmuser=${slurmuser}
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
slurm-setuser -u $slurmuser -g $slurmuser -y
RUN yum install -y slurm-slurmctld && \
yum clean all && rm -rf /var/cache/yum
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
su -s /bin/sh -c "slurmctld -D" ${slurmuser} 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMCTLD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'

View File

@@ -1,17 +1,32 @@
FROM docker.io/library/centos:7
RUN yum install -y epel-release && \
yum install -y slurm slurm-slurmd && \
yum clean all
yum install -y slurm && \
yum clean all && rm -rf /var/cache/yum
RUN yum install -y less iproute bind-utils nmap-ncat net-tools && \
yum clean all
yum clean all && rm -rf /var/cache/yum
COPY entrypoint.sh /usr/local/sbin/entrypoint.sh
RUN chown root:root /usr/local/sbin/entrypoint.sh && chmod 755 /usr/local/sbin/entrypoint.sh
RUN chown root:root /usr/local/sbin/entrypoint.sh && \
chmod 755 /usr/local/sbin/entrypoint.sh
ENTRYPOINT [ "/usr/local/sbin/entrypoint.sh" ]
# CMD bash -c 'cat <({ condor_master -f & tail --retry --pid $! -f /var/log/condor/MasterLog & })'
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & slurmd -D & })'
ARG slurmuser=slurm
ENV slurmuser=${slurmuser}
RUN useradd -d /var/lib/slurm -m --no-log-init --system $slurmuser &&\
slurm-setuser -u $slurmuser -g $slurmuser -y
RUN yum install -y slurm-slurmd && \
yum clean all && rm -rf /var/cache/yum
ENV SLURMCTLD_LOG_PATH="/var/log/slurm/slurmctld.log"
ENV SLURMD_LOG_PATH="/var/log/slurm/slurmd.log"
ENV SLURM_SCHED_LOG_PATH="/var/log/slurm/slurmsched.log"
CMD bash -c 'cat <({ su -s /bin/sh -c "munged -F" munge & \
slurmd -D 2>/dev/null 1>/dev/null & \
tail --retry --pid $! -f ${SLURMD_LOG_PATH} ${SLURM_SCHED_LOG_PATH} & })'

View File

@@ -1,19 +1,19 @@
- file:
path: "/home/centos7/docker-images/{{item}}"
path: "/container/docker-images/{{item}}"
state: directory
owner: "{{unpriv_user}}"
group: docker
- copy:
src: "{{item}}.Dockerfile"
dest: "/home/centos7/docker-images/{{item}}/Dockerfile"
dest: "/container/docker-images/{{item}}/Dockerfile"
owner: "{{unpriv_user}}"
group: docker
register: slurm_cp_dockerfile
- copy:
src: "entrypoint.sh"
dest: "/home/centos7/docker-images/{{item}}/entrypoint.sh"
dest: "/container/docker-images/{{item}}/entrypoint.sh"
owner: root
group: root
mode: u=rwx,g=rx,o=rx
@@ -24,7 +24,7 @@
# pull: False
build:
pull: False
path: "/home/centos7/docker-images/{{item}}"
path: "/container/docker-images/{{item}}"
# target: "{{item}}" # unsupported on old docker-py versions as in el7
source: build
force_source: "{{slurm_cp_dockerfile.changed or slurm_cp_entrypt.changed}}"

View File

@@ -21,7 +21,7 @@
mode: u=rw,g=,o=
- file:
path: /home/centos7/volumes/munge
path: /container/volumes/munge
state: directory
owner: munge
group: munge
@@ -33,17 +33,17 @@
force: true
mode: preserve
src: /etc/munge/munge.key
dest: /home/centos7/volumes/munge/munge.key
dest: /container/volumes/munge/munge.key
- file:
path: /home/centos7/volumes/slurm/
path: /container/volumes/slurm/
state: directory
- name: upload slurm config
template:
force: true
src: "{{item}}.j2"
dest: "/home/centos7/volumes/slurm/{{item}}"
dest: "/container/volumes/slurm/{{item}}"
loop:
- slurm.conf
- cgroup.conf
@@ -82,6 +82,8 @@
volumes: "{{default_mounts + ( item.extra_mounts | default([]) ) }}"
networks:
- name: "slurm"
env:
slurmuser: "{{slurm_user}}"
image: "{{item.image}}"
state: started
detach: True
@@ -90,8 +92,8 @@
networks_cli_compatible: True
vars:
default_mounts:
- /home/centos7/volumes/slurm/:/etc/slurm/:rw
- /home/centos7/volumes/munge/munge.key:/etc/munge/munge.key:rw
- /container/volumes/slurm/:/etc/slurm/:rw
- /container/volumes/munge/munge.key:/etc/munge/munge.key:rw
- slurm-shared:/shared/:rw
slurm_nodes_all: | # add execute nodes
{% for i in range(1, 4) -%}

View File

@@ -32,6 +32,7 @@ CryptoType=crypto/munge
#MaxStepCount=40000
#MaxTasksPerNode=128
MpiDefault=pmix
# when running slurmd as user change to: MpiDefault=none
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
@@ -57,8 +58,9 @@ SlurmctldPort=6817
SlurmdPidFile=/var/run/slurm/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurm/d
SlurmUser=root
#SlurmdUser=root
SlurmUser={{slurm_user}}
SlurmdUser=root
# SlurmdUser=slurm -> sbatch does not work
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool/slurm/ctld
@@ -130,7 +132,7 @@ AccountingStorageType=accounting_storage/none
#AccountingStorageUser=
AccountingStoreJobComment=YES
ClusterName=cluster
#DebugFlags=
#DebugFlags=Steps,TraceJobs
#JobCompHost=
JobCompLoc=/tmp/jobcomp
#JobCompPass=
@@ -141,10 +143,10 @@ JobCompType=jobcomp/filetxt
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
SlurmctldDebug=verbose
#SlurmctldLogFile=
SlurmctldLogFile={{slurm_log_path_ctld}}
SlurmdDebug=verbose
#SlurmdLogFile=
#SlurmSchedLogFile=
SlurmdLogFile={{slurm_log_path_d}}
SlurmSchedLogFile={{slurm_log_path_sched}}
#SlurmSchedLogLevel=
#
#
@@ -163,4 +165,4 @@ SlurmdDebug=verbose
# COMPUTE NODES
NodeName=slurm-exec[1-{{num_nodes}}] CPUs=2 CoresPerSocket=2 State=UNKNOWN
NodeName=slurm-submit1 CPUs=1 State=UNKNOWN
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] Default=YES MaxTime=INFINITE State=UP
PartitionName=debug Nodes=slurm-exec[1-{{num_nodes}}] AllocNodes=slurm-submit1 Default=YES MaxTime=INFINITE State=UP