From dfaff0931b4f39d8f9dff5b75a40d836aff05a8a Mon Sep 17 00:00:00 2001
From: XaverStiensmeier <xaverstiensmeier@gmx.de>
Date: Mon, 1 Jul 2024 11:39:32 +0200
Subject: [PATCH] updated documentation

---
 bibigrid.yml                                  | 21 ++++++++++++-------
 .../markdown/features/configuration.md        | 18 +++++++++-------
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/bibigrid.yml b/bibigrid.yml
index c2c59cbb..765cab6d 100644
--- a/bibigrid.yml
+++ b/bibigrid.yml
@@ -7,9 +7,9 @@
   cloud: openstack # name of clouds.yaml cloud-specification key (which is value to top level key clouds)
 
   # -- BEGIN: GENERAL CLUSTER INFORMATION --
-  # sshTimeout: 5 # Number of ssh connection attempts with 2^attempt seconds in between (2^sshTimeout-1 is the max time before returning with an error)
+  # sshTimeout: 5 # number of attempts to connect to instances during startup with delay in between
   # cloudScheduling:
-  #    sshTimeout: 42 # like the sshTimeout during startup but during the on demand scheduling
+  #    sshTimeout: 42 # like sshTimeout but during the on demand scheduling on the running cluster
 
   ## sshPublicKeyFiles listed here will be added to access the cluster. A temporary key is created by bibigrid itself.
   #sshPublicKeyFiles:
@@ -32,7 +32,11 @@
   #    - [...]
 
   ## Uncomment if you don't want assign a public ip to the master; for internal cluster (Tuebingen).
-  #useMasterWithPublicIp: False # defaults True if False no public-ip (floating-ip) will be allocated
+  # useMasterWithPublicIp: False # defaults True if False no public-ip (floating-ip) will be allocated
+  # gateway: # if you want to use a gateway for create.
+  # ip: # IP of gateway to use
+  # portFunction: 30000 + oct4 # variables are called: oct1.oct2.oct3.oct4
+
   # deleteTmpKeypairAfter: False
   # dontUploadCredentials: False
 
@@ -45,7 +49,7 @@
 
   useMasterAsCompute: True
   
-  #waitForServices:  # existing service name that runs after an instance is launched. BiBiGrid's playbook will wait until service is "stopped" to avoid issues
+  # waitForServices:  # existing service name that runs after an instance is launched. BiBiGrid's playbook will wait until service is "stopped" to avoid issues
   #  - de.NBI_Bielefeld_environment.service  # uncomment for cloud site Bielefeld
 
   # master configuration
@@ -59,7 +63,7 @@
   # fallbackOnOtherImage: False # if True, most similar image by name will be picked. A regex can also be given instead.
 
   # worker configuration
-  #workerInstances:
+  # workerInstances:
   #  - type: # existing type/flavor on your cloud. See launch instance>flavor for options
   #    image: # same as master. Consider using regex to prevent image updates from breaking your running cluster
   #    count: # any number of workers you would like to create with set type, image combination
@@ -89,9 +93,6 @@
   # Depends on cloud site and project
   subnet: # existing subnet on your cloud. See https://openstack.cebitec.uni-bielefeld.de/project/networks/
   # or network:
-  # gateway: # if you want to use a gateway for create.
-    # ip: # IP of gateway to use
-    # portFunction: 30000 + oct4 # variables are called: oct1.oct2.oct3.oct4
 
   # Uncomment if no full DNS service for started instances is available.
   # Currently, the case in Berlin, DKFZ, Heidelberg and Tuebingen.
@@ -99,4 +100,8 @@
 
   #features: # list
 
+  # elastic_scheduling: # for large or slow clusters increasing these timeouts might be necessary to avoid failures
+  #   SuspendTimeout: 30 # after SuspendTimeout seconds, slurm allows to power up the node again
+  #   ResumeTimeout: 900 # if a node doesn't start in ResumeTimeout seconds, the start is considered failed.
+
   #- [next configurations]
diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md
index 0bfc4f18..0b86fcd2 100644
--- a/documentation/markdown/features/configuration.md
+++ b/documentation/markdown/features/configuration.md
@@ -67,7 +67,7 @@ Attempts have a pause of `2^(attempts+2)` seconds in between. Default value is 5
 
 ```yaml
 cloudScheduling:
-  sshTimeout: 4
+  sshTimeout: 5
 ```
 
 #### masterMounts (optional:False)
@@ -150,16 +150,19 @@ If `False`, the cluster will start without the job scheduling system slurm.
 For nearly all cases the default value is what you need. Default is `True`.
 
 ##### slurmConf (optional)
-`slurmConf` contains variable fields in the `slurm.conf`. The most common use is to increase the `SuspendTime` 
-and the `ResumeTimeout` like:
+`slurmConf` contains variable fields in the `slurm.conf`. The most common use is to increase the `SuspendTime`, 
+`SuspendTimeout` and the `ResumeTimeout` like:
 
 ```yaml
 elastic_scheduling:
   SuspendTime: 1800
+  SuspendTimeout: 60
   ResumeTimeout: 1800
 ```
 
-Please only use if necessary. On Demand Scheduling improves resource availability for all users.
+Increasing the `SuspendTime` should only be done with consideration for other users. 
+On Demand Scheduling improves resource availability for all users.
+If some nodes need to be active during the entire cluster lifetime, [onDemand](#workerinstances) might be the better approach.
 
 ###### Defaults
 ```yaml
@@ -169,8 +172,9 @@ slurmConf:
     db_password: changeme
     munge_key: # automatically generated via id_generation.generate_munge_key
     elastic_scheduling:
-      SuspendTime: 900 # if a node doesn't start in SuspendTime seconds, the start is considered failed. See https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeProgram
-      ResumeTimeout: 900 # if a node is not used for ResumeTimeout seconds, it will shut down  
+      SuspendTime: 900  # if a node is not used for SuspendTime seconds, it will shut down  
+      SuspendTimeout: 30 # after SuspendTimeout seconds, slurm allows to power up the powered down node again
+      ResumeTimeout: 900 # if a node doesn't start in ResumeTimeout seconds, the start is considered failed. See https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeProgram
       TreeWidth: 128 # https://slurm.schedmd.com/slurm.conf.html#OPT_TreeWidth
 ```
 
@@ -254,7 +258,7 @@ workerInstance:
 - `type` sets the instance's hardware configuration.
 - `image` sets the bootable operating system to be installed on the instance.
 - `count` sets how many workers of that `type` `image` combination are in this work group
-- `onDemand` defines whether nodes in the worker group are scheduled on demand (True) or are started permanently (False). Please only use if necessary. On Demand Scheduling improves resource availability for all users. This option only works on the master cloud for now.
+- `onDemand` defines whether nodes in the worker group are scheduled on demand (True) or are started permanently (False). Please only use if necessary. On Demand Scheduling improves resource availability for all users. This option only works for single cloud setups for now.
 - `partitions` allow you to force Slurm to schedule to a group of nodes (partitions) ([more](https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION))
 - `features` allow you to force Slurm to schedule a job only on nodes that meet certain `bool` constraints. This can be helpful when only certain nodes can access a specific resource - like a database ([more](https://slurm.schedmd.com/slurm.conf.html#OPT_Features)).