Skip to content

Commit

Permalink
Merge branch 'dev' into additional-partitions
Browse files Browse the repository at this point in the history
  • Loading branch information
XaverStiensmeier authored Jun 4, 2024
2 parents 1e26cc7 + 9ad7cab commit b6184e8
Show file tree
Hide file tree
Showing 15 changed files with 403 additions and 36 deletions.
15 changes: 10 additions & 5 deletions bibigrid/core/actions/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,13 @@ def initialize_instances(self):
"gateway": configuration.get("gateway", {}), "timeout": self.ssh_timeout}
if configuration.get("masterInstance"):
self.master_ip = configuration["floating_ip"]
ssh_data["commands"] = self.ssh_add_public_key_commands + ssh_handler.ANSIBLE_SETUP
wait_for_service_command, wait_for_service_message = ssh_handler.a_c.WAIT_FOR_SERVICES
wait_for_services_commands = [
(wait_for_service_command.format(service=service), wait_for_service_message.format(service=service))
for service in configuration.get("waitForServices", [])]
print(wait_for_services_commands)
ssh_data["commands"] = (
wait_for_services_commands + self.ssh_add_public_key_commands + ssh_handler.ANSIBLE_SETUP)
ssh_data["filepaths"] = [(ssh_data["private_key"], ssh_handler.PRIVATE_KEY_FILE)]
ssh_handler.execute_ssh(ssh_data, self.log)
elif configuration.get("vpnInstance"):
Expand Down Expand Up @@ -314,7 +320,6 @@ def prepare_configurations(self):
@return:
"""
for configuration, provider in zip(self.configurations, self.providers):
configuration["cloud_identifier"] = provider.cloud_specification["identifier"]
if not configuration.get("network"):
self.log.debug("No network found. Getting network by subnet.")
configuration["network"] = provider.get_network_id_by_subnet(configuration["subnet"])
Expand Down Expand Up @@ -355,9 +360,9 @@ def upload_data(self):
self.log.debug(f"Starting playbook with {ansible_start}.")
commands = [ssh_handler.get_ac_command(self.providers, AC_NAME.format(
cluster_id=self.cluster_id))] + ssh_handler.ANSIBLE_START
ssh_data = {"floating_ip": self.master_ip, "private_key": KEY_FOLDER + self.key_name,
"username": self.ssh_user, "commands": commands, "filepaths": FILEPATHS,
"gateway": self.configurations[0].get("gateway", {}), "timeout": self.ssh_timeout}
ssh_data = {"floating_ip": self.master_ip, "private_key": KEY_FOLDER + self.key_name, "username": self.ssh_user,
"commands": commands, "filepaths": FILEPATHS, "gateway": self.configurations[0].get("gateway", {}),
"timeout": self.ssh_timeout}
ssh_handler.execute_ssh(ssh_data=ssh_data, log=self.log)

def start_start_server_threads(self):
Expand Down
3 changes: 3 additions & 0 deletions bibigrid/core/utility/ansible_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@
"Execute ansible playbook. Be patient.")

# ansible setup
WAIT_FOR_SERVICES = (
"while [[ $(systemctl is-active {service}) == 'active' ]]; do echo 'Waiting for service {service}'; sleep 2; done",
"Waiting for service {service}.")
UPDATE = ("sudo apt-get update", "Update apt repository lists.")
PYTHON3_PIP = "sudo apt-get install -y python3-pip", "Install python3 pip using apt."
ANSIBLE_PASSLIB = ("sudo pip install ansible==6.6 passlib", "Install Ansible and Passlib using pip.")
4 changes: 3 additions & 1 deletion bibigrid/core/utility/handler/configuration_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,5 +149,7 @@ def get_cloud_specifications(configurations, log):
for configuration in configurations:
cloud = configuration.get(CLOUD_CONFIGURATION_KEY)
if cloud:
cloud_specifications.append(get_cloud_specification(cloud, clouds, clouds_public, log)) # might be None
cloud_specification = get_cloud_specification(cloud, clouds, clouds_public, log)
cloud_specifications.append(cloud_specification) # might be None if not found
configuration["cloud_identifier"] = cloud_specification["identifier"]
return cloud_specifications
10 changes: 3 additions & 7 deletions bibigrid/core/utility/validate_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import os

from bibigrid.core.utility import validate_schema
from bibigrid.core.utility import image_selection
from bibigrid.core.utility.handler import configuration_handler
from bibigrid.models.exceptions import ImageNotActiveException
Expand Down Expand Up @@ -201,13 +202,8 @@ def validate(self):
@return:
"""
success = bool(self.providers)
self.log.info("Validating config file...")
# success = check_provider_data(
# configuration_handler.get_list_by_key(self.configurations, "cloud"),
# len(self.configurations)) and success
# if not success:
# LOG.warning("Providers not set correctly in configuration file. Check log for more detail.")
# return success
success = validate_schema.validate_configurations(self.configurations, self.log) and success

checks = [("master/vpn", self.check_master_vpn_worker), ("servergroup", self.check_server_group),
("instances", self.check_instances), ("volumes", self.check_volumes), ("network", self.check_network),
("quotas", self.check_quotas), ("sshPublicKeyFiles", self.check_ssh_public_key_files),
Expand Down
60 changes: 60 additions & 0 deletions bibigrid/core/utility/validate_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""
Handles the schema validation for BiBiGrid's configuration yml.
"""

from schema import Schema, Optional, Or, SchemaError

# Define the schema for the configuration file
master_schema = Schema(
{'infrastructure': str, 'cloud': str, 'sshUser': str, Or('subnet', 'network'): str, 'cloud_identifier': str,
Optional('sshPublicKeyFiles'): [str], Optional('sshTimeout'): int,
Optional('cloudScheduling'): {Optional('sshTimeout'): int}, Optional('autoMount'): bool,
Optional('masterMounts'): [str], Optional('nfsShares'): [str],
Optional('userRoles'): [{'hosts': [str], 'roles': [{'name': str, Optional('tags'): [str]}]}],
Optional('localFS'): bool, Optional('localDNSlookup'): bool, Optional('slurm'): bool,
Optional('slurmConf'): {Optional('db'): str, Optional('db_user'): str, Optional('db_password'): str,
Optional('munge_key'): str, Optional('elastic_scheduling'): {Optional('SuspendTime'): int,
Optional(
'ResumeTimeout'): int,
Optional('TreeWidth'): int}},
Optional('zabbix'): bool, Optional('nfs'): bool, Optional('ide'): bool, Optional('useMasterAsCompute'): bool,
Optional('useMasterWithPublicIp'): bool, Optional('waitForServices'): [str],
Optional('gateway'): {'ip': str, 'portFunction': str}, Optional('fallbackOnOtherImage'): bool,
Optional('localDNSLookup'): bool, Optional('features'): [str], 'workerInstances': [
{'type': str, 'image': str, Optional('count'): int, Optional('onDemand'): bool, Optional('partitions'): [str],
Optional('features'): [str]}],
'masterInstance': {'type': str, 'image': str, Optional('partitions'): [str], Optional('features'): [str]},
Optional('vpngtw'): {'type': str, 'image': str}})

other_schema = Schema(
{'infrastructure': str, 'cloud': str, 'sshUser': str, Or('subnet', 'network'): str, 'cloud_identifier': str,
Optional('waitForServices'): [str], Optional('features'): [str], 'workerInstances': [
{'type': str, 'image': str, Optional('count'): int, Optional('onDemand'): bool, Optional('partitions'): [str],
Optional('features'): [str]}], 'vpnInstance': {'type': str, 'image': str}})


def validate_configurations(configurations, log):
log.info("Validating config file schema...")
configuration = None
try:
configuration = configurations[0]
if configuration.get("region") or configuration.get("availabilityZone"):
log.warning(
"Keys 'region' and 'availabilityZone' are deprecated! Check will return False if you use one of them."
"Just remove them. They are no longer required.")
master_schema.validate(configuration)
log.debug(f"Master configuration '{configuration['cloud_identifier']}' valid.")
for configuration in configurations[1:]:
if configuration.get("region") or configuration.get("availabilityZone"):
log.warning(
"Keys region and availabilityZone are deprecated! Check will return False if you use one of them."
"Just remove them. They are no longer required.")
other_schema.validate(configuration)
log.debug(f"Configuration '{configuration['cloud_identifier']}' schema valid.")
log.debug("Entire configuration schema valid.")
return True
except SchemaError as err:
log.warning(
f"Configuration '{configuration.get('cloud_identifier', 'No identifier found')}' invalid. See error: "
f"{err}.")
return False
22 changes: 12 additions & 10 deletions documentation/markdown/features/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,9 @@ userRoles: # see ansible_hosts for all options
- hosts:
- "master"
roles: # roles placed in resources/playbook/roles_user
- name: "resistance_nextflow"
- name: "resistance_nextflow"
tags:
- resistance_nextflow
# varsFiles: # (optional)
# - file1
```
Expand All @@ -135,8 +137,8 @@ If `True`, master will store DNS information for his workers. Default is `False`
If `False`, the cluster will start without the job scheduling system slurm.
For nearly all cases the default value is what you need. Default is `True`.

##### SlurmConf (optional)
`SlurmConf` contains variable fields in the `slurm.conf`. The most common use is to increase the `SuspendTime`
##### slurmConf (optional)
`slurmConf` contains variable fields in the `slurm.conf`. The most common use is to increase the `SuspendTime`
and the `ResumeTimeout` like:

```yaml
Expand Down Expand Up @@ -181,13 +183,6 @@ If `False`, master will no longer help workers to process jobs. Default is `True

If `False`, master will not be created with an attached floating ip. Default is `True`.

#### waitForServices (optional):

Expects a list of services to wait for.
This is required if your provider has any post-launch services interfering with the package manager. If not set,
seemingly random errors can occur when the service interrupts ansible's execution. Services are
listed on [de.NBI Wiki](https://cloud.denbi.de/wiki/) at `Computer Center Specific` (not yet).

#### gateway (optional)
In order to save valuable floating ips, BiBiGrid can also make use of a gateway to create the cluster.
For more information on how to set up a gateway, how gateways work and why they save floating ips please continue reading [here](https://cloud.denbi.de/wiki/Tutorials/SaveFloatingIPs/).
Expand All @@ -208,6 +203,13 @@ Using gateway also automatically sets [useMasterWithPublicIp](#usemasterwithpubl

### Local

#### waitForServices (optional):

Expects a list of services to wait for.
This is required if your provider has any post-launch services interfering with the package manager. If not set,
seemingly random errors can occur when the service interrupts ansible's execution. Services are
listed on [de.NBI Wiki](https://cloud.denbi.de/wiki/) at `Computer Center Specific` (not yet).

#### infrastructure (required)

`infrastructure` sets the used provider implementation for this configuration. Currently only `openstack` is available.
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ uvicorn~=0.23.2
fastapi~=0.101.0
pydantic~=2.1.1
keystoneauth1~=5.1.0
filelock~=3.13.1
filelock~=3.13.1
schema~=0.7.7
12 changes: 5 additions & 7 deletions resources/playbook/roles/bibigrid/files/slurm/fail.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ process_string() {
}

mkdir -p worker_logs
mkdir -p worker_logs/fail
mkdir -p worker_logs/fail/out
mkdir -p worker_logs/fail/err

Expand All @@ -34,16 +35,13 @@ function log {

log "Fail-Script started"

# $1 is in slurm node format for example: bibigrid-worker0-cid-[0-1],bibigrid-worker1-cid-0 and needs no converting
scontrol update NodeName="$1" state=RESUME reason=FailedStartup # no sudo needed cause executed by slurm user

hosts=$(scontrol show hostnames "$1")

echo "Hosts $hosts used"
log "Hosts $hosts used"

# delete servers
python3 /usr/local/bin/delete_server.py "${hosts}"
# $1 is in slurm node format for example: bibigrid-worker0-cid-[0-1],bibigrid-worker1-cid-0 and needs no converting
scontrol update NodeName="$1" state=POWER_DOWN reason=FailedStartup # no sudo needed cause executed by slurm user

echo "Finished delete_server.py execution."
log "Nodes $1 set to POWER_DOWN."

exit $?
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
owner: root
group: root
mode: 0644
remote_src: true
with_items: "{{ collected_files.files }}"
- name: Remove collected files
file:
Expand Down
82 changes: 82 additions & 0 deletions resources/tests/schema/error_master_missing.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# See https://cloud.denbi.de/wiki/Tutorials/BiBiGrid/ (after update)
# First configuration will be used for general cluster information and must include the master.
# All other configurations mustn't include another master, but exactly one vpnWorker instead (keys like master).

- infrastructure: openstack # former mode.
cloud: some_cloud #credentials # name of clouds.yaml entry

sshTimeout: 6

# customAnsibleCfg: True # If True, changes in ansible.cfg are kept. Default False.
# customSlurmTemplate: True # If True, changes in slurm.j2 are kept. Default False.

cloudScheduling:
sshTimeout: 42
# -- BEGIN: GENERAL CLUSTER INFORMATION --
# deleteTmpKeypairAfter: True
# dontUploadCredentials: True
## sshPublicKeyFiles listed here will be added to access the cluster. A temporary key is created by bibigrid itself.
# - [key one]
## Volumes and snapshots that will be mounted to master
#autoMount: True
#masterMounts:
# - test

#nfsShares:
# - test2

## Ansible (Galaxy) roles can be added for execution
#userRoles:
# - hosts:
# - "master"
# roles:
# - name: "resistance_nextflow"

## Uncomment if you don't want assign a public ip to the master; for internal cluster (Tuebingen).
# useMasterWithPublicIp: False

# Other keys
#localFS: False
#localDNSlookup: False
zabbix: True
nfs: True
ide: True

useMasterAsCompute: True
waitForServices:
- some.service

fallbackOnOtherImage: True


# master configuration
# -- END: GENERAL CLUSTER INFORMATION --

workerInstances:
- type: de.NBI small + ephemeral
image: ^Ubuntu 22\.04 LTS \(.*\)$
count: 2
#partitions:
# - ephemeral
- type: de.NBI small
image: ^Ubuntu 22\.04 LTS \(.*\)$
count: 1
#onDemand: False
# worker configuration

# Depends on cloud image
sshUser: ubuntu

# Depends on cloud site and project
# network: bibiserv-external
# network: bibiserv_test2_network
subnet: subnet
#gateway:
# ip: 129.70.51.103
# portFunction: "30000 + oct4"

# Uncomment if no full DNS service for started instances is available.
# Currently the case in Berlin, DKFZ, Heidelberg and Tuebingen.
#localDNSLookup: yes

#- [next configurations]
Loading

0 comments on commit b6184e8

Please sign in to comment.