From c1862239b24ab3af9c35db09337af78ee6f88040 Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Thu, 4 Jul 2024 15:44:16 +0200 Subject: [PATCH] updated timeouts. updated tests --- bibigrid.yaml | 2 +- bibigrid/core/utility/ansible_configurator.py | 2 +- .../markdown/features/configuration.md | 4 +- resources/defaults/slurm/slurm.j2 | 2 +- tests/test_ansible_configurator.py | 44 +++++++++---------- tests/test_create.py | 4 +- 6 files changed, 29 insertions(+), 29 deletions(-) diff --git a/bibigrid.yaml b/bibigrid.yaml index 88c3f0e3..8982a810 100644 --- a/bibigrid.yaml +++ b/bibigrid.yaml @@ -101,7 +101,7 @@ #features: # list # elastic_scheduling: # for large or slow clusters increasing these timeouts might be necessary to avoid failures - # SuspendTimeout: 30 # after SuspendTimeout seconds, slurm allows to power up the node again + # SuspendTimeout: 60 # after SuspendTimeout seconds, slurm allows to power up the node again # ResumeTimeout: 1200 # if a node doesn't start in ResumeTimeout seconds, the start is considered failed. #- [next configurations] diff --git a/bibigrid/core/utility/ansible_configurator.py b/bibigrid/core/utility/ansible_configurator.py index d88592f8..875ea00c 100644 --- a/bibigrid/core/utility/ansible_configurator.py +++ b/bibigrid/core/utility/ansible_configurator.py @@ -28,7 +28,7 @@ "server_name": "bibigrid", "admin_password": "bibigrid"} SLURM_CONF = {"db": "slurm", "db_user": "slurm", "db_password": "changeme", "munge_key": id_generation.generate_munge_key(), - "elastic_scheduling": {"SuspendTime": 3600, "ResumeTimeout": 1200, "SuspendTimeout": 30, + "elastic_scheduling": {"SuspendTime": 3600, "ResumeTimeout": 1200, "SuspendTimeout": 60, "TreeWidth": 128}} CLOUD_SCHEDULING = {"sshTimeout": 5} diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md index c4987907..60345c0c 100644 --- a/documentation/markdown/features/configuration.md +++ b/documentation/markdown/features/configuration.md @@ -156,7 +156,7 @@ For nearly all cases the default value is what you need. Default is `True`. ```yaml elastic_scheduling: SuspendTime: 1800 - SuspendTimeout: 60 + SuspendTimeout: 90 ResumeTimeout: 1800 ``` @@ -173,7 +173,7 @@ slurmConf: munge_key: # automatically generated via id_generation.generate_munge_key elastic_scheduling: SuspendTime: 900 # if a node is not used for SuspendTime seconds, it will shut down - SuspendTimeout: 30 # after SuspendTimeout seconds, slurm allows to power up the powered down node again + SuspendTimeout: 60 # after SuspendTimeout seconds, slurm allows to power up the powered down node again ResumeTimeout: 900 # if a node doesn't start in ResumeTimeout seconds, the start is considered failed. See https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeProgram TreeWidth: 128 # https://slurm.schedmd.com/slurm.conf.html#OPT_TreeWidth ``` diff --git a/resources/defaults/slurm/slurm.j2 b/resources/defaults/slurm/slurm.j2 index c90a2fd6..abd8078b 100644 --- a/resources/defaults/slurm/slurm.j2 +++ b/resources/defaults/slurm/slurm.j2 @@ -101,7 +101,7 @@ ResumeTimeout= {{ slurm_conf.elastic_scheduling.ResumeTimeout }} SuspendProgram=/opt/slurm/terminate.sh # Suspend time's default is 1 hour (3600 seconds) SuspendTime= {{ slurm_conf.elastic_scheduling.SuspendTime }} -# SuspendTimeout's default is 30 seconds +# SuspendTimeout's default is 90 seconds SuspendTimeout={{ slurm_conf.elastic_scheduling.SuspendTimeout }} # Excludes {{ hostvars[groups.master.0].name }} from suspend SuspendExcNodes={{ exclude_groups | join(',') }} diff --git a/tests/test_ansible_configurator.py b/tests/test_ansible_configurator.py index 53bcd47e..4b465e65 100644 --- a/tests/test_ansible_configurator.py +++ b/tests/test_ansible_configurator.py @@ -99,14 +99,14 @@ def test_generate_common_configuration_false(self): default_user = "ubuntu" ssh_user = "test" configuration = [{}] - common_configuration_yaml = {'bibigrid_version': version.__version__, 'cloud_scheduling': {'sshTimeout': 4}, + common_configuration_yaml = {'bibigrid_version': version.__version__, 'cloud_scheduling': {'sshTimeout': 5}, 'cluster_cidrs': cidrs, 'cluster_id': cluster_id, 'default_user': default_user, 'dns_server_list': ['8.8.8.8'], 'enable_ide': False, 'enable_nfs': False, 'enable_slurm': False, 'enable_zabbix': False, 'local_dns_lookup': False, 'local_fs': False, 'slurm': True, 'slurm_conf': {'db': 'slurm', 'db_password': 'changeme', 'db_user': 'slurm', - 'elastic_scheduling': {'ResumeTimeout': 900, 'SuspendTime': 3600, - 'TreeWidth': 128}, + 'elastic_scheduling': {'ResumeTimeout': 1200, 'SuspendTime': 3600, + 'SuspendTimeout': 60, 'TreeWidth': 128}, 'munge_key': 'TO_BE_FILLED'}, 'ssh_user': ssh_user, 'use_master_as_compute': True} generated_common_configuration = ansible_configurator.generate_common_configuration_yaml(cidrs, configuration, @@ -124,7 +124,7 @@ def test_generate_common_configuration_true(self): ssh_user = "test" configuration = [ {elem: "True" for elem in ["localFS", "localDNSlookup", "useMasterAsCompute", "slurm", "zabbix", "ide"]}] - common_configuration_yaml = {'bibigrid_version': version.__version__, 'cloud_scheduling': {'sshTimeout': 4}, + common_configuration_yaml = {'bibigrid_version': version.__version__, 'cloud_scheduling': {'sshTimeout': 5}, 'cluster_cidrs': cidrs, 'cluster_id': cluster_id, 'default_user': default_user, 'dns_server_list': ['8.8.8.8'], 'enable_ide': 'True', 'enable_nfs': False, 'enable_slurm': 'True', 'enable_zabbix': 'True', @@ -132,8 +132,8 @@ def test_generate_common_configuration_true(self): 'workspace': '${HOME}'}, 'local_dns_lookup': 'True', 'local_fs': 'True', 'slurm': 'True', 'slurm_conf': {'db': 'slurm', 'db_password': 'changeme', 'db_user': 'slurm', - 'elastic_scheduling': {'ResumeTimeout': 900, 'SuspendTime': 3600, - 'TreeWidth': 128}, + 'elastic_scheduling': {'ResumeTimeout': 1200, 'SuspendTime': 3600, + 'SuspendTimeout': 60, 'TreeWidth': 128}, 'munge_key': 'TO_BE_FILLED'}, 'ssh_user': ssh_user, 'use_master_as_compute': 'True', 'zabbix_conf': {'admin_password': 'bibigrid', 'db': 'zabbix', @@ -152,16 +152,16 @@ def test_generate_common_configuration_nfs_shares(self): cluster_id = "21" default_user = "ubuntu" ssh_user = "test" - common_configuration_yaml = {'bibigrid_version': version.__version__, 'cloud_scheduling': {'sshTimeout': 4}, + common_configuration_yaml = {'bibigrid_version': version.__version__, 'cloud_scheduling': {'sshTimeout': 5}, 'cluster_cidrs': cidrs, 'cluster_id': cluster_id, 'default_user': default_user, 'dns_server_list': ['8.8.8.8'], 'enable_ide': False, 'enable_nfs': 'True', 'enable_slurm': False, 'enable_zabbix': False, 'ext_nfs_mounts': [], 'local_dns_lookup': False, 'local_fs': False, - 'nfs_mounts': [{'dst': '//vil/mil', 'src': '//vil/mil'}, - {'dst': '//vol/spool', 'src': '//vol/spool'}], 'slurm': True, + 'nfs_mounts': [{'dst': '/vil/mil', 'src': '/vil/mil'}, + {'dst': '/vol/spool', 'src': '/vol/spool'}], 'slurm': True, 'slurm_conf': {'db': 'slurm', 'db_password': 'changeme', 'db_user': 'slurm', - 'elastic_scheduling': {'ResumeTimeout': 900, 'SuspendTime': 3600, - 'TreeWidth': 128}, + 'elastic_scheduling': {'ResumeTimeout': 1200, 'SuspendTime': 3600, + 'SuspendTimeout': 60, 'TreeWidth': 128}, 'munge_key': 'TO_BE_FILLED'}, 'ssh_user': ssh_user, 'use_master_as_compute': True} generated_common_configuration = ansible_configurator.generate_common_configuration_yaml(cidrs, configuration, @@ -177,15 +177,15 @@ def test_generate_common_configuration_nfs(self): cluster_id = "21" default_user = "ubuntu" ssh_user = "test" - common_configuration_yaml = {'bibigrid_version': version.__version__, 'cloud_scheduling': {'sshTimeout': 4}, + common_configuration_yaml = {'bibigrid_version': version.__version__, 'cloud_scheduling': {'sshTimeout': 5}, 'cluster_cidrs': cidrs, 'cluster_id': cluster_id, 'default_user': default_user, 'dns_server_list': ['8.8.8.8'], 'enable_ide': False, 'enable_nfs': 'True', 'enable_slurm': False, 'enable_zabbix': False, 'ext_nfs_mounts': [], 'local_dns_lookup': False, 'local_fs': False, - 'nfs_mounts': [{'dst': '//vol/spool', 'src': '//vol/spool'}], 'slurm': True, + 'nfs_mounts': [{'dst': '/vol/spool', 'src': '/vol/spool'}], 'slurm': True, 'slurm_conf': {'db': 'slurm', 'db_password': 'changeme', 'db_user': 'slurm', - 'elastic_scheduling': {'ResumeTimeout': 900, 'SuspendTime': 3600, - 'TreeWidth': 128}, + 'elastic_scheduling': {'ResumeTimeout': 1200, 'SuspendTime': 3600, + 'SuspendTimeout': 60, 'TreeWidth': 128}, 'munge_key': 'TO_BE_FILLED'}, 'ssh_user': ssh_user, 'use_master_as_compute': True} generated_common_configuration = ansible_configurator.generate_common_configuration_yaml(cidrs, configuration, @@ -201,16 +201,16 @@ def test_generate_common_configuration_ext_nfs_shares(self): cluster_id = "21" default_user = "ubuntu" ssh_user = "test" - common_configuration_yaml = {'bibigrid_version': version.__version__, 'cloud_scheduling': {'sshTimeout': 4}, + common_configuration_yaml = {'bibigrid_version': version.__version__, 'cloud_scheduling': {'sshTimeout': 5}, 'cluster_cidrs': cidrs, 'cluster_id': cluster_id, 'default_user': default_user, 'dns_server_list': ['8.8.8.8'], 'enable_ide': False, 'enable_nfs': 'True', 'enable_slurm': False, 'enable_zabbix': False, 'ext_nfs_mounts': [{'dst': '/vil/mil', 'src': '/vil/mil'}], 'local_dns_lookup': False, 'local_fs': False, - 'nfs_mounts': [{'dst': '//vol/spool', 'src': '//vol/spool'}], 'slurm': True, + 'nfs_mounts': [{'dst': '/vol/spool', 'src': '/vol/spool'}], 'slurm': True, 'slurm_conf': {'db': 'slurm', 'db_password': 'changeme', 'db_user': 'slurm', - 'elastic_scheduling': {'ResumeTimeout': 900, 'SuspendTime': 3600, - 'TreeWidth': 128}, + 'elastic_scheduling': {'ResumeTimeout': 1200, 'SuspendTime': 3600, + 'SuspendTimeout': 60, 'TreeWidth': 128}, 'munge_key': 'YryJVnqgg24Ksf8zXQtbct3nuXrMSi9N'}, 'ssh_user': ssh_user, 'use_master_as_compute': True} generated_common_configuration = ansible_configurator.generate_common_configuration_yaml(cidrs, configuration, @@ -226,7 +226,7 @@ def test_generate_common_configuration_ide(self): cluster_id = "21" default_user = "ubuntu" ssh_user = "test" - common_configuration_yaml = {'bibigrid_version': version.__version__, 'cloud_scheduling': {'sshTimeout': 4}, + common_configuration_yaml = {'bibigrid_version': version.__version__, 'cloud_scheduling': {'sshTimeout': 5}, 'cluster_cidrs': cidrs, 'cluster_id': cluster_id, 'default_user': default_user, 'dns_server_list': ['8.8.8.8'], 'enable_ide': 'Some1', 'enable_nfs': False, 'enable_slurm': False, 'enable_zabbix': False, @@ -234,8 +234,8 @@ def test_generate_common_configuration_ide(self): 'port_start': 8181, 'workspace': '${HOME}'}, 'local_dns_lookup': False, 'local_fs': False, 'slurm': True, 'slurm_conf': {'db': 'slurm', 'db_password': 'changeme', 'db_user': 'slurm', - 'elastic_scheduling': {'ResumeTimeout': 900, 'SuspendTime': 3600, - 'TreeWidth': 128}, + 'elastic_scheduling': {'ResumeTimeout': 1200, 'SuspendTime': 3600, + 'SuspendTimeout': 60, 'TreeWidth': 128}, 'munge_key': 'b7nks3Ur3kanyPAEBxfSC9ypfSHFnWJL'}, 'ssh_user': ssh_user, 'use_master_as_compute': True} generated_common_configuration = ansible_configurator.generate_common_configuration_yaml(cidrs, configuration, diff --git a/tests/test_create.py b/tests/test_create.py index 612681d2..63dd119d 100644 --- a/tests/test_create.py +++ b/tests/test_create.py @@ -123,7 +123,7 @@ def test_initialize_master(self, mock_execute_ssh): 'username': creator.ssh_user, 'commands': creator.ssh_add_public_key_commands + ssh_handler.ANSIBLE_SETUP, 'filepaths': [(create.KEY_FOLDER + creator.key_name, '.ssh/id_ecdsa')], - 'gateway': {}, 'timeout': 4} + 'gateway': {}, 'timeout': 5} mock_execute_ssh.assert_called_with(ssh_data, startup.LOG) def test_prepare_volumes_none(self): @@ -211,7 +211,7 @@ def test_upload_playbooks(self, mock_execute_ssh, mock_ac_ssh, mock_configure_an 'username': creator.ssh_user, 'commands': [mock_ac_ssh()] + ssh_handler.ANSIBLE_START, 'filepaths': create.FILEPATHS, - 'gateway': {}, 'timeout': 4} + 'gateway': {}, 'timeout': 5} mock_execute_ssh.assert_called_with(ssh_data=ssh_data, log=startup.LOG) @patch.object(create.Create, "generate_keypair")