Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Azure] Allow resource group specifiation for Azure instance provisioning #3764

Merged
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
06fe6c7
Allow resource group specifiation for Azure instance provisioning
landscapepainter Jul 19, 2024
b536c62
Add 'use_external_resource_group' under provider config
landscapepainter Jul 20, 2024
1b95339
nit
landscapepainter Jul 24, 2024
5b6814f
attached resources deletion
landscapepainter Jul 26, 2024
4b64e47
support deployment removal when terminating
landscapepainter Jul 28, 2024
e6416b2
nit
landscapepainter Jul 28, 2024
afe8898
delete RoleAssignment when terminating
landscapepainter Jul 28, 2024
076fb0d
update ARM config template
landscapepainter Jul 28, 2024
cdca77f
nit
landscapepainter Jul 28, 2024
e8ce9f8
nit
landscapepainter Jul 30, 2024
ee77e4e
delete role assignment with guid
landscapepainter Jul 30, 2024
15b4d75
update role assignment removal logic
landscapepainter Jul 31, 2024
d97f1ba
Separate resource group region and VM, attached resources
landscapepainter Jul 31, 2024
75822e5
Merge branch 'master' into azure-instance-resource-group-specification
landscapepainter Aug 4, 2024
ce40c8c
nit
landscapepainter Aug 5, 2024
e117896
nit
landscapepainter Aug 5, 2024
9a6fa10
nit
landscapepainter Aug 5, 2024
8a8a381
nit
landscapepainter Aug 5, 2024
109f25b
add error handling for deletion
landscapepainter Aug 10, 2024
bfded7f
Merge branch 'master' into azure-instance-resource-group-specification
landscapepainter Aug 10, 2024
d24abdc
format
landscapepainter Aug 10, 2024
cc6d503
deployment naming update
landscapepainter Aug 12, 2024
e02533b
test
landscapepainter Aug 17, 2024
c8bde4e
nit
landscapepainter Aug 17, 2024
d7f3f83
update deployment constant names
landscapepainter Aug 17, 2024
03a8dc0
update open_ports to wait for the nsg creation corresponding to the V…
landscapepainter Aug 17, 2024
42e8a10
format
landscapepainter Aug 17, 2024
a5f64b7
Merge branch 'master' into azure-instance-resource-group-specification
landscapepainter Aug 17, 2024
8375f85
nit
landscapepainter Aug 17, 2024
3679832
Merge branch 'master' into azure-instance-resource-group-specification
landscapepainter Aug 27, 2024
cda4f10
format
landscapepainter Aug 27, 2024
e41f221
Merge branch 'master' into azure-instance-resource-group-specification
landscapepainter Oct 26, 2024
d11ef46
update docstring
landscapepainter Oct 26, 2024
e5f1e09
add back deleted snippet
landscapepainter Oct 26, 2024
4af53e0
format
landscapepainter Oct 26, 2024
eb56fd5
Merge branch 'master' into azure-instance-resource-group-specification
landscapepainter Oct 26, 2024
91fb7a8
Merge branch 'master' into azure-instance-resource-group-specification
landscapepainter Oct 27, 2024
fb2c1ee
delete nic with retries
landscapepainter Oct 29, 2024
40605da
error handle update
landscapepainter Oct 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions sky/adaptors/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ def get_client(name: str,
from azure.mgmt import authorization
return authorization.AuthorizationManagementClient(
credential, subscription_id)
elif name == 'msi':
from azure.mgmt import msi
return msi.ManagedServiceIdentityClient(credential, subscription_id)
elif name == 'graph':
import msgraph
return msgraph.GraphServiceClient(credential)
Expand Down
11 changes: 10 additions & 1 deletion sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from sky import clouds
from sky import exceptions
from sky import sky_logging
from sky import skypilot_config
from sky.adaptors import azure
from sky.clouds import service_catalog
from sky.clouds.utils import azure_utils
Expand Down Expand Up @@ -355,6 +356,13 @@ def make_deploy_resources_variables(
need_nvidia_driver_extension = (acc_dict is not None and
'A10' in acc_dict)

# Determine resource group for deploying the instance.
resource_group_name = skypilot_config.get_nested(
('azure', 'resource_group_vm'), None)
use_external_resource_group = resource_group_name is not None
if resource_group_name is None:
resource_group_name = f'{cluster_name.name_on_cloud}-{region_name}'

# Setup commands to eliminate the banner and restart sshd.
# This script will modify /etc/ssh/sshd_config and add a bash script
# into .bashrc. The bash script will restart sshd if it has not been
Expand Down Expand Up @@ -411,7 +419,8 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
'disk_tier': Azure._get_disk_type(disk_tier),
'cloud_init_setup_commands': cloud_init_setup_commands,
'azure_subscription_id': self.get_project_id(dryrun),
'resource_group': f'{cluster_name.name_on_cloud}-{region_name}',
'resource_group': resource_group_name,
'use_external_resource_group': use_external_resource_group,
}

# Setting disk performance tier for high disk tier.
Expand Down
8 changes: 7 additions & 1 deletion sky/provision/azure/azure-config-template.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
"description": "Subnet parameters."
}
},
"location": {
"type": "string",
"metadata": {
"description": "Location of where the resources are allocated."
}
},
"nsgName": {
"type": "string",
"metadata": {
Expand All @@ -23,7 +29,7 @@
},
"variables": {
"contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
"location": "[resourceGroup().location]",
"location": "[parameters('location')]",
"msiName": "[concat('sky-', parameters('clusterId'), '-msi')]",
"roleAssignmentName": "[concat('sky-', parameters('clusterId'), '-ra')]",
"nsgName": "[parameters('nsgName')]",
Expand Down
110 changes: 65 additions & 45 deletions sky/provision/azure/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,12 @@
from sky import sky_logging
from sky.adaptors import azure
from sky.provision import common
from sky.provision import constants
from sky.utils import common_utils

logger = sky_logging.init_logger(__name__)

UNIQUE_ID_LEN = 4
_DEPLOYMENT_NAME = 'skypilot-config'
_LEGACY_DEPLOYMENT_NAME = 'ray-config'
_RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT = 480 # 8 minutes
_CLUSTER_ID = '{cluster_name_on_cloud}-{unique_id}'

Expand Down Expand Up @@ -82,46 +81,55 @@ def bootstrap_instances(
in provider_config), 'Provider config must include location field'
params = {'location': provider_config['location']}

assert ('use_external_resource_group'
in provider_config), ('Provider config must include '
'use_external_resource_group field')
use_external_resource_group = provider_config['use_external_resource_group']

if 'tags' in provider_config:
params['tags'] = provider_config['tags']

logger.info(f'Creating/Updating resource group: {resource_group}')
rg_create_or_update = get_azure_sdk_function(
client=resource_client.resource_groups,
function_name='create_or_update')
rg_creation_start = time.time()
retry = 0
while (time.time() - rg_creation_start <
_RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT):
try:
rg_create_or_update(resource_group_name=resource_group,
parameters=params)
break
except azure.exceptions().ResourceExistsError as e:
if 'ResourceGroupBeingDeleted' in str(e):
if retry % 5 == 0:
logger.info(
f'Azure resource group {resource_group} of a recent '
f'terminated cluster {cluster_name_on_cloud} is being '
'deleted. It can only be provisioned after it is fully '
'deleted. Waiting...')
time.sleep(1)
retry += 1
continue
raise
except azure.exceptions().ClientAuthenticationError as e:
# When resource group is user specified, it already exists in certain
# region.
if not use_external_resource_group:
logger.info(f'Creating/Updating resource group: {resource_group}')
rg_create_or_update = get_azure_sdk_function(
client=resource_client.resource_groups,
function_name='create_or_update')
rg_creation_start = time.time()
retry = 0
while (time.time() - rg_creation_start <
_RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT):
try:
rg_create_or_update(resource_group_name=resource_group,
parameters=params)
break
except azure.exceptions().ResourceExistsError as e:
if 'ResourceGroupBeingDeleted' in str(e):
if retry % 5 == 0:
logger.info(
f'Azure resource group {resource_group} of a '
'recent terminated cluster '
f'{cluster_name_on_cloud} is being deleted. It can'
' only be provisioned after it is fully deleted. '
'Waiting...')
time.sleep(1)
retry += 1
continue
raise
except azure.exceptions().ClientAuthenticationError as e:
message = (
'Failed to authenticate with Azure. Please check your '
'Azure credentials. Error: '
f'{common_utils.format_exception(e)}').replace('\n', ' ')
logger.error(message)
raise exceptions.NoClusterLaunchedError(message) from e
else:
message = (
'Failed to authenticate with Azure. Please check your Azure '
f'credentials. Error: {common_utils.format_exception(e)}'
).replace('\n', ' ')
f'Timed out waiting for resource group {resource_group} to be '
'deleted.')
logger.error(message)
raise exceptions.NoClusterLaunchedError(message) from e
else:
message = (
f'Timed out waiting for resource group {resource_group} to be '
'deleted.')
logger.error(message)
raise TimeoutError(message)
raise TimeoutError(message)

# load the template file
current_path = Path(__file__).parent
Expand Down Expand Up @@ -155,6 +163,9 @@ def bootstrap_instances(
'nsgName': {
'value': nsg_name
},
'location': {
'value': params['location']
}
},
}
}
Expand All @@ -164,11 +175,22 @@ def bootstrap_instances(
get_deployment = get_azure_sdk_function(client=resource_client.deployments,
function_name='get')
deployment_exists = False
for deployment_name in [_DEPLOYMENT_NAME, _LEGACY_DEPLOYMENT_NAME]:
if use_external_resource_group:
deployment_name = (
constants.EXTERNAL_RG_BOOTSTRAP_DEPLOYMENT_NAME.format(
cluster_name_on_cloud=cluster_name_on_cloud))
deployment_list = [deployment_name]
else:
deployment_name = constants.DEPLOYMENT_NAME
deployment_list = [
constants.DEPLOYMENT_NAME, constants.LEGACY_DEPLOYMENT_NAME
]

for deploy_name in deployment_list:
try:
deployment = get_deployment(resource_group_name=resource_group,
deployment_name=deployment_name)
logger.info(f'Deployment {deployment_name!r} already exists. '
deployment_name=deploy_name)
logger.info(f'Deployment {deploy_name!r} already exists. '
'Skipping deployment creation.')

outputs = deployment.properties.outputs
Expand All @@ -179,22 +201,20 @@ def bootstrap_instances(
deployment_exists = False

if not deployment_exists:
logger.info(f'Creating/Updating deployment: {_DEPLOYMENT_NAME}')
logger.info(f'Creating/Updating deployment: {deployment_name}')
create_or_update = get_azure_sdk_function(
client=resource_client.deployments,
function_name='create_or_update')
# TODO (skypilot): this takes a long time (> 40 seconds) to run.
outputs = create_or_update(
resource_group_name=resource_group,
deployment_name=_DEPLOYMENT_NAME,
deployment_name=deployment_name,
parameters=parameters,
).result().properties.outputs

nsg_id = outputs['nsg']['value']

# append output resource ids to be used with vm creation
provider_config['msi'] = outputs['msi']['value']
provider_config['nsg'] = nsg_id
provider_config['nsg'] = outputs['nsg']['value']
provider_config['subnet'] = outputs['subnet']['value']

return config
Loading
Loading