From aecb2de328cdff1e973380572c2c136683262206 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Sat, 11 Nov 2023 22:57:42 -0800 Subject: [PATCH] GCP: Support for custom VPC. (#2764) * GCP: Support for custom VPC. * yapf * format.sh * Address comments. * Update docs * format * comment --- .../cloud-setup/cloud-permissions/gcp.rst | 54 ++++++++++++- docs/source/reference/config.rst | 39 +++++++-- sky/backends/backend_utils.py | 19 ++++- sky/backends/cloud_vm_ray_backend.py | 37 +++++++-- sky/skylet/providers/gcp/config.py | 80 +++++++++++++++---- sky/skylet/providers/gcp/constants.py | 11 +++ sky/templates/aws-ray.yml.j2 | 7 +- sky/templates/gcp-ray.yml.j2 | 4 + sky/utils/schemas.py | 7 ++ tests/test_config.py | 7 ++ 10 files changed, 224 insertions(+), 41 deletions(-) diff --git a/docs/source/cloud-setup/cloud-permissions/gcp.rst b/docs/source/cloud-setup/cloud-permissions/gcp.rst index fa75750e711..82ad6249aa4 100644 --- a/docs/source/cloud-setup/cloud-permissions/gcp.rst +++ b/docs/source/cloud-setup/cloud-permissions/gcp.rst @@ -66,7 +66,7 @@ User compute.firewalls.create compute.firewalls.delete compute.firewalls.get - compute.instances.create + compute.instances.create compute.instances.delete compute.instances.get compute.instances.list @@ -148,8 +148,8 @@ User .. note:: - The user created with the above minimal permissions will not be able to create service accounts to be assigned to SkyPilot instances. - + The user created with the above minimal permissions will not be able to create service accounts to be assigned to SkyPilot instances. + The admin needs to follow the :ref:`instruction below ` to create a service account to be shared by all users in the project. @@ -182,3 +182,51 @@ Service Account :align: center :alt: Set Service Account Role + +.. _gcp-minimum-firewall-rules: + +Firewall Rules +~~~~~~~~~~~~~~~~~~~ + +By default, users do not need to set up any special firewall rules to start +using SkyPilot. If the default VPC does not satisfy the minimal required rules, +a new VPC ``skypilot-vpc`` with sufficient rules will be automatically created +and used. + +However, if you manually set up and instruct SkyPilot to use a VPC (see +:ref:`here `), ensure it has the following required firewall rules: + +.. code-block:: python + + # Allow internal connections between SkyPilot VMs: + # + # controller -> head node of a cluster + # head node of a cluster <-> worker node(s) of a cluster + # + # NOTE: these ports are more relaxed than absolute minimum, but the + # sourceRanges restrict the traffic to internal IPs. + { + "direction": "INGRESS", + "allowed": [ + {"IPProtocol": "tcp", "ports": ["0-65535"]}, + {"IPProtocol": "udp", "ports": ["0-65535"]}, + ], + "sourceRanges": ["10.128.0.0/9"], + }, + + # Allow SSH connections from user machine(s) + # + # NOTE: This can be satisfied using the following relaxed sourceRanges + # (0.0.0.0/0), but you can customize it if you want to restrict to certain + # known public IPs (useful when using internal VPN or proxy solutions). + { + "direction": "INGRESS", + "allowed": [ + {"IPProtocol": "tcp", "ports": ["22"]}, + ], + "sourceRanges": ["0.0.0.0/0"], + }, + +You can inspect and manage firewall rules at +``https://console.cloud.google.com/net-security/firewall-manager/firewall-policies/list?project=`` +or using any of GCP's SDKs. diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index 5c0a86ec66b..e21487d612d 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -57,7 +57,7 @@ Available fields and semantics: # with this name (provisioner automatically looks for such regions). # Regions without a VPC with this name will not be used to launch nodes. # - # Default: None (use the default VPC in each region). + # Default: null (use the default VPC in each region). vpc_name: skypilot-vpc # Should instances be assigned private IPs only? (optional) @@ -88,7 +88,7 @@ Available fields and semantics: # and any SkyPilot nodes. (This option is not used between SkyPilot nodes, # since they are behind the proxy / may not have such a proxy set up.) # - # Optional; default: None. + # Optional; default: null. ### Format 1 ### # A string; the same proxy command is used for all regions. ssh_proxy_command: ssh -W %h:%p -i ~/.ssh/sky-key -o StrictHostKeyChecking=no ec2-user@ @@ -103,6 +103,24 @@ Available fields and semantics: # Advanced GCP configurations (optional). # Apply to all new instances but not existing ones. gcp: + # VPC to use (optional). + # + # Default: null, which implies the following behavior. First, the VPC named + # 'default' is checked against minimal recommended firewall rules for + # SkyPilot to function. If it satisfies these rules, this VPC is used. + # Otherwise, a new VPC named 'skypilot-vpc' is automatically created with + # the minimal recommended firewall rules and will be used. + # + # If this field is set, SkyPilot will use the VPC with this name. Useful for + # when users want to manually set up a VPC and precisely control its + # firewall rules. If no region restrictions are given, SkyPilot only + # provisions in regions for which a subnet of this VPC exists. Errors are + # thrown if VPC with this name is not found. The VPC does not get modified + # in any way, except when opening ports (e.g., via `resources.ports`) in + # which case new firewall rules permitting public traffic to those ports + # will be added. + vpc_name: skypilot-vpc + # Reserved capacity (optional). # # The specific reservation to be considered when provisioning clusters on GCP. @@ -117,15 +135,24 @@ Available fields and semantics: # Advanced Kubernetes configurations (optional). kubernetes: # The networking mode for accessing SSH jump pod (optional). - # This must be either: 'nodeport' or 'portforward'. If not specified, defaults to 'portforward'. # - # nodeport: Exposes the jump pod SSH service on a static port number on each Node, allowing external access to using :. Using this mode requires opening multiple ports on nodes in the Kubernetes cluster. - # portforward: Uses `kubectl port-forward` to create a tunnel and directly access the jump pod SSH service in the Kubernetes cluster. Does not require opening ports the cluster nodes and is more secure. 'portforward' is used as default if 'networking' is not specified. + # This must be either: 'nodeport' or 'portforward'. If not specified, + # defaults to 'portforward'. + # + # nodeport: Exposes the jump pod SSH service on a static port number on each + # Node, allowing external access to using :. Using this + # mode requires opening multiple ports on nodes in the Kubernetes cluster. + # + # portforward: Uses `kubectl port-forward` to create a tunnel and directly + # access the jump pod SSH service in the Kubernetes cluster. Does not + # require opening ports the cluster nodes and is more secure. 'portforward' + # is used as default if 'networking' is not specified. networking: portforward # Advanced OCI configurations (optional). oci: - # A dict mapping region names to region-specific configurations, or `default` for the default configuration. + # A dict mapping region names to region-specific configurations, or + # `default` for the default configuration. default: # The OCID of the profile to use for launching instances (optional). oci_config_profile: DEFAULT diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index fa54505dd20..c7ea38bf071 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1023,8 +1023,8 @@ def write_cluster_config( 'SKYPILOT_USER', '')), # AWS only: - 'vpc_name': skypilot_config.get_nested(('aws', 'vpc_name'), - None), + 'aws_vpc_name': skypilot_config.get_nested(('aws', 'vpc_name'), + None), 'use_internal_ips': skypilot_config.get_nested( ('aws', 'use_internal_ips'), False), # Not exactly AWS only, but we only test it's supported on AWS @@ -1038,6 +1038,8 @@ def write_cluster_config( 'resource_group': f'{cluster_name}-{region_name}', # GCP only: + 'gcp_vpc_name': skypilot_config.get_nested(('gcp', 'vpc_name'), + None), 'gcp_project_id': gcp_project_id, 'specific_reservations': filtered_specific_reservations, 'num_specific_reserved_workers': num_specific_reserved_workers, @@ -1126,10 +1128,21 @@ def write_cluster_config( user_file_dir = os.path.expanduser(f'{SKY_USER_FILE_PATH}/') + # We do not import the module under sky.skylet.providers globally as we + # need to avoid importing ray module (extras like skypilot[aws] has + # removed the Ray dependency). # pylint: disable=import-outside-toplevel from sky.skylet.providers.gcp import config as gcp_config config = common_utils.read_yaml(os.path.expanduser(config_dict['ray'])) - vpc_name = gcp_config.get_usable_vpc(config) + vpc_name = None + try: + vpc_name = gcp_config.get_usable_vpc(config) + except RuntimeError as e: + # Launching a TPU and encountering a bootstrap-phase error, no point + # in failover unless: + # TODO(zongheng): handle failover when multi-resource is added. + with ux_utils.print_exception_no_traceback(): + raise e scripts = [] for template_name in ('gcp-tpu-create.sh.j2', 'gcp-tpu-delete.sh.j2'): diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index b1832d6581f..d8b93bcd51a 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -664,8 +664,6 @@ def _update_blocklist_on_gcp_error( self, launchable_resources: 'resources_lib.Resources', region: 'clouds.Region', zones: Optional[List['clouds.Zone']], stdout: str, stderr: str): - - del region # unused style = colorama.Style assert zones and len(zones) == 1, zones zone = zones[0] @@ -673,7 +671,12 @@ def _update_blocklist_on_gcp_error( exception_list = [s for s in splits if s.startswith('Exception: ')] httperror_str = [ s for s in splits - if s.startswith('googleapiclient.errors.HttpError: ') + # GCP API errors + if s.startswith('googleapiclient.errors.HttpError: ') or + # 'SKYPILOT_ERROR_NO_NODES_LAUNCHED': skypilot's changes to the + # underlying provisioner provider; for errors prior to provisioning + # like VPC setup. + 'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' in s ] if len(exception_list) == 1: # Parse structured response {'errors': [...]}. @@ -756,9 +759,21 @@ def _update_blocklist_on_gcp_error( else: assert False, error elif len(httperror_str) >= 1: - logger.info(f'Got {httperror_str[0]}') - if ('Requested disk size cannot be smaller than the image size' - in httperror_str[0]): + messages = '\n\t'.join(httperror_str) + logger.warning( + f'Got error(s):\n\t{style.DIM}{messages}{style.RESET_ALL}') + if ('SKYPILOT_ERROR_NO_NODES_LAUNCHED: No VPC with name ' + in stderr): + # User has specified a VPC that does not exist. On GCP, VPC is + # global. So we skip the entire cloud. + self._blocked_resources.add( + launchable_resources.copy(region=None, zone=None)) + elif ('SKYPILOT_ERROR_NO_NODES_LAUNCHED: No subnet for region ' + in stderr): + self._blocked_resources.add( + launchable_resources.copy(region=region.name, zone=None)) + elif ('Requested disk size cannot be smaller than the image size' + in httperror_str[0]): logger.info('Skipping all regions due to disk size issue.') self._blocked_resources.add( launchable_resources.copy(region=None, zone=None)) @@ -773,7 +788,6 @@ def _update_blocklist_on_gcp_error( f'Details: {httperror_str[0]}') self._blocked_resources.add( launchable_resources.copy(region=None, zone=None)) - else: # Parse HttpError for unauthorized regions. Example: # googleapiclient.errors.HttpError: None: + """Logs an message then raises a specific RuntimeError to trigger failover. + + Mainly used for handling VPC/subnet errors before nodes are launched. + """ + # NOTE: keep. The backend looks for this to know no nodes are launched. + prefix = "SKYPILOT_ERROR_NO_NODES_LAUNCHED: " + raise RuntimeError(prefix + error) + + def get_node_type(node: dict) -> GCPNodeType: """Returns node type based on the keys in ``node``. @@ -753,28 +763,56 @@ def _create_rules(config, compute, rules, VPC_NAME, PROJ_ID): wait_for_compute_global_operation(config["provider"]["project_id"], op, compute) -def get_usable_vpc(config): +def get_usable_vpc(config) -> str: """Return a usable VPC. + If config['provider']['vpc_name'] is set, return the VPC with the name + (errors out if not found). When this field is set, no firewall rules + checking or overrides will take place; it is the user's responsibility to + properly set up the VPC. + If not found, create a new one with sufficient firewall rules. + + Raises: + RuntimeError: if the user has specified a VPC name but the VPC is not found. """ _, _, compute, _ = construct_clients_from_provider_config(config["provider"]) - - # For backward compatibility, reuse the VPC if the VM is launched. - resource = GCPCompute( - compute, - config["provider"]["project_id"], - config["provider"]["availability_zone"], - config["cluster_name"], - ) - node = resource._list_instances(label_filters=None, status_filter=None) - if len(node) > 0: - netInterfaces = node[0].get("networkInterfaces", []) - if len(netInterfaces) > 0: - vpc_name = netInterfaces[0]["network"].split("/")[-1] - return vpc_name - - vpcnets_all = _list_vpcnets(config, compute) + specific_vpc_to_use = config["provider"].get("vpc_name", None) + if specific_vpc_to_use is None: + # For backward compatibility, reuse the VPC if the VM is launched. + resource = GCPCompute( + compute, + config["provider"]["project_id"], + config["provider"]["availability_zone"], + config["cluster_name"], + ) + node = resource._list_instances(label_filters=None, status_filter=None) + if len(node) > 0: + netInterfaces = node[0].get("networkInterfaces", []) + if len(netInterfaces) > 0: + vpc_name = netInterfaces[0]["network"].split("/")[-1] + return vpc_name + + vpcnets_all = _list_vpcnets(config, compute) + else: + vpcnets_all = _list_vpcnets( + config, compute, filter=f"name={specific_vpc_to_use}" + ) + # On GCP, VPC names are unique, so it'd be 0 or 1 VPC found. + assert ( + len(vpcnets_all) <= 1 + ), f"{len(vpcnets_all)} VPCs found with the same name {specific_vpc_to_use}" + if len(vpcnets_all) == 1: + # Skip checking any firewall rules if the user has specified a VPC. + logger.info(f"Using user-specified VPC {specific_vpc_to_use!r}.") + return specific_vpc_to_use + else: + # VPC with this name not found. Error out and let SkyPilot failover. + _skypilot_log_error_and_exit_for_failover( + f"No VPC with name {specific_vpc_to_use!r} is found. " + "To fix: specify a correct VPC name." + ) + # Should not reach here. usable_vpc_name = None for vpc in vpcnets_all: @@ -827,6 +865,14 @@ def _configure_subnet(config, compute): # SkyPilot: make sure there's a usable VPC usable_vpc_name = get_usable_vpc(config) subnets = _list_subnets(config, compute, filter=f'(name="{usable_vpc_name}")') + if not subnets: + # This can happen when e.g., region A is specified but the VPC has no + # subnet in region A. + _skypilot_log_error_and_exit_for_failover( + f"No subnet for region {config['provider']['region']} found (VPC {usable_vpc_name!r}). " + f"Check the subnets of VPC {usable_vpc_name!r} at https://console.cloud.google.com/networking/networks" + ) + default_subnet = subnets[0] default_interfaces = [ diff --git a/sky/skylet/providers/gcp/constants.py b/sky/skylet/providers/gcp/constants.py index c6f02e8af9b..97d9d14ad1c 100644 --- a/sky/skylet/providers/gcp/constants.py +++ b/sky/skylet/providers/gcp/constants.py @@ -9,6 +9,7 @@ "mtu": 1460, "routingConfig": {"routingMode": "GLOBAL"}, } + # Required firewall rules for SkyPilot to work. FIREWALL_RULES_REQUIRED = [ # Allow internal connections between GCP VMs for Ray multi-node cluster. @@ -29,9 +30,12 @@ "ports": ["22"], } ], + # TODO(skypilot): some users reported that this should be relaxed (e.g., + # allowlisting only certain IPs to have ssh access). "sourceRanges": ["0.0.0.0/0"], }, ] + # Template when creating firewall rules for a new VPC. FIREWALL_RULES_TEMPLATE = [ { @@ -61,6 +65,8 @@ "ports": ["22"], } ], + # TODO(skypilot): some users reported that this should be relaxed (e.g., + # allowlisting only certain IPs to have ssh access). "sourceRanges": ["0.0.0.0/0"], }, { @@ -84,6 +90,11 @@ VM_MINIMAL_PERMISSIONS = [ "compute.disks.create", "compute.disks.list", + # TODO(skypilot): some users reported that firewalls changes + # (create/delete/update) should be removed if VPC/firewalls are separately + # set up. It is undesirable for a normal account to have these permissions. + # Note that if these permissions are removed, opening ports (e.g., via + # `resources.ports`) would fail. "compute.firewalls.create", "compute.firewalls.delete", "compute.firewalls.get", diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index bbd96d30a2c..1494a2c7060 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -36,10 +36,9 @@ provider: security_group: # AWS config file must include security group name GroupName: {{security_group}} -{% if vpc_name is not none %} - # NOTE: This is a new field added by SkyPilot and parsed by our own - # AWSNodeProvider. - vpc_name: {{vpc_name}} +{% if aws_vpc_name is not none %} + # NOTE: This is a new field added by SkyPilot to force use a specific VPC. + vpc_name: {{aws_vpc_name}} {% endif %} {%- if docker_login_config is not none %} # We put docker login config in provider section because ray's schema disabled diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 2ecb915f55e..c63340392ab 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -28,6 +28,10 @@ provider: cache_stopped_nodes: True # The GCP project ID. project_id: {{gcp_project_id}} +{% if gcp_vpc_name is not none %} + # NOTE: This is a new field added by SkyPilot to force use a specific VPC. + vpc_name: {{gcp_vpc_name}} +{% endif %} # The firewall rule name for customized firewall rules. Only enabled # if we have ports requirement. {% if firewall_rule is not none %} diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 20453c49e70..0b7a1cc4223 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -349,6 +349,13 @@ def get_config_schema(): 'minItems': 1, 'maxItems': 1, }, + 'vpc_name': { + 'oneOf': [{ + 'type': 'string', + }, { + 'type': 'null', + }], + }, } }, 'kubernetes': { diff --git a/tests/test_config.py b/tests/test_config.py index 52ff1767648..dfe64f77f06 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -37,6 +37,10 @@ def _create_config_file(config_file_path: pathlib.Path) -> None: vpc_name: {VPC_NAME} use_internal_ips: true ssh_proxy_command: {PROXY_COMMAND} + + gcp: + vpc_name: {VPC_NAME} + kubernetes: networking: {NODEPORT_MODE_NAME} """)) @@ -167,6 +171,7 @@ def test_config_get_set_nested(monkeypatch, tmp_path) -> None: assert skypilot_config.get_nested(('aws', 'use_internal_ips'), None) assert skypilot_config.get_nested(('aws', 'ssh_proxy_command'), None) == PROXY_COMMAND + assert skypilot_config.get_nested(('gcp', 'vpc_name'), None) == VPC_NAME assert skypilot_config.get_nested(('kubernetes', 'networking'), None) == NODEPORT_MODE_NAME # Check set_nested() will copy the config dict and return a new dict @@ -189,6 +194,7 @@ def test_config_get_set_nested(monkeypatch, tmp_path) -> None: assert skypilot_config.get_nested(('aws', 'use_internal_ips'), None) assert skypilot_config.get_nested( ('aws', 'ssh_proxy_command'), None) is None + assert skypilot_config.get_nested(('gcp', 'vpc_name'), None) == VPC_NAME # Check config with only partial keys still works new_config3 = copy.copy(new_config2) @@ -223,3 +229,4 @@ def test_config_with_env(monkeypatch, tmp_path) -> None: assert skypilot_config.get_nested(('aws', 'use_internal_ips'), None) assert skypilot_config.get_nested(('aws', 'ssh_proxy_command'), None) == PROXY_COMMAND + assert skypilot_config.get_nested(('gcp', 'vpc_name'), None) == VPC_NAME