Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core] Remove backward compatibility code for 0.6.0 & 0.7.0 #4175

Merged
merged 7 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 10 additions & 133 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,8 @@ class SSHConfigHelper(object):

ssh_conf_path = '~/.ssh/config'
ssh_conf_lock_path = os.path.expanduser('~/.sky/ssh_config.lock')
ssh_conf_per_cluster_lock_path = os.path.expanduser(
'~/.sky/ssh_config_{}.lock')
ssh_cluster_path = SKY_USER_FILE_PATH + '/ssh/{}'

@classmethod
Expand Down Expand Up @@ -486,12 +488,6 @@ def add_cluster(

config_path = os.path.expanduser(cls.ssh_conf_path)

# For backward compatibility: before #2706, we wrote the config of SkyPilot clusters
# directly in ~/.ssh/config. For these clusters, we remove the config in ~/.ssh/config
# and write/overwrite the config in ~/.sky/ssh/<cluster_name> instead.
cls._remove_stale_cluster_config_for_backward_compatibility(
cluster_name, ip, auth_config, docker_user)

if not os.path.exists(config_path):
config = ['\n']
with open(config_path,
Expand Down Expand Up @@ -560,139 +556,20 @@ def add_cluster(
f.write(codegen)

@classmethod
def _remove_stale_cluster_config_for_backward_compatibility(
cls,
cluster_name: str,
ip: str,
auth_config: Dict[str, str],
docker_user: Optional[str] = None,
):
"""Remove authentication information for cluster from local SSH config.

If no existing host matching the provided specification is found, then
nothing is removed.

Args:
ip: Head node's IP address.
auth_config: read_yaml(handle.cluster_yaml)['auth']
docker_user: If not None, use this user to ssh into the docker
"""
username = auth_config['ssh_user']
config_path = os.path.expanduser(cls.ssh_conf_path)
cluster_config_path = os.path.expanduser(
cls.ssh_cluster_path.format(cluster_name))
if not os.path.exists(config_path):
return

with open(config_path, 'r', encoding='utf-8') as f:
config = f.readlines()

start_line_idx = None

# Scan the config for the cluster name.
for i, line in enumerate(config):
next_line = config[i + 1] if i + 1 < len(config) else ''
if docker_user is None:
found = (line.strip() == f'HostName {ip}' and
next_line.strip() == f'User {username}')
else:
found = (line.strip() == 'HostName localhost' and
next_line.strip() == f'User {docker_user}')
if found:
# Find the line starting with ProxyCommand and contains the ip
found = False
for idx in range(i, len(config)):
# Stop if we reach an empty line, which means a new host
if not config[idx].strip():
break
if config[idx].strip().startswith('ProxyCommand'):
proxy_command_line = config[idx].strip()
if proxy_command_line.endswith(f'@{ip}'):
found = True
break
if found:
start_line_idx = i - 1
break

if start_line_idx is not None:
# Scan for end of previous config.
cursor = start_line_idx
while cursor > 0 and len(config[cursor].strip()) > 0:
cursor -= 1
prev_end_line_idx = cursor

# Scan for end of the cluster config.
end_line_idx = None
cursor = start_line_idx + 1
start_line_idx -= 1 # remove auto-generated comment
while cursor < len(config):
if config[cursor].strip().startswith(
'# ') or config[cursor].strip().startswith('Host '):
end_line_idx = cursor
break
cursor += 1

# Remove sky-generated config and update the file.
config[prev_end_line_idx:end_line_idx] = [
'\n'
] if end_line_idx is not None else []
with open(config_path, 'w', encoding='utf-8') as f:
f.write(''.join(config).strip())
f.write('\n' * 2)

# Delete include statement if it exists in the config.
sky_autogen_comment = ('# Added by sky (use `sky stop/down '
f'{cluster_name}` to remove)')
with open(config_path, 'r', encoding='utf-8') as f:
config = f.readlines()

for i, line in enumerate(config):
config_str = line.strip()
if f'Include {cluster_config_path}' in config_str:
with open(config_path, 'w', encoding='utf-8') as f:
if i < len(config) - 1 and config[i + 1] == '\n':
del config[i + 1]
# Delete Include string
del config[i]
# Delete Sky Autogen Comment
if i > 0 and sky_autogen_comment in config[i - 1].strip():
del config[i - 1]
f.write(''.join(config))
break
if 'Host' in config_str:
break

@classmethod
# TODO: We can remove this after 0.6.0 and have a lock only per cluster.
@timeline.FileLockEvent(ssh_conf_lock_path)
def remove_cluster(
cls,
cluster_name: str,
ip: str,
auth_config: Dict[str, str],
docker_user: Optional[str] = None,
):
def remove_cluster(cls, cluster_name: str):
"""Remove authentication information for cluster from ~/.sky/ssh/<cluster_name>.

For backward compatibility also remove the config from ~/.ssh/config if it exists.

If no existing host matching the provided specification is found, then
nothing is removed.

Args:
ip: Head node's IP address.
auth_config: read_yaml(handle.cluster_yaml)['auth']
docker_user: If not None, use this user to ssh into the docker
cluster_name: Cluster name.
"""
cluster_config_path = os.path.expanduser(
cls.ssh_cluster_path.format(cluster_name))
common_utils.remove_file_if_exists(cluster_config_path)

# Ensures backward compatibility: before #2706, we wrote the config of SkyPilot clusters
# directly in ~/.ssh/config. For these clusters, we should clean up the config.
# TODO: Remove this after 0.6.0
cls._remove_stale_cluster_config_for_backward_compatibility(
cluster_name, ip, auth_config, docker_user)
with timeline.FileLockEvent(
cls.ssh_conf_per_cluster_lock_path.format(cluster_name)):
cluster_config_path = os.path.expanduser(
cls.ssh_cluster_path.format(cluster_name))
common_utils.remove_file_if_exists(cluster_config_path)


def _replace_yaml_dicts(
Expand Down Expand Up @@ -867,7 +744,7 @@ def write_cluster_config(
labels = skypilot_config.get_nested((str(cloud).lower(), 'labels'), {})
# Deprecated: instance_tags have been replaced by labels. For backward
# compatibility, we support them and the schema allows them only if
# `labels` are not specified. This should be removed after 0.7.0.
# `labels` are not specified. This should be removed after 0.8.0.
labels = skypilot_config.get_nested((str(cloud).lower(), 'instance_tags'),
labels)
# labels is a dict, which is guaranteed by the type check in
Expand Down
103 changes: 4 additions & 99 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2118,13 +2118,8 @@ def __init__(
stable_internal_external_ips: Optional[List[Tuple[str,
str]]] = None,
stable_ssh_ports: Optional[List[int]] = None,
cluster_info: Optional[provision_common.ClusterInfo] = None,
# The following 2 fields are deprecated. SkyPilot new provisioner
# API handles the TPU node creation/deletion.
# Backward compatibility for TPU nodes created before #2943.
# TODO (zhwu): Remove this after 0.6.0.
tpu_create_script: Optional[str] = None,
tpu_delete_script: Optional[str] = None) -> None:
cluster_info: Optional[provision_common.ClusterInfo] = None
) -> None:
self._version = self._VERSION
self.cluster_name = cluster_name
self.cluster_name_on_cloud = cluster_name_on_cloud
Expand All @@ -2139,12 +2134,6 @@ def __init__(
self.launched_nodes = launched_nodes
self.launched_resources = launched_resources
self.docker_user: Optional[str] = None
# Deprecated. SkyPilot new provisioner API handles the TPU node
# creation/deletion.
# Backward compatibility for TPU nodes created before #2943.
# TODO (zhwu): Remove this after 0.6.0.
self.tpu_create_script = tpu_create_script
self.tpu_delete_script = tpu_delete_script

def __repr__(self):
return (f'ResourceHandle('
Expand All @@ -2160,10 +2149,7 @@ def __repr__(self):
f'\n\tlaunched_resources={self.launched_nodes}x '
f'{self.launched_resources}, '
f'\n\tdocker_user={self.docker_user},'
f'\n\tssh_user={self.ssh_user},'
# TODO (zhwu): Remove this after 0.6.0.
f'\n\ttpu_create_script={self.tpu_create_script}, '
f'\n\ttpu_delete_script={self.tpu_delete_script})')
f'\n\tssh_user={self.ssh_user}')

def get_cluster_name(self):
return self.cluster_name
Expand All @@ -2176,26 +2162,6 @@ def _use_internal_ips(self):
return common_utils.read_yaml(self.cluster_yaml).get(
'provider', {}).get('use_internal_ips', False)

def _update_cluster_region(self):
"""Update the region in handle.launched_resources.

This is for backward compatibility to handle the clusters launched
long before. We should remove this after 0.6.0.
"""
if self.launched_resources.region is not None:
return

config = common_utils.read_yaml(self.cluster_yaml)
provider = config['provider']
cloud = self.launched_resources.cloud
if cloud.is_same_cloud(clouds.Azure()):
region = provider['location']
elif cloud.is_same_cloud(clouds.GCP()) or cloud.is_same_cloud(
clouds.AWS()):
region = provider['region']

self.launched_resources = self.launched_resources.copy(region=region)

def update_ssh_ports(self, max_attempts: int = 1) -> None:
"""Fetches and sets the SSH ports for the cluster nodes.

Expand Down Expand Up @@ -2567,8 +2533,6 @@ def __setstate__(self, state):
if version < 4:
self.update_ssh_ports()

self._update_cluster_region()

if version < 8:
try:
self._update_cluster_info()
Expand Down Expand Up @@ -2649,8 +2613,6 @@ def check_resources_fit_cluster(
if record is not None:
usage_lib.messages.usage.update_cluster_status(record['status'])

# Backward compatibility: the old launched_resources without region info
# was handled by ResourceHandle._update_cluster_region.
assert launched_resources.region is not None, handle

mismatch_str = (f'To fix: specify a new cluster name, or down the '
Expand Down Expand Up @@ -4081,55 +4043,9 @@ def post_teardown_cleanup(self,
* Removing ssh configs for the cluster;
* Updating the local state of the cluster;
* Removing the terminated cluster's scripts and ray yaml files.

Raises:
RuntimeError: If it fails to delete the TPU.
"""
log_path = os.path.join(os.path.expanduser(self.log_dir),
'teardown.log')
log_abs_path = os.path.abspath(log_path)
cluster_name_on_cloud = handle.cluster_name_on_cloud

# Backward compatibility for TPU nodes created before #2943. Any TPU
# node launched before that PR have the delete script generated (and do
# not have the tpu_node config set in its cluster yaml), so we have to
# call the deletion script to clean up the TPU node.
# For TPU nodes launched after the PR, deletion is done in SkyPilot's
# new GCP provisioner API.
# TODO (zhwu): Remove this after 0.6.0.
if (handle.tpu_delete_script is not None and
os.path.exists(handle.tpu_delete_script)):
# Only call the deletion script if the cluster config does not
# contain TPU node config. Otherwise, the deletion should
# already be handled by the new provisioner.
config = common_utils.read_yaml(handle.cluster_yaml)
tpu_node_config = config['provider'].get('tpu_node')
if tpu_node_config is None:
with rich_utils.safe_status(
ux_utils.spinner_message('Terminating TPU')):
tpu_rc, tpu_stdout, tpu_stderr = log_lib.run_with_log(
['bash', handle.tpu_delete_script],
log_abs_path,
stream_logs=False,
require_outputs=True)
if tpu_rc != 0:
if _TPU_NOT_FOUND_ERROR in tpu_stderr:
logger.info('TPU not found. '
'It should have been deleted already.')
elif purge:
logger.warning(
_TEARDOWN_PURGE_WARNING.format(
reason='stopping/terminating TPU',
details=tpu_stderr))
else:
raise RuntimeError(
_TEARDOWN_FAILURE_MESSAGE.format(
extra_reason='It is caused by TPU failure.',
cluster_name=common_utils.cluster_name_in_hint(
handle.cluster_name, cluster_name_on_cloud),
stdout=tpu_stdout,
stderr=tpu_stderr))

if (terminate and handle.launched_resources.is_image_managed is True):
# Delete the image when terminating a "cloned" cluster, i.e.,
# whose image is created by SkyPilot (--clone-disk-from)
Expand Down Expand Up @@ -4174,11 +4090,7 @@ def post_teardown_cleanup(self,
# The cluster file must exist because the cluster_yaml will only
# be removed after the cluster entry in the database is removed.
config = common_utils.read_yaml(handle.cluster_yaml)
auth_config = config['auth']
backend_utils.SSHConfigHelper.remove_cluster(handle.cluster_name,
handle.head_ip,
auth_config,
handle.docker_user)
backend_utils.SSHConfigHelper.remove_cluster(handle.cluster_name)

global_user_state.remove_cluster(handle.cluster_name,
terminate=terminate)
Expand All @@ -4187,13 +4099,6 @@ def post_teardown_cleanup(self,
# This function could be directly called from status refresh,
# where we need to cleanup the cluster profile.
metadata_utils.remove_cluster_metadata(handle.cluster_name)
# Clean up TPU creation/deletion scripts
# Backward compatibility for TPU nodes created before #2943.
# TODO (zhwu): Remove this after 0.6.0.
if handle.tpu_delete_script is not None:
assert handle.tpu_create_script is not None
common_utils.remove_file_if_exists(handle.tpu_create_script)
common_utils.remove_file_if_exists(handle.tpu_delete_script)

# Clean up generated config
# No try-except is needed since Ray will fail to teardown the
Expand Down
7 changes: 0 additions & 7 deletions sky/provision/gcp/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,13 +632,6 @@ def cleanup_ports(
del ports # Unused.
assert provider_config is not None, cluster_name_on_cloud
project_id = provider_config['project_id']
if 'ports' in provider_config:
# Backward compatibility for old provider config.
# TODO(tian): remove this after 2 minor releases, 0.6.0.
for port in provider_config['ports']:
firewall_rule_name = f'user-ports-{cluster_name_on_cloud}-{port}'
instance_utils.GCPComputeInstance.delete_firewall_rule(
project_id, firewall_rule_name)
if 'firewall_rule' in provider_config:
firewall_rule_name = provider_config['firewall_rule']
instance_utils.GCPComputeInstance.delete_firewall_rule(
Expand Down
2 changes: 0 additions & 2 deletions sky/serve/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,8 +572,6 @@ def status(
'controller_port': (Optional[int]) controller port,
'load_balancer_port': (Optional[int]) load balancer port,
'policy': (Optional[str]) load balancer policy description,
'requested_resources': (sky.Resources) requested resources
for replica (deprecated),
'requested_resources_str': (str) str representation of
requested resources,
'replica_info': (List[Dict[str, Any]]) replica information,
Expand Down
Loading
Loading