Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[UX] sky launch --fast #4159

Merged
merged 11 commits into from
Oct 31, 2024
2 changes: 1 addition & 1 deletion sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2705,7 +2705,7 @@ def _provision(
(e.g., cluster name invalid) or a region/zone throwing
resource unavailability.
exceptions.CommandError: any ssh command error.
RuntimeErorr: raised when 'rsync' is not installed.
RuntimeError: raised when 'rsync' is not installed.
# TODO(zhwu): complete the list of exceptions.
"""
# FIXME: ray up for Azure with different cluster_names will overwrite
Expand Down
13 changes: 12 additions & 1 deletion sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,7 @@ def _launch_with_confirm(
retry_until_up: bool = False,
no_setup: bool = False,
clone_disk_from: Optional[str] = None,
fast: bool = False,
):
"""Launch a cluster with a Task."""
if cluster is None:
Expand Down Expand Up @@ -619,6 +620,7 @@ def _launch_with_confirm(
retry_until_up=retry_until_up,
no_setup=no_setup,
clone_disk_from=clone_disk_from,
fast=fast,
)


Expand Down Expand Up @@ -1040,6 +1042,13 @@ def cli():
help=('[Experimental] Clone disk from an existing cluster to launch '
'a new one. This is useful when the new cluster needs to have '
'the same data on the boot disk as an existing cluster.'))
@click.option(
'--fast',
is_flag=True,
default=False,
required=False,
help=('[Experimental] If the cluster is already up and available, skip '
'provisioning and setup steps.'))
@usage_lib.entrypoint
def launch(
entrypoint: Tuple[str, ...],
Expand Down Expand Up @@ -1071,6 +1080,7 @@ def launch(
yes: bool,
no_setup: bool,
clone_disk_from: Optional[str],
fast: bool,
):
"""Launch a cluster or task.

Expand Down Expand Up @@ -1139,7 +1149,8 @@ def launch(
down=down,
retry_until_up=retry_until_up,
no_setup=no_setup,
clone_disk_from=clone_disk_from)
clone_disk_from=clone_disk_from,
fast=fast)


@cli.command(cls=_DocumentedCodeCommand)
Expand Down
37 changes: 35 additions & 2 deletions sky/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sky import admin_policy
from sky import backends
from sky import clouds
from sky import exceptions
from sky import global_user_state
from sky import optimizer
from sky import sky_logging
Expand Down Expand Up @@ -216,7 +217,8 @@ def _execute(
'(after all jobs finish).'
f'{colorama.Style.RESET_ALL}')
idle_minutes_to_autostop = 1
stages.remove(Stage.DOWN)
if Stage.DOWN in stages:
stages.remove(Stage.DOWN)
if idle_minutes_to_autostop >= 0:
requested_features.add(
clouds.CloudImplementationFeatures.AUTO_TERMINATE)
Expand Down Expand Up @@ -355,6 +357,7 @@ def launch(
detach_run: bool = False,
no_setup: bool = False,
clone_disk_from: Optional[str] = None,
fast: bool = False,
# Internal only:
# pylint: disable=invalid-name
_is_launched_by_jobs_controller: bool = False,
Expand Down Expand Up @@ -409,6 +412,8 @@ def launch(
clone_disk_from: [Experimental] if set, clone the disk from the
specified cluster. This is useful to migrate the cluster to a
different availability zone or region.
fast: [Experimental] If the cluster is already up and available,
skip provisioning and setup steps.

Example:
.. code-block:: python
Expand Down Expand Up @@ -452,15 +457,43 @@ def launch(
controller_utils.check_cluster_name_not_controller(
cluster_name, operation_str='sky.launch')

handle = None
stages = None
# Check if cluster exists and we are doing fast provisioning
if fast and cluster_name is not None:
maybe_handle = global_user_state.get_handle_from_cluster_name(
cluster_name)
if maybe_handle is not None:
try:
# This will throw if the cluster is not available
backend_utils.check_cluster_available(
cluster_name,
operation='executing tasks',
check_cloud_vm_ray_backend=False,
dryrun=dryrun)
handle = maybe_handle
# Get all stages
stages = [
Stage.SYNC_WORKDIR,
Stage.SYNC_FILE_MOUNTS,
Stage.PRE_EXEC,
Stage.EXEC,
Stage.DOWN,
]
except exceptions.ClusterNotUpError:
# Proceed with normal provisioning
pass

return _execute(
entrypoint=entrypoint,
dryrun=dryrun,
down=down,
stream_logs=stream_logs,
handle=None,
handle=handle,
backend=backend,
retry_until_up=retry_until_up,
optimize_target=optimize_target,
stages=stages,
cluster_name=cluster_name,
detach_setup=detach_setup,
detach_run=detach_run,
Expand Down
85 changes: 85 additions & 0 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,69 @@ def test_minimal(generic_cloud: str):
run_one_test(test)


# ---------- Test fast launch ----------
def test_launch_fast(generic_cloud: str):
name = _get_cluster_name()

test = Test(
'test_launch_fast',
[
# First launch to create the cluster
f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
f'sky logs {name} 1 --status',

# Second launch to test fast launch - should not reprovision
f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast tests/test_yamls/minimal.yaml) && '
' echo "$s" && '
# Validate that cluster was not re-launched.
'! echo "$s" | grep -A 1 "Launching on" | grep "is up." && '
# Validate that setup was not re-run.
'! echo "$s" | grep -A 1 "Running setup on" | grep "running setup" && '
# Validate that the task ran and finished.
'echo "$s" | grep -A 1 "task run finish" | grep "Job finished (status: SUCCEEDED)"',
f'sky logs {name} 2 --status',
f'sky status -r {name} | grep UP',
],
f'sky down -y {name}',
timeout=_get_timeout(generic_cloud),
)
run_one_test(test)


# See cloud exclusion explanations in test_autostop
@pytest.mark.no_fluidstack
@pytest.mark.no_lambda_cloud
@pytest.mark.no_ibm
@pytest.mark.no_kubernetes
def test_launch_fast_with_autostop(generic_cloud: str):
name = _get_cluster_name()
# Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
# the VM is stopped.
autostop_timeout = 600 if generic_cloud == 'azure' else 250

test = Test(
'test_launch_fast_with_autostop',
[
# First launch to create the cluster with a short autostop
f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --cloud {generic_cloud} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
f'sky logs {name} 1 --status',
f'sky status -r {name} | grep UP',
f'sleep {autostop_timeout}',

# Ensure cluster is stopped
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED',

# Launch again. Do full output validation - we expect the cluster to re-launch
f'unset SKYPILOT_DEBUG; s=$(sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {_VALIDATE_LAUNCH_OUTPUT}',
f'sky logs {name} 2 --status',
f'sky status -r {name} | grep UP',
],
f'sky down -y {name}',
timeout=_get_timeout(generic_cloud) + autostop_timeout,
)
run_one_test(test)


# ---------- Test region ----------
@pytest.mark.aws
def test_aws_region():
Expand Down Expand Up @@ -4376,6 +4439,28 @@ def test_core_api_sky_launch_exec():
sky.down(name)


# The sky launch CLI has some additional checks to make sure the cluster is up/
# restarted. However, the core API doesn't have these; make sure it still works
def test_core_api_sky_launch_fast(generic_cloud: str):
name = _get_cluster_name()
cloud = sky.clouds.CLOUD_REGISTRY.from_str(generic_cloud)
try:
task = sky.Task(run="whoami").set_resources(sky.Resources(cloud=cloud))
sky.launch(task,
cluster_name=name,
idle_minutes_to_autostop=1,
fast=True)
# Sleep to let the cluster autostop
time.sleep(120)
# Run it again - should work with fast=True
sky.launch(task,
cluster_name=name,
idle_minutes_to_autostop=1,
fast=True)
finally:
sky.down(name)


# ---------- Testing Storage ----------
class TestStorageWithCredentials:
"""Storage tests which require credentials and network connection"""
Expand Down
Loading