Skip to content

Commit

Permalink
[k8s] Increase ssh timeout when calling uptime (#2785)
Browse files Browse the repository at this point in the history
* update timeout

* lint

* readability
  • Loading branch information
romilbhardwaj authored Nov 16, 2023
1 parent 0b52a31 commit ef8839a
Showing 1 changed file with 28 additions and 0 deletions.
28 changes: 28 additions & 0 deletions sky/skylet/providers/kubernetes/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

MAX_TAG_RETRIES = 3
DELAY_BEFORE_TAG_RETRY = 0.5
UPTIME_SSH_TIMEOUT = 10

RAY_COMPONENT_LABEL = 'cluster.ray.io/component'

Expand All @@ -30,6 +31,33 @@ def set_port(self, port):

SSHCommandRunner.set_port = set_port

# Monkey patch SSHCommandRunner to use a larger timeout when running uptime to
# check cluster liveness. This is needed because the default timeout of 5s is
# too short when the cluster is accessed from different geographical
# locations over VPN.
#
# Ray autoscaler sets the timeout on a per-call basis (as an arg to
# SSHCommandRunner.run). The 5s timeout is hardcoded in
# NodeUpdater.wait_ready() in updater.py is hard to modify without
# duplicating a large chunk of ray autoscaler code. Instead, we
# monkey patch the run method to check if the command being run is 'uptime',
# and if so change the timeout to 10s.
#
# Fortunately, Ray uses a timeout of 120s for running commands after the
# cluster is ready, so we do not need to modify that.


def run_override_timeout(*args, **kwargs):
# If command is `uptime`, change timeout to 10s
command = args[1]
if command == 'uptime':
kwargs['timeout'] = UPTIME_SSH_TIMEOUT
return SSHCommandRunner._run(*args, **kwargs)


SSHCommandRunner._run = SSHCommandRunner.run
SSHCommandRunner.run = run_override_timeout


def head_service_selector(cluster_name: str) -> Dict[str, str]:
"""Selector for Operator-configured head service."""
Expand Down

0 comments on commit ef8839a

Please sign in to comment.