Skip to content

Commit

Permalink
[AWS] unset AWS env vars to avoid autoscaler using the incorrect cred…
Browse files Browse the repository at this point in the history
…entials (#2442)

* unset AWS env vars to avoid autoscaler using the incorrect credentials

* add comment

* Update sky/templates/aws-ray.yml.j2

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

---------

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>
  • Loading branch information
Michaelvll and concretevitamin authored Aug 22, 2023
1 parent f890269 commit ab9daf0
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions sky/templates/aws-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -225,16 +225,20 @@ setup_commands:
# Increment the following for catching performance bugs easier:
# current num items (num SSH connections): 1
head_start_ray_commands:
# Unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY to avoid using credentials from environment
# variables set by user. SkyPilot's ray cluster should use the `~/.aws/` credentials, as that is
# the one used to create the cluster, and the autoscaler module started by the `ray start` command
# should use the same credentials. Otherwise, `ray status` will fail to fetch the available nodes. Reference: https://github.com/skypilot-org/skypilot/issues/2441
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
# Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
# all the sessions to be reloaded. This is a workaround.
- ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
- ray stop; unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
{{dump_port_command}};

{%- if num_nodes > 1 %}
worker_start_ray_commands:
- ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
- ray stop; unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
{%- else %}
worker_start_ray_commands: []
Expand Down

0 comments on commit ab9daf0

Please sign in to comment.