Skip to content

Commit

Permalink
make all SLURM job run on jumpbox runners
Browse files Browse the repository at this point in the history
  • Loading branch information
yhtang committed Oct 15, 2024
1 parent e8043a5 commit d0800ed
Show file tree
Hide file tree
Showing 7 changed files with 18 additions and 18 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/_runner_ondemand_slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ on:
jobs:

launch-slurm-runner:
runs-on: ubuntu-latest
runs-on: jumpbox
steps:
- name: Print environment variables
run: env
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/_test_maxtext.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
# - [1, 1, 1, 8] # PP, DP, FSDP, TP
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox

steps:
- name: Print environment variables
Expand Down Expand Up @@ -194,7 +194,7 @@ jobs:
- [1, 4, 2, 2]
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox

steps:
- name: Print environment variables
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/_test_pax_rosetta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
- [1, 1, 2, 4]
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox

env:
BADGE_FILENAME_PREFIX: badge-rosetta-pax-single-process-multi-device-te
Expand Down Expand Up @@ -260,7 +260,7 @@ jobs:
ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox
env:
BADGE_FILENAME_PREFIX: badge-rosetta-pax-multi-node-te
steps:
Expand Down Expand Up @@ -458,7 +458,7 @@ jobs:
- [4, 2, 1, 2]
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox
env:
BADGE_FILENAME_PREFIX: badge-rosetta-pax-multi-node
steps:
Expand Down Expand Up @@ -650,7 +650,7 @@ jobs:
- [1, 8, 1, 1]
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox
env:
BADGE_FILENAME_PREFIX: badge-rosetta-pax-single-node-dropout-te
steps:
Expand Down Expand Up @@ -844,7 +844,7 @@ jobs:
- [1, 8, 1, 1]
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox
env:
BADGE_FILENAME_PREFIX: badge-rosetta-pax-single-process-evaluation-te
steps:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/_test_slurm_pyxis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ jobs:

run-test:
name: ${{ inputs.NAME }}
runs-on: ubuntu-22.04
runs-on: jumpbox
outputs:
SLURM_JOB_ID: ${{ steps.submit.outputs.SLURM_JOB_ID }}
SLURM_STATE: ${{ steps.exit-info.outputs.SLURM_STATE }}
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/_test_t5x_rosetta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False"
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox
env:
BADGE_FILENAME_PREFIX: badge-rosetta-t5x-single-process-multi-device
steps:
Expand Down Expand Up @@ -242,7 +242,7 @@ jobs:
EXTRA_GIN_ARGS: ""
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox
env:
BADGE_FILENAME_PREFIX: badge-rosetta-t5x-multi-gpu-multi-node
steps:
Expand Down Expand Up @@ -428,7 +428,7 @@ jobs:
N_GPU: [8]
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox
env:
BADGE_FILENAME_PREFIX: badge-rosetta-t5x-vit-single-process-multi-device
steps:
Expand Down Expand Up @@ -598,7 +598,7 @@ jobs:
N_NODE: [1, 2]
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox
env:
BADGE_FILENAME_PREFIX: badge-rosetta-t5x-vit-multi-gpu-multi-node
steps:
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/_test_upstream_pax.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
- [1, 1, 2, 4]
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox

steps:
- name: Print environment variables
Expand Down Expand Up @@ -212,7 +212,7 @@ jobs:
ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox

steps:
- name: Print environment variables
Expand Down Expand Up @@ -361,7 +361,7 @@ jobs:
- [1, 8, 1, 1]
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox

steps:
- name: Print environment variables
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/_test_upstream_t5x.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:
ADDITIONAL_ARGS: "--enable-fmha 1"
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox

steps:
- name: Print environment variables
Expand Down Expand Up @@ -191,7 +191,7 @@ jobs:
ADDITIONAL_ARGS: "--enable-fmha 1"
fail-fast: false

runs-on: ubuntu-22.04
runs-on: jumpbox

steps:
- name: Print environment variables
Expand Down

0 comments on commit d0800ed

Please sign in to comment.