~Sandbox #249
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: "~Sandbox" | |
on: | |
workflow_dispatch: | |
inputs: | |
BASE_IMAGE: | |
type: string | |
description: "Base image to fast-forward dependencies (older)" | |
required: false | |
default: "ghcr.io/nvidia/t5x:nightly-2023-07-18" | |
BROKEN_IMAGE: | |
type: string | |
description: 'Broken image (newer)' | |
required: false | |
default: "ghcr.io/nvidia/t5x:nightly-2023-07-20" | |
REPO_DIRS: | |
type: string | |
description: "Space separated dirs to fast-forward (e.g., '/opt/flax /opt/t5x')" | |
required: false | |
default: "" | |
env: | |
UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal | |
PAX_ASSIGNEE: ashors1 | |
T5X_ASSIGNEE: terrykong | |
JAX_ASSIGNEE: yhtang | |
STAKEHOLDERS: "@ashors1 @terrykong @yhtang" | |
permissions: | |
contents: read # to fetch code | |
actions: write # to cancel previous workflows | |
packages: write # to upload container | |
jobs: | |
sandbox: | |
runs-on: ubuntu-22.04 | |
steps: | |
- name: Login to GitHub Container Registry | |
uses: docker/login-action@v2 | |
with: | |
registry: ghcr.io | |
username: ${{ github.repository_owner }} | |
password: ${{ secrets.GITHUB_TOKEN }} | |
- name: Print usage | |
run: | | |
cat << EOF | |
This is an empty workflow file located in the main branch of your | |
repository. It serves as a testing ground for new GitHub Actions on | |
development branches before merging them to the main branch. By | |
defining and overloading this workflow on your development branch, | |
you can test new actions without affecting your main branch, ensuring | |
a smooth integration process once the changes are ready to be merged. | |
Usage: | |
1. In your development branch, modify the sandbox.yml workflow file | |
to include the new actions you want to test. Make sure to commit | |
the changes to the development branch. | |
2. Navigate to the 'Actions' tab in your repository, select the | |
'~Sandbox' workflow, and choose your development branch from the | |
branch dropdown menu. Click on 'Run workflow' to trigger the | |
workflow on your development branch. | |
3. Once you have tested and verified the new actions in the Sandbox | |
workflow, you can incorporate them into your main workflow(s) and | |
merge the development branch into the main branch. Remember to | |
revert the changes to the sandbox.yml file in the main branch to | |
keep it empty for future testing. | |
EOF | |
metadata: | |
outputs: | |
# Dates will be generated [S, S+1, S+2, ..., E-1] | |
# - Will go thru S, S+1, ..., E-1 in this order since it is assumed that S was the latest | |
# working image, so it has the best chance to succed. | |
TAGS_BETWEEN: ${{ steps.meta.outputs.TAGS_BETWEEN }} | |
# If the BASE_IMAGE is ghcr.io/nvidia/t5x:nightly-YYYY-MM-DD, then BASE_IMAGE_REPO is ghcr.io/nvidia/t5x | |
BASE_IMAGE_REPO: ${{ steps.meta.outputs.BASE_IMAGE_REPO }} | |
BROKEN_IMAGE: ${{ steps.meta.outputs.BROKEN_IMAGE }} | |
# This would be something like either t5x or pax | |
FRAMEWORK_BASE: ${{ steps.meta.outputs.FRAMEWORK_BASE }} | |
BROKEN_DATE: ${{ steps.meta.outputs.BROKEN_DATE }} | |
UPLD_IMAGE: ${{ steps.meta.outputs.UPLD_IMAGE }} | |
runs-on: ubuntu-22.04 | |
steps: | |
- name: Check out the repository under ${GITHUB_WORKSPACE} | |
uses: actions/checkout@v3 | |
- name: Login to GitHub Container Registry | |
uses: docker/login-action@v2 | |
with: | |
registry: ghcr.io | |
username: ${{ github.repository_owner }} | |
password: ${{ secrets.GITHUB_TOKEN }} | |
- name: | |
id: meta | |
shell: bash -x -e {0} | |
run: | | |
source .github/workflows/scripts/get_build_date.sh | |
source .github/workflows/scripts/all_image_tags.sh | |
BASE_IMAGE=${{ inputs.BASE_IMAGE }} | |
BROKEN_IMAGE=${{ inputs.BROKEN_IMAGE }} | |
BASE_IMAGE_REPO=$(echo $BASE_IMAGE | rev | cut -d: -f2- | rev) | |
FRAMEWORK_BASE=$(echo $BASE_IMAGE_REPO | rev | cut -d/ -f1 | rev) | |
if [[ $FRAMEWORK_BASE != t5x && $FRAMEWORK_BASE != pax ]]; then | |
echo "BASE_IMAGE=$BASE_IMAGE can only be ghcr.io/nvidia/pax or ghcr.io/nvidia/t5x" | |
exit 1 | |
fi | |
ALL_BASE_TAGS="$(all_image_tags ${{ secrets.GITHUB_TOKEN }} $BASE_IMAGE_REPO)" | |
generate_tags() { | |
start_date=$1 | |
end_date=$2 | |
if [[ "$start_date" > "$end_date" ]]; then | |
echo "$start_date > $end_date, which is not supported" | |
exit 1 | |
fi | |
while [[ "$start_date" < "$end_date" ]]; do | |
echo nightly-$start_date | |
start_date=$(date -I -d "$start_date + 1 day") | |
done | |
} | |
filter_valid_tags() { | |
fgrep -x -f <(echo "$ALL_BASE_TAGS") | |
} | |
start_date=${BASE_IMAGE##*nightly-} | |
end_date=$(get_build_date ${{ secrets.GITHUB_TOKEN }} $BROKEN_IMAGE ) | |
if ! docker manifest inspect ${BASE_IMAGE} >/dev/null 2>&1; then | |
echo "Script assumes $BASE_IMAGE exists, but it wasn't found" | |
exit 1 | |
elif ! docker manifest inspect ${BROKEN_IMAGE} >/dev/null 2>&1; then | |
echo "Script assumes $BROKEN_IMAGE exists, but it wasn't found" | |
exit 1 | |
fi | |
echo "TAGS_BETWEEN=$(generate_tags $start_date $end_date | filter_valid_tags | jq -R -s -c 'split("\n")[:-1]')" | tee -a $GITHUB_OUTPUT | |
echo "BASE_IMAGE_REPO=$BASE_IMAGE_REPO" | tee -a $GITHUB_OUTPUT | |
echo "BROKEN_IMAGE=$BROKEN_IMAGE" | tee -a $GITHUB_OUTPUT | |
echo "FRAMEWORK_BASE=$FRAMEWORK_BASE" | tee -a $GITHUB_OUTPUT | |
echo "BROKEN_DATE=$end_date" | tee -a $GITHUB_OUTPUT | |
echo "UPLD_IMAGE=${{ env.UPLD_IMAGE }}" | tee -a $GITHUB_OUTPUT | |
####### | |
# T5X # | |
####### | |
build-t5x-ff: | |
needs: [metadata] | |
if: needs.metadata.outputs.FRAMEWORK_BASE == 't5x' | |
runs-on: [self-hosted, x86, small] | |
strategy: | |
# To enforce sequential execution, set to 1 | |
max-parallel: 2 | |
matrix: | |
base_tag: ${{fromJson(needs.metadata.outputs.TAGS_BETWEEN)}} | |
steps: | |
- name: Print environment variables | |
run: env | |
- name: Check out the repository under ${GITHUB_WORKSPACE} | |
uses: actions/checkout@v3 | |
- name: Login to GitHub Container Registry | |
uses: docker/login-action@v2 | |
with: | |
registry: ghcr.io | |
username: ${{ github.repository_owner }} | |
password: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set docker metadata | |
id: meta | |
uses: docker/metadata-action@v4 | |
with: | |
images: | | |
${{ env.UPLD_IMAGE }} | |
flavor: | | |
latest=false | |
tags: | | |
type=raw,value=${{ github.run_id }}-${{ matrix.base_tag }}-ff-t5x-to-${{ needs.metadata.outputs.BROKEN_DATE }} | |
labels: | |
org.opencontainers.image.created=${{ needs.metadata.outputs.BROKEN_DATE }} | |
- name: Set up Docker Buildx | |
uses: docker/setup-buildx-action@v2 | |
with: | |
driver-opts: | | |
image=moby/buildkit:v0.10.6 | |
- name: "Build docker image: ${{ needs.metadata.outputs.UPLD_IMAGE }}:${{ github.run_id }}-${{ matrix.base_tag }}-ff-t5x-to-${{ needs.metadata.outputs.BROKEN_DATE }}" | |
uses: docker/build-push-action@v4 | |
with: | |
context: .github/container | |
push: true | |
file: .github/container/Dockerfile.ff | |
tags: ${{ steps.meta.outputs.tags }} | |
labels: ${{ steps.meta.outputs.labels }} | |
build-args: | | |
BASE_IMAGE=${{ inputs.BASE_IMAGE }} | |
BROKEN_IMAGE=${{ inputs.BROKEN_IMAGE }} | |
REPO_DIRS=${{ inputs.REPO_DIRS }} | |
- name: Log image to Github Step Summary | |
run: | | |
cat <<EOF | tee -a $GITHUB_STEP_SUMMARY | |
* ${{ needs.metadata.outputs.UPLD_IMAGE }}:${{ github.run_id }}-${{ matrix.base_tag }}-ff-t5x-to-${{ needs.metadata.outputs.BROKEN_DATE }} | |
EOF | |
test-t5x-ff: | |
needs: [metadata, build-t5x-ff] | |
strategy: | |
fail-fast: false | |
max-parallel: 1 | |
matrix: | |
base_tag: ${{fromJson(needs.metadata.outputs.TAGS_BETWEEN)}} | |
uses: ./.github/workflows/_test_t5x.yaml | |
with: | |
T5X_IMAGE: "${{ needs.metadata.outputs.UPLD_IMAGE }}:${{ github.run_id }}-${{ matrix.base_tag }}-ff-t5x-to-${{ needs.metadata.outputs.BROKEN_DATE }}" | |
secrets: inherit | |
t5x-ff-summary: | |
if: (success() || failure()) && needs.metadata.outputs.FRAMEWORK_BASE == 't5x' | |
needs: [metadata, test-t5x-ff] | |
runs-on: ubuntu-22.04 | |
permissions: | |
issues: write | |
steps: | |
- name: Create table summarizing | |
id: summary-table | |
run: | | |
set -ou pipefail | |
get_jobs() { | |
page=$1 | |
curl -s -H "Authorization: Bearer ${{ github.token }}" "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/${{ github.run_id }}/jobs?page=$page" | |
} | |
page=1 | |
json_data=$(get_jobs $page) | |
total_jobs=$(echo "$json_data" | jq -r '.total_count') | |
# Jobs are paginated, so we need to aggregate them | |
while [[ "$(echo "$json_data" | jq -r '.jobs | length')" -lt $total_jobs ]]; do | |
page=$((page+1)) | |
json_data=$(jq '.jobs += input.jobs' <(echo "$json_data") <(get_jobs $page)) | |
done | |
name_conclusion_array=() | |
while IFS= read -r line; do | |
name_conclusion_array+=("$line") | |
done < <(echo "$json_data" | jq -r '.jobs[] | select(.name | startswith("test-t5x-ff") and contains("outcome")) | "\(.name)\t\(.conclusion)"') | |
TABLE_MD=$( | |
cat <<EOF | |
| Rewind to | Test result | Image | | |
| --- | --- | --- | | |
EOF | |
last_success_img_date="" | |
i=0 | |
for tag in $(echo '${{ needs.metadata.outputs.TAGS_BETWEEN }}' | jq -r '.[]'); do | |
job_step_name=$(echo "${name_conclusion_array[$i]}" | cut -f1) | |
conclusion=$(echo "${name_conclusion_array[$i]}" | cut -f2) | |
if [[ ! "$job_step_name" == *$tag* ]]; then | |
echo "The runs from the GH API ($name_conclusion_array) do not align with TAGS_BETWEEN=${{ needs.metadata.outputs.TAGS_BETWEEN }}" >&2 | |
exit 1 | |
fi | |
echo "| $tag | $conclusion | ${{ needs.metadata.outputs.UPLD_IMAGE }}:${{ github.run_id }}-$tag-ff-t5x-to-${{ needs.metadata.outputs.BROKEN_DATE }} |" | |
i=$((i+1)) | |
if [[ $conclusion == success ]]; then | |
last_success_img_date=$tag | |
fi | |
done | |
echo -e "| | failure <br> (assumed broken) | ${{ inputs.BROKEN_IMAGE }} (BROKEN_IMAGE) |\n" | |
if [[ -z "$last_success_img_date" ]]; then | |
echo "Found a working base nightly image (based on $last_success_img_date)); issue likely lies in more recent base nightly images." | |
echo "ASSIGNEE=${{ env.JAX_ASSIGNEE }}" >> $GITHUB_OUTPUT | |
else | |
echo "Cannot find a working base nightly image; issue likely lies in the ${{ needs.metadata.outputs.FRAMEWORK_BASE }} libraries." | |
echo "ASSIGNEE=${{ env.T5X_ASSIGNEE }}" >> $GITHUB_OUTPUT | |
fi | |
) | |
ret_code=$? | |
# Writes to both the step summary and the github output | |
echo 'TABLE_MD<<EOF' >> $GITHUB_OUTPUT | |
echo "$TABLE_MD" | tee -a $GITHUB_STEP_SUMMARY >> $GITHUB_OUTPUT | |
echo 'EOF' >> $GITHUB_OUTPUT | |
exit $ret_code | |
- uses: octokit/request-action@v2.x | |
with: | |
#route: POST /repos/{owner_and_repo}/issues | |
route: PATCH /repos/{owner_and_repo}/issues/{issue_number} | |
owner_and_repo: ${{ github.repository }} | |
issue_number: 218 # DELETE ME | |
title: | | |
| | |
[Bot] ${{ needs.metadata.outputs.FRAMEWORK_BASE }} test failures on ${{ needs.metadata.outputs.BROKEN_DATE }} | |
body: | | |
| | |
* Github Action run: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} | | |
${{ steps.summary-table.outputs.TABLE_MD }} | |
${{ env.STAKEHOLDERS }} | |
assignee: ${{ steps.summary-table.outputs.ASSIGNEE }} | |
labels: "[\"auto-triage\"]" | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
######## | |
## PAX # | |
######## | |
build-pax-ff: | |
needs: metadata | |
if: needs.metadata.outputs.FRAMEWORK_BASE == 'pax' | |
runs-on: [self-hosted, x86, small] | |
strategy: | |
# To enforce sequential execution, set to 1 | |
max-parallel: 2 | |
matrix: | |
base_tag: ${{fromJson(needs.metadata.outputs.TAGS_BETWEEN)}} | |
steps: | |
- name: Print environment variables | |
run: env | |
- name: Check out the repository under ${GITHUB_WORKSPACE} | |
uses: actions/checkout@v3 | |
- name: Login to GitHub Container Registry | |
uses: docker/login-action@v2 | |
with: | |
registry: ghcr.io | |
username: ${{ github.repository_owner }} | |
password: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set docker metadata | |
id: meta | |
uses: docker/metadata-action@v4 | |
with: | |
images: | | |
${{ env.UPLD_IMAGE }} | |
flavor: | | |
latest=false | |
tags: | | |
type=raw,value=${{ github.run_id }}-${{ matrix.base_tag }}-ff-pax-to-${{ needs.metadata.outputs.BROKEN_DATE }} | |
labels: | |
org.opencontainers.image.created=${{ needs.metadata.outputs.BROKEN_DATE }} | |
- name: Set up Docker Buildx | |
uses: docker/setup-buildx-action@v2 | |
with: | |
driver-opts: | | |
image=moby/buildkit:v0.10.6 | |
- name: "Build docker image: ${{ needs.metadata.outputs.UPLD_IMAGE }}:${{ github.run_id }}-${{ matrix.base_tag }}-ff-pax-to-${{ needs.metadata.outputs.BROKEN_DATE }}" | |
uses: docker/build-push-action@v4 | |
with: | |
context: .github/container | |
push: true | |
file: .github/container/Dockerfile.ff | |
tags: ${{ steps.meta.outputs.tags }} | |
labels: ${{ steps.meta.outputs.labels }} | |
build-args: | | |
BASE_IMAGE=${{ inputs.BASE_IMAGE }} | |
BROKEN_IMAGE=${{ inputs.BROKEN_IMAGE }} | |
REPO_DIRS=${{ inputs.REPO_DIRS }} | |
test-pax-ff: | |
needs: [metadata, build-pax-ff] | |
strategy: | |
fail-fast: false | |
max-parallel: 1 | |
matrix: | |
base_tag: ${{fromJson(needs.metadata.outputs.TAGS_BETWEEN)}} | |
uses: ./.github/workflows/_test_pax.yaml | |
with: | |
PAX_IMAGE: "${{ needs.metadata.outputs.UPLD_IMAGE }}:${{ github.run_id }}-${{ matrix.base_tag }}-ff-pax-to-${{ needs.metadata.outputs.BROKEN_DATE }}" | |
secrets: inherit | |
pax-ff-summary: | |
if: (success() || failure()) && needs.metadata.outputs.FRAMEWORK_BASE == 'pax' | |
needs: [metadata, test-pax-ff] | |
runs-on: ubuntu-22.04 | |
permissions: | |
issues: write | |
steps: | |
- name: Create table summarizing | |
id: summary-table | |
run: | | |
set -ou pipefail | |
get_jobs() { | |
page=$1 | |
curl -s -H "Authorization: Bearer ${{ github.token }}" "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/${{ github.run_id }}/jobs?page=$page" | |
} | |
page=1 | |
json_data=$(get_jobs $page) | |
total_jobs=$(echo "$json_data" | jq -r '.total_count') | |
# Jobs are paginated, so we need to aggregate them | |
while [[ "$(echo "$json_data" | jq -r '.jobs | length')" -lt $total_jobs ]]; do | |
page=$((page+1)) | |
json_data=$(jq '.jobs += input.jobs' <(echo "$json_data") <(get_jobs $page)) | |
done | |
name_conclusion_array=() | |
while IFS= read -r line; do | |
name_conclusion_array+=("$line") | |
done < <(echo "$json_data" | jq -r '.jobs[] | select(.name | startswith("test-pax-ff") and contains("outcome")) | "\(.name)\t\(.conclusion)"') | |
TABLE_MD=$( | |
cat <<EOF | |
| Rewind to | Test result | Image | | |
| --- | --- | --- | | |
EOF | |
last_success_img_date="" | |
i=0 | |
for tag in $(echo '${{ needs.metadata.outputs.TAGS_BETWEEN }}' | jq -r '.[]'); do | |
job_step_name=$(echo "${name_conclusion_array[$i]}" | cut -f1) | |
conclusion=$(echo "${name_conclusion_array[$i]}" | cut -f2) | |
if [[ ! "$job_step_name" == *$tag* ]]; then | |
echo "The runs from the GH API ($name_conclusion_array) do not align with TAGS_BETWEEN=${{ needs.metadata.outputs.TAGS_BETWEEN }}" >&2 | |
exit 1 | |
fi | |
echo "| $tag | $conclusion | ${{ needs.metadata.outputs.UPLD_IMAGE }}:${{ github.run_id }}-$tag-ff-pax-to-${{ needs.metadata.outputs.BROKEN_DATE }} |" | |
i=$((i+1)) | |
if [[ $conclusion == success ]]; then | |
last_success_img_date=$tag | |
fi | |
done | |
echo -e "| | failure <br> (assumed broken) | ${{ inputs.BROKEN_IMAGE }} (BROKEN_IMAGE) |\n" | |
if [[ -z "$last_success_img_date" ]]; then | |
echo "Found a working base nightly image (based on $last_success_img_date)); issue likely lies in more recent base nightly images." | |
echo "ASSIGNEE=${{ env.JAX_ASSIGNEE }}" >> $GITHUB_OUTPUT | |
else | |
echo "Cannot find a working base nightly image; issue likely lies in the ${{ needs.metadata.outputs.FRAMEWORK_BASE }} libraries." | |
echo "ASSIGNEE=${{ env.PAX_ASSIGNEE }}" >> $GITHUB_OUTPUT | |
fi | |
) | |
ret_code=$? | |
# Writes to both the step summary and the github output | |
echo 'TABLE_MD<<EOF' >> $GITHUB_OUTPUT | |
echo "$TABLE_MD" | tee -a $GITHUB_STEP_SUMMARY >> $GITHUB_OUTPUT | |
echo 'EOF' >> $GITHUB_OUTPUT | |
exit $ret_code | |
- uses: octokit/request-action@v2.x | |
with: | |
#route: POST /repos/{owner_and_repo}/issues | |
route: PATCH /repos/{owner_and_repo}/issues/{issue_number} | |
owner_and_repo: ${{ github.repository }} | |
issue_number: 215 # DELETE ME | |
title: | | |
| | |
[Bot] ${{ needs.metadata.outputs.FRAMEWORK_BASE }} test failures on ${{ needs.metadata.outputs.BROKEN_DATE }} | |
body: | | |
| | |
* Github Action run: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} | | |
${{ steps.summary-table.outputs.TABLE_MD }} | |
${{ env.STAKEHOLDERS }} | |
assignee: ${{ steps.summary-table.outputs.ASSIGNEE }} | |
labels: "[\"auto-triage\"]" | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |