Skip to content

Commit

Permalink
Merge branch 'main' of ssh://gitlab-master.nvidia.com:12051/dl/jax/ja…
Browse files Browse the repository at this point in the history
…x-toolbox-mirror into pax-documentation-squashed
  • Loading branch information
ashors1 committed Jul 11, 2023
2 parents 9597382 + 33bbecd commit b63a89c
Show file tree
Hide file tree
Showing 16 changed files with 438 additions and 35 deletions.
4 changes: 4 additions & 0 deletions .github/container/test-jax.sh
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,10 @@ set -ex
## Install dependencies

pip install -r `jax_source_dir`/build/test-requirements.txt
# Reason for manually installing matplotlib:
# https://github.com/google/jax/commit/6b76937c530bd8ee185cc9e1991b3696bd10e831
# https://github.com/google/jax/blob/6bc74d2a9874e1fe93a45191bb829c07dfee04fa/tests/BUILD#L134
pip install matplotlib

## Run tests

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/_build_jax.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
build:
outputs:
DOCKER_TAGS: ${{ steps.meta.outputs.tags }}
runs-on: [self-hosted, builder]
runs-on: [self-hosted, large-builder]
steps:
- name: Print environment variables
run: env
Expand Down Expand Up @@ -118,4 +118,4 @@ jobs:
REPO_JAX=${{ inputs.REPO_JAX }}
REPO_XLA=${{ inputs.REPO_XLA }}
REF_JAX=${{ inputs.REF_JAX }}
REF_XLA=${{ inputs.REF_XLA }}
REF_XLA=${{ inputs.REF_XLA }}
15 changes: 12 additions & 3 deletions .github/workflows/_publish_badge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,15 @@ jobs:
gist_id: gistId,
files: {
[filename]: { content },
...Object.fromEntries(Object.entries(gist.files).filter(([name]) => name !== filename))
}
});
...Object.fromEntries(
Object.entries(gist.files)
.filter(([name]) => name !== filename)
.map(([name, value]) => [
name,
Object.fromEntries(
Object.entries(value).filter(([nestedKey]) => nestedKey === "content")
),
])
),
},
});
12 changes: 11 additions & 1 deletion .github/workflows/_test_pax.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,16 @@ jobs:
output/ \
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true
- name: Write SLURM job status to file
shell: bash -x -e {0}
run: |
python << EOF
import json
with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f:
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
json.dump(dump, f)
EOF
- name: Upload training logs as artifacts
uses: actions/upload-artifact@v3
with:
Expand All @@ -149,7 +159,7 @@ jobs:
## PAX MGMN training
[view metrics](http://${{ vars.HOSTNAME_TENSORBOARD }}:6006/#scalars&regexInput=${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per)
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per)
EOF
) | tee $GITHUB_STEP_SUMMARY
27 changes: 24 additions & 3 deletions .github/workflows/_test_rosetta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,23 @@ on:
description: 'Rosetta image build by NVIDIA/JAX-Toolbox'
required: true
default: 'ghcr.io/nvidia/rosetta-t5x:latest'
outputs:
TEST_ARTIFACT_NAME:
description: 'Name of the unit test artifact for downstream workflows'
value: ${{ jobs.rosetta-tests.outputs.TEST_ARTIFACT_NAME }}

env:
TEST_ARTIFACT_NAME: test-logs

jobs:
rosetta-tests:
strategy:
matrix:
MARKERS: ["", "-m integration"]
TEST_TYPE: ["unit", "integration"]
fail-fast: false
runs-on: [self-hosted, compute, V100]
outputs:
TEST_ARTIFACT_NAME: ${{ env.TEST_ARTIFACT_NAME }}
steps:
- name: Print environment variables
run: |
Expand All @@ -35,9 +44,21 @@ jobs:
shell: bash -x -e {0}
run: |
docker pull ${{ inputs.ROSETTA_IMAGE }}
docker tag ${{ inputs.ROSETTA_IMAGE }} rosetta:latest
- name: Run Rosetta tests w/ docker
shell: bash -x -e {0}
shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log rosetta:latest bash -x -e /cmd.sh
run: |
docker run --gpus all ${{ inputs.ROSETTA_IMAGE }} sh -c "pip install '/opt/rosetta[test]' && pytest /opt/rosetta ${{ matrix.MARKERS }}"
EXTRA_ARGS=""
if [[ ${{ matrix.TEST_TYPE }} == integration ]]; then
EXTRA_ARGS="-m integration"
fi
ROSETTA_PATH=$(dirname $(python -c "import rosetta; print(*rosetta.__path__)"))
pip install "${ROSETTA_PATH}[test]" pytest-reportlog
pytest --report-log=/log/${{ matrix.TEST_TYPE }}-report.jsonl ${ROSETTA_PATH} ${EXTRA_ARGS} || true
- name: Upload unit test json logs
uses: actions/upload-artifact@v3
with:
name: ${{ env.TEST_ARTIFACT_NAME }}
path: /log/${{ matrix.TEST_TYPE }}-report.jsonl
2 changes: 1 addition & 1 deletion .github/workflows/_test_t5x.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ jobs:
## T5X MGMN training
[view metrics](http://${{ vars.HOSTNAME_TENSORBOARD }}:6006/#scalars&regexInput=${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per)
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per)
EOF
) | tee $GITHUB_STEP_SUMMARY
34 changes: 32 additions & 2 deletions .github/workflows/nightly-pax-test-mgmn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ jobs:
outputs:
BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }}
PAX_IMAGE: ${{ steps.date.outputs.PAX_IMAGE }}
PUBLISH: ${{ steps.date.outputs.PUBLISH }}
steps:
- name: Set metadata
id: date
Expand All @@ -44,6 +45,7 @@ jobs:
PAX_IMAGE=${{ inputs.PAX_IMAGE }}
PAX_IMAGE=${PAX_IMAGE:-${{ env.DEFAULT_PAX_IMAGE }}}
echo "PAX_IMAGE=${PAX_IMAGE}" >> $GITHUB_OUTPUT
echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT
run-jobs:
needs: metadata
Expand Down Expand Up @@ -95,13 +97,41 @@ jobs:
## PAX MGMN nightly training: ${{ needs.metadata.outputs.BUILD_DATE }}
[view metrics](http://${{ vars.HOSTNAME_TENSORBOARD }}:6006/#scalars&regexInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per)
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per)
EOF
) | tee $GITHUB_STEP_SUMMARY
publish-completion:
needs: [metadata, run-jobs]
uses: ./.github/workflows/_publish_badge.yaml
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
secrets: inherit
with:
ENDPOINT_FILENAME: 'pax-test-completion-status.json'
PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }}
SCRIPT: |
EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*TP*PP/*-status.json"
PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
FAILED_TESTS=$(jq -r '. | select ((.state == "FAILED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l)
echo "Test statuses:"
jq -rc 'input_filename,.' $EXIT_STATUSES
if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then
BADGE_COLOR=brightgreen
elif [[ $PASSED_TESTS -eq 0 ]]; then
BADGE_COLOR=red
else
BADGE_COLOR=yellow
fi
echo "LABEL='Completion'" >> $GITHUB_OUTPUT
echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT
if-upstream-failed:
runs-on: ubuntu-latest
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch'
steps:
- run: echo 'Upstream workflow failed, aborting run' && exit 1
- run: echo 'Upstream workflow failed, aborting run' && exit 1
63 changes: 62 additions & 1 deletion .github/workflows/nightly-rosetta-pax-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ jobs:
BUILD_DATE: ${{ steps.meta-vars.outputs.BUILD_DATE }}
BASE_LIBRARY: ${{ steps.meta-vars.outputs.BASE_LIBRARY }}
BASE_IMAGE: ${{ steps.meta-vars.outputs.BASE_IMAGE }}
PUBLISH: ${{ steps.meta-vars.outputs.PUBLISH }}
steps:
- name: Set build metadata
id: meta-vars
Expand All @@ -50,6 +51,7 @@ jobs:
echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT
echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT
echo "BASE_IMAGE=${BASE_IMAGE}" >> $GITHUB_OUTPUT
echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT
build:
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
Expand All @@ -61,6 +63,26 @@ jobs:
BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }}
secrets: inherit

publish-build:
needs: [metadata, build]
uses: ./.github/workflows/_publish_badge.yaml
if: ( success() || failure() ) && (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
secrets: inherit
with:
ENDPOINT_FILENAME: 'rosetta-pax-build-status.json'
PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }}
SCRIPT: |
if [[ ${{ needs.build.result }} == "success" ]]; then
BADGE_COLOR=brightgreen
MSG=passing
else
BADGE_COLOR=red
MSG=failing
fi
echo "LABEL='nightly'" >> $GITHUB_OUTPUT
echo "MESSAGE='${MSG}'" >> $GITHUB_OUTPUT
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT
test:
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
needs: build
Expand All @@ -69,7 +91,46 @@ jobs:
ROSETTA_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }}
secrets: inherit

publish:
publish-test:
needs: [metadata, build, test]
uses: ./.github/workflows/_publish_badge.yaml
if: ( success() || failure() ) && (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
secrets: inherit
with:
ENDPOINT_FILENAME: 'rosetta-pax-test-status.json'
PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }}
SCRIPT: |
ARTIFACTS="${{ needs.test.outputs.TEST_ARTIFACT_NAME }}/*.jsonl"
all_outcomes() {
cat $ARTIFACTS | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome'
}
cnt_type() {
cat $ARTIFACTS | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l
}
SKIPPED_TESTS=$(cnt_type skipped)
FAILED_TESTS=$(cnt_type failed)
PASSED_TESTS=$(cnt_type passed)
TOTAL_TESTS=$(all_outcomes | wc -l)
echo "Unit/Integration test breakdown:"
all_outcomes | sort | uniq -c
if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]]; then
BADGE_COLOR=brightgreen
else
if [[ $PASSED_TESTS -eq 0 ]]; then
BADGE_COLOR=red
else
BADGE_COLOR=yellow
fi
fi
echo "LABEL='V100'" >> $GITHUB_OUTPUT
if [[ ${{ needs.build.result }} == "success" ]]; then
echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT
else
echo "MESSAGE='n/a'" >> $GITHUB_OUTPUT
fi
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT
publish-container:
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)
needs: [metadata, build]
uses: ./.github/workflows/_publish_container.yaml
Expand Down
63 changes: 62 additions & 1 deletion .github/workflows/nightly-rosetta-t5x-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ jobs:
BUILD_DATE: ${{ steps.meta-vars.outputs.BUILD_DATE }}
BASE_LIBRARY: ${{ steps.meta-vars.outputs.BASE_LIBRARY }}
BASE_IMAGE: ${{ steps.meta-vars.outputs.BASE_IMAGE }}
PUBLISH: ${{ steps.meta-vars.outputs.PUBLISH }}
steps:
- name: Set build metadata
id: meta-vars
Expand All @@ -50,6 +51,7 @@ jobs:
echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT
echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT
echo "BASE_IMAGE=${BASE_IMAGE}" >> $GITHUB_OUTPUT
echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT
build:
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
Expand All @@ -61,6 +63,26 @@ jobs:
BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }}
secrets: inherit

publish-build:
needs: [metadata, build]
uses: ./.github/workflows/_publish_badge.yaml
if: ( success() || failure() ) && (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
secrets: inherit
with:
ENDPOINT_FILENAME: 'rosetta-t5x-build-status.json'
PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }}
SCRIPT: |
if [[ ${{ needs.build.result }} == "success" ]]; then
BADGE_COLOR=brightgreen
MSG=passing
else
BADGE_COLOR=red
MSG=failing
fi
echo "LABEL='nightly'" >> $GITHUB_OUTPUT
echo "MESSAGE='${MSG}'" >> $GITHUB_OUTPUT
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT
test:
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
needs: build
Expand All @@ -69,7 +91,46 @@ jobs:
ROSETTA_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }}
secrets: inherit

publish:
publish-test:
needs: [metadata, build, test]
uses: ./.github/workflows/_publish_badge.yaml
if: ( success() || failure() ) && (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
secrets: inherit
with:
ENDPOINT_FILENAME: 'rosetta-t5x-test-status.json'
PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }}
SCRIPT: |
ARTIFACTS="${{ needs.test.outputs.TEST_ARTIFACT_NAME }}/*.jsonl"
all_outcomes() {
cat $ARTIFACTS | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome'
}
cnt_type() {
cat $ARTIFACTS | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l
}
SKIPPED_TESTS=$(cnt_type skipped)
FAILED_TESTS=$(cnt_type failed)
PASSED_TESTS=$(cnt_type passed)
TOTAL_TESTS=$(all_outcomes | wc -l)
echo "Unit/Integration test breakdown:"
all_outcomes | sort | uniq -c
if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]]; then
BADGE_COLOR=brightgreen
else
if [[ $PASSED_TESTS -eq 0 ]]; then
BADGE_COLOR=red
else
BADGE_COLOR=yellow
fi
fi
echo "LABEL='V100'" >> $GITHUB_OUTPUT
if [[ ${{ needs.build.result }} == "success" ]]; then
echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT
else
echo "MESSAGE='n/a'" >> $GITHUB_OUTPUT
fi
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT
publish-container:
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)
needs: [metadata, build]
uses: ./.github/workflows/_publish_container.yaml
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nightly-t5x-test-mgmn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ jobs:
## T5X MGMN nightly training: ${{ needs.metadata.outputs.BUILD_DATE }}
[view metrics](http://${{ vars.HOSTNAME_TENSORBOARD }}:6006/#scalars&regexInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per)
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per)
EOF
) | tee $GITHUB_STEP_SUMMARY
Expand Down
Loading

0 comments on commit b63a89c

Please sign in to comment.