From cbca97df85f90ecc6175f0bc1692ba2a171e0758 Mon Sep 17 00:00:00 2001 From: Michael Sarahan Date: Tue, 29 Oct 2024 13:15:42 -0500 Subject: [PATCH] add telemetry --- .github/workflows/pr.yaml | 215 ++++++++++++++++++++++++++++++-------- 1 file changed, 174 insertions(+), 41 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 0a33f5488a6..05ae8b763ef 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -9,7 +9,60 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +env: + # TODO: put this in a shared org-wide secret? + OTEL_SERVICE_NAME: 'pr-cudf' + # TODO: this should be set as an org-wide variable + OTEL_EXPORTER_OTLP_ENDPOINT: https://tempo.gha-runners.nvidia.com:4318 + # These are where the secrets in github env vars are written to files. These files don't + # exist unless you explicitly write them in a step. + # The purpose of setting the environment variable is to tell OpenTelemetry tools where to find them. + # We abuse it a bit by also using it as the write destination for the certificate files. + OTEL_EXPORTER_OTLP_CERTIFICATE: "/tmp/certs/ca.crt" + OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE: "/tmp/certs/client.crt" + OTEL_EXPORTER_OTLP_CLIENT_KEY: "/tmp/certs/client.key" + OTEL_EXPORTER_OTLP_PROTOCOL: "http/protobuf" + OTEL_EXPORTER_OTLP_HEADERS: ${{ secrets.OTEL_EXPORTER_OTLP_HEADERS }} + jobs: + telemetry-setup: + runs-on: ubuntu-latest + outputs: + start_time: ${{ steps.timestamp.outputs.START_TIME }} + traceparent: ${{ steps.telemetry-setup.outputs.traceparent }} + endpoint: ${{ steps.var-reexports.outputs.endpoint }} + top_level_service_name: ${{ steps.var-reexports.outputs.service_name }} + steps: + - name: Get starting timestamp + id: timestamp + run: + echo "START_TIME=$(date +%s.%N)" >> ${GITHUB_OUTPUT} + - name: Echo endpoint to make it available to shared workflows + id: var-reexports + run: | + echo endpoint="${OTEL_EXPORTER_OTLP_ENDPOINT}" >> ${GITHUB_OUTPUT} + echo service_name="${OTEL_SERVICE_NAME}" >> ${GITHUB_OUTPUT} + - name: Write certificate files for mTLS + run: | + mkdir -p /tmp/certs + cat << EOF > "${OTEL_EXPORTER_OTLP_CERTIFICATE}" + ${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }} + EOF + cat << EOF > "${OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE}" + ${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }} + EOF + cat << EOF > "${OTEL_EXPORTER_OTLP_CLIENT_KEY}" + ${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }} + EOF + - name: Telemetry setup + id: telemetry-setup + uses: rapidsai/shared-actions/telemetry-traceparent@add-telemetry + - name: Start root span + uses: rapidsai/shared-actions/telemetry-create-span@add-telemetry + with: + name: "root span" + traceparent: ${{steps.telemetry-setup.outputs.traceparent}} + start_time: ${{steps.timestamp.outputs.start_time}} pr-builder: needs: - changed-files @@ -21,6 +74,7 @@ jobs: - conda-python-build - conda-python-tests - docs-build + - telemetry-setup - wheel-build-pylibcugraph - wheel-tests-pylibcugraph - wheel-build-cugraph @@ -35,13 +89,13 @@ jobs: - wheel-tests-cugraph-equivariant - devcontainer secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@add-telemetry if: always() with: needs: ${{ toJSON(needs) }} changed-files: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@add-telemetry with: files_yaml: | test_cpp: @@ -71,48 +125,64 @@ jobs: - '!notebooks/**' checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12 + needs: telemetry-setup + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@add-telemetry with: enable_check_generated_files: false + ignored_pr_jobs: "final_span_update" + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} conda-cpp-build: - needs: checks + needs: + - checks + - telemetry-setup secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@add-telemetry with: build_type: pull-request node_type: cpu32 + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} conda-cpp-tests: - needs: [conda-cpp-build, changed-files] + needs: [conda-cpp-build, changed-files, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@add-telemetry if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp with: build_type: pull-request + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} conda-cpp-checks: - needs: conda-cpp-build + needs: [conda-cpp-build, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@add-telemetry with: build_type: pull-request enable_check_symbols: true symbol_exclusions: (cugraph::ops|hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel) + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} conda-python-build: - needs: conda-cpp-build + needs: [conda-cpp-build, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@add-telemetry with: build_type: pull-request + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} conda-python-tests: - needs: [conda-python-build, changed-files] + needs: [conda-python-build, changed-files, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@add-telemetry if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} conda-notebook-tests: - needs: [conda-python-build, changed-files] + needs: [conda-python-build, changed-files, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@add-telemetry if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks with: build_type: pull-request @@ -120,20 +190,24 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:cuda11.8.0-ubuntu22.04-py3.10" run_script: "ci/test_notebooks.sh" + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} docs-build: - needs: conda-python-build + needs: [conda-python-build, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@add-telemetry with: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" container_image: "rapidsai/ci-conda:cuda11.8.0-ubuntu22.04-py3.10" run_script: "ci/build_docs.sh" + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} wheel-build-pylibcugraph: - needs: checks + needs: [checks, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@add-telemetry with: build_type: pull-request script: ci/build_wheel_pylibcugraph.sh @@ -141,103 +215,162 @@ jobs: extra-repo-sha: branch-24.12 extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY node_type: cpu32 + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} wheel-tests-pylibcugraph: - needs: [wheel-build-pylibcugraph, changed-files] + needs: [wheel-build-pylibcugraph, changed-files, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@add-telemetry if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: ci/test_wheel_pylibcugraph.sh + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} wheel-build-cugraph: - needs: wheel-tests-pylibcugraph + needs: [wheel-tests-pylibcugraph, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@add-telemetry with: build_type: pull-request script: ci/build_wheel_cugraph.sh extra-repo: rapidsai/cugraph-ops extra-repo-sha: branch-24.12 extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} wheel-tests-cugraph: - needs: [wheel-build-cugraph, changed-files] + needs: [wheel-build-cugraph, changed-files, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@add-telemetry if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: ci/test_wheel_cugraph.sh + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} wheel-build-nx-cugraph: - needs: wheel-tests-pylibcugraph + needs: + - telemetry-setup + - wheel-tests-pylibcugraph secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@add-telemetry with: build_type: pull-request script: ci/build_wheel_nx-cugraph.sh + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} wheel-tests-nx-cugraph: - needs: [wheel-build-nx-cugraph, changed-files] + needs: [wheel-build-nx-cugraph, changed-files, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@add-telemetry if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: ci/test_wheel_nx-cugraph.sh + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} wheel-build-cugraph-dgl: - needs: wheel-tests-cugraph + needs: [wheel-tests-cugraph, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@add-telemetry with: build_type: pull-request script: ci/build_wheel_cugraph-dgl.sh wheel-tests-cugraph-dgl: - needs: [wheel-build-cugraph-dgl, changed-files] + needs: [wheel-build-cugraph-dgl, changed-files, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@add-telemetry if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: ci/test_wheel_cugraph-dgl.sh matrix_filter: map(select(.ARCH == "amd64")) wheel-build-cugraph-pyg: - needs: wheel-tests-cugraph + needs: [wheel-tests-cugraph, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@add-telemetry with: build_type: pull-request script: ci/build_wheel_cugraph-pyg.sh + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} wheel-tests-cugraph-pyg: - needs: [wheel-build-cugraph-pyg, changed-files] + needs: [wheel-build-cugraph-pyg, changed-files, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@add-telemetry if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: ci/test_wheel_cugraph-pyg.sh matrix_filter: map(select(.ARCH == "amd64")) + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} wheel-build-cugraph-equivariant: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 + needs: telemetry-setup + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@add-telemetry with: build_type: pull-request script: ci/build_wheel_cugraph-equivariant.sh + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} wheel-tests-cugraph-equivariant: - needs: [wheel-build-cugraph-equivariant, changed-files] + needs: [wheel-build-cugraph-equivariant, changed-files, telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@add-telemetry if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: ci/test_wheel_cugraph-equivariant.sh matrix_filter: map(select(.ARCH == "amd64")) + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} devcontainer: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@add-telemetry + needs: telemetry-setup with: arch: '["amd64"]' cuda: '["12.5"]' node_type: cpu32 extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{ needs.telemetry-setup.outputs.traceparent }} build_command: | sccache -z; build-all --verbose -j$(nproc --ignore=1) -DBUILD_CUGRAPH_MG_TESTS=ON; sccache -s; + final_span_update: + runs-on: ubuntu-latest + needs: [pr-builder, telemetry-setup] + steps: + - name: Get final timestamp + id: timestamp + run: + echo "FINAL_TIME=$(date +%s.%N)" >> ${GITHUB_OUTPUT} + # Main purpose of this traceparent line here is to ensure that otel-cli is installed. + - name: Get job traceparent + uses: rapidsai/shared-actions/telemetry-traceparent@add-telemetry + - name: Write certificate files for mTLS + run: | + mkdir -p /tmp/certs + cat << EOF > ${OTEL_EXPORTER_OTLP_CERTIFICATE} + ${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }} + EOF + cat << EOF > ${OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE} + ${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }} + EOF + cat << EOF > ${OTEL_EXPORTER_OTLP_CLIENT_KEY} + ${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }} + EOF + - name: Update root span with final completion time + if: always() + uses: rapidsai/shared-actions/telemetry-create-span@add-telemetry + with: + service: ${{needs.telemetry-setup.outputs.top_level_service_name}} + name: "end-of-job update" + default_endpoint: "${{needs.telemetry-setup.outputs.endpoint}}" + traceparent: ${{needs.telemetry-setup.outputs.traceparent}} + start_time: ${{needs.telemetry-setup.outputs.start_time}} + end_time: ${{steps.timestamp.outputs.FINAL_TIME}}