-
Notifications
You must be signed in to change notification settings - Fork 56
160 lines (152 loc) · 6.82 KB
/
gpu-bench-merge-regression.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Run GPU regression check only when attempting to merge, shown as skipped status check beforehand
name: GPU merge group regression check
on:
pull_request:
types: [opened, synchronize, reopened, ready_for_review]
branches: [main]
merge_group:
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
# Run comparative benchmark against main, open issue on regression
gpu-benchmark:
if: github.event_name != 'pull_request' || github.event.action == 'enqueued'
name: GPU bench regression check
runs-on: [self-hosted, gpu-bench]
steps:
- uses: actions/checkout@v4
with:
repository: lurk-lab/ci-workflows
- uses: ./.github/actions/gpu-setup
with:
gpu-framework: 'cuda'
- uses: ./.github/actions/ci-env
- uses: actions/checkout@v4
# Install dependencies
- uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v2
- uses: taiki-e/install-action@v2
with:
tool: just@1.22
- name: Install criterion
run: |
cargo install cargo-criterion
cargo install criterion-table
- name: Set bench output format and base SHA
run: |
echo "LURK_BENCH_OUTPUT=commit-comment" | tee -a $GITHUB_ENV
echo "BASE_COMMIT=${{ github.event.merge_group.base_sha }}" | tee -a $GITHUB_ENV
GPU_NAME=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader,nounits | tail -n1)
echo "GPU_ID=$(echo $GPU_NAME | awk '{ print $NF }')" | tee -a $GITHUB_ENV
echo "GPU_NAME=$GPU_NAME" | tee -a $GITHUB_ENV
# Checkout gh-pages to check for cached bench result
- name: Checkout gh-pages
uses: actions/checkout@v4
with:
ref: gh-pages
path: gh-pages
- name: Check for cached bench result
id: cached-bench
run: |
if [ -f "fibonacci-${{ env.BASE_COMMIT }}-${{ env.GPU_ID }}.json" ]
then
echo "cached=true" | tee -a $GITHUB_OUTPUT
cp fibonacci-${{ env.BASE_COMMIT }}-${{ env.GPU_ID }}.json ../fibonacci-${{ env.BASE_COMMIT }}.json
else
echo "cached=false" | tee -a $GITHUB_OUTPUT
fi
working-directory: ${{ github.workspace }}/gh-pages
# Checkout base branch for comparative bench
- uses: actions/checkout@v4
if: steps.cached-bench.outputs.cached == 'false'
with:
ref: main
path: main
# Copy the script so the base can bench with the same parameters
- name: Run GPU bench on base branch
if: steps.cached-bench.outputs.cached == 'false'
run: |
# Copy justfile & env to main, overwriting existing config with that of PR branch
cp ../benches/justfile ../benches/bench.env .
# Run benchmark
just gpu-bench-ci fibonacci
# Copy bench output to PR branch
cp fibonacci-${{ env.BASE_COMMIT }}.json ..
working-directory: ${{ github.workspace }}/main
- name: Run GPU bench on PR branch
run: |
just gpu-bench-ci fibonacci
cp fibonacci-${{ github.sha }}.json ..
working-directory: ${{ github.workspace }}/benches
- name: copy the benchmark template and prepare it with data
run: |
cp .github/tables.toml .
# Get CPU model
CPU_MODEL=$(grep '^model name' /proc/cpuinfo | head -1 | awk -F ': ' '{ print $2 }')
# Get vCPU count
NUM_VCPUS=$(nproc --all)
# Get total RAM in GB
TOTAL_RAM=$(grep MemTotal /proc/meminfo | awk '{$2=$2/(1024^2); print int($2), "GB RAM";}')
WORKFLOW_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
# Use conditionals to ensure that only non-empty variables are inserted
[[ ! -z "${{ env.GPU_NAME }}" ]] && sed -i "/^\"\"\"$/i ${{ env.GPU_NAME }}" tables.toml
[[ ! -z "$CPU_MODEL" ]] && sed -i "/^\"\"\"$/i $CPU_MODEL" tables.toml
[[ ! -z "$NUM_VCPUS" ]] && sed -i "/^\"\"\"$/i $NUM_VCPUS vCPUs" tables.toml
[[ ! -z "$TOTAL_RAM" ]] && sed -i "/^\"\"\"$/i $TOTAL_RAM" tables.toml
sed -i "/^\"\"\"$/i Workflow run: $WORKFLOW_URL" tables.toml
echo "WORKFLOW_URL=$WORKFLOW_URL" | tee -a $GITHUB_ENV
working-directory: ${{ github.workspace }}
# Create a `criterion-table` and write in commit comment
- name: Run `criterion-table`
run: cat fibonacci-${{ env.BASE_COMMIT }}.json fibonacci-${{ github.sha }}.json | criterion-table > BENCHMARKS.md
- name: Write bench on commit comment
uses: peter-evans/commit-comment@v3
with:
body-path: BENCHMARKS.md
# Check for a slowdown >= `$ARECIBO_BENCH_NOISE_THRESHOLD` (fallback is 10%/1.1x). If so, open an issue but don't block merge
# Since we are parsing for slowdowns, we simply add 1 to the noise threshold decimal to get the regression factor
- name: Check for perf regression
id: regression-check
run: |
REGRESSIONS=$(grep -o '[0-9.]*x slower' BENCHMARKS.md | cut -d 'x' -f1)
echo $REGRESSIONS
if [ ! -z "${{ env.LURK_BENCH_NOISE_THRESHOLD}}" ]; then
REGRESSION_FACTOR=$(echo "${{ env.LURK_BENCH_NOISE_THRESHOLD }}+1" | bc)
else
REGRESSION_FACTOR=1.1
fi
for r in $REGRESSIONS
do
if (( $(echo "$r >= $REGRESSION_FACTOR" | bc -l) ))
then
exit 1
fi
done
echo "NOISE_THRESHOLD=$("(REGRESSION_FACTOR-1)*100" | bc) | tee -a $GITHUB_ENV
continue-on-error: true
# Not possible to use ${{ github.event.number }} with the `merge_group` trigger
- name: Get PR number from merge branch
run: |
echo "PR_NUMBER=$(echo ${{ github.event.merge_group.head_ref }} | sed -e 's/.*pr-\(.*\)-.*/\1/')" | tee -a $GITHUB_ENV
- uses: JasonEtco/create-an-issue@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ env.PR_NUMBER }}
GIT_SHA: ${{ github.sha }}
WORKFLOW_URL: ${{ env.WORKFLOW_URL }}
NOISE_THRESHOLD: ${{ env.NOISE_THRESHOLD }}
with:
filename: .github/PERF_REGRESSION.md
- name: Remove old main bench
run: |
rm ${{ env.BASE_COMMIT }}.json
mv ${{ github.sha }}.json ${{ github.sha }}-${{ env.GPU_ID }}.json
working-directory: ${{ github.workspace }}
- name: Commit bench result to `gh-pages` branch if no regression
if: steps.regression-check.outcome != 'failure'
uses: stefanzweifel/git-auto-commit-action@v5
with:
branch: gh-pages
commit_message: '[automated] GPU Benchmark from PR #${{ env.PR_NUMBER }}'
file_pattern: '${{ github.sha }}-${{ env.GPU_ID }}.json'