fix: defaultRuntime and publishing workflow (#11)

justinthelaw · Oct 8, 2024 · 844fbd2 · 844fbd2
1 parent 369249a
commit 844fbd2
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 20 deletions.
diff --git a/.github/.gitkeep b/.github/.gitkeep
diff --git a/.github/workflows/tag-and-release.yaml b/.github/workflows/tag-and-release.yaml
@@ -29,7 +29,7 @@ jobs:
 
     strategy:
       matrix:
-        k3s_version: [v1.30.4-k3s1]
+        k3s_version: ["v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"]
 
     permissions:
       contents: read
@@ -57,7 +57,7 @@ jobs:
 
     strategy:
       matrix:
-        k3s_version: [v1.30.4-k3s1]
+        k3s_version: ["v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"]
         cuda_version:
           [
             11.8.0-base-ubuntu22.04,
@@ -79,7 +79,7 @@ jobs:
 
       - name: Publish the CUDA K3s image
         run: |
-          uds run publish-image \
+          uds run publish-cuda-image \
             --set VERSION="${{ matrix.k3s_version }}" \
             --set CUDA_VERSION="${{ matrix.cuda_version }}" \
             --no-progress

diff --git a/docs/GPU.md b/docs/GPU.md
@@ -64,20 +64,6 @@ uds zarf package deploy oci://justinthelaw/uds-k3d:${VERSION}-cuda --confirm
 
 <!-- x-release-please-end -->
 
-#### Windows (WSL2)
-
-The following additional K3s arg must be provided to the UDS task:
-
-```bash
-uds run default-cuda --set K3D_EXTRA_ARGS="--gpus=all"
-```
-
-If running from the upstream package, the command would look like this:
-
-```bash
-uds zarf package deploy oci://justinthelaw/uds-k3d:${VERSION}-cuda --set K3D_EXTRA_ARGS="--gpus=all" --confirm 
-```
-
 ### Tests
 
 This repository includes two CUDA workload tests that can be executed:
@@ -151,3 +137,80 @@ When you reinstall or start a new GPU-dependent pod, the previous PID (process)
 
 1. Scale the previous GPU-dependent pod deployment down to 0, as the current `RollingUpdate` strategy for vLLM relies on back-up/secondary GPUs to be available for a graceful turnover
 2. Use `nvidia-smi` to check if the process has been flushed upon Pod termination BEFORE you deploy a new GPU-dependent pod, and if not, use `kill -9 <PID>` to manually flush the process
+
+#### MacOS
+
+UDS K3d's NVIDIA GPU support does not work on MacOS.
+
+#### Windows (WSL2)
+
+The NVIDIA GPU Operator does not work on WSL2 as of version v24.3.0 (see [issue](https://github.com/NVIDIA/gpu-operator/issues/318)); however, the NVIDIA Device Plugin, by itself, does work as of version 0.15.0-rc1 (see [comment](https://github.com/NVIDIA/k8s-device-plugin/issues/332#issuecomment-1927997436)).
+
+To get around this issue, the recommended course of action is to install UDS K3d without the `cuda` flavor, and then deploy the NVIDIA Device Plugin separately. Below are the steps for doing so:
+
+1. Run `uds run default` or `uds zarf package deploy oci://justinthelaw/uds-k3d:${VERSION} --confirm`
+2. Create an `nvidia-device-plugin.yaml` manifest like the one below, and a deploy it with `uds zarf tools kubectl apply -f nvidia-device-plugin.yaml`
+
+  ```yaml
+  apiVersion: node.k8s.io/v1
+  kind: RuntimeClass
+  metadata:
+    name: nvidia
+  handler: nvidia
+  ---
+  apiVersion: apps/v1
+  kind: DaemonSet
+  metadata:
+    name: nvidia-device-plugin-daemonset
+    namespace: kube-system
+  spec:
+    selector:
+      matchLabels:
+        name: nvidia-device-plugin-daemonset
+    updateStrategy:
+      type: RollingUpdate
+    template:
+      metadata:
+        labels:
+          name: nvidia-device-plugin-daemonset
+      spec:
+        runtimeClassName: nvidia # Explicitly request the runtime
+        tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+        # Mark this pod as a critical add-on; when enabled, the critical add-on
+        # scheduler reserves resources for critical add-on pods so that they can
+        # be rescheduled after a failure.
+        # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+        priorityClassName: "system-node-critical"
+        containers:
+        - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
+          name: nvidia-device-plugin-ctr
+          env:
+            - name: PASS_DEVICE_SPECS
+              value: "true"
+            - name: FAIL_ON_INIT_ERROR
+              value: "true"
+            - name: DEVICE_LIST_STRATEGY
+              value: envvar
+            - name: DEVICE_ID_STRATEGY
+              value: uuid
+            - name: NVIDIA_VISIBLE_DEVICES
+              value: all
+            - name: NVIDIA_DRIVER_CAPABILITIES
+              value: compute,utility
+            - name: MPS_ROOT
+              value: /run/nvidia/mps
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
+          volumeMounts:
+          - name: device-plugin
+            mountPath: /var/lib/kubelet/device-plugins
+        volumes:
+        - name: device-plugin
+          hostPath:
+            path: /var/lib/kubelet/device-plugins
+  ```
diff --git a/values/nvidia-gpu-operator-values.yaml b/values/nvidia-gpu-operator-values.yaml
@@ -58,7 +58,6 @@ validator:
       - name: WITH_WORKLOAD
         value: "false"
   driver:
-    # RKE2-specific configurations
     env:
       - name: DISABLE_DEV_CHAR_SYMLINK_CREATION
         value: "true"
@@ -73,8 +72,8 @@ operator:
   imagePullPolicy: IfNotPresent
   imagePullSecrets: []
   priorityClassName: system-node-critical
-  # Explicitly set runtime to containerd, not the default of `docker`
-  defaultRuntime: containerd
+  # Can be set to `containerd`, `docker`, etc.
+  defaultRuntime: docker
   runtimeClass: nvidia
   use_ocp_driver_toolkit: false
   # cleanup CRD on chart un-install