redhat-na-ssa · codekow · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024
diff --git a/components/app-configs/nvidia-gpu-verification/overlays/toleration/kustomization.yaml b/components/app-configs/nvidia-gpu-verification/overlays/toleration/kustomization.yaml
@@ -16,9 +16,6 @@ patches:
           - effect: NoSchedule
             key: nvidia.com/gpu
             operator: Exists
-          - effect: NoSchedule
-            key: nvidia-gpu-only
-            operator: Exists
   - target:
       version: v1
       kind: Pod
@@ -29,6 +26,3 @@ patches:
           - effect: NoSchedule
             key: nvidia.com/gpu
             operator: Exists
-          - effect: NoSchedule
-            key: nvidia-gpu-only
-            operator: Exists
diff --git a/components/operators/devspaces/instance/overlays/nvidia-gpu/kustomization.yaml b/components/operators/devspaces/instance/overlays/nvidia-gpu/kustomization.yaml
@@ -28,6 +28,3 @@ patches:
           - effect: NoSchedule
             key: nvidia.com/gpu
             operator: Exists
-          - effect: NoSchedule
-            key: nvidia-gpu-only
-            operator: Exists
diff --git a/components/operators/gpu-operator-certified/instance/base/cluster-policy.yaml b/components/operators/gpu-operator-certified/instance/base/cluster-policy.yaml
@@ -55,9 +55,6 @@ spec:
       - effect: NoSchedule
         key: nvidia.com/gpu
         operator: Exists
-      - effect: NoSchedule
-        key: nvidia-gpu-only
-        operator: Exists
   devicePlugin:
     enabled: true
     config:

diff --git a/components/operators/gpu-operator-certified/instance/components/aws-gpu-machineset/job.sh b/components/operators/gpu-operator-certified/instance/components/aws-gpu-machineset/job.sh
@@ -8,3 +8,4 @@ INSTANCE_TYPE=${INSTANCE_TYPE:-g4dn.4xlarge}
 ocp_aws_cluster || exit 0
 ocp_aws_create_gpu_machineset "${INSTANCE_TYPE}"
 ocp_create_machineset_autoscale
+# ocp_aws_taint_gpu_machineset
diff --git a/components/operators/gpu-operator-certified/instance/components/aws-gpu-machineset/ocp.sh b/components/operators/gpu-operator-certified/instance/components/aws-gpu-machineset/ocp.sh
@@ -6,9 +6,11 @@
 # ocp_aws_cluster
 # ocp_aws_create_gpu_machineset
 # ocp_aws_clone_worker_machineset
-# ocp_create_machineset_autoscale'
+# ocp_aws_taint_gpu_machineset
+# ocp_create_machineset_autoscale
+# '
 
-# for function in $FUNCTIONS
+# for function in ${FUNCTIONS}
 # do
 #   extract_function $function scripts/library/ocp.sh >> tmp
 #   echo >> tmp
@@ -45,16 +47,7 @@ ocp_aws_create_gpu_machineset(){
   oc -n openshift-machine-api \
     patch "${MACHINE_SET_TYPE}" \
     --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"node-role.kubernetes.io/gpu":""}}}}}}'
-
-  # taint nodes for gpu-only workloads
-  oc -n openshift-machine-api \
-    patch "${MACHINE_SET_TYPE}" \
-    --type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia.com/gpu","value":"","effect":"NoSchedule"}]}}}}'
-
-  # oc -n openshift-machine-api \
-  #   patch "${MACHINE_SET_TYPE}" \
-  #   --type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia-gpu-only","value":"","effect":"NoSchedule"}]}}}}'
-
+
   # should use the default profile
   # oc -n openshift-machine-api \
   #   patch "${MACHINE_SET_TYPE}" \
@@ -112,6 +105,18 @@ ocp_aws_clone_worker_machineset(){
     --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"node-role.kubernetes.io/'"${SHORT_NAME}"'":""}}}}}}'
 }
 
+ocp_aws_taint_gpu_machineset(){
+  INSTANCE_TYPE=${1:-g4dn.4xlarge}
+  MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep "${INSTANCE_TYPE%.*}" | head -n1)
+
+  echo "Patching: ${MACHINE_SET_TYPE}"
+
+  # taint nodes for gpu-only workloads
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia.com/gpu","value":"","effect":"NoSchedule"}]}}}}'
+}
+
 ocp_create_machineset_autoscale(){
   MACHINE_MIN=${1:-0}
   MACHINE_MAX=${2:-4}

diff --git a/.../gpu-operator-certified/instance/components/time-sliced-4/patch-device-plugin-config.yaml b/.../gpu-operator-certified/instance/components/time-sliced-4/patch-device-plugin-config.yaml
@@ -5,11 +5,6 @@ metadata:
 data:
   no-time-sliced: |-
     version: v1
-    sharing:
-      timeSlicing:
-        resources:
-          - name: nvidia.com/gpu
-            replicas: 0
   time-sliced: |-
     version: v1
     sharing:

diff --git a/...rs/gpu-operator-certified/instance/components/time-sliced/patch-device-plugin-config.yaml b/...rs/gpu-operator-certified/instance/components/time-sliced/patch-device-plugin-config.yaml
@@ -5,11 +5,6 @@ metadata:
 data:
   no-time-sliced: |-
     version: v1
-    sharing:
-      timeSlicing:
-        resources:
-          - name: nvidia.com/gpu
-            replicas: 0
   time-sliced: |-
     version: v1
     sharing:

diff --git a/...ors/rhods-operator/instance/components/nvidia-gpu-accelerator-profile/nvidia-profile.yaml b/...ors/rhods-operator/instance/components/nvidia-gpu-accelerator-profile/nvidia-profile.yaml
@@ -14,6 +14,3 @@ spec:
     - effect: NoSchedule
       key: nvidia.com/gpu
       operator: Exists
-    - effect: NoSchedule
-      key: nvidia-gpu-only
-      operator: Exists
diff --git a/scripts/library/ocp.sh b/scripts/library/ocp.sh
@@ -243,15 +243,6 @@ ocp_aws_create_gpu_machineset(){
   oc -n openshift-machine-api \
     patch "${MACHINE_SET_TYPE}" \
     --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"node-role.kubernetes.io/gpu":""}}}}}}'
-
-  # taint nodes for gpu-only workloads
-  oc -n openshift-machine-api \
-    patch "${MACHINE_SET_TYPE}" \
-    --type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia.com/gpu","value":"","effect":"NoSchedule"}]}}}}'
-
-  # oc -n openshift-machine-api \
-  #   patch "${MACHINE_SET_TYPE}" \
-  #   --type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia-gpu-only","value":"","effect":"NoSchedule"}]}}}}'
 
   # should use the default profile
   # oc -n openshift-machine-api \
@@ -272,6 +263,18 @@ ocp_aws_create_gpu_machineset(){
     --type=merge --patch '{"spec":{"template":{"spec":{"providerSpec":{"value":{"instanceType":"'"${INSTANCE_TYPE}"'"}}}}}}'
 }
 
+ocp_aws_taint_gpu_machineset(){
+  INSTANCE_TYPE=${1:-g4dn.4xlarge}
+  MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep "${INSTANCE_TYPE%.*}" | head -n1)
+
+  echo "Patching: ${MACHINE_SET_TYPE}"
+
+  # taint nodes for gpu-only workloads
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia.com/gpu","value":"","effect":"NoSchedule"}]}}}}'
+}
+
 ocp_create_machineset_autoscale(){
   MACHINE_MIN=${1:-0}
   MACHINE_MAX=${2:-4}
@@ -332,8 +335,8 @@ ocp_set_scheduler_profile(){
   SCHED_PROFILE=${1:-LowNodeUtilization}
 
   # LowNodeUtilization, HighNodeUtilization, NoScoring
-  echo "see https://docs.openshift.com/container-platform/4.11/nodes/scheduling/nodes-scheduler-profiles.html"
-  echo "OPTIONS: LowNodeUtilization, HighNodeUtilization, NoScoring"
+  echo "see https://docs.openshift.com/container-platform/4.16/nodes/scheduling/nodes-scheduler-profiles.html"
+  echo "OPTIONS: LowNodeUtilization (default), HighNodeUtilization, NoScoring"
   echo "SCHED_PROFILE: ${SCHED_PROFILE}"
 
   oc patch schedulers.config.openshift.io/cluster --type merge --patch '{"spec":{"profile": "'"${SCHED_PROFILE}"'"}}' 
@@ -411,13 +414,13 @@ ocp_ack_upgrade_4.13(){
 }
 
 ocp_gpu_taint_nodes(){
-  oc adm taint node -l node-role.kubernetes.io/gpu nvidia-gpu-only=:NoSchedule --overwrite
+  oc adm taint node -l node-role.kubernetes.io/gpu nvidia.com/gpu=:NoSchedule --overwrite
   oc adm drain -l node-role.kubernetes.io/gpu --ignore-daemonsets --delete-emptydir-data
   oc adm uncordon -l node-role.kubernetes.io/gpu
 }
 
 ocp_gpu_untaint_nodes(){
-  oc adm taint node -l node-role.kubernetes.io/gpu nvidia-gpu-only=:NoSchedule-
+  oc adm taint node -l node-role.kubernetes.io/gpu nvidia.com/gpu=:NoSchedule-
 }
 
 ocp_gpu_label_nodes_from_nfd(){

diff --git a/workshop/wip/parasol-insurance/01-setup/kustomization.yaml b/workshop/wip/parasol-insurance/01-setup/kustomization.yaml
@@ -33,9 +33,6 @@ patches:
           - effect: NoSchedule
             key: nvidia.com/gpu
             operator: Exists
-          - effect: NoSchedule
-            key: nvidia-gpu-only
-            operator: Exists
   - path: exclude-sc-ceph-rbd.yaml
   - path: exclude-sc-gp3.yaml
   - path: exclude-cm.yaml