Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update: gpu config #103

Merged
merged 4 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@ patches:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
- effect: NoSchedule
key: nvidia-gpu-only
operator: Exists
- target:
version: v1
kind: Pod
Expand All @@ -29,6 +26,3 @@ patches:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
- effect: NoSchedule
key: nvidia-gpu-only
operator: Exists
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,3 @@ patches:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
- effect: NoSchedule
key: nvidia-gpu-only
operator: Exists
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,6 @@ spec:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
- effect: NoSchedule
key: nvidia-gpu-only
operator: Exists
devicePlugin:
enabled: true
config:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ INSTANCE_TYPE=${INSTANCE_TYPE:-g4dn.4xlarge}
ocp_aws_cluster || exit 0
ocp_aws_create_gpu_machineset "${INSTANCE_TYPE}"
ocp_create_machineset_autoscale
# ocp_aws_taint_gpu_machineset
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
# ocp_aws_cluster
# ocp_aws_create_gpu_machineset
# ocp_aws_clone_worker_machineset
# ocp_create_machineset_autoscale'
# ocp_aws_taint_gpu_machineset
# ocp_create_machineset_autoscale
# '

# for function in $FUNCTIONS
# for function in ${FUNCTIONS}
# do
# extract_function $function scripts/library/ocp.sh >> tmp
# echo >> tmp
Expand Down Expand Up @@ -45,16 +47,7 @@ ocp_aws_create_gpu_machineset(){
oc -n openshift-machine-api \
patch "${MACHINE_SET_TYPE}" \
--type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"node-role.kubernetes.io/gpu":""}}}}}}'

# taint nodes for gpu-only workloads
oc -n openshift-machine-api \
patch "${MACHINE_SET_TYPE}" \
--type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia.com/gpu","value":"","effect":"NoSchedule"}]}}}}'

# oc -n openshift-machine-api \
# patch "${MACHINE_SET_TYPE}" \
# --type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia-gpu-only","value":"","effect":"NoSchedule"}]}}}}'


# should use the default profile
# oc -n openshift-machine-api \
# patch "${MACHINE_SET_TYPE}" \
Expand Down Expand Up @@ -112,6 +105,18 @@ ocp_aws_clone_worker_machineset(){
--type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"node-role.kubernetes.io/'"${SHORT_NAME}"'":""}}}}}}'
}

ocp_aws_taint_gpu_machineset(){
INSTANCE_TYPE=${1:-g4dn.4xlarge}
MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep "${INSTANCE_TYPE%.*}" | head -n1)

echo "Patching: ${MACHINE_SET_TYPE}"

# taint nodes for gpu-only workloads
oc -n openshift-machine-api \
patch "${MACHINE_SET_TYPE}" \
--type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia.com/gpu","value":"","effect":"NoSchedule"}]}}}}'
}

ocp_create_machineset_autoscale(){
MACHINE_MIN=${1:-0}
MACHINE_MAX=${2:-4}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,6 @@ metadata:
data:
no-time-sliced: |-
version: v1
sharing:
timeSlicing:
resources:
- name: nvidia.com/gpu
replicas: 0
time-sliced: |-
version: v1
sharing:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,6 @@ metadata:
data:
no-time-sliced: |-
version: v1
sharing:
timeSlicing:
resources:
- name: nvidia.com/gpu
replicas: 0
time-sliced: |-
version: v1
sharing:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,3 @@ spec:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
- effect: NoSchedule
key: nvidia-gpu-only
operator: Exists
29 changes: 16 additions & 13 deletions scripts/library/ocp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -243,15 +243,6 @@ ocp_aws_create_gpu_machineset(){
oc -n openshift-machine-api \
patch "${MACHINE_SET_TYPE}" \
--type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"node-role.kubernetes.io/gpu":""}}}}}}'

# taint nodes for gpu-only workloads
oc -n openshift-machine-api \
patch "${MACHINE_SET_TYPE}" \
--type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia.com/gpu","value":"","effect":"NoSchedule"}]}}}}'

# oc -n openshift-machine-api \
# patch "${MACHINE_SET_TYPE}" \
# --type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia-gpu-only","value":"","effect":"NoSchedule"}]}}}}'

# should use the default profile
# oc -n openshift-machine-api \
Expand All @@ -272,6 +263,18 @@ ocp_aws_create_gpu_machineset(){
--type=merge --patch '{"spec":{"template":{"spec":{"providerSpec":{"value":{"instanceType":"'"${INSTANCE_TYPE}"'"}}}}}}'
}

ocp_aws_taint_gpu_machineset(){
INSTANCE_TYPE=${1:-g4dn.4xlarge}
MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep "${INSTANCE_TYPE%.*}" | head -n1)

echo "Patching: ${MACHINE_SET_TYPE}"

# taint nodes for gpu-only workloads
oc -n openshift-machine-api \
patch "${MACHINE_SET_TYPE}" \
--type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia.com/gpu","value":"","effect":"NoSchedule"}]}}}}'
}

ocp_create_machineset_autoscale(){
MACHINE_MIN=${1:-0}
MACHINE_MAX=${2:-4}
Expand Down Expand Up @@ -332,8 +335,8 @@ ocp_set_scheduler_profile(){
SCHED_PROFILE=${1:-LowNodeUtilization}

# LowNodeUtilization, HighNodeUtilization, NoScoring
echo "see https://docs.openshift.com/container-platform/4.11/nodes/scheduling/nodes-scheduler-profiles.html"
echo "OPTIONS: LowNodeUtilization, HighNodeUtilization, NoScoring"
echo "see https://docs.openshift.com/container-platform/4.16/nodes/scheduling/nodes-scheduler-profiles.html"
echo "OPTIONS: LowNodeUtilization (default), HighNodeUtilization, NoScoring"
echo "SCHED_PROFILE: ${SCHED_PROFILE}"

oc patch schedulers.config.openshift.io/cluster --type merge --patch '{"spec":{"profile": "'"${SCHED_PROFILE}"'"}}'
Expand Down Expand Up @@ -411,13 +414,13 @@ ocp_ack_upgrade_4.13(){
}

ocp_gpu_taint_nodes(){
oc adm taint node -l node-role.kubernetes.io/gpu nvidia-gpu-only=:NoSchedule --overwrite
oc adm taint node -l node-role.kubernetes.io/gpu nvidia.com/gpu=:NoSchedule --overwrite
oc adm drain -l node-role.kubernetes.io/gpu --ignore-daemonsets --delete-emptydir-data
oc adm uncordon -l node-role.kubernetes.io/gpu
}

ocp_gpu_untaint_nodes(){
oc adm taint node -l node-role.kubernetes.io/gpu nvidia-gpu-only=:NoSchedule-
oc adm taint node -l node-role.kubernetes.io/gpu nvidia.com/gpu=:NoSchedule-
}

ocp_gpu_label_nodes_from_nfd(){
Expand Down
3 changes: 0 additions & 3 deletions workshop/wip/parasol-insurance/01-setup/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@ patches:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
- effect: NoSchedule
key: nvidia-gpu-only
operator: Exists
- path: exclude-sc-ceph-rbd.yaml
- path: exclude-sc-gp3.yaml
- path: exclude-cm.yaml
Expand Down
Loading