diff --git a/.github/workflows/e2e-workflow.yml b/.github/workflows/e2e-workflow.yml index e31021b5c..b0e86c9ae 100644 --- a/.github/workflows/e2e-workflow.yml +++ b/.github/workflows/e2e-workflow.yml @@ -139,10 +139,11 @@ jobs: shell: bash run: | make gpu-provisioner-helm - kubectl wait --for=condition=available deploy "kaito-gpu-provisioner" -n gpu-provisioner --timeout=300s + kubectl wait --for=condition=available deploy "gpu-provisioner" -n gpu-provisioner --timeout=300s env: AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }} AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }} + GPU_PROVISIONER_VERSION: ${{ vars.GPU_PROVISIONER_VERSION }} - uses: azure/login@8c334a195cbb38e46038007b304988d888bf676a # v2.0.0 with: diff --git a/Makefile b/Makefile index 3337c9c89..589e140eb 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ REGISTRY ?= YOUR_REGISTRY IMG_NAME ?= workspace VERSION ?= v0.2.2 +GPU_PROVISIONER_VERSION ?= 0.2.0 IMG_TAG ?= $(subst v,,$(VERSION)) ROOT_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) @@ -199,25 +200,19 @@ gpu-provisioner-identity-perm: ## Create identity for gpu-provisioner AKS_OIDC_ISSUER=$(shell az aks show -n "$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP)" --subscription $(AZURE_SUBSCRIPTION_ID) --query "oidcIssuerProfile.issuerUrl");\ az identity federated-credential create --name gpu-federatecredential --identity-name gpuIdentity --resource-group "$(AZURE_RESOURCE_GROUP)" --issuer $$AKS_OIDC_ISSUER \ - --subject system:serviceaccount:"gpu-provisioner:gpu-provisioner" --audience api://AzureADTokenExchange --subscription $(AZURE_SUBSCRIPTION_ID) + --subject system:serviceaccount:"$(GPU_NAMESPACE):$(GPU_NAMESPACE)" --audience api://AzureADTokenExchange --subscription $(AZURE_SUBSCRIPTION_ID) -.PHONY: gpu-provisioner-helm gpu-provisioner-helm: ## Update Azure client env vars and settings in helm values.yml az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) $(eval IDENTITY_CLIENT_ID=$(shell az identity show --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP) --query 'clientId' -o tsv)) $(eval AZURE_TENANT_ID=$(shell az account show | jq -r ".tenantId")) $(eval AZURE_SUBSCRIPTION_ID=$(shell az account show | jq -r ".id")) - yq -i '(.controller.env[] | select(.name=="ARM_SUBSCRIPTION_ID")) .value = "$(AZURE_SUBSCRIPTION_ID)"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.controller.env[] | select(.name=="LOCATION")) .value = "$(AZURE_LOCATION)"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.controller.env[] | select(.name=="ARM_RESOURCE_GROUP")) .value = "$(AZURE_RESOURCE_GROUP)"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.controller.env[] | select(.name=="AZURE_NODE_RESOURCE_GROUP")) .value = "$(AZURE_RESOURCE_GROUP_MC)"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.controller.env[] | select(.name=="AZURE_CLUSTER_NAME")) .value = "$(AZURE_CLUSTER_NAME)"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.settings.azure.clusterName) = "$(AZURE_CLUSTER_NAME)"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.workloadIdentity.clientId) = "$(IDENTITY_CLIENT_ID)"' ./charts/kaito/gpu-provisioner/values.yaml - yq -i '(.workloadIdentity.tenantId) = "$(AZURE_TENANT_ID)"' ./charts/kaito/gpu-provisioner/values.yaml + curl -sO https://raw.githubusercontent.com/Azure/gpu-provisioner/main/hack/deploy/configure-helm-values.sh + chmod +x ./configure-helm-values.sh && ./configure-helm-values.sh $(AZURE_CLUSTER_NAME) $(AZURE_RESOURCE_GROUP) gpuIdentity - helm install kaito-gpu-provisioner ./charts/kaito/gpu-provisioner --namespace $(GPU_NAMESPACE) --create-namespace + helm install $(GPU_NAMESPACE) --values gpu-provisioner-values.yaml --wait \ + https://github.com/Azure/gpu-provisioner/raw/gh-pages/charts/gpu-provisioner-$(GPU_PROVISIONER_VERSION).tgz ##@ Build Dependencies diff --git a/README.md b/README.md index 7ec55e013..879da0507 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ The above figure presents the Kaito architecture overview. Its major components - **Workspace controller**: It reconciles the `workspace` custom resource, creates `machine` (explained below) custom resources to trigger node auto provisioning, and creates the inference workload (`deployment` or `statefulset`) based on the model preset configurations. - **Node provisioner controller**: The controller's name is *gpu-provisioner* in [gpu-provisioner helm chart](https://github.com/Azure/gpu-provisioner/tree/main/charts/gpu-provisioner). It uses the `machine` CRD originated from [Karpenter](https://sigs.k8s.io/karpenter) to interact with the workspace controller. It integrates with Azure Kubernetes Service(AKS) APIs to add new GPU nodes to the AKS cluster. -Note that the *gpu-provisioner* is an open sourced component maintained in [this](https://github.com/Azure/gpu-provisioner) repository. It can be replaced by other controllers if they support Karpenter-core APIs. +> Note: The [*gpu-provisioner*](https://github.com/Azure/gpu-provisioner) is an open sourced component. It can be replaced by other controllers if they support [Karpenter-core](https://sigs.k8s.io/karpenter) APIs. ## Installation diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 75ead6938..acf518e90 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -32,7 +32,7 @@ var _ = SynchronizedBeforeSuite(func() []byte { //check gpu-provisioner deployment is up and running gpuProvisionerDeployment := &v1.Deployment{ ObjectMeta: metav1.ObjectMeta{ - Name: "kaito-gpu-provisioner", + Name: "gpu-provisioner", Namespace: gpuNamespace, }, }