From ca744f59aa90f49f6bcc8a682faeb4bbd902ef43 Mon Sep 17 00:00:00 2001 From: sarvesh-cast Date: Sat, 19 Oct 2024 15:52:45 +0530 Subject: [PATCH] Fix gitops & existing cluster example (#404) --- examples/aks/aks_cluster_existing/castai.tf | 96 ++++++++-------- .../aks/aks_cluster_existing/providers.tf | 15 +++ .../aks/aks_cluster_existing/variables.tf | 9 +- examples/aks/aks_cluster_gitops/README.md | 105 +++++++++++++++--- examples/aks/aks_cluster_gitops/castai.tf | 100 +++++++++++++++++ examples/aks/aks_cluster_gitops/providers.tf | 1 + 6 files changed, 265 insertions(+), 61 deletions(-) diff --git a/examples/aks/aks_cluster_existing/castai.tf b/examples/aks/aks_cluster_existing/castai.tf index 6848b66f..4ba16abc 100644 --- a/examples/aks/aks_cluster_existing/castai.tf +++ b/examples/aks/aks_cluster_existing/castai.tf @@ -3,21 +3,7 @@ data "azurerm_subscription" "current" {} data "azurerm_kubernetes_cluster" "example" { name = var.cluster_name - resource_group_name = var.cluster_rg -} - -provider "castai" { - api_url = var.castai_api_url - api_token = var.castai_api_token -} - -provider "helm" { - kubernetes { - host = data.azurerm_kubernetes_cluster.example.kube_config.0.host - client_certificate = base64decode(data.azurerm_kubernetes_cluster.example.kube_config.0.client_certificate) - client_key = base64decode(data.azurerm_kubernetes_cluster.example.kube_config.0.client_key) - cluster_ca_certificate = base64decode(data.azurerm_kubernetes_cluster.example.kube_config.0.cluster_ca_certificate) - } + resource_group_name = var.resource_group } # Configure AKS cluster connection to CAST AI using CAST AI aks-cluster module. @@ -39,14 +25,15 @@ module "castai-aks-cluster" { tenant_id = data.azurerm_subscription.current.tenant_id - default_node_configuration = module.castai-aks-cluster.castai_node_configurations["default"] + default_node_configuration = module.castai-aks-cluster.castai_node_configurations["default"] + install_workload_autoscaler = true node_configurations = { default = { - disk_cpu_ratio = 0 - subnets = var.subnets - tags = var.tags - max_pods_per_node = 60 + min_disk_size = 100 + disk_cpu_ratio = 0 + subnets = var.subnets + tags = var.tags } } @@ -59,60 +46,79 @@ module "castai-aks-cluster" { should_taint = false constraints = { - on_demand = true - min_cpu = 8 - max_cpu = 96 - max_memory = 786432 + on_demand = true + } + } + example_spot_template = { + configuration_id = module.castai-aks-cluster.castai_node_configurations["default"] + is_enabled = true + should_taint = true + + custom_labels = { + custom-label-key-1 = "custom-label-value-1" + custom-label-key-2 = "custom-label-value-2" + } + + custom_taints = [ + { + key = "custom-taint-key-1" + value = "custom-taint-value-1" + effect = "NoSchedule" + }, + { + key = "custom-taint-key-2" + value = "custom-taint-value-2" + effect = "NoSchedule" + } + ] + constraints = { + spot = true + use_spot_fallbacks = true + fallback_restore_rate_seconds = 1800 + min_cpu = 4 + max_cpu = 100 instance_families = { - exclude = ["standard_FSv2", "standard_Dv4"] + exclude = ["standard_FSv2"] + } + custom_priority = { + instance_families = ["standard_Dv4"] + spot = true } } } } autoscaler_settings = { - enabled = true + enabled = false is_scoped_mode = false node_templates_partial_matching_enabled = false unschedulable_pods = { - enabled = true - - headroom = { - enabled = true - cpu_percentage = 10 - memory_percentage = 10 - } - - headroom_spot = { - enabled = true - cpu_percentage = 10 - memory_percentage = 10 - } + enabled = false } node_downscaler = { - enabled = true + enabled = false empty_nodes = { - enabled = true + enabled = false } evictor = { aggressive_mode = false - cycle_interval = "5m10s" + cycle_interval = "60s" dry_run = false - enabled = true + enabled = false node_grace_period_minutes = 10 scoped_mode = false } } cluster_limits = { - enabled = true + enabled = false cpu = { - max_cores = 20 + max_cores = 200 min_cores = 1 } } diff --git a/examples/aks/aks_cluster_existing/providers.tf b/examples/aks/aks_cluster_existing/providers.tf index bdab1922..ad5a6d74 100644 --- a/examples/aks/aks_cluster_existing/providers.tf +++ b/examples/aks/aks_cluster_existing/providers.tf @@ -1,8 +1,23 @@ # Following providers required by AKS and Vnet resources. provider "azurerm" { features {} + subscription_id = var.subscription_id # From Azure version 4.0, Specifying Subscription ID is Mandatory } provider "azuread" { tenant_id = data.azurerm_subscription.current.tenant_id } + +provider "castai" { + api_url = var.castai_api_url + api_token = var.castai_api_token +} + +provider "helm" { + kubernetes { + host = data.azurerm_kubernetes_cluster.example.kube_config.0.host + client_certificate = base64decode(data.azurerm_kubernetes_cluster.example.kube_config.0.client_certificate) + client_key = base64decode(data.azurerm_kubernetes_cluster.example.kube_config.0.client_key) + cluster_ca_certificate = base64decode(data.azurerm_kubernetes_cluster.example.kube_config.0.cluster_ca_certificate) + } +} diff --git a/examples/aks/aks_cluster_existing/variables.tf b/examples/aks/aks_cluster_existing/variables.tf index ffc549c8..392d6c53 100644 --- a/examples/aks/aks_cluster_existing/variables.tf +++ b/examples/aks/aks_cluster_existing/variables.tf @@ -4,11 +4,16 @@ variable "cluster_name" { description = "Name of the AKS cluster, resources will be created for." } -variable "cluster_rg" { +variable "resource_group" { type = string description = "Resource Group of the AKS cluster, resources will be created for." } +variable "subscription_id" { + type = string + description = "subscription id" +} + variable "cluster_region" { type = string description = "Region of the AKS cluster, resources will be created for." @@ -47,4 +52,4 @@ variable "tags" { variable "subnets" { type = list(string) description = "Cluster subnets" -} \ No newline at end of file +} diff --git a/examples/aks/aks_cluster_gitops/README.md b/examples/aks/aks_cluster_gitops/README.md index 5417bdf3..50924093 100644 --- a/examples/aks/aks_cluster_gitops/README.md +++ b/examples/aks/aks_cluster_gitops/README.md @@ -1,30 +1,107 @@ ## AKS and CAST AI example for GitOps onboarding flow -Following example shows how to onboard AKS cluster to CAST AI using GitOps flow. -In GitOps flow CAST AI Node Configuration, Node Templates and Autoscaler policies are managed using Terraform, but all Castware components such as `castai-agent`, `castai-cluster-controller`, `castai-evictor`, `castai-spot-handler`, `castai-kvisor` are to be installed using other means (e.g ArgoCD, manual Helm releases, etc.) +## GitOps flow + +Terraform Managed ==> IAM roles, CAST AI Node Configuration, CAST Node Templates and CAST Autoscaler policies + +Helm Managed ==> All Castware components such as `castai-agent`, `castai-cluster-controller`, `castai-evictor`, `castai-spot-handler`, `castai-kvisor`, `castai-workload-autoscaler`, `castai-pod-pinner`, `castai-egressd` are to be installed using other means (e.g ArgoCD, manual Helm releases, etc.) + + + +-------------------------+ + | Start | + +-------------------------+ + | + | TERRAFORM + +-------------------------+ + | 1. Update TF.VARS + 2. Terraform Init & Apply| + +-------------------------+ + | + |GITOPS + +-------------------------+ + | 3. Deploy Helm chart of castai-agent castai-cluster-controller`, `castai-evictor`, `castai-spot-handler`, `castai-kvisor`, `castai-workload-autoscaler`, `castai-pod-pinner` + +-------------------------+ + | + | + +-------------------------+ + | END | + +-------------------------+ -Steps to take to successfully onboard AKS cluster to CAST AI using GitOps flow: Prerequisites: - CAST AI account - Obtained CAST AI [API Access key](https://docs.cast.ai/docs/authentication#obtaining-api-access-key) with Full Access + +### Step 1 & 2: Update TF vars & TF Init, plan & apply +After successful apply, CAST Console UI will be in `Connecting` state. \ +Note generated 'CASTAI_CLUSTER_ID' from outputs + + +### Step 3: Deploy Helm chart of CAST Components +Coponents: `castai-cluster-controller`,`castai-evictor`, `castai-spot-handler`, `castai-kvisor`, `castai-workload-autoscaler`, `castai-pod-pinner` \ +After all CAST AI components are installed in the cluster its status in CAST AI console would change from `Connecting` to `Connected` which means that cluster onboarding process completed successfully. + +``` +CASTAI_API_KEY="" +CASTAI_CLUSTER_ID="" +CAST_CONFIG_SOURCE="castai-cluster-controller" + +#### Mandatory Component: Castai-agent +helm upgrade -i castai-agent castai-helm/castai-agent -n castai-agent --create-namespace \ + --set apiKey=$CASTAI_API_KEY \ + --set provider=aks \ + --set createNamespace=false + +#### Mandatory Component: castai-cluster-controller +helm upgrade -i cluster-controller castai-helm/castai-cluster-controller -n castai-agent \ +--set castai.apiKey=$CASTAI_API_KEY \ +--set castai.clusterID=$CASTAI_CLUSTER_ID \ +--set aks.enabled=true \ +--set autoscaling.enabled=true + +#### castai-spot-handler +helm upgrade -i castai-spot-handler castai-helm/castai-spot-handler -n castai-agent \ +--set castai.clusterID=$CASTAI_CLUSTER_ID \ +--set castai.provider=azure + +#### castai-evictor +helm upgrade -i castai-evictor castai-helm/castai-evictor -n castai-agent --set replicaCount=1 + +#### castai-pod-pinner +helm upgrade -i castai-pod-pinner castai-helm/castai-pod-pinner -n castai-agent \ +--set castai.apiKey=$CASTAI_API_KEY \ +--set castai.clusterID=$CASTAI_CLUSTER_ID \ +--set replicaCount=0 + +#### castai-workload-autoscaler +helm upgrade -i castai-workload-autoscaler castai-helm/castai-workload-autoscaler -n castai-agent \ +--set castai.apiKeySecretRef=$CAST_CONFIG_SOURCE \ +--set castai.configMapRef=$CAST_CONFIG_SOURCE \ + +#### castai-kvisor +helm upgrade -i castai-kvisor castai-helm/castai-kvisor -n castai-agent \ +--set castai.apiKey=$CASTAI_API_KEY \ +--set castai.clusterID=$CASTAI_CLUSTER_ID \ +--set controller.extraArgs.kube-linter-enabled=true \ +--set controller.extraArgs.image-scan-enabled=true \ +--set controller.extraArgs.kube-bench-enabled=true \ +--set controller.extraArgs.kube-bench-cloud-provider=aks +``` + +## Steps Overview + 1. Configure `tf.vars.example` file with required values. If AKS cluster is already managed by Terraform you could instead directly reference those resources. 2. Run `terraform init` -3. Run `terraform apply` and make a note of `cluster_id` and `cluster_token` output values. At this stage you would see that your cluster is in `Connecting` state in CAST AI console. -4. Install CAST AI components using Helm. Use `cluster_id` and `cluster_token` values to configure Helm releases: -- Set `castai.apiKey` property to `cluster_token` for following CAST AI components: `castai-cluster-controller`, `castai-kvisor`. -- Set `additionalEnv.STATIC_CLUSTER_ID` property to `cluster_id` and `apiKey` property to `cluster_token` for `castai-agent`. -- Set `castai.clusterID` property to for `castai-cluster-controller`, `castai-spot-handler`, `castai-kvisor` -Example Helm install command: -```bash -helm install cluster-controller castai-helm/castai-cluster-controller --namespace=castai-agent --set castai.apiKey=,provider=aks,castai.clusterID=,createNamespace=false,apiURL="https://api.cast.ai" -``` +3. Run `terraform apply` and make a note of `cluster_id` output values. At this stage you would see that your cluster is in `Connecting` state in CAST AI console +4. Install CAST AI components using Helm. Use `cluster_id` and `api_key` values to configure Helm releases: +- Set `castai.apiKey` property to `api_key` +- Set `castai.clusterID` property to `cluster_id` 5. After all CAST AI components are installed in the cluster its status in CAST AI console would change from `Connecting` to `Connected` which means that cluster onboarding process completed successfully. ## Importing already onboarded cluster to Terraform -This example can also be used to import AKS cluster to Terraform which is already onboarded to CAST AI console trough [script](https://docs.cast.ai/docs/cluster-onboarding#how-it-works). +This example can also be used to import AKS cluster to Terraform which is already onboarded to CAST AI console through [script](https://docs.cast.ai/docs/cluster-onboarding#how-it-works). For importing existing cluster follow steps 1-3 above and change `castai_node_configuration.default` Node Configuration name. -This would allow to manage already onboarded clusters' CAST AI Node Configurations and Node Templates through IaC. +This would allow to manage already onboarded clusters' CAST AI Node Configurations and Node Templates through IaC. \ No newline at end of file diff --git a/examples/aks/aks_cluster_gitops/castai.tf b/examples/aks/aks_cluster_gitops/castai.tf index 9bb13a46..0cb1d3db 100644 --- a/examples/aks/aks_cluster_gitops/castai.tf +++ b/examples/aks/aks_cluster_gitops/castai.tf @@ -28,3 +28,103 @@ resource "castai_node_configuration_default" "this" { cluster_id = castai_aks_cluster.this.id configuration_id = castai_node_configuration.default.id } + +resource "castai_node_template" "default_by_castai" { + cluster_id = castai_aks_cluster.this.id + + name = "default-by-castai" + is_default = true + is_enabled = true + configuration_id = castai_node_configuration.default.id + should_taint = true + + constraints { + on_demand = true + } +} + +resource "castai_node_template" "example_spot_template" { + cluster_id = castai_aks_cluster.this.id + + name = "example_spot_template" + is_default = false + is_enabled = true + configuration_id = castai_node_configuration.default.id + should_taint = true + + custom_labels = { + type = "spot" + } + + custom_taints { + key = "dedicated" + value = "backend" + effect = "NoSchedule" + } + + constraints { + spot = true + use_spot_fallbacks = true + fallback_restore_rate_seconds = 300 + enable_spot_diversity = true + spot_diversity_price_increase_limit_percent = 20 + is_gpu_only = false + min_cpu = 2 + max_cpu = 8 + min_memory = 4096 + max_memory = 16384 + architectures = ["amd64"] + burstable_instances = "disabled" + customer_specific = "disabled" + + instance_families { + exclude = ["standard_FSv2"] + } + + custom_priority { + instance_families = ["standard_Dv4"] + spot = true + } + } + +} + +resource "castai_autoscaler" "castai_autoscaler_policy" { + cluster_id = castai_aks_cluster.this.id + + autoscaler_settings { + enabled = true + is_scoped_mode = false + node_templates_partial_matching_enabled = false + + unschedulable_pods { + enabled = true + } + + cluster_limits { + enabled = false + + cpu { + min_cores = 1 + max_cores = 200 + } + } + + node_downscaler { + enabled = true + + empty_nodes { + enabled = true + } + + evictor { + aggressive_mode = false + cycle_interval = "60s" + dry_run = false + enabled = false + node_grace_period_minutes = 10 + scoped_mode = false + } + } + } +} diff --git a/examples/aks/aks_cluster_gitops/providers.tf b/examples/aks/aks_cluster_gitops/providers.tf index 7e184c69..e018e80a 100644 --- a/examples/aks/aks_cluster_gitops/providers.tf +++ b/examples/aks/aks_cluster_gitops/providers.tf @@ -1,5 +1,6 @@ provider "azurerm" { features {} + subscription_id = var.subscription_id # From Azure version 4.0, Specifying Subscription ID is Mandatory } provider "azuread" {