Skip to content

Commit

Permalink
Fix gitops & existing cluster example (#404)
Browse files Browse the repository at this point in the history
  • Loading branch information
sarvesh-cast authored Oct 19, 2024
1 parent ae03ffd commit ca744f5
Show file tree
Hide file tree
Showing 6 changed files with 265 additions and 61 deletions.
96 changes: 51 additions & 45 deletions examples/aks/aks_cluster_existing/castai.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,7 @@ data "azurerm_subscription" "current" {}

data "azurerm_kubernetes_cluster" "example" {
name = var.cluster_name
resource_group_name = var.cluster_rg
}

provider "castai" {
api_url = var.castai_api_url
api_token = var.castai_api_token
}

provider "helm" {
kubernetes {
host = data.azurerm_kubernetes_cluster.example.kube_config.0.host
client_certificate = base64decode(data.azurerm_kubernetes_cluster.example.kube_config.0.client_certificate)
client_key = base64decode(data.azurerm_kubernetes_cluster.example.kube_config.0.client_key)
cluster_ca_certificate = base64decode(data.azurerm_kubernetes_cluster.example.kube_config.0.cluster_ca_certificate)
}
resource_group_name = var.resource_group
}

# Configure AKS cluster connection to CAST AI using CAST AI aks-cluster module.
Expand All @@ -39,14 +25,15 @@ module "castai-aks-cluster" {
tenant_id = data.azurerm_subscription.current.tenant_id


default_node_configuration = module.castai-aks-cluster.castai_node_configurations["default"]
default_node_configuration = module.castai-aks-cluster.castai_node_configurations["default"]
install_workload_autoscaler = true

node_configurations = {
default = {
disk_cpu_ratio = 0
subnets = var.subnets
tags = var.tags
max_pods_per_node = 60
min_disk_size = 100
disk_cpu_ratio = 0
subnets = var.subnets
tags = var.tags
}
}

Expand All @@ -59,60 +46,79 @@ module "castai-aks-cluster" {
should_taint = false

constraints = {
on_demand = true
min_cpu = 8
max_cpu = 96
max_memory = 786432
on_demand = true
}
}
example_spot_template = {
configuration_id = module.castai-aks-cluster.castai_node_configurations["default"]
is_enabled = true
should_taint = true

custom_labels = {
custom-label-key-1 = "custom-label-value-1"
custom-label-key-2 = "custom-label-value-2"
}

custom_taints = [
{
key = "custom-taint-key-1"
value = "custom-taint-value-1"
effect = "NoSchedule"
},
{
key = "custom-taint-key-2"
value = "custom-taint-value-2"
effect = "NoSchedule"
}
]
constraints = {
spot = true
use_spot_fallbacks = true
fallback_restore_rate_seconds = 1800
min_cpu = 4
max_cpu = 100
instance_families = {
exclude = ["standard_FSv2", "standard_Dv4"]
exclude = ["standard_FSv2"]
}
custom_priority = {
instance_families = ["standard_Dv4"]
spot = true
}
}
}
}

autoscaler_settings = {
enabled = true
enabled = false
is_scoped_mode = false
node_templates_partial_matching_enabled = false

unschedulable_pods = {
enabled = true

headroom = {
enabled = true
cpu_percentage = 10
memory_percentage = 10
}

headroom_spot = {
enabled = true
cpu_percentage = 10
memory_percentage = 10
}
enabled = false
}

node_downscaler = {
enabled = true
enabled = false

empty_nodes = {
enabled = true
enabled = false
}

evictor = {
aggressive_mode = false
cycle_interval = "5m10s"
cycle_interval = "60s"
dry_run = false
enabled = true
enabled = false
node_grace_period_minutes = 10
scoped_mode = false
}
}

cluster_limits = {
enabled = true
enabled = false

cpu = {
max_cores = 20
max_cores = 200
min_cores = 1
}
}
Expand Down
15 changes: 15 additions & 0 deletions examples/aks/aks_cluster_existing/providers.tf
Original file line number Diff line number Diff line change
@@ -1,8 +1,23 @@
# Following providers required by AKS and Vnet resources.
provider "azurerm" {
features {}
subscription_id = var.subscription_id # From Azure version 4.0, Specifying Subscription ID is Mandatory
}

provider "azuread" {
tenant_id = data.azurerm_subscription.current.tenant_id
}

provider "castai" {
api_url = var.castai_api_url
api_token = var.castai_api_token
}

provider "helm" {
kubernetes {
host = data.azurerm_kubernetes_cluster.example.kube_config.0.host
client_certificate = base64decode(data.azurerm_kubernetes_cluster.example.kube_config.0.client_certificate)
client_key = base64decode(data.azurerm_kubernetes_cluster.example.kube_config.0.client_key)
cluster_ca_certificate = base64decode(data.azurerm_kubernetes_cluster.example.kube_config.0.cluster_ca_certificate)
}
}
9 changes: 7 additions & 2 deletions examples/aks/aks_cluster_existing/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,16 @@ variable "cluster_name" {
description = "Name of the AKS cluster, resources will be created for."
}

variable "cluster_rg" {
variable "resource_group" {
type = string
description = "Resource Group of the AKS cluster, resources will be created for."
}

variable "subscription_id" {
type = string
description = "subscription id"
}

variable "cluster_region" {
type = string
description = "Region of the AKS cluster, resources will be created for."
Expand Down Expand Up @@ -47,4 +52,4 @@ variable "tags" {
variable "subnets" {
type = list(string)
description = "Cluster subnets"
}
}
105 changes: 91 additions & 14 deletions examples/aks/aks_cluster_gitops/README.md
Original file line number Diff line number Diff line change
@@ -1,30 +1,107 @@
## AKS and CAST AI example for GitOps onboarding flow

Following example shows how to onboard AKS cluster to CAST AI using GitOps flow.
In GitOps flow CAST AI Node Configuration, Node Templates and Autoscaler policies are managed using Terraform, but all Castware components such as `castai-agent`, `castai-cluster-controller`, `castai-evictor`, `castai-spot-handler`, `castai-kvisor` are to be installed using other means (e.g ArgoCD, manual Helm releases, etc.)
## GitOps flow

Terraform Managed ==> IAM roles, CAST AI Node Configuration, CAST Node Templates and CAST Autoscaler policies

Helm Managed ==> All Castware components such as `castai-agent`, `castai-cluster-controller`, `castai-evictor`, `castai-spot-handler`, `castai-kvisor`, `castai-workload-autoscaler`, `castai-pod-pinner`, `castai-egressd` are to be installed using other means (e.g ArgoCD, manual Helm releases, etc.)


+-------------------------+
| Start |
+-------------------------+
|
| TERRAFORM
+-------------------------+
| 1. Update TF.VARS
2. Terraform Init & Apply|
+-------------------------+
|
|GITOPS
+-------------------------+
| 3. Deploy Helm chart of castai-agent castai-cluster-controller`, `castai-evictor`, `castai-spot-handler`, `castai-kvisor`, `castai-workload-autoscaler`, `castai-pod-pinner`
+-------------------------+
|
|
+-------------------------+
| END |
+-------------------------+

Steps to take to successfully onboard AKS cluster to CAST AI using GitOps flow:

Prerequisites:
- CAST AI account
- Obtained CAST AI [API Access key](https://docs.cast.ai/docs/authentication#obtaining-api-access-key) with Full Access


### Step 1 & 2: Update TF vars & TF Init, plan & apply
After successful apply, CAST Console UI will be in `Connecting` state. \
Note generated 'CASTAI_CLUSTER_ID' from outputs


### Step 3: Deploy Helm chart of CAST Components
Coponents: `castai-cluster-controller`,`castai-evictor`, `castai-spot-handler`, `castai-kvisor`, `castai-workload-autoscaler`, `castai-pod-pinner` \
After all CAST AI components are installed in the cluster its status in CAST AI console would change from `Connecting` to `Connected` which means that cluster onboarding process completed successfully.

```
CASTAI_API_KEY=""
CASTAI_CLUSTER_ID=""
CAST_CONFIG_SOURCE="castai-cluster-controller"
#### Mandatory Component: Castai-agent
helm upgrade -i castai-agent castai-helm/castai-agent -n castai-agent --create-namespace \
--set apiKey=$CASTAI_API_KEY \
--set provider=aks \
--set createNamespace=false
#### Mandatory Component: castai-cluster-controller
helm upgrade -i cluster-controller castai-helm/castai-cluster-controller -n castai-agent \
--set castai.apiKey=$CASTAI_API_KEY \
--set castai.clusterID=$CASTAI_CLUSTER_ID \
--set aks.enabled=true \
--set autoscaling.enabled=true
#### castai-spot-handler
helm upgrade -i castai-spot-handler castai-helm/castai-spot-handler -n castai-agent \
--set castai.clusterID=$CASTAI_CLUSTER_ID \
--set castai.provider=azure
#### castai-evictor
helm upgrade -i castai-evictor castai-helm/castai-evictor -n castai-agent --set replicaCount=1
#### castai-pod-pinner
helm upgrade -i castai-pod-pinner castai-helm/castai-pod-pinner -n castai-agent \
--set castai.apiKey=$CASTAI_API_KEY \
--set castai.clusterID=$CASTAI_CLUSTER_ID \
--set replicaCount=0
#### castai-workload-autoscaler
helm upgrade -i castai-workload-autoscaler castai-helm/castai-workload-autoscaler -n castai-agent \
--set castai.apiKeySecretRef=$CAST_CONFIG_SOURCE \
--set castai.configMapRef=$CAST_CONFIG_SOURCE \
#### castai-kvisor
helm upgrade -i castai-kvisor castai-helm/castai-kvisor -n castai-agent \
--set castai.apiKey=$CASTAI_API_KEY \
--set castai.clusterID=$CASTAI_CLUSTER_ID \
--set controller.extraArgs.kube-linter-enabled=true \
--set controller.extraArgs.image-scan-enabled=true \
--set controller.extraArgs.kube-bench-enabled=true \
--set controller.extraArgs.kube-bench-cloud-provider=aks
```

## Steps Overview

1. Configure `tf.vars.example` file with required values. If AKS cluster is already managed by Terraform you could instead directly reference those resources.
2. Run `terraform init`
3. Run `terraform apply` and make a note of `cluster_id` and `cluster_token` output values. At this stage you would see that your cluster is in `Connecting` state in CAST AI console.
4. Install CAST AI components using Helm. Use `cluster_id` and `cluster_token` values to configure Helm releases:
- Set `castai.apiKey` property to `cluster_token` for following CAST AI components: `castai-cluster-controller`, `castai-kvisor`.
- Set `additionalEnv.STATIC_CLUSTER_ID` property to `cluster_id` and `apiKey` property to `cluster_token` for `castai-agent`.
- Set `castai.clusterID` property to for `castai-cluster-controller`, `castai-spot-handler`, `castai-kvisor`
Example Helm install command:
```bash
helm install cluster-controller castai-helm/castai-cluster-controller --namespace=castai-agent --set castai.apiKey=<cluster_token>,provider=aks,castai.clusterID=<cluster_id>,createNamespace=false,apiURL="https://api.cast.ai"
```
3. Run `terraform apply` and make a note of `cluster_id` output values. At this stage you would see that your cluster is in `Connecting` state in CAST AI console
4. Install CAST AI components using Helm. Use `cluster_id` and `api_key` values to configure Helm releases:
- Set `castai.apiKey` property to `api_key`
- Set `castai.clusterID` property to `cluster_id`
5. After all CAST AI components are installed in the cluster its status in CAST AI console would change from `Connecting` to `Connected` which means that cluster onboarding process completed successfully.


## Importing already onboarded cluster to Terraform

This example can also be used to import AKS cluster to Terraform which is already onboarded to CAST AI console trough [script](https://docs.cast.ai/docs/cluster-onboarding#how-it-works).
This example can also be used to import AKS cluster to Terraform which is already onboarded to CAST AI console through [script](https://docs.cast.ai/docs/cluster-onboarding#how-it-works).
For importing existing cluster follow steps 1-3 above and change `castai_node_configuration.default` Node Configuration name.
This would allow to manage already onboarded clusters' CAST AI Node Configurations and Node Templates through IaC.
This would allow to manage already onboarded clusters' CAST AI Node Configurations and Node Templates through IaC.
Loading

0 comments on commit ca744f5

Please sign in to comment.