diff --git a/README.md b/README.md index 9c59194c..419ba21b 100644 --- a/README.md +++ b/README.md @@ -171,6 +171,124 @@ and usage `castai_eks_cluster.this.cluster_token` * default value for `imds_v1` was change to `true`, in case that your configuration didn't had this specified please explicitly set this value to `false` +Migrating from 4.x.x to 5.x.x +--------------------------- + +Version 5.x.x changed: +* Terraform provider adopts [default node template concept](https://docs.cast.ai/docs/default-node-template) +* Removed `spotInstances` field from `autoscaler_policies_json` attribute in `castai_autoscaler_policies` resource +* Removed `customInstancesEnabled` field from `autoscaler_policies_json` attribute in `castai_autoscaler_policies` resource +* Removed `nodeConstraints` field from `autoscaler_policies_json` attribute in `castai_autoscaler_policies` resource +* All valid fields which were removed from `autoscaler_policies_json` have mapping in `castai_node_template` [resource](https://registry.terraform.io/providers/CastAI/castai/latest/docs/resources/node_template) + +Old configuration: +```terraform +resource "castai_autoscaler" "castai_autoscaler_policies" { + cluster_id = data.castai_eks_clusterid.cluster_id.id // or other reference + + autoscaler_policies_json = <<-EOT + { + "enabled": true, + "unschedulablePods": { + "enabled": true + "customInstancesEnabled": true, + "nodeConstraints": { + "enabled": true, + "minCpuCores": 2, + "maxCpuCores": 4, + "minRamMib": 3814, + "maxRamMib": 16384, + }, + }, + "spotInstances": { + "enabled": true, + "clouds": ["gcp"], + "spotBackups": { + "enabled": true + } + }, + "nodeDownscaler": { + "enabled": true, + "emptyNodes": { + "enabled": true + }, + "evictor": { + "aggressiveMode": true, + "cycleInterval": "5m10s", + "dryRun": false, + "enabled": true, + "nodeGracePeriodMinutes": 10, + "scopedMode": false + } + } + } + EOT +} +``` + +New configuration: +```terraform +resource "castai_autoscaler" "castai_autoscaler_policies" { + cluster_id = data.castai_eks_clusterid.cluster_id.id // or other reference + + autoscaler_policies_json = <<-EOT + { + "enabled": true, + "unschedulablePods": { + "enabled": true + }, + "nodeDownscaler": { + "enabled": true, + "emptyNodes": { + "enabled": true + }, + "evictor": { + "aggressiveMode": true, + "cycleInterval": "5m10s", + "dryRun": false, + "enabled": true, + "nodeGracePeriodMinutes": 10, + "scopedMode": false + } + } + } + EOT +} + +resource "castai_node_template" "default_by_castai" { + cluster_id = data.castai_eks_clusterid.cluster_id.id // or other reference + + name = "default-by-castai" + configuration_id = castai_node_configuration.default.id // or other reference + is_default = true + should_taint = false + custom_instances_enabled = true + + constraints { + architectures = [ + "amd64", + "arm64", + ] + on_demand = true + spot = true + use_spot_fallbacks = true + min_cpu = 2 + max_cpu = 4 + min_memory = 3814 + max_memory = 16384 + } + + depends_on = [ castai_autoscaler.castai_autoscaler_policies ] +} +``` + +If you have used `castai-eks-cluster` or other modules follow: +https://github.com/castai/terraform-castai-eks-cluster/blob/main/README.md#migrating-from-5xx-to-6xx + +Note: `default-by-castai` default node template is created in background by CAST.ai, when creating managed resource +in Terraform the provider will handle create as update. Alternatively you can perform Terraform state import and +everything will work correctly. + Developing the provider --------------------------- diff --git a/castai/resource_autoscaler.go b/castai/resource_autoscaler.go index 53f45bdf..5cca3a88 100644 --- a/castai/resource_autoscaler.go +++ b/castai/resource_autoscaler.go @@ -44,9 +44,10 @@ func resourceAutoscaler() *schema.Resource { Description: "CAST AI cluster id", }, FieldAutoscalerPoliciesJSON: { - Type: schema.TypeString, - Description: "autoscaler policies JSON string to override current autoscaler settings", - Optional: true, + Type: schema.TypeString, + Description: "autoscaler policies JSON string to override current autoscaler settings", + Optional: true, + ValidateDiagFunc: validateAutoscalerPolicyJSON(), }, FieldAutoscalerPolicies: { Type: schema.TypeString, @@ -229,3 +230,40 @@ func getClusterId(data *schema.ResourceData) string { return value.(string) } + +func validateAutoscalerPolicyJSON() schema.SchemaValidateDiagFunc { + return validation.ToDiagFunc(func(i interface{}, k string) ([]string, []error) { + v, ok := i.(string) + if !ok { + return nil, []error{fmt.Errorf("expected type of %q to be string", k)} + } + policyMap := make(map[string]interface{}) + err := json.Unmarshal([]byte(v), &policyMap) + if err != nil { + return nil, []error{fmt.Errorf("failed to deserialize JSON: %v", err)} + } + errors := make([]error, 0) + if _, found := policyMap["spotInstances"]; found { + errors = append(errors, createValidationError("spotInstances", v)) + } + if unschedulablePods, found := policyMap["unschedulablePods"]; found { + if unschedulablePodsMap, ok := unschedulablePods.(map[string]interface{}); ok { + if _, found := unschedulablePodsMap["customInstancesEnabled"]; found { + errors = append(errors, createValidationError("customInstancesEnabled", v)) + } + if _, found := unschedulablePodsMap["nodeConstraints"]; found { + errors = append(errors, createValidationError("nodeConstraints", v)) + } + } + } + + return nil, errors + }) +} + +func createValidationError(field, value string) error { + return fmt.Errorf("'%s' field was removed from policies JSON in 5.0.0. "+ + "The configuration was migrated to default node template.\n\n"+ + "See: https://github.com/castai/terraform-provider-castai#migrating-from-4xx-to-5xx\n\n"+ + "Policy:\n%v", field, value) +} diff --git a/castai/resource_autoscaler_test.go b/castai/resource_autoscaler_test.go index c793d8a1..0f70b726 100644 --- a/castai/resource_autoscaler_test.go +++ b/castai/resource_autoscaler_test.go @@ -8,6 +8,7 @@ import ( "io" "net/http" "reflect" + "strings" "testing" "github.com/golang/mock/gomock" @@ -318,3 +319,129 @@ func JSONBytesEqual(a, b []byte) (bool, error) { } return reflect.DeepEqual(j2, j), nil } + +func Test_validateAutoscalerPolicyJSON(t *testing.T) { + type testData struct { + json string + valid bool + expectedMessage string + } + tests := map[string]testData{ + "should return no diagnostic error for valid autoscaler policies JSON": { + json: ` { + "enabled": true, + "unschedulablePods": { + "enabled": true + }, + "nodeDownscaler": { + "enabled": true, + "emptyNodes": { + "enabled": true + }, + "evictor": { + "aggressiveMode": true, + "cycleInterval": "5m10s", + "dryRun": false, + "enabled": true, + "nodeGracePeriodMinutes": 10, + "scopedMode": false + } + } + }`, + valid: true, + }, + "should return diagnostic error if spot instances block is present in JSON": { + json: ` { + "enabled": true, + "unschedulablePods": { + "enabled": true + }, + "spotInstances": { + "enabled": true, + "clouds": ["gcp"], + "spotBackups": { + "enabled": true + } + }, + "nodeDownscaler": { + "enabled": true, + "emptyNodes": { + "enabled": true + }, + "evictor": { + "aggressiveMode": true, + "cycleInterval": "5m10s", + "dryRun": false, + "enabled": true, + "nodeGracePeriodMinutes": 10, + "scopedMode": false + } + } + }`, + valid: false, + expectedMessage: "'spotInstances' field was removed from policies JSON in 5.0.0. The configuration was migrated to default node template.", + }, + "should return diagnostic error if custom instance enabled attribute is present in JSON": { + json: ` { + "enabled": true, + "unschedulablePods": { + "enabled": true, + "customInstancesEnabled": true + }, + "nodeDownscaler": { + "enabled": true, + "emptyNodes": { + "enabled": true + }, + "evictor": { + "aggressiveMode": true, + "cycleInterval": "5m10s", + "dryRun": false, + "enabled": true, + "nodeGracePeriodMinutes": 10, + "scopedMode": false + } + } + }`, + valid: false, + expectedMessage: "'customInstancesEnabled' field was removed from policies JSON in 5.0.0. The configuration was migrated to default node template.", + }, + + "should return diagnostic error if node constraints attribute is present in JSON": { + json: ` { + "enabled": true, + "unschedulablePods": { + "enabled": true, + "nodeConstraints": {} + }, + "nodeDownscaler": { + "enabled": true, + "emptyNodes": { + "enabled": true + }, + "evictor": { + "aggressiveMode": true, + "cycleInterval": "5m10s", + "dryRun": false, + "enabled": true, + "nodeGracePeriodMinutes": 10, + "scopedMode": false + } + } + }`, + valid: false, + expectedMessage: "'nodeConstraints' field was removed from policies JSON in 5.0.0. The configuration was migrated to default node template.", + }, + } + for name, tt := range tests { + t.Run(name, func(t *testing.T) { + result := validateAutoscalerPolicyJSON()(tt.json, []cty.PathStep{cty.PathStep(nil)}) + require.Equal(t, tt.valid, !result.HasError()) + if !tt.valid { + for _, d := range result { + require.True(t, strings.Contains(d.Summary, tt.expectedMessage)) + } + } + }) + } +} diff --git a/castai/resource_node_template.go b/castai/resource_node_template.go index 4f3dc980..a107964d 100644 --- a/castai/resource_node_template.go +++ b/castai/resource_node_template.go @@ -6,6 +6,7 @@ import ( "github.com/castai/terraform-provider-castai/castai/sdk" "github.com/google/uuid" "github.com/hashicorp/terraform-plugin-sdk/v2/diag" + "github.com/hashicorp/terraform-plugin-sdk/v2/helper/retry" "github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema" "github.com/hashicorp/terraform-plugin-sdk/v2/helper/validation" "github.com/samber/lo" @@ -566,7 +567,11 @@ func resourceNodeTemplateDelete(ctx context.Context, d *schema.ResourceData, met } func resourceNodeTemplateUpdate(ctx context.Context, d *schema.ResourceData, meta any) diag.Diagnostics { - if !d.HasChanges( + return updateNodeTemplate(ctx, d, meta, false) +} + +func updateNodeTemplate(ctx context.Context, d *schema.ResourceData, meta any, skipChangeCheck bool) diag.Diagnostics { + if !skipChangeCheck && !d.HasChanges( FieldNodeTemplateName, FieldNodeTemplateShouldTaint, FieldNodeTemplateConfigurationId, @@ -578,7 +583,7 @@ func resourceNodeTemplateUpdate(ctx context.Context, d *schema.ResourceData, met FieldNodeTemplateConstraints, FieldNodeTemplateIsEnabled, ) { - log.Printf("[INFO] Nothing to update in node configuration") + log.Printf("[INFO] Nothing to update in node template") return nil } @@ -659,6 +664,12 @@ func resourceNodeTemplateCreate(ctx context.Context, d *schema.ResourceData, met defer log.Printf("[INFO] Create Node Template post call end") client := meta.(*ProviderConfig).api clusterID := d.Get(FieldClusterID).(string) + + // default node template is created by default in the background, therefore we need to use PUT instead of POST + if d.Get(FieldNodeTemplateIsDefault).(bool) { + return updateDefaultNodeTemplate(ctx, d, meta) + } + req := sdk.NodeTemplatesAPICreateNodeTemplateJSONRequestBody{ Name: lo.ToPtr(d.Get(FieldNodeTemplateName).(string)), IsDefault: lo.ToPtr(d.Get(FieldNodeTemplateIsDefault).(bool)), @@ -723,6 +734,29 @@ func resourceNodeTemplateCreate(ctx context.Context, d *schema.ResourceData, met return resourceNodeTemplateRead(ctx, d, meta) } +func updateDefaultNodeTemplate(ctx context.Context, d *schema.ResourceData, meta any) diag.Diagnostics { + d.SetId(d.Get(FieldNodeTemplateName).(string)) + // make timeout 5 seconds less than the creation timeout + timeout := d.Timeout(schema.TimeoutCreate) - 5*time.Second + // handle situation when default node template is not created yet by autoscaler policy + if err := retry.RetryContext(ctx, timeout, func() *retry.RetryError { + diagnostics := updateNodeTemplate(ctx, d, meta, true) + + for _, d := range diagnostics { + if d.Severity == diag.Error { + if strings.Contains(d.Summary, "node template not found") { + return retry.RetryableError(fmt.Errorf(d.Summary)) + } + return retry.NonRetryableError(fmt.Errorf(d.Summary)) + } + } + return nil + }); err != nil { + return diag.FromErr(err) + } + return nil +} + func getNodeTemplateByName(ctx context.Context, data *schema.ResourceData, meta any, clusterID string) (*sdk.NodetemplatesV1NodeTemplate, error) { client := meta.(*ProviderConfig).api nodeTemplateName := data.Id() diff --git a/castai/resource_node_template_test.go b/castai/resource_node_template_test.go index 6d931359..741059d8 100644 --- a/castai/resource_node_template_test.go +++ b/castai/resource_node_template_test.go @@ -214,6 +214,73 @@ func TestNodeTemplateResourceReadContextEmptyList(t *testing.T) { r.Equal(result[0].Summary, "failed to find node template with name: gpu") } +func TestNodeTemplateResourceCreate_defaultNodeTemplate(t *testing.T) { + r := require.New(t) + mockctrl := gomock.NewController(t) + mockClient := mock_sdk.NewMockClientInterface(mockctrl) + + ctx := context.Background() + provider := &ProviderConfig{ + api: &sdk.ClientWithResponses{ + ClientInterface: mockClient, + }, + } + + clusterId := "b6bfc074-a267-400f-b8f1-db0850c369b1" + body := io.NopCloser(bytes.NewReader([]byte(` + { + "items": [ + { + "template": { + "configurationId": "7dc4f922-29c9-4377-889c-0c8c5fb8d497", + "configurationName": "default", + "name": "default-by-castai", + "isEnabled": true, + "isDefault": true, + "constraints": { + "spot": false, + "onDemand": true, + "minCpu": 10, + "maxCpu": 10000, + "architectures": ["amd64", "arm64"] + }, + "version": "3", + "shouldTaint": true, + "customLabels": {}, + "customTaints": [], + "rebalancingConfig": { + "minNodes": 0 + }, + "customInstancesEnabled": true + } + } + ] + } + `))) + mockClient.EXPECT(). + NodeTemplatesAPIListNodeTemplates(gomock.Any(), clusterId, &sdk.NodeTemplatesAPIListNodeTemplatesParams{IncludeDefault: lo.ToPtr(true)}). + Return(&http.Response{StatusCode: 200, Body: body, Header: map[string][]string{"Content-Type": {"json"}}}, nil) + + mockClient.EXPECT(). + NodeTemplatesAPIUpdateNodeTemplate(gomock.Any(), clusterId, "default-by-castai", gomock.Any()). + Return(&http.Response{StatusCode: 200, Body: io.NopCloser(bytes.NewReader([]byte{}))}, nil) + + resource := resourceNodeTemplate() + val := cty.ObjectVal(map[string]cty.Value{ + FieldClusterId: cty.StringVal(clusterId), + FieldNodeTemplateName: cty.StringVal("default-by-castai"), + FieldNodeTemplateIsDefault: cty.BoolVal(true), + FieldNodeTemplateCustomInstancesEnabled: cty.BoolVal(true), + }) + state := terraform.NewInstanceStateShimmedFromValue(val, 0) + state.ID = "default-by-castai" + + data := resource.Data(state) + result := resource.CreateContext(ctx, data, provider) + r.Nil(result) + r.False(result.HasError()) +} + func TestNodeTemplateResourceDelete_defaultNodeTemplate(t *testing.T) { r := require.New(t) mockctrl := gomock.NewController(t) diff --git a/examples/aks/aks_cluster_autoscaler_policies/castai.tf b/examples/aks/aks_cluster_autoscaler_policies/castai.tf index 56f1f837..29854685 100644 --- a/examples/aks/aks_cluster_autoscaler_policies/castai.tf +++ b/examples/aks/aks_cluster_autoscaler_policies/castai.tf @@ -53,6 +53,21 @@ module "castai-aks-cluster" { } node_templates = { + default_by_castai = { + name = "default-by-castai" + configuration_id = module.castai-aks-cluster.castai_node_configurations["default"] + is_default = true + should_taint = false + + constraints = { + on_demand = true + spot = true + use_spot_fallbacks = true + + enable_spot_diversity = false + spot_diversity_price_increase_limit_percent = 20 + } + } spot_tmpl = { configuration_id = module.castai-aks-cluster.castai_node_configurations["default"] should_taint = true @@ -91,7 +106,6 @@ module "castai-aks-cluster" { // Configure Autoscaler policies as per API specification https://api.cast.ai/v1/spec/#/PoliciesAPI/PoliciesAPIUpsertClusterPolicies. // Here: // - unschedulablePods - Unscheduled pods policy - // - spotInstances - Spot instances configuration // - nodeDownscaler - Node deletion policy autoscaler_policies_json = <<-EOT { @@ -99,15 +113,6 @@ module "castai-aks-cluster" { "unschedulablePods": { "enabled": true }, - "spotInstances": { - "enabled": true, - "clouds": ["azure"], - "spotBackups": { - "enabled": true - }, - "spotDiversityEnabled": false, - "spotDiversityPriceIncreaseLimitPercent": 20 - }, "nodeDownscaler": { "enabled": true, "emptyNodes": { diff --git a/examples/eks/eks_cluster_autoscaler_policies/castai.tf b/examples/eks/eks_cluster_autoscaler_policies/castai.tf index eb39ee07..7a2fb4be 100644 --- a/examples/eks/eks_cluster_autoscaler_policies/castai.tf +++ b/examples/eks/eks_cluster_autoscaler_policies/castai.tf @@ -101,6 +101,24 @@ module "castai-eks-cluster" { } node_templates = { + default_by_castai = { + name = "default-by-castai" + configuration_id = module.castai-aks-cluster.castai_node_configurations["default"] + is_default = true + should_taint = false + + constraints = { + on_demand = true + spot = true + use_spot_fallbacks = true + + enable_spot_diversity = false + spot_diversity_price_increase_limit_percent = 20 + + spot_interruption_predictions_enabled = true + spot_interruption_predictions_type = "aws-rebalance-recommendations" + } + } spot_tmpl = { configuration_id = module.castai-eks-cluster.castai_node_configurations["default"] should_taint = true @@ -140,7 +158,6 @@ module "castai-eks-cluster" { # Configure Autoscaler policies as per API specification https://api.cast.ai/v1/spec/#/PoliciesAPI/PoliciesAPIUpsertClusterPolicies. # Here: # - unschedulablePods - Unscheduled pods policy - # - spotInstances - Spot instances configuration # - nodeDownscaler - Node deletion policy autoscaler_policies_json = <<-EOT { @@ -148,19 +165,6 @@ module "castai-eks-cluster" { "unschedulablePods": { "enabled": true }, - "spotInstances": { - "enabled": true, - "clouds": ["aws"], - "spotBackups": { - "enabled": true - }, - "spotDiversityEnabled": false, - "spotDiversityPriceIncreaseLimitPercent": 20, - "spotInterruptionPredictions": { - "enabled": true, - "type": "AWSRebalanceRecommendations" - } - }, "nodeDownscaler": { "enabled": true, "emptyNodes": { diff --git a/examples/gke/gke_cluster_autoscaler_policies/castai.tf b/examples/gke/gke_cluster_autoscaler_policies/castai.tf index 17fdcc79..3ffe65b6 100644 --- a/examples/gke/gke_cluster_autoscaler_policies/castai.tf +++ b/examples/gke/gke_cluster_autoscaler_policies/castai.tf @@ -60,6 +60,21 @@ module "castai-gke-cluster" { } node_templates = { + default_by_castai = { + name = "default-by-castai" + configuration_id = module.castai-aks-cluster.castai_node_configurations["default"] + is_default = true + should_taint = false + + constraints = { + on_demand = true + spot = true + use_spot_fallbacks = true + + enable_spot_diversity = false + spot_diversity_price_increase_limit_percent = 20 + } + } spot_tmpl = { configuration_id = module.castai-gke-cluster.castai_node_configurations["default"] should_taint = true @@ -100,7 +115,6 @@ module "castai-gke-cluster" { // Configure Autoscaler policies as per API specification https://api.cast.ai/v1/spec/#/PoliciesAPI/PoliciesAPIUpsertClusterPolicies. // Here: // - unschedulablePods - Unscheduled pods policy - // - spotInstances - Spot instances configuration // - nodeDownscaler - Node deletion policy autoscaler_policies_json = <<-EOT { @@ -108,15 +122,6 @@ module "castai-gke-cluster" { "unschedulablePods": { "enabled": true }, - "spotInstances": { - "enabled": true, - "clouds": ["gcp"], - "spotBackups": { - "enabled": true - }, - "spotDiversityEnabled": false, - "spotDiversityPriceIncreaseLimitPercent": 20 - }, "nodeDownscaler": { "enabled": true, "emptyNodes": {