Skip to content

Commit

Permalink
feat: Adopt default node template concept
Browse files Browse the repository at this point in the history
  • Loading branch information
jansyk13 committed Aug 4, 2023
1 parent 88b6ed4 commit cabfd2d
Show file tree
Hide file tree
Showing 8 changed files with 437 additions and 39 deletions.
118 changes: 118 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,124 @@ and usage `castai_eks_cluster.this.cluster_token`
* default value for `imds_v1` was change to `true`, in case that your configuration didn't had this specified
please explicitly set this value to `false`
Migrating from 4.x.x to 5.x.x
---------------------------
Version 5.x.x changed:
* Terraform provider adopts [default node template concept](https://docs.cast.ai/docs/default-node-template)
* Removed `spotInstances` field from `autoscaler_policies_json` attribute in `castai_autoscaler_policies` resource
* Removed `customInstancesEnabled` field from `autoscaler_policies_json` attribute in `castai_autoscaler_policies` resource
* Removed `nodeConstraints` field from `autoscaler_policies_json` attribute in `castai_autoscaler_policies` resource
* All valid fields which were removed from `autoscaler_policies_json` have mapping in `castai_node_template` [resource](https://registry.terraform.io/providers/CastAI/castai/latest/docs/resources/node_template)
Old configuration:
```terraform
resource "castai_autoscaler" "castai_autoscaler_policies" {
cluster_id = data.castai_eks_clusterid.cluster_id.id // or other reference
autoscaler_policies_json = <<-EOT
{
"enabled": true,
"unschedulablePods": {
"enabled": true
"customInstancesEnabled": true,
"nodeConstraints": {
"enabled": true,
"minCpuCores": 2,
"maxCpuCores": 4,
"minRamMib": 3814,
"maxRamMib": 16384,
},
},
"spotInstances": {
"enabled": true,
"clouds": ["gcp"],
"spotBackups": {
"enabled": true
}
},
"nodeDownscaler": {
"enabled": true,
"emptyNodes": {
"enabled": true
},
"evictor": {
"aggressiveMode": true,
"cycleInterval": "5m10s",
"dryRun": false,
"enabled": true,
"nodeGracePeriodMinutes": 10,
"scopedMode": false
}
}
}
EOT
}
```
New configuration:
```terraform
resource "castai_autoscaler" "castai_autoscaler_policies" {
cluster_id = data.castai_eks_clusterid.cluster_id.id // or other reference
autoscaler_policies_json = <<-EOT
{
"enabled": true,
"unschedulablePods": {
"enabled": true
},
"nodeDownscaler": {
"enabled": true,
"emptyNodes": {
"enabled": true
},
"evictor": {
"aggressiveMode": true,
"cycleInterval": "5m10s",
"dryRun": false,
"enabled": true,
"nodeGracePeriodMinutes": 10,
"scopedMode": false
}
}
}
EOT
}
resource "castai_node_template" "default_by_castai" {
cluster_id = data.castai_eks_clusterid.cluster_id.id // or other reference
name = "default-by-castai"
configuration_id = castai_node_configuration.default.id // or other reference
is_default = true
should_taint = false
custom_instances_enabled = true
constraints {
architectures = [
"amd64",
"arm64",
]
on_demand = true
spot = true
use_spot_fallbacks = true
min_cpu = 2
max_cpu = 4
min_memory = 3814
max_memory = 16384
}
depends_on = [ castai_autoscaler.castai_autoscaler_policies ]
}
```
If you have used `castai-eks-cluster` or other modules follow:
https://github.com/castai/terraform-castai-eks-cluster/blob/main/README.md#migrating-from-5xx-to-6xx
Note: `default-by-castai` default node template is created in background by CAST.ai, when creating managed resource
in Terraform the provider will handle create as update. Alternatively you can perform Terraform state import and
everything will work correctly.
Developing the provider
---------------------------
Expand Down
44 changes: 41 additions & 3 deletions castai/resource_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,10 @@ func resourceAutoscaler() *schema.Resource {
Description: "CAST AI cluster id",
},
FieldAutoscalerPoliciesJSON: {
Type: schema.TypeString,
Description: "autoscaler policies JSON string to override current autoscaler settings",
Optional: true,
Type: schema.TypeString,
Description: "autoscaler policies JSON string to override current autoscaler settings",
Optional: true,
ValidateDiagFunc: validateAutoscalerPolicyJSON(),
},
FieldAutoscalerPolicies: {
Type: schema.TypeString,
Expand Down Expand Up @@ -229,3 +230,40 @@ func getClusterId(data *schema.ResourceData) string {

return value.(string)
}

func validateAutoscalerPolicyJSON() schema.SchemaValidateDiagFunc {
return validation.ToDiagFunc(func(i interface{}, k string) ([]string, []error) {
v, ok := i.(string)
if !ok {
return nil, []error{fmt.Errorf("expected type of %q to be string", k)}
}
policyMap := make(map[string]interface{})
err := json.Unmarshal([]byte(v), &policyMap)
if err != nil {
return nil, []error{fmt.Errorf("failed to deserialize JSON: %v", err)}
}
errors := make([]error, 0)
if _, found := policyMap["spotInstances"]; found {
errors = append(errors, createValidationError("spotInstances", v))
}
if unschedulablePods, found := policyMap["unschedulablePods"]; found {
if unschedulablePodsMap, ok := unschedulablePods.(map[string]interface{}); ok {
if _, found := unschedulablePodsMap["customInstancesEnabled"]; found {
errors = append(errors, createValidationError("customInstancesEnabled", v))
}
if _, found := unschedulablePodsMap["nodeConstraints"]; found {
errors = append(errors, createValidationError("nodeConstraints", v))
}
}
}

return nil, errors
})
}

func createValidationError(field, value string) error {
return fmt.Errorf("'%s' field was removed from policies JSON in 5.0.0. "+
"The configuration was migrated to default node template.\n\n"+
"See: https://github.com/castai/terraform-provider-castai#migrating-from-4xx-to-5xx\n\n"+
"Policy:\n%v", field, value)
}
127 changes: 127 additions & 0 deletions castai/resource_autoscaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"io"
"net/http"
"reflect"
"strings"
"testing"

"github.com/golang/mock/gomock"
Expand Down Expand Up @@ -318,3 +319,129 @@ func JSONBytesEqual(a, b []byte) (bool, error) {
}
return reflect.DeepEqual(j2, j), nil
}

func Test_validateAutoscalerPolicyJSON(t *testing.T) {
type testData struct {
json string
valid bool
expectedMessage string
}
tests := map[string]testData{
"should return no diagnostic error for valid autoscaler policies JSON": {
json: ` {
"enabled": true,
"unschedulablePods": {
"enabled": true
},
"nodeDownscaler": {
"enabled": true,
"emptyNodes": {
"enabled": true
},
"evictor": {
"aggressiveMode": true,
"cycleInterval": "5m10s",
"dryRun": false,
"enabled": true,
"nodeGracePeriodMinutes": 10,
"scopedMode": false
}
}
}`,
valid: true,
},
"should return diagnostic error if spot instances block is present in JSON": {
json: ` {
"enabled": true,
"unschedulablePods": {
"enabled": true
},
"spotInstances": {
"enabled": true,
"clouds": ["gcp"],
"spotBackups": {
"enabled": true
}
},
"nodeDownscaler": {
"enabled": true,
"emptyNodes": {
"enabled": true
},
"evictor": {
"aggressiveMode": true,
"cycleInterval": "5m10s",
"dryRun": false,
"enabled": true,
"nodeGracePeriodMinutes": 10,
"scopedMode": false
}
}
}`,
valid: false,
expectedMessage: "'spotInstances' field was removed from policies JSON in 5.0.0. The configuration was migrated to default node template.",
},
"should return diagnostic error if custom instance enabled attribute is present in JSON": {
json: ` {
"enabled": true,
"unschedulablePods": {
"enabled": true,
"customInstancesEnabled": true
},
"nodeDownscaler": {
"enabled": true,
"emptyNodes": {
"enabled": true
},
"evictor": {
"aggressiveMode": true,
"cycleInterval": "5m10s",
"dryRun": false,
"enabled": true,
"nodeGracePeriodMinutes": 10,
"scopedMode": false
}
}
}`,
valid: false,
expectedMessage: "'customInstancesEnabled' field was removed from policies JSON in 5.0.0. The configuration was migrated to default node template.",
},

"should return diagnostic error if node constraints attribute is present in JSON": {
json: ` {
"enabled": true,
"unschedulablePods": {
"enabled": true,
"nodeConstraints": {}
},
"nodeDownscaler": {
"enabled": true,
"emptyNodes": {
"enabled": true
},
"evictor": {
"aggressiveMode": true,
"cycleInterval": "5m10s",
"dryRun": false,
"enabled": true,
"nodeGracePeriodMinutes": 10,
"scopedMode": false
}
}
}`,
valid: false,
expectedMessage: "'nodeConstraints' field was removed from policies JSON in 5.0.0. The configuration was migrated to default node template.",
},
}
for name, tt := range tests {
t.Run(name, func(t *testing.T) {
result := validateAutoscalerPolicyJSON()(tt.json, []cty.PathStep{cty.PathStep(nil)})
require.Equal(t, tt.valid, !result.HasError())
if !tt.valid {
for _, d := range result {
require.True(t, strings.Contains(d.Summary, tt.expectedMessage))
}
}
})
}
}
38 changes: 36 additions & 2 deletions castai/resource_node_template.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"github.com/castai/terraform-provider-castai/castai/sdk"
"github.com/google/uuid"
"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/retry"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/validation"
"github.com/samber/lo"
Expand Down Expand Up @@ -566,7 +567,11 @@ func resourceNodeTemplateDelete(ctx context.Context, d *schema.ResourceData, met
}

func resourceNodeTemplateUpdate(ctx context.Context, d *schema.ResourceData, meta any) diag.Diagnostics {
if !d.HasChanges(
return updateNodeTemplate(ctx, d, meta, false)
}

func updateNodeTemplate(ctx context.Context, d *schema.ResourceData, meta any, skipChangeCheck bool) diag.Diagnostics {
if !skipChangeCheck && !d.HasChanges(
FieldNodeTemplateName,
FieldNodeTemplateShouldTaint,
FieldNodeTemplateConfigurationId,
Expand All @@ -578,7 +583,7 @@ func resourceNodeTemplateUpdate(ctx context.Context, d *schema.ResourceData, met
FieldNodeTemplateConstraints,
FieldNodeTemplateIsEnabled,
) {
log.Printf("[INFO] Nothing to update in node configuration")
log.Printf("[INFO] Nothing to update in node template")
return nil
}

Expand Down Expand Up @@ -659,6 +664,12 @@ func resourceNodeTemplateCreate(ctx context.Context, d *schema.ResourceData, met
defer log.Printf("[INFO] Create Node Template post call end")
client := meta.(*ProviderConfig).api
clusterID := d.Get(FieldClusterID).(string)

// default node template is created by default in the background, therefore we need to use PUT instead of POST
if d.Get(FieldNodeTemplateIsDefault).(bool) {
return updateDefaultNodeTemplate(ctx, d, meta)
}

req := sdk.NodeTemplatesAPICreateNodeTemplateJSONRequestBody{
Name: lo.ToPtr(d.Get(FieldNodeTemplateName).(string)),
IsDefault: lo.ToPtr(d.Get(FieldNodeTemplateIsDefault).(bool)),
Expand Down Expand Up @@ -723,6 +734,29 @@ func resourceNodeTemplateCreate(ctx context.Context, d *schema.ResourceData, met
return resourceNodeTemplateRead(ctx, d, meta)
}

func updateDefaultNodeTemplate(ctx context.Context, d *schema.ResourceData, meta any) diag.Diagnostics {
d.SetId(d.Get(FieldNodeTemplateName).(string))
// make timeout 5 seconds less than the creation timeout
timeout := d.Timeout(schema.TimeoutCreate) - 5*time.Second
// handle situation when default node template is not created yet by autoscaler policy
if err := retry.RetryContext(ctx, timeout, func() *retry.RetryError {
diagnostics := updateNodeTemplate(ctx, d, meta, true)

for _, d := range diagnostics {
if d.Severity == diag.Error {
if strings.Contains(d.Summary, "node template not found") {
return retry.RetryableError(fmt.Errorf(d.Summary))
}
return retry.NonRetryableError(fmt.Errorf(d.Summary))
}
}
return nil
}); err != nil {
return diag.FromErr(err)
}
return nil
}

func getNodeTemplateByName(ctx context.Context, data *schema.ResourceData, meta any, clusterID string) (*sdk.NodetemplatesV1NodeTemplate, error) {
client := meta.(*ProviderConfig).api
nodeTemplateName := data.Id()
Expand Down
Loading

0 comments on commit cabfd2d

Please sign in to comment.