Skip to content

Commit

Permalink
[Fix] Retry cluster update on "INVALID_STATE" (#3890)
Browse files Browse the repository at this point in the history
## Changes
Clusters can only be updated while in Running and Terminated state. This
causes TF to fail
to update Autoscaling Clusters if there is an ongoing resize.

## Tests

- [X] `make test` run locally
- [ ] relevant change in `docs/` folder
- [ ] covered with integration tests in `internal/acceptance`
- [ ] relevant acceptance tests are passing
- [X] using Go SDK
  • Loading branch information
hectorcast-db authored Aug 13, 2024
1 parent 9490aa8 commit 81be591
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 1 deletion.
19 changes: 18 additions & 1 deletion clusters/resource_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@ package clusters

import (
"context"
"errors"
"fmt"
"log"
"strings"
"time"

"github.com/hashicorp/go-cty/cty"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/retry"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/validation"

"github.com/databricks/databricks-sdk-go/apierr"
"github.com/databricks/databricks-sdk-go/service/compute"
"github.com/databricks/terraform-provider-databricks/common"
"github.com/databricks/terraform-provider-databricks/libraries"
Expand Down Expand Up @@ -604,7 +607,21 @@ func resourceClusterUpdate(ctx context.Context, d *schema.ResourceData, c *commo
return err
}
cluster.ForceSendFields = []string{"NumWorkers"}
_, err = clusters.Edit(ctx, cluster)

err = retry.RetryContext(ctx, 15*time.Minute, func() *retry.RetryError {
_, err = clusters.Edit(ctx, cluster)
if err == nil {
return nil
}
var apiErr *apierr.APIError
// Only Running and Terminated clusters can be modified. In particular, autoscaling clusters cannot be modified
// while the resizing is ongoing. We retry in this case. Scaling can take several minutes.
if errors.As(err, &apiErr) && apiErr.ErrorCode == "INVALID_STATE" {
return retry.RetryableError(fmt.Errorf("cluster %s cannot be modified in its current state", clusterId))
}
return retry.NonRetryableError(err)
})

}
if err != nil {
return err
Expand Down
96 changes: 96 additions & 0 deletions clusters/resource_cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -965,6 +965,102 @@ func TestResourceClusterUpdate(t *testing.T) {
assert.Equal(t, "abc", d.Id(), "Id should be the same as in reading")
}

func TestResourceClusterUpdate_WhileScaling(t *testing.T) {
d, err := qa.ResourceFixture{
Fixtures: []qa.HTTPFixture{
{
Method: "GET",
Resource: "/api/2.1/clusters/get?cluster_id=abc",
ReuseRequest: true,
Response: compute.ClusterDetails{
ClusterId: "abc",
NumWorkers: 100,
ClusterName: "Shared Autoscaling",
SparkVersion: "7.1-scala12",
NodeTypeId: "i3.xlarge",
AutoterminationMinutes: 15,
State: compute.StateRunning,
},
},
{
Method: "POST",
Resource: "/api/2.1/clusters/events",
ExpectedRequest: compute.GetEvents{
ClusterId: "abc",
Limit: 1,
Order: compute.GetEventsOrderDesc,
EventTypes: []compute.EventType{compute.EventTypePinned, compute.EventTypeUnpinned},
},
Response: compute.GetEventsResponse{
Events: []compute.ClusterEvent{},
TotalCount: 0,
},
},
{
Method: "POST",
Resource: "/api/2.1/clusters/start",
ExpectedRequest: compute.StartCluster{
ClusterId: "abc",
},
},
{
Method: "GET",
Resource: "/api/2.0/libraries/cluster-status?cluster_id=abc",
Response: compute.ClusterLibraryStatuses{
LibraryStatuses: []compute.LibraryFullStatus{},
},
},
{
Method: "POST",
Resource: "/api/2.1/clusters/edit",
ExpectedRequest: compute.ClusterDetails{
AutoterminationMinutes: 15,
ClusterId: "abc",
NumWorkers: 100,
ClusterName: "Shared Autoscaling",
SparkVersion: "7.1-scala12",
NodeTypeId: "i3.xlarge",
},
Response: common.APIErrorBody{
ErrorCode: "INVALID_STATE",
},
Status: 404,
},
{
Method: "POST",
Resource: "/api/2.1/clusters/edit",
ExpectedRequest: compute.ClusterDetails{
AutoterminationMinutes: 15,
ClusterId: "abc",
NumWorkers: 100,
ClusterName: "Shared Autoscaling",
SparkVersion: "7.1-scala12",
NodeTypeId: "i3.xlarge",
},
},
{
Method: "GET",
Resource: "/api/2.0/libraries/cluster-status?cluster_id=abc",
Response: compute.ClusterLibraryStatuses{
LibraryStatuses: []compute.LibraryFullStatus{},
},
},
},
ID: "abc",
Update: true,
Resource: ResourceCluster(),
State: map[string]any{
"autotermination_minutes": 15,
"cluster_name": "Shared Autoscaling",
"spark_version": "7.1-scala12",
"node_type_id": "i3.xlarge",
"num_workers": 100,
},
}.Apply(t)
assert.NoError(t, err)
assert.Equal(t, "abc", d.Id(), "Id should be the same as in reading")
}

func TestResourceClusterUpdateWithPinned(t *testing.T) {
d, err := qa.ResourceFixture{
Fixtures: []qa.HTTPFixture{
Expand Down

0 comments on commit 81be591

Please sign in to comment.