Skip to content

Commit

Permalink
Adding config for retry read-stalled-req (#2575)
Browse files Browse the repository at this point in the history
* Adding config for read-stalled-req

* minor doc change

* review comments

* Added unit and composite test

* minor formatting changes

* fixing format issue
  • Loading branch information
raj-prince authored Oct 11, 2024
1 parent 3207fba commit 766dec9
Show file tree
Hide file tree
Showing 10 changed files with 311 additions and 0 deletions.
76 changes: 76 additions & 0 deletions cfg/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ type GcsRetriesConfig struct {
MaxRetrySleep time.Duration `yaml:"max-retry-sleep"`

Multiplier float64 `yaml:"multiplier"`

ReadStall ReadStallGcsRetriesConfig `yaml:"read-stall"`
}

type ListConfig struct {
Expand Down Expand Up @@ -209,6 +211,20 @@ type MonitoringConfig struct {
ExperimentalTracingSamplingRatio float64 `yaml:"experimental-tracing-sampling-ratio"`
}

type ReadStallGcsRetriesConfig struct {
Enable bool `yaml:"enable"`

InitialReqTimeout time.Duration `yaml:"initial-req-timeout"`

MaxReqTimeout time.Duration `yaml:"max-req-timeout"`

MinReqTimeout time.Duration `yaml:"min-req-timeout"`

ReqIncreaseRate float64 `yaml:"req-increase-rate"`

ReqTargetPercentile float64 `yaml:"req-target-percentile"`
}

type WriteConfig struct {
BlockSizeMb int64 `yaml:"block-size-mb"`

Expand Down Expand Up @@ -297,6 +313,12 @@ func BuildFlagSet(flagSet *pflag.FlagSet) error {

flagSet.BoolP("enable-nonexistent-type-cache", "", false, "Once set, if an inode is not found in GCS, a type cache entry with type NonexistentType will be created. This also means new file/dir created might not be seen. For example, if this flag is set, and metadata-cache-ttl-secs is set, then if we create the same file/node in the meantime using the same mount, since we are not refreshing the cache, it will still return nil.")

flagSet.BoolP("enable-read-stall-retry", "", false, "To turn on/off retries for stalled read requests. This is based on a timeout that changes depending on how long similar requests took in the past.")

if err := flagSet.MarkHidden("enable-read-stall-retry"); err != nil {
return err
}

flagSet.BoolP("experimental-enable-json-read", "", false, "By default, GCSFuse uses the GCS XML API to get and read objects. When this flag is specified, GCSFuse uses the GCS JSON API instead.\"")

if err := flagSet.MarkDeprecated("experimental-enable-json-read", "Experimental flag: could be dropped even in a minor release."); err != nil {
Expand Down Expand Up @@ -427,6 +449,36 @@ func BuildFlagSet(flagSet *pflag.FlagSet) error {
return err
}

flagSet.DurationP("read-stall-initial-req-timeout", "", 20000000000*time.Nanosecond, "Initial value of the read-request dynamic timeout.")

if err := flagSet.MarkHidden("read-stall-initial-req-timeout"); err != nil {
return err
}

flagSet.DurationP("read-stall-max-req-timeout", "", 1200000000000*time.Nanosecond, "Upper bound of the read-request dynamic timeout.")

if err := flagSet.MarkHidden("read-stall-max-req-timeout"); err != nil {
return err
}

flagSet.DurationP("read-stall-min-req-timeout", "", 500000000*time.Nanosecond, "Lower bound of the read request dynamic timeout.")

if err := flagSet.MarkHidden("read-stall-min-req-timeout"); err != nil {
return err
}

flagSet.Float64P("read-stall-req-increase-rate", "", 15, "Determines how many increase calls it takes for dynamic timeout to double.")

if err := flagSet.MarkHidden("read-stall-req-increase-rate"); err != nil {
return err
}

flagSet.Float64P("read-stall-req-target-percentile", "", 0.99, "Retry the request which take more than p(targetPercentile * 100) of past similar request.")

if err := flagSet.MarkHidden("read-stall-req-target-percentile"); err != nil {
return err
}

flagSet.IntP("rename-dir-limit", "", 0, "Allow rename a directory containing fewer descendants than this limit.")

flagSet.Float64P("retry-multiplier", "", 2, "Param for exponential backoff algorithm, which is used to increase waiting time b/w two consecutive retries.")
Expand Down Expand Up @@ -552,6 +604,10 @@ func BindFlags(v *viper.Viper, flagSet *pflag.FlagSet) error {
return err
}

if err := v.BindPFlag("gcs-retries.read-stall.enable", flagSet.Lookup("enable-read-stall-retry")); err != nil {
return err
}

if err := v.BindPFlag("gcs-connection.experimental-enable-json-read", flagSet.Lookup("experimental-enable-json-read")); err != nil {
return err
}
Expand Down Expand Up @@ -712,6 +768,26 @@ func BindFlags(v *viper.Viper, flagSet *pflag.FlagSet) error {
return err
}

if err := v.BindPFlag("gcs-retries.read-stall.initial-req-timeout", flagSet.Lookup("read-stall-initial-req-timeout")); err != nil {
return err
}

if err := v.BindPFlag("gcs-retries.read-stall.max-req-timeout", flagSet.Lookup("read-stall-max-req-timeout")); err != nil {
return err
}

if err := v.BindPFlag("gcs-retries.read-stall.min-req-timeout", flagSet.Lookup("read-stall-min-req-timeout")); err != nil {
return err
}

if err := v.BindPFlag("gcs-retries.read-stall.req-increase-rate", flagSet.Lookup("read-stall-req-increase-rate")); err != nil {
return err
}

if err := v.BindPFlag("gcs-retries.read-stall.req-target-percentile", flagSet.Lookup("read-stall-req-target-percentile")); err != nil {
return err
}

if err := v.BindPFlag("file-system.rename-dir-limit", flagSet.Lookup("rename-dir-limit")); err != nil {
return err
}
Expand Down
44 changes: 44 additions & 0 deletions cfg/params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,50 @@
usage: Param for exponential backoff algorithm, which is used to increase waiting time b/w two consecutive retries.
default: 2

- config-path: "gcs-retries.read-stall.enable"
flag-name: "enable-read-stall-retry"
type: "bool"
usage: >-
To turn on/off retries for stalled read requests. This is based on a timeout
that changes depending on how long similar requests took in the past.
default: false
hide-flag: true

- config-path: "gcs-retries.read-stall.initial-req-timeout"
flag-name: "read-stall-initial-req-timeout"
type: "duration"
usage: Initial value of the read-request dynamic timeout.
default: 20s
hide-flag: true

- config-path: "gcs-retries.read-stall.max-req-timeout"
flag-name: "read-stall-max-req-timeout"
type: "duration"
usage: Upper bound of the read-request dynamic timeout.
default: 20m
hide-flag: true

- config-path: "gcs-retries.read-stall.min-req-timeout"
flag-name: "read-stall-min-req-timeout"
type: "duration"
usage: Lower bound of the read request dynamic timeout.
default: 500ms
hide-flag: true

- config-path: "gcs-retries.read-stall.req-increase-rate"
flag-name: "read-stall-req-increase-rate"
type: "float64"
usage: Determines how many increase calls it takes for dynamic timeout to double.
default: 15
hide-flag: true

- config-path: "gcs-retries.read-stall.req-target-percentile"
flag-name: "read-stall-req-target-percentile"
type: "float64"
usage: Retry the request which take more than p(targetPercentile * 100) of past similar request.
default: 0.99
hide-flag: true

- config-path: "implicit-dirs"
flag-name: "implicit-dirs"
type: "bool"
Expand Down
19 changes: 19 additions & 0 deletions cfg/validate.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,21 @@ func isValidWriteStreamingConfig(wc *WriteConfig) error {
return nil
}

func isValidReadStallGcsRetriesConfig(rsrc *ReadStallGcsRetriesConfig) error {
if rsrc == nil {
return nil
}
if rsrc.Enable {
if rsrc.ReqIncreaseRate <= 0 {
return fmt.Errorf("invalid value of increaseRate: %f, can't be 0 or negative", rsrc.ReqIncreaseRate)
}
if rsrc.ReqTargetPercentile <= 0 || rsrc.ReqTargetPercentile >= 1 {
return fmt.Errorf("invalid value of targetPercentile: %f, should be in the range (0, 1)", rsrc.ReqTargetPercentile)
}
}
return nil
}

// ValidateConfig returns a non-nil error if the config is invalid.
func ValidateConfig(v isSet, config *Config) error {
var err error
Expand Down Expand Up @@ -198,5 +213,9 @@ func ValidateConfig(v isSet, config *Config) error {
return fmt.Errorf("error parsing write config: %w", err)
}

if err = isValidReadStallGcsRetriesConfig(&config.GcsRetries.ReadStall); err != nil {
return fmt.Errorf("error parsing read-stall-gcs-retries config: %w", err)
}

return nil
}
64 changes: 64 additions & 0 deletions cfg/validate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,70 @@ func TestValidateConfig_ErrorScenarios(t *testing.T) {
FileSystem: FileSystemConfig{KernelListCacheTtlSecs: 88888888888888888},
},
},
{
name: "read_stall_req_increase_rate_negative",
config: &Config{
Logging: LoggingConfig{LogRotate: validLogRotateConfig()},
FileCache: validFileCacheConfig(t),
MetadataCache: MetadataCacheConfig{
ExperimentalMetadataPrefetchOnMount: "sync",
},
GcsRetries: GcsRetriesConfig{
ReadStall: ReadStallGcsRetriesConfig{
Enable: true,
ReqIncreaseRate: -1,
},
},
},
},
{
name: "read_stall_req_increase_rate_zero",
config: &Config{
Logging: LoggingConfig{LogRotate: validLogRotateConfig()},
FileCache: validFileCacheConfig(t),
MetadataCache: MetadataCacheConfig{
ExperimentalMetadataPrefetchOnMount: "sync",
},
GcsRetries: GcsRetriesConfig{
ReadStall: ReadStallGcsRetriesConfig{
Enable: true,
ReqIncreaseRate: 0,
},
},
},
},
{
name: "read_stall_req_target_percentile_large",
config: &Config{
Logging: LoggingConfig{LogRotate: validLogRotateConfig()},
FileCache: validFileCacheConfig(t),
MetadataCache: MetadataCacheConfig{
ExperimentalMetadataPrefetchOnMount: "sync",
},
GcsRetries: GcsRetriesConfig{
ReadStall: ReadStallGcsRetriesConfig{
Enable: true,
ReqTargetPercentile: 4,
},
},
},
},
{
name: "read_stall_req_target_percentile_negative",
config: &Config{
Logging: LoggingConfig{LogRotate: validLogRotateConfig()},
FileCache: validFileCacheConfig(t),
MetadataCache: MetadataCacheConfig{
ExperimentalMetadataPrefetchOnMount: "sync",
},
GcsRetries: GcsRetriesConfig{
ReadStall: ReadStallGcsRetriesConfig{
Enable: true,
ReqTargetPercentile: -3,
},
},
},
},
}

for _, tc := range testCases {
Expand Down
68 changes: 68 additions & 0 deletions cmd/config_validation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,22 @@ func TestValidateConfigFile_InvalidConfigThrowsError(t *testing.T) {
name: "small_max_blocks_per_file",
configFile: "testdata/write_config/invalid_write_config_due_to_small_max_blocks_per_file.yaml",
},
{
name: "negative req_increase_rate",
configFile: "testdata/gcs_retries/read_stall/invalid_req_increase_rate_negative.yaml",
},
{
name: "zero req_increase_rate",
configFile: "testdata/gcs_retries/read_stall/invalid_req_increase_rate_zero.yaml",
},
{
name: "large req_target_percentile",
configFile: "testdata/gcs_retries/read_stall/invalid_req_target_percentile_large.yaml",
},
{
name: "negative req_target_percentile",
configFile: "testdata/gcs_retries/read_stall/invalid_req_target_percentile_negative.yaml",
},
}

for _, tc := range testCases {
Expand Down Expand Up @@ -670,3 +686,55 @@ func TestValidateConfigFile_MetadataCacheConfigSuccessful(t *testing.T) {
})
}
}

func TestValidateConfigFile_ReadStallConfigSuccessful(t *testing.T) {
testCases := []struct {
name string
configFile string
expectedConfig *cfg.Config
}{
{
// Test default values.
name: "empty_config_file",
configFile: "testdata/empty_file.yaml",
expectedConfig: &cfg.Config{
GcsRetries: cfg.GcsRetriesConfig{
ReadStall: cfg.ReadStallGcsRetriesConfig{
Enable: false,
MinReqTimeout: 500 * time.Millisecond,
MaxReqTimeout: 1200 * time.Second,
InitialReqTimeout: 20 * time.Second,
ReqTargetPercentile: 0.99,
ReqIncreaseRate: 15,
},
},
},
},
{
name: "valid_config_file",
configFile: "testdata/valid_config.yaml",
expectedConfig: &cfg.Config{
GcsRetries: cfg.GcsRetriesConfig{
ReadStall: cfg.ReadStallGcsRetriesConfig{
Enable: true,
MinReqTimeout: 10 * time.Second,
MaxReqTimeout: 200 * time.Second,
InitialReqTimeout: 20 * time.Second,
ReqTargetPercentile: 0.99,
ReqIncreaseRate: 15,
},
},
},
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
gotConfig, err := getConfigObjectWithConfigFile(t, tc.configFile)

if assert.NoError(t, err) {
assert.EqualValues(t, tc.expectedConfig.GcsRetries.ReadStall, gotConfig.GcsRetries.ReadStall)
}
})
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
gcs-retries:
read-stall:
enable:true
min-req-timeout:10s
max-req-timeout:20s
initial-req-timeout:30s
req-increase-rate:-15
req-target-percentile:0.99
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
gcs-retries:
read-stall:
enable:true
min-req-timeout:10s
max-req-timeout:20s
initial-req-timeout:30s
req-increase-rate:0
req-target-percentile:0.99
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
gcs-retries:
read-stall:
enable:true
min-req-timeout:10s
max-req-timeout:20s
initial-req-timeout:30s
req-increase-rate:15
req-target-percentile:4
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
gcs-retries:
read-stall:
enable:true
min-req-timeout:10s
max-req-timeout:20s
initial-req-timeout:30s
req-increase-rate:15
req-target-percentile:-2
Loading

0 comments on commit 766dec9

Please sign in to comment.