internal/retry/retry.go (83 lines of code) (raw):

// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package retry implements retry logic helpers to execute arbitrary functions // with defined policy. package retry import ( "context" "fmt" "math" "time" "github.com/GoogleCloudPlatform/galog" ) const ( // DefaultMaximumBackoff is the maximum backoff time between retries. Backoff // will be capped to this value if not set in the policy. DefaultMaximumBackoff = time.Minute * 20 ) // IsRetriable is method signature for implementing to override default logic of // retrying each error. type IsRetriable func(error) bool // Policy represents the struct to configure the retry behavior. type Policy struct { // MaxAttempts represents the maximum number of retry attempts. If set to 0 // then retry will be infinite. MaxAttempts int // BackoffFactor is the multiplier by which retry interval (Jitter) increases // after each retry. For constant backoff set Backoff factor to 1. BackoffFactor float64 // Jitter is the interval before the first retry. Jitter time.Duration // ShouldRetry is optional and the way to override default retry logic of // retry every error. If ShouldRetry is not provided/implemented every error // will be retried until all attempts are exhausted. ShouldRetry IsRetriable // MaximumBackoff is the maximum backoff time between retries. Backoff will be // capped to this value. If not set, then backoff will keep increasing based // on BackoffFactor and Jitter until [DefaultMaximumBackoff] to prevent // overflow. MaximumBackoff time.Duration } // backoff computes interval between retries. Interval is // jitter*(backoffFactor^attempt). For e.g. if jitter was set to 10 and factor // was 3, backoff between attempts would be [10, 30, 90, 270...]. func backoff(attempt int, policy Policy) time.Duration { maxBackoff := policy.MaximumBackoff if maxBackoff == 0 { maxBackoff = DefaultMaximumBackoff } if attempt < 0 { galog.V(2).Debugf("Attempt is negative [%d] probably due to overflow, using max backoff [%v] instead", attempt, maxBackoff) return maxBackoff } b := policy.Jitter.Seconds() * math.Pow(policy.BackoffFactor, float64(attempt)) // Check for overflow BEFORE converting to time.Duration if b > maxBackoff.Seconds() { galog.V(2).Debugf("Computed backoff [%v] exceeds max backoff [%v], using max backoff instead", b, maxBackoff) return maxBackoff } d := time.Duration(b * float64(time.Second)) return d } // isRetriable checks if error is retriable. If ShouldRetry is unimplemented // it always returns true, otherwise overridden method's logic determines the // retry behavior. func isRetriable(policy Policy, err error) bool { if policy.ShouldRetry == nil { return true } return policy.ShouldRetry(err) } // RunWithResponse executes and retries the function on failure based on policy // defined and returns response on success. func RunWithResponse[T any](ctx context.Context, policy Policy, f func() (T, error)) (T, error) { var ( res T err error ) if f == nil { return res, fmt.Errorf("retry function cannot be nil") } for attempt := 0; ; attempt++ { if res, err = f(); err == nil { return res, nil } if !isRetriable(policy, err) { return res, fmt.Errorf("giving up, retry policy returned false on error: %w", err) } galog.Debugf("Attempt %d failed with error %+v", attempt, err) // Return early, no need to wait if all retries have exhausted. if attempt+1 == policy.MaxAttempts { return res, fmt.Errorf("exhausted all (%d) retries, last error: %w", policy.MaxAttempts, err) } select { case <-ctx.Done(): return res, ctx.Err() case <-time.After(backoff(attempt, policy)): // Verify if context is still active. If not, return early. select { case <-ctx.Done(): return res, ctx.Err() default: // Timeout, continue retrying. } } } } // Run executes and retries the function on failure based on policy defined and // returns nil-error on success. func Run(ctx context.Context, policy Policy, f func() error) error { if f == nil { return fmt.Errorf("retry function cannot be nil") } fn := func() (any, error) { return nil, f() } _, err := RunWithResponse(ctx, policy, fn) return err }