From f658d1b8b960a4e70388d8364c2eba1ed2c85c00 Mon Sep 17 00:00:00 2001 From: Rael Garcia Date: Fri, 19 Jun 2026 11:48:52 +0000 Subject: [PATCH 1/5] fix(prow-job-executor): retry Key Vault prow-token lookup on transient failures The prow-token Key Vault lookup used by EV2 gating had no retry. A transient managed-identity/IMDS failure (EOF reading 169.254.169.254 while minting the token) or a Key Vault 429/5xx failed the whole EV2 gating step, forcing the entire deployment job to restart. Wrap lookupProwTokenInKeyVault in an exponential-backoff retry mirroring the existing SubmitJob pattern. Transient errors (credential/IMDS, network, Key Vault 429/5xx) are retried; permanent Key Vault 4xx (401/403/404/400/409) fail fast. Retries are logged and bounded by the parent context. AROSLSRE-1228 --- tools/prow-job-executor/go.mod | 2 +- tools/prow-job-executor/prowtoken.go | 99 +++++++++++++ tools/prow-job-executor/prowtoken_test.go | 164 ++++++++++++++++++++++ 3 files changed, 264 insertions(+), 1 deletion(-) create mode 100644 tools/prow-job-executor/prowtoken_test.go diff --git a/tools/prow-job-executor/go.mod b/tools/prow-job-executor/go.mod index 591f9959..4158c716 100644 --- a/tools/prow-job-executor/go.mod +++ b/tools/prow-job-executor/go.mod @@ -4,6 +4,7 @@ go 1.25.5 require ( github.com/Azure/ARO-Tools/tools/cmdutils v0.0.0-20260227032723-11f678744bf9 + github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.0 github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azsecrets v1.4.0 github.com/go-logr/logr v1.4.3 github.com/google/uuid v1.6.0 @@ -25,7 +26,6 @@ require ( cloud.google.com/go/trace v1.11.5 // indirect contrib.go.opencensus.io/exporter/ocagent v0.7.1-0.20200907061046-05415f1de66d // indirect contrib.go.opencensus.io/exporter/prometheus v0.4.2 // indirect - github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/internal v1.2.0 // indirect diff --git a/tools/prow-job-executor/prowtoken.go b/tools/prow-job-executor/prowtoken.go index a451512e..f649206b 100644 --- a/tools/prow-job-executor/prowtoken.go +++ b/tools/prow-job-executor/prowtoken.go @@ -16,14 +16,44 @@ package prowjobexecutor import ( "context" + "errors" "fmt" + "net/http" + "time" + "github.com/go-logr/logr" "github.com/spf13/cobra" + "k8s.io/apimachinery/pkg/util/wait" + "github.com/Azure/ARO-Tools/tools/cmdutils" + "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azsecrets" ) +// prowTokenLookupBackoff controls retries of the Key Vault prow-token lookup. +// +// The lookup can fail with transient errors that are unrelated to configuration: +// the managed-identity token acquisition reads the instance metadata endpoint +// (169.254.169.254), which intermittently returns connection resets / EOFs on EV2 +// runners, and Key Vault itself can return 429/5xx. Without retries such a momentary +// blip fails the whole EV2 gating step and forces the entire deployment job to be +// restarted from scratch. +// +// Delays between attempts grow 30s, 1, 2, 4, 8, 16 minutes over up to 6 attempts, +// for a worst-case cumulative wait of ~31 minutes before giving up. The parent +// context still bounds the total runtime. +// +// Note: apimachinery's Backoff.Cap not only clamps an individual delay but also +// stops all further retries once the cap is reached, so the schedule is deliberately +// bounded by Steps rather than Cap. +var prowTokenLookupBackoff = wait.Backoff{ + Duration: 30 * time.Second, // Initial delay + Factor: 2.0, // Exponential factor + Jitter: 0.1, // 10% jitter to de-sync concurrent runners + Steps: 6, // Maximum attempts (~31m worst-case cumulative wait) +} + func NewDefaultRawProwTokenOptions() *RawProwTokenOptions { return &RawProwTokenOptions{} } @@ -89,7 +119,76 @@ func (o *validatedProwTokenOptions) Complete(ctx context.Context) (*ProwTokenOpt }, nil } +// lookupProwTokenInKeyVault fetches the prow token from Key Vault, retrying on +// transient failures (managed-identity/IMDS or network blips and Key Vault 429/5xx) +// with exponential backoff. Permanent failures (Key Vault 4xx other than 429, e.g. +// 401/403/404) fail fast. func lookupProwTokenInKeyVault(ctx context.Context, keyVaultURI string, secretName string) (string, error) { + return retryProwTokenLookup(ctx, prowTokenLookupBackoff, func(ctx context.Context) (string, error) { + return lookupProwTokenInKeyVaultOnce(ctx, keyVaultURI, secretName) + }) +} + +// retryProwTokenLookup runs fetch with exponential backoff, retrying only +// transient errors as classified by isRetryableKeyVaultError. The fetch callback is +// injectable so the retry behavior can be unit tested without a live Key Vault. +func retryProwTokenLookup(ctx context.Context, backoff wait.Backoff, fetch func(ctx context.Context) (string, error)) (string, error) { + logger, err := logr.FromContext(ctx) + if err != nil { + logger = logr.Discard() + } + + var token string + var lastErr error + condition := func(ctx context.Context) (bool, error) { + t, err := fetch(ctx) + if err != nil { + lastErr = err + // Only retry known-transient errors. Permanent failures (missing/forbidden + // secret, bad request) surface immediately instead of after a long backoff. + if !isRetryableKeyVaultError(err) { + return false, err // Stop retrying and propagate the error + } + + logger.Info("Prow token lookup failed with a transient error, will retry", "error", err.Error()) + return false, nil + } + + token = t + return true, nil // Success, stop retrying + } + + if err := wait.ExponentialBackoffWithContext(ctx, backoff, condition); err != nil { + if lastErr != nil { + return "", fmt.Errorf("failed to look up prow token after retries: %w", lastErr) + } + return "", err + } + + return token, nil +} + +// isRetryableKeyVaultError reports whether a prow-token lookup error is transient +// and worth retrying. Key Vault HTTP responses are retried only on 429 and 5xx; +// other 4xx (401/403/404/400/409) are permanent and fail fast. Any error without an +// HTTP response status is a credential-acquisition or transport failure (e.g. an +// IMDS connection reset/EOF while minting the managed-identity token) and is treated +// as transient. +func isRetryableKeyVaultError(err error) bool { + var respErr *azcore.ResponseError + if errors.As(err, &respErr) { + code := respErr.StatusCode + return code == http.StatusTooManyRequests || (code >= 500 && code <= 599) + } + + // No HTTP response status: credential acquisition (IMDS/managed identity) or + // transport failure. These are transient on EV2 runners; retry. + return true +} + +// lookupProwTokenInKeyVaultOnce performs a single Key Vault secret lookup without +// retry logic. +func lookupProwTokenInKeyVaultOnce(ctx context.Context, keyVaultURI string, secretName string) (string, error) { // Get Azure credentials using ARO-Tools cred, err := cmdutils.GetAzureTokenCredentials() if err != nil { diff --git a/tools/prow-job-executor/prowtoken_test.go b/tools/prow-job-executor/prowtoken_test.go new file mode 100644 index 00000000..5bc8cab5 --- /dev/null +++ b/tools/prow-job-executor/prowtoken_test.go @@ -0,0 +1,164 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package prowjobexecutor + +import ( + "context" + "errors" + "fmt" + "net/http" + "testing" + "time" + + "github.com/go-logr/logr" + + "k8s.io/apimachinery/pkg/util/wait" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore" +) + +// fastBackoff returns a backoff with negligible delays so retry tests run quickly +// while still exercising a deterministic number of attempts. No Cap is set so the +// attempt count is exactly Steps (apimachinery's Cap would also halt retries early). +func fastBackoff(steps int) wait.Backoff { + return wait.Backoff{ + Duration: time.Millisecond, + Factor: 2.0, + Jitter: 0.0, + Steps: steps, + } +} + +func testContext() context.Context { + return logr.NewContext(context.Background(), logr.Discard()) +} + +func responseError(statusCode int) error { + return &azcore.ResponseError{StatusCode: statusCode} +} + +func TestIsRetryableKeyVaultError(t *testing.T) { + tests := []struct { + name string + err error + want bool + }{ + {name: "key vault 429 is retryable", err: responseError(http.StatusTooManyRequests), want: true}, + {name: "key vault 500 is retryable", err: responseError(http.StatusInternalServerError), want: true}, + {name: "key vault 503 is retryable", err: responseError(http.StatusServiceUnavailable), want: true}, + {name: "key vault 401 is not retryable", err: responseError(http.StatusUnauthorized), want: false}, + {name: "key vault 403 is not retryable", err: responseError(http.StatusForbidden), want: false}, + {name: "key vault 404 is not retryable", err: responseError(http.StatusNotFound), want: false}, + {name: "key vault 400 is not retryable", err: responseError(http.StatusBadRequest), want: false}, + {name: "wrapped key vault 403 is not retryable", err: fmt.Errorf("get secret: %w", responseError(http.StatusForbidden)), want: false}, + {name: "wrapped key vault 503 is retryable", err: fmt.Errorf("get secret: %w", responseError(http.StatusServiceUnavailable)), want: true}, + {name: "imds eof credential error is retryable", err: errors.New("ManagedIdentityCredential: Get \"http://169.254.169.254/metadata/identity/oauth2/token\": EOF"), want: true}, + {name: "generic non-response error is retryable", err: errors.New("boom"), want: true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := isRetryableKeyVaultError(tt.err); got != tt.want { + t.Fatalf("isRetryableKeyVaultError(%v) = %v, want %v", tt.err, got, tt.want) + } + }) + } +} + +func TestRetryProwTokenLookup(t *testing.T) { + tests := []struct { + name string + // errs returned in order before "success"; an entry of nil means success. + errs []error + wantToken string + wantErr bool + wantCalls int + }{ + { + name: "succeeds on first attempt", + errs: []error{nil}, + wantToken: "secret-value", + wantCalls: 1, + }, + { + name: "retries transient imds eof then succeeds", + errs: []error{errors.New("ManagedIdentityCredential: ...: EOF"), nil}, + wantToken: "secret-value", + wantCalls: 2, + }, + { + name: "retries key vault 503 then succeeds", + errs: []error{responseError(http.StatusServiceUnavailable), nil}, + wantToken: "secret-value", + wantCalls: 2, + }, + { + name: "permanent 403 fails fast without retry", + errs: []error{responseError(http.StatusForbidden)}, + wantErr: true, + wantCalls: 1, + }, + { + name: "permanent 404 fails fast without retry", + errs: []error{responseError(http.StatusNotFound)}, + wantErr: true, + wantCalls: 1, + }, + { + name: "persistent transient error exhausts retries", + errs: []error{responseError(http.StatusTooManyRequests)}, // repeats until steps exhausted + wantErr: true, + wantCalls: 4, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + calls := 0 + fetch := func(ctx context.Context) (string, error) { + idx := calls + calls++ + // Repeat the last error once the scripted slice is exhausted, to + // model a persistently-failing transient condition. + if idx >= len(tt.errs) { + idx = len(tt.errs) - 1 + } + if err := tt.errs[idx]; err != nil { + return "", err + } + return "secret-value", nil + } + + token, err := retryProwTokenLookup(testContext(), fastBackoff(4), fetch) + + if tt.wantErr { + if err == nil { + t.Fatalf("expected error, got nil (token %q)", token) + } + } else { + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if token != tt.wantToken { + t.Fatalf("token = %q, want %q", token, tt.wantToken) + } + } + + if calls != tt.wantCalls { + t.Fatalf("fetch called %d times, want %d", calls, tt.wantCalls) + } + }) + } +} From 434e42f850fe41d3d2cf38817ee17f38974d4489 Mon Sep 17 00:00:00 2001 From: Rael Garcia Date: Fri, 19 Jun 2026 13:24:26 +0000 Subject: [PATCH 2/5] fix(prow-job-executor): address review feedback on KV retry - bump backoff Steps 6->7 so the schedule matches the documented ~31.5m worst-case (six sleeps: 30s,1,2,4,8,16m), fixing a Steps/comment mismatch - in retryProwTokenLookup, return a cancelled/expired parent context error as-is and only wrap with "after retries" when retries were genuinely exhausted, so deadline/cancel errors are no longer masked - treat context.Canceled/DeadlineExceeded as non-retryable in isRetryableKeyVaultError to fail fast instead of looping - add tests for context-error classification and fail-fast-on-cancel --- tools/prow-job-executor/prowtoken.go | 32 ++++++++++++++++------- tools/prow-job-executor/prowtoken_test.go | 25 ++++++++++++++++++ 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/tools/prow-job-executor/prowtoken.go b/tools/prow-job-executor/prowtoken.go index f649206b..0c9229af 100644 --- a/tools/prow-job-executor/prowtoken.go +++ b/tools/prow-job-executor/prowtoken.go @@ -40,8 +40,8 @@ import ( // blip fails the whole EV2 gating step and forces the entire deployment job to be // restarted from scratch. // -// Delays between attempts grow 30s, 1, 2, 4, 8, 16 minutes over up to 6 attempts, -// for a worst-case cumulative wait of ~31 minutes before giving up. The parent +// Delays between attempts grow 30s, 1, 2, 4, 8, 16 minutes over up to 7 attempts, +// for a worst-case cumulative wait of ~31.5 minutes before giving up. The parent // context still bounds the total runtime. // // Note: apimachinery's Backoff.Cap not only clamps an individual delay but also @@ -51,7 +51,7 @@ var prowTokenLookupBackoff = wait.Backoff{ Duration: 30 * time.Second, // Initial delay Factor: 2.0, // Exponential factor Jitter: 0.1, // 10% jitter to de-sync concurrent runners - Steps: 6, // Maximum attempts (~31m worst-case cumulative wait) + Steps: 7, // Maximum attempts (~31.5m worst-case cumulative wait) } func NewDefaultRawProwTokenOptions() *RawProwTokenOptions { @@ -143,13 +143,13 @@ func retryProwTokenLookup(ctx context.Context, backoff wait.Backoff, fetch func( condition := func(ctx context.Context) (bool, error) { t, err := fetch(ctx) if err != nil { - lastErr = err // Only retry known-transient errors. Permanent failures (missing/forbidden // secret, bad request) surface immediately instead of after a long backoff. if !isRetryableKeyVaultError(err) { - return false, err // Stop retrying and propagate the error + return false, err // Stop retrying and propagate the error as-is } + lastErr = err logger.Info("Prow token lookup failed with a transient error, will retry", "error", err.Error()) return false, nil } @@ -159,9 +159,16 @@ func retryProwTokenLookup(ctx context.Context, backoff wait.Backoff, fetch func( } if err := wait.ExponentialBackoffWithContext(ctx, backoff, condition); err != nil { + // A cancelled/expired parent context takes precedence: report it as-is rather + // than masking it behind the last transient lookup error. + if ctxErr := ctx.Err(); ctxErr != nil { + return "", ctxErr + } + // Retries were exhausted: surface the last transient error for context. if lastErr != nil { return "", fmt.Errorf("failed to look up prow token after retries: %w", lastErr) } + // A permanent error returned by the condition propagates unchanged. return "", err } @@ -169,12 +176,17 @@ func retryProwTokenLookup(ctx context.Context, backoff wait.Backoff, fetch func( } // isRetryableKeyVaultError reports whether a prow-token lookup error is transient -// and worth retrying. Key Vault HTTP responses are retried only on 429 and 5xx; -// other 4xx (401/403/404/400/409) are permanent and fail fast. Any error without an -// HTTP response status is a credential-acquisition or transport failure (e.g. an -// IMDS connection reset/EOF while minting the managed-identity token) and is treated -// as transient. +// and worth retrying. A cancelled or expired context is never retryable. Key Vault +// HTTP responses are retried only on 429 and 5xx; other 4xx (401/403/404/400/409) +// are permanent and fail fast. Any error without an HTTP response status is a +// credential-acquisition or transport failure (e.g. an IMDS connection reset/EOF +// while minting the managed-identity token) and is treated as transient. func isRetryableKeyVaultError(err error) bool { + // A cancelled/expired parent context must fail fast, never retry. + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return false + } + var respErr *azcore.ResponseError if errors.As(err, &respErr) { code := respErr.StatusCode diff --git a/tools/prow-job-executor/prowtoken_test.go b/tools/prow-job-executor/prowtoken_test.go index 5bc8cab5..fcffe0ae 100644 --- a/tools/prow-job-executor/prowtoken_test.go +++ b/tools/prow-job-executor/prowtoken_test.go @@ -66,6 +66,9 @@ func TestIsRetryableKeyVaultError(t *testing.T) { {name: "wrapped key vault 503 is retryable", err: fmt.Errorf("get secret: %w", responseError(http.StatusServiceUnavailable)), want: true}, {name: "imds eof credential error is retryable", err: errors.New("ManagedIdentityCredential: Get \"http://169.254.169.254/metadata/identity/oauth2/token\": EOF"), want: true}, {name: "generic non-response error is retryable", err: errors.New("boom"), want: true}, + {name: "context canceled is not retryable", err: context.Canceled, want: false}, + {name: "context deadline exceeded is not retryable", err: context.DeadlineExceeded, want: false}, + {name: "wrapped context canceled is not retryable", err: fmt.Errorf("get secret: %w", context.Canceled), want: false}, } for _, tt := range tests { @@ -162,3 +165,25 @@ func TestRetryProwTokenLookup(t *testing.T) { }) } } + +// TestRetryProwTokenLookupContextCanceled verifies that a cancelled parent context +// fails fast (no retries) and the returned error is the context error itself, not a +// "...after retries" wrapper hiding it. +func TestRetryProwTokenLookupContextCanceled(t *testing.T) { + ctx, cancel := context.WithCancel(testContext()) + cancel() + + calls := 0 + fetch := func(ctx context.Context) (string, error) { + calls++ + return "", ctx.Err() + } + + _, err := retryProwTokenLookup(ctx, fastBackoff(4), fetch) + if err == nil { + t.Fatalf("expected error, got nil") + } + if !errors.Is(err, context.Canceled) { + t.Fatalf("error = %v, want it to wrap context.Canceled", err) + } +} From 94ac9f55e9c5b747ce4eb639ef3423e15c91b6fc Mon Sep 17 00:00:00 2001 From: Rael Garcia Date: Mon, 22 Jun 2026 14:00:13 +0000 Subject: [PATCH 3/5] refactor(prow-job-executor): extract generic retry helper; fail fast on permanent KV errors Three call sites (Gangway SubmitJob, GetJobStatus, Key Vault prow-token lookup) each reimplemented the same wait.Backoff retry loop. Extract a single generic helper retry.WithValue[T] and route all three through it. Also narrow the Key Vault classifier (review follow-up): deterministic local failures (unparseable Key Vault URI, empty secret value) are now marked errPermanentKeyVaultLookup and fail fast instead of consuming the full ~31-minute transient backoff budget. IMDS/credential and transport errors remain retryable. --- .../prow-job-executor/internal/retry/retry.go | 89 +++++++++++ .../internal/retry/retry_test.go | 142 ++++++++++++++++++ tools/prow-job-executor/prowjob/client.go | 69 ++------- tools/prow-job-executor/prowtoken.go | 77 +++------- tools/prow-job-executor/prowtoken_test.go | 13 +- 5 files changed, 278 insertions(+), 112 deletions(-) create mode 100644 tools/prow-job-executor/internal/retry/retry.go create mode 100644 tools/prow-job-executor/internal/retry/retry_test.go diff --git a/tools/prow-job-executor/internal/retry/retry.go b/tools/prow-job-executor/internal/retry/retry.go new file mode 100644 index 00000000..da10adf8 --- /dev/null +++ b/tools/prow-job-executor/internal/retry/retry.go @@ -0,0 +1,89 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package retry provides a small, generic exponential-backoff retry helper shared +// by the prow-job-executor's transient-failure paths (Gangway job submission, job +// status polling, and the Key Vault prow-token lookup). +package retry + +import ( + "context" + "fmt" + + "github.com/go-logr/logr" + + "k8s.io/apimachinery/pkg/util/wait" +) + +// WithValue invokes fn with exponential backoff and returns its value once it +// succeeds, retrying only errors that isRetryable classifies as transient. +// +// Behavior: +// - fn is called at least once. On success its value is returned immediately. +// - When fn returns an error that isRetryable reports as false (a permanent or +// deterministic failure), WithValue stops immediately and propagates that error +// unchanged, without consuming the remaining backoff budget. +// - When isRetryable reports true, the error is logged at info level (if a logr +// logger is present on ctx) and the call is retried until the backoff budget is +// exhausted, after which the last transient error is wrapped and returned. +// - A cancelled or expired parent context always takes precedence: its error is +// returned as-is rather than masked behind the last transient error. +// +// fn must respect ctx for cancellation. The parent context bounds the total runtime +// regardless of the backoff schedule. +func WithValue[T any](ctx context.Context, backoff wait.Backoff, isRetryable func(error) bool, fn func(ctx context.Context) (T, error)) (T, error) { + logger, err := logr.FromContext(ctx) + if err != nil { + logger = logr.Discard() + } + + var result T + var lastErr error + condition := func(ctx context.Context) (bool, error) { + v, err := fn(ctx) + if err != nil { + // Permanent/deterministic failures surface immediately instead of + // after a long backoff. + if !isRetryable(err) { + return false, err // Stop retrying and propagate the error as-is. + } + + lastErr = err + logger.Info("Operation failed with a transient error, will retry", "error", err.Error()) + return false, nil + } + + result = v + return true, nil // Success, stop retrying. + } + + if err := wait.ExponentialBackoffWithContext(ctx, backoff, condition); err != nil { + // A cancelled/expired parent context takes precedence: report it as-is + // rather than masking it behind the last transient error. + if ctxErr := ctx.Err(); ctxErr != nil { + var zero T + return zero, ctxErr + } + // Retries were exhausted: surface the last transient error for context. + if lastErr != nil { + var zero T + return zero, fmt.Errorf("retry budget exhausted after transient errors: %w", lastErr) + } + // A permanent error returned by the condition propagates unchanged. + var zero T + return zero, err + } + + return result, nil +} diff --git a/tools/prow-job-executor/internal/retry/retry_test.go b/tools/prow-job-executor/internal/retry/retry_test.go new file mode 100644 index 00000000..b2f00fdf --- /dev/null +++ b/tools/prow-job-executor/internal/retry/retry_test.go @@ -0,0 +1,142 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package retry + +import ( + "context" + "errors" + "strings" + "testing" + "time" + + "github.com/go-logr/logr" + + "k8s.io/apimachinery/pkg/util/wait" +) + +// fastBackoff returns a backoff with negligible delays so retry tests run quickly +// while still exercising a deterministic number of attempts. No Cap is set so the +// attempt count is exactly Steps (apimachinery's Cap would also halt retries early). +func fastBackoff(steps int) wait.Backoff { + return wait.Backoff{ + Duration: time.Millisecond, + Factor: 2.0, + Jitter: 0.0, + Steps: steps, + } +} + +func testContext() context.Context { + return logr.NewContext(context.Background(), logr.Discard()) +} + +// retryAll treats every error as transient. +func retryAll(error) bool { return true } + +func TestWithValueSucceedsFirstAttempt(t *testing.T) { + calls := 0 + got, err := WithValue(testContext(), fastBackoff(4), retryAll, func(context.Context) (int, error) { + calls++ + return 42, nil + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != 42 { + t.Fatalf("got %d, want 42", got) + } + if calls != 1 { + t.Fatalf("fn called %d times, want 1", calls) + } +} + +func TestWithValueRetriesTransientThenSucceeds(t *testing.T) { + calls := 0 + got, err := WithValue(testContext(), fastBackoff(4), retryAll, func(context.Context) (string, error) { + calls++ + if calls < 3 { + return "", errors.New("transient") + } + return "ok", nil + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != "ok" { + t.Fatalf("got %q, want %q", got, "ok") + } + if calls != 3 { + t.Fatalf("fn called %d times, want 3", calls) + } +} + +func TestWithValueFailsFastOnNonRetryable(t *testing.T) { + sentinel := errors.New("permanent") + calls := 0 + _, err := WithValue(testContext(), fastBackoff(4), func(error) bool { return false }, func(context.Context) (int, error) { + calls++ + return 0, sentinel + }) + if !errors.Is(err, sentinel) { + t.Fatalf("error = %v, want it to wrap sentinel", err) + } + if calls != 1 { + t.Fatalf("fn called %d times, want 1 (no retries on permanent error)", calls) + } + // A permanent error must propagate unchanged, not behind the "retry budget" wrapper. + if strings.Contains(err.Error(), "retry budget exhausted") { + t.Fatalf("permanent error should propagate as-is, got %q", err.Error()) + } +} + +func TestWithValueWrapsLastErrorWhenExhausted(t *testing.T) { + sentinel := errors.New("still failing") + calls := 0 + _, err := WithValue(testContext(), fastBackoff(3), retryAll, func(context.Context) (int, error) { + calls++ + return 0, sentinel + }) + if err == nil { + t.Fatal("expected error, got nil") + } + if !errors.Is(err, sentinel) { + t.Fatalf("error = %v, want it to wrap the last transient error", err) + } + if !strings.Contains(err.Error(), "retry budget exhausted") { + t.Fatalf("exhausted error %q should mention the retry budget", err.Error()) + } + if calls != 3 { + t.Fatalf("fn called %d times, want 3", calls) + } +} + +// TestWithValueContextCanceledTakesPrecedence verifies a cancelled parent context +// surfaces as-is rather than behind the "retry budget" wrapper or the last error. +func TestWithValueContextCanceledTakesPrecedence(t *testing.T) { + ctx, cancel := context.WithCancel(testContext()) + cancel() + + calls := 0 + _, err := WithValue(ctx, fastBackoff(4), retryAll, func(ctx context.Context) (int, error) { + calls++ + return 0, ctx.Err() + }) + if !errors.Is(err, context.Canceled) { + t.Fatalf("error = %v, want context.Canceled", err) + } + if strings.Contains(err.Error(), "retry budget exhausted") { + t.Fatalf("context error should surface as-is, got %q", err.Error()) + } +} diff --git a/tools/prow-job-executor/prowjob/client.go b/tools/prow-job-executor/prowjob/client.go index a7a10f19..f3947c9a 100644 --- a/tools/prow-job-executor/prowjob/client.go +++ b/tools/prow-job-executor/prowjob/client.go @@ -34,6 +34,8 @@ import ( prowjobs "sigs.k8s.io/prow/pkg/apis/prowjobs/v1" prowgangway "sigs.k8s.io/prow/pkg/gangway" "sigs.k8s.io/yaml" + + "github.com/Azure/ARO-Tools/tools/prow-job-executor/internal/retry" ) // jobSubmissionResponse represents the minimal JSON response from Gangway API for job submission @@ -85,40 +87,9 @@ func NewClient(token, gangwayURL, prowURL string) *Client { // forcing the entire deployment job to be restarted. Only transient failures // (HTTP 429, 5xx, and network errors) are retried; everything else fails fast. func (c *Client) SubmitJob(ctx context.Context, request *prowgangway.CreateJobExecutionRequest) (string, error) { - logger, err := logr.FromContext(ctx) - if err != nil { - return "", err - } - - var executionID string - var lastErr error - condition := func(ctx context.Context) (bool, error) { - id, err := c.submitJobOnce(ctx, request) - if err != nil { - lastErr = err - // Only retry known-transient errors. Deterministic failures (marshal, - // request construction, response decode, 4xx other than 429) are not - // retried so they surface immediately instead of after a long backoff. - if !isRetryableError(err) { - return false, err // Stop retrying and propagate the error - } - - logger.Info("Job submission failed with a transient error, will retry", "error", err.Error()) - return false, nil - } - - executionID = id - return true, nil // Success, stop retrying - } - - if err := wait.ExponentialBackoffWithContext(ctx, c.submitBackoff, condition); err != nil { - if lastErr != nil { - return "", fmt.Errorf("failed to submit job after retries: %w", lastErr) - } - return "", err - } - - return executionID, nil + return retry.WithValue(ctx, c.submitBackoff, isRetryableError, func(ctx context.Context) (string, error) { + return c.submitJobOnce(ctx, request) + }) } // submitJobOnce performs a single job submission request without retry logic. @@ -167,8 +138,6 @@ func (c *Client) submitJobOnce(ctx context.Context, request *prowgangway.CreateJ // GetJobStatus retrieves the full job information by Prow execution ID with retry logic func (c *Client) GetJobStatus(ctx context.Context, prowExecutionID string) (*prowjobs.ProwJob, error) { - var result *prowjobs.ProwJob - // Configure exponential backoff with jitter backoff := wait.Backoff{ Duration: time.Second, // Initial delay @@ -178,28 +147,16 @@ func (c *Client) GetJobStatus(ctx context.Context, prowExecutionID string) (*pro Cap: 10 * time.Second, // Maximum delay cap } - condition := func(ctx context.Context) (bool, error) { - job, err := c.getJobStatusOnce(ctx, prowExecutionID) - if err != nil { - // Check if this is a non-retryable error (e.g., 403 Forbidden) - if isNonRetryableHTTPError(err) { - return false, err // Stop retrying and propagate the error - } - - // For retryable errors continue - return false, nil - } - - result = job - return true, nil // Success, stop retrying - } - - err := wait.ExponentialBackoffWithContext(ctx, backoff, condition) - if err != nil { - return nil, err + // Everything except a non-retryable HTTP status (e.g. 401/403) is retried: a + // freshly submitted job's status may 404 until it propagates, and transport + // errors are transient. + isRetryable := func(err error) bool { + return !isNonRetryableHTTPError(err) } - return result, nil + return retry.WithValue(ctx, backoff, isRetryable, func(ctx context.Context) (*prowjobs.ProwJob, error) { + return c.getJobStatusOnce(ctx, prowExecutionID) + }) } // getJobStatusOnce performs a single job status request without retry logic diff --git a/tools/prow-job-executor/prowtoken.go b/tools/prow-job-executor/prowtoken.go index 0c9229af..57aa84fb 100644 --- a/tools/prow-job-executor/prowtoken.go +++ b/tools/prow-job-executor/prowtoken.go @@ -21,16 +21,22 @@ import ( "net/http" "time" - "github.com/go-logr/logr" "github.com/spf13/cobra" "k8s.io/apimachinery/pkg/util/wait" "github.com/Azure/ARO-Tools/tools/cmdutils" + "github.com/Azure/ARO-Tools/tools/prow-job-executor/internal/retry" "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azsecrets" ) +// errPermanentKeyVaultLookup marks a prow-token lookup failure as a deterministic, +// local misconfiguration (e.g. an unparseable Key Vault URI or a secret with no +// value). Errors wrapping it are classified non-retryable so they fail fast instead +// of consuming the full ~31-minute backoff budget reserved for transient failures. +var errPermanentKeyVaultLookup = errors.New("permanent Key Vault lookup error") + // prowTokenLookupBackoff controls retries of the Key Vault prow-token lookup. // // The lookup can fail with transient errors that are unrelated to configuration: @@ -124,69 +130,30 @@ func (o *validatedProwTokenOptions) Complete(ctx context.Context) (*ProwTokenOpt // with exponential backoff. Permanent failures (Key Vault 4xx other than 429, e.g. // 401/403/404) fail fast. func lookupProwTokenInKeyVault(ctx context.Context, keyVaultURI string, secretName string) (string, error) { - return retryProwTokenLookup(ctx, prowTokenLookupBackoff, func(ctx context.Context) (string, error) { + return retry.WithValue(ctx, prowTokenLookupBackoff, isRetryableKeyVaultError, func(ctx context.Context) (string, error) { return lookupProwTokenInKeyVaultOnce(ctx, keyVaultURI, secretName) }) } -// retryProwTokenLookup runs fetch with exponential backoff, retrying only -// transient errors as classified by isRetryableKeyVaultError. The fetch callback is -// injectable so the retry behavior can be unit tested without a live Key Vault. -func retryProwTokenLookup(ctx context.Context, backoff wait.Backoff, fetch func(ctx context.Context) (string, error)) (string, error) { - logger, err := logr.FromContext(ctx) - if err != nil { - logger = logr.Discard() - } - - var token string - var lastErr error - condition := func(ctx context.Context) (bool, error) { - t, err := fetch(ctx) - if err != nil { - // Only retry known-transient errors. Permanent failures (missing/forbidden - // secret, bad request) surface immediately instead of after a long backoff. - if !isRetryableKeyVaultError(err) { - return false, err // Stop retrying and propagate the error as-is - } - - lastErr = err - logger.Info("Prow token lookup failed with a transient error, will retry", "error", err.Error()) - return false, nil - } - - token = t - return true, nil // Success, stop retrying - } - - if err := wait.ExponentialBackoffWithContext(ctx, backoff, condition); err != nil { - // A cancelled/expired parent context takes precedence: report it as-is rather - // than masking it behind the last transient lookup error. - if ctxErr := ctx.Err(); ctxErr != nil { - return "", ctxErr - } - // Retries were exhausted: surface the last transient error for context. - if lastErr != nil { - return "", fmt.Errorf("failed to look up prow token after retries: %w", lastErr) - } - // A permanent error returned by the condition propagates unchanged. - return "", err - } - - return token, nil -} - // isRetryableKeyVaultError reports whether a prow-token lookup error is transient -// and worth retrying. A cancelled or expired context is never retryable. Key Vault +// and worth retrying. A cancelled or expired context is never retryable, nor is a +// deterministic local failure marked with errPermanentKeyVaultLookup. Key Vault // HTTP responses are retried only on 429 and 5xx; other 4xx (401/403/404/400/409) -// are permanent and fail fast. Any error without an HTTP response status is a -// credential-acquisition or transport failure (e.g. an IMDS connection reset/EOF -// while minting the managed-identity token) and is treated as transient. +// are permanent and fail fast. Any remaining error without an HTTP response status +// is a credential-acquisition or transport failure (e.g. an IMDS connection +// reset/EOF while minting the managed-identity token) and is treated as transient. func isRetryableKeyVaultError(err error) bool { // A cancelled/expired parent context must fail fast, never retry. if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { return false } + // Deterministic local misconfiguration (bad Key Vault URI, empty secret) is + // permanent; fail fast instead of backing off for ~31 minutes. + if errors.Is(err, errPermanentKeyVaultLookup) { + return false + } + var respErr *azcore.ResponseError if errors.As(err, &respErr) { code := respErr.StatusCode @@ -210,7 +177,8 @@ func lookupProwTokenInKeyVaultOnce(ctx context.Context, keyVaultURI string, secr // Create Key Vault secrets client client, err := azsecrets.NewClient(keyVaultURI, cred, nil) if err != nil { - return "", fmt.Errorf("failed to create Key Vault client: %w", err) + // Bad/unparseable Key Vault URI is a deterministic misconfiguration; fail fast. + return "", fmt.Errorf("failed to create Key Vault client: %w: %w", err, errPermanentKeyVaultLookup) } // Get the secret from Key Vault @@ -220,7 +188,8 @@ func lookupProwTokenInKeyVaultOnce(ctx context.Context, keyVaultURI string, secr } if secret.Value == nil { - return "", fmt.Errorf("secret %q in Key Vault %q has no value", secretName, keyVaultURI) + // A present-but-empty secret is a deterministic misconfiguration; fail fast. + return "", fmt.Errorf("secret %q in Key Vault %q has no value: %w", secretName, keyVaultURI, errPermanentKeyVaultLookup) } return *secret.Value, nil diff --git a/tools/prow-job-executor/prowtoken_test.go b/tools/prow-job-executor/prowtoken_test.go index fcffe0ae..e6d741ee 100644 --- a/tools/prow-job-executor/prowtoken_test.go +++ b/tools/prow-job-executor/prowtoken_test.go @@ -26,6 +26,7 @@ import ( "k8s.io/apimachinery/pkg/util/wait" + "github.com/Azure/ARO-Tools/tools/prow-job-executor/internal/retry" "github.com/Azure/azure-sdk-for-go/sdk/azcore" ) @@ -66,6 +67,8 @@ func TestIsRetryableKeyVaultError(t *testing.T) { {name: "wrapped key vault 503 is retryable", err: fmt.Errorf("get secret: %w", responseError(http.StatusServiceUnavailable)), want: true}, {name: "imds eof credential error is retryable", err: errors.New("ManagedIdentityCredential: Get \"http://169.254.169.254/metadata/identity/oauth2/token\": EOF"), want: true}, {name: "generic non-response error is retryable", err: errors.New("boom"), want: true}, + {name: "permanent local error is not retryable", err: fmt.Errorf("failed to create Key Vault client: %w: %w", errors.New("bad uri"), errPermanentKeyVaultLookup), want: false}, + {name: "wrapped permanent local error is not retryable", err: fmt.Errorf("lookup failed: %w", fmt.Errorf("secret has no value: %w", errPermanentKeyVaultLookup)), want: false}, {name: "context canceled is not retryable", err: context.Canceled, want: false}, {name: "context deadline exceeded is not retryable", err: context.DeadlineExceeded, want: false}, {name: "wrapped context canceled is not retryable", err: fmt.Errorf("get secret: %w", context.Canceled), want: false}, @@ -119,6 +122,12 @@ func TestRetryProwTokenLookup(t *testing.T) { wantErr: true, wantCalls: 1, }, + { + name: "permanent local error fails fast without retry", + errs: []error{fmt.Errorf("secret has no value: %w", errPermanentKeyVaultLookup)}, + wantErr: true, + wantCalls: 1, + }, { name: "persistent transient error exhausts retries", errs: []error{responseError(http.StatusTooManyRequests)}, // repeats until steps exhausted @@ -144,7 +153,7 @@ func TestRetryProwTokenLookup(t *testing.T) { return "secret-value", nil } - token, err := retryProwTokenLookup(testContext(), fastBackoff(4), fetch) + token, err := retry.WithValue(testContext(), fastBackoff(4), isRetryableKeyVaultError, fetch) if tt.wantErr { if err == nil { @@ -179,7 +188,7 @@ func TestRetryProwTokenLookupContextCanceled(t *testing.T) { return "", ctx.Err() } - _, err := retryProwTokenLookup(ctx, fastBackoff(4), fetch) + _, err := retry.WithValue(ctx, fastBackoff(4), isRetryableKeyVaultError, fetch) if err == nil { t.Fatalf("expected error, got nil") } From f1da9fdf5582973572a153b4d4421cc2596d5888 Mon Sep 17 00:00:00 2001 From: Rael Garcia Date: Mon, 22 Jun 2026 14:30:20 +0000 Subject: [PATCH 4/5] fix(prow-job-executor): fail fast on context cancellation in retry helper WithValue now checks ctx.Err() at the top of the error branch so a context cancelled mid-call surfaces as-is, without a misleading "will retry" log or recording it as the last transient error (notably the GetJobStatus path, whose classifier treats context errors as retryable). Strengthen the context-cancellation tests to assert call counts: a context cancelled before the first attempt never invokes fn (0 calls), and a new test deterministically exercises mid-call cancellation (1 call, surfaces context.Canceled, not the exhausted-budget wrapper). --- .../prow-job-executor/internal/retry/retry.go | 8 +++++ .../internal/retry/retry_test.go | 34 +++++++++++++++++-- tools/prow-job-executor/prowtoken_test.go | 8 +++-- 3 files changed, 45 insertions(+), 5 deletions(-) diff --git a/tools/prow-job-executor/internal/retry/retry.go b/tools/prow-job-executor/internal/retry/retry.go index da10adf8..babc829a 100644 --- a/tools/prow-job-executor/internal/retry/retry.go +++ b/tools/prow-job-executor/internal/retry/retry.go @@ -53,6 +53,14 @@ func WithValue[T any](ctx context.Context, backoff wait.Backoff, isRetryable fun condition := func(ctx context.Context) (bool, error) { v, err := fn(ctx) if err != nil { + // A cancelled/expired parent context is terminal: stop immediately, + // without logging a misleading "will retry" or recording it as the last + // transient error. (Some callers' isRetryable does not special-case + // context errors, e.g. GetJobStatus.) + if ctxErr := ctx.Err(); ctxErr != nil { + return false, ctxErr + } + // Permanent/deterministic failures surface immediately instead of // after a long backoff. if !isRetryable(err) { diff --git a/tools/prow-job-executor/internal/retry/retry_test.go b/tools/prow-job-executor/internal/retry/retry_test.go index b2f00fdf..7c80ab7a 100644 --- a/tools/prow-job-executor/internal/retry/retry_test.go +++ b/tools/prow-job-executor/internal/retry/retry_test.go @@ -122,9 +122,10 @@ func TestWithValueWrapsLastErrorWhenExhausted(t *testing.T) { } } -// TestWithValueContextCanceledTakesPrecedence verifies a cancelled parent context -// surfaces as-is rather than behind the "retry budget" wrapper or the last error. -func TestWithValueContextCanceledTakesPrecedence(t *testing.T) { +// TestWithValueContextCanceledBeforeStart verifies a context cancelled before the +// first attempt surfaces as-is and never invokes fn (ExponentialBackoffWithContext +// checks the context before the first condition call). +func TestWithValueContextCanceledBeforeStart(t *testing.T) { ctx, cancel := context.WithCancel(testContext()) cancel() @@ -139,4 +140,31 @@ func TestWithValueContextCanceledTakesPrecedence(t *testing.T) { if strings.Contains(err.Error(), "retry budget exhausted") { t.Fatalf("context error should surface as-is, got %q", err.Error()) } + if calls != 0 { + t.Fatalf("fn called %d times, want 0 (context checked before first attempt)", calls) + } +} + +// TestWithValueContextCanceledDuringCall verifies that when the parent context is +// cancelled while fn is running, the next iteration fails fast on the context error +// without logging a "will retry" or wrapping it as an exhausted-budget error — even +// though fn returned an otherwise-retryable error. +func TestWithValueContextCanceledDuringCall(t *testing.T) { + ctx, cancel := context.WithCancel(testContext()) + + calls := 0 + _, err := WithValue(ctx, fastBackoff(4), retryAll, func(ctx context.Context) (int, error) { + calls++ + cancel() // cancel the parent mid-call + return 0, errors.New("transient") // a normally-retryable error + }) + if !errors.Is(err, context.Canceled) { + t.Fatalf("error = %v, want context.Canceled", err) + } + if strings.Contains(err.Error(), "retry budget exhausted") { + t.Fatalf("cancelled context should not be wrapped as exhausted budget, got %q", err.Error()) + } + if calls != 1 { + t.Fatalf("fn called %d times, want 1 (no retry after cancellation)", calls) + } } diff --git a/tools/prow-job-executor/prowtoken_test.go b/tools/prow-job-executor/prowtoken_test.go index e6d741ee..f2a81474 100644 --- a/tools/prow-job-executor/prowtoken_test.go +++ b/tools/prow-job-executor/prowtoken_test.go @@ -176,8 +176,9 @@ func TestRetryProwTokenLookup(t *testing.T) { } // TestRetryProwTokenLookupContextCanceled verifies that a cancelled parent context -// fails fast (no retries) and the returned error is the context error itself, not a -// "...after retries" wrapper hiding it. +// fails fast (the fetch is never invoked, since ExponentialBackoffWithContext checks +// the context before the first attempt) and the returned error is the context error +// itself, not a "...after retries" wrapper hiding it. func TestRetryProwTokenLookupContextCanceled(t *testing.T) { ctx, cancel := context.WithCancel(testContext()) cancel() @@ -195,4 +196,7 @@ func TestRetryProwTokenLookupContextCanceled(t *testing.T) { if !errors.Is(err, context.Canceled) { t.Fatalf("error = %v, want it to wrap context.Canceled", err) } + if calls != 0 { + t.Fatalf("fetch called %d times, want 0 (context checked before first attempt)", calls) + } } From 34472385bb19b345c6d5dd89a51d0692fc3d94ac Mon Sep 17 00:00:00 2001 From: Rael Garcia Date: Mon, 22 Jun 2026 14:48:44 +0000 Subject: [PATCH 5/5] docs(prow-job-executor): correct GetJobStatus Steps comment to "Maximum attempts" wait.Backoff.Steps is the total number of attempts, not retries. Align this comment with the other two backoffs in the codebase. --- tools/prow-job-executor/prowjob/client.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/prow-job-executor/prowjob/client.go b/tools/prow-job-executor/prowjob/client.go index f3947c9a..9abd1c2f 100644 --- a/tools/prow-job-executor/prowjob/client.go +++ b/tools/prow-job-executor/prowjob/client.go @@ -143,7 +143,7 @@ func (c *Client) GetJobStatus(ctx context.Context, prowExecutionID string) (*pro Duration: time.Second, // Initial delay Factor: 2.0, // Exponential factor Jitter: 0.1, // 10% jitter - Steps: 3, // Maximum retries + Steps: 3, // Maximum attempts Cap: 10 * time.Second, // Maximum delay cap }