From 58cf789ee2a1a4b217f1d5180430be4488e38d27 Mon Sep 17 00:00:00 2001 From: Steve Goodwin Date: Wed, 27 May 2026 12:56:41 -0400 Subject: [PATCH 1/2] OCPBUGS-38676: Add retry for transient API errors to prevent Degraded blips Wrap all API write operations (Apply, Create, Update, Delete) with retry logic to absorb transient errors (conflicts, timeouts, connection refused) during upgrades before they reach status reporting. Without retry, a single transient failure sets Degraded=True for ~1 minute until the next sync resolves it, causing CI test failures. Retry parameters: 3 attempts, 500ms backoff with 2x factor (~3.5s max). Permanent errors (Forbidden, Invalid, AlreadyExists) are not retried. Assisted by Claude Code (Opus 4.6) --- .../controllers/clidownloads/controller.go | 26 ++- .../downloadsdeployment/controller.go | 19 +- .../controllers/oauthclients/oauthclients.go | 5 +- .../oauthclientsecret/oauthclientsecret.go | 6 +- .../poddisruptionbudget/controller.go | 8 +- pkg/console/controllers/service/controller.go | 17 +- .../controllers/serviceaccounts/controller.go | 19 +- .../storageversionmigration/controller.go | 5 +- .../upgradenotification/controller.go | 17 +- pkg/console/controllers/util/retry.go | 51 ++++++ pkg/console/controllers/util/retry_test.go | 165 ++++++++++++++++++ pkg/console/operator/retry.go | 10 ++ pkg/console/operator/sync_v400.go | 107 ++++++++---- 13 files changed, 382 insertions(+), 73 deletions(-) create mode 100644 pkg/console/controllers/util/retry.go create mode 100644 pkg/console/controllers/util/retry_test.go create mode 100644 pkg/console/operator/retry.go diff --git a/pkg/console/controllers/clidownloads/controller.go b/pkg/console/controllers/clidownloads/controller.go index 79ea6f70f5..9d3708c51d 100644 --- a/pkg/console/controllers/clidownloads/controller.go +++ b/pkg/console/controllers/clidownloads/controller.go @@ -228,10 +228,15 @@ func ApplyCLIDownloads(ctx context.Context, consoleClient consoleclientv1.Consol existingCLIDownloads, err := consoleClient.Get(ctx, cliDownloadsName, metav1.GetOptions{}) existingCLIDownloadsCopy := existingCLIDownloads.DeepCopy() if apierrors.IsNotFound(err) { - actualCLIDownloads, err := consoleClient.Create(ctx, requiredCLIDownloads, metav1.CreateOptions{}) - if err != nil { - klog.V(4).Infof("error creating %s consoleclidownloads custom resource: %s", cliDownloadsName, err) - return nil, "FailedCreate", err + var actualCLIDownloads *v1.ConsoleCLIDownload + createErr := controllersutil.RetryOnTransientError(func() error { + var e error + actualCLIDownloads, e = consoleClient.Create(ctx, requiredCLIDownloads, metav1.CreateOptions{}) + return e + }) + if createErr != nil { + klog.V(4).Infof("error creating %s consoleclidownloads custom resource: %s", cliDownloadsName, createErr) + return nil, "FailedCreate", createErr } klog.V(4).Infof("%s consoleclidownloads custom resource created", cliDownloadsName) return actualCLIDownloads, "", nil @@ -249,10 +254,15 @@ func ApplyCLIDownloads(ctx context.Context, consoleClient consoleclientv1.Consol } existingCLIDownloadsCopy.Spec = requiredCLIDownloads.Spec - actualCLIDownloads, err := consoleClient.Update(ctx, existingCLIDownloadsCopy, metav1.UpdateOptions{}) - if err != nil { - klog.V(4).Infof("error updating %s consoleclidownloads custom resource: %v", cliDownloadsName, err) - return nil, "FailedUpdate", err + var actualCLIDownloads *v1.ConsoleCLIDownload + updateErr := controllersutil.RetryOnTransientError(func() error { + var e error + actualCLIDownloads, e = consoleClient.Update(ctx, existingCLIDownloadsCopy, metav1.UpdateOptions{}) + return e + }) + if updateErr != nil { + klog.V(4).Infof("error updating %s consoleclidownloads custom resource: %v", cliDownloadsName, updateErr) + return nil, "FailedUpdate", updateErr } return actualCLIDownloads, "", nil } diff --git a/pkg/console/controllers/downloadsdeployment/controller.go b/pkg/console/controllers/downloadsdeployment/controller.go index aacd32a110..88b14ce3b9 100644 --- a/pkg/console/controllers/downloadsdeployment/controller.go +++ b/pkg/console/controllers/downloadsdeployment/controller.go @@ -118,12 +118,19 @@ func (c *DownloadsDeploymentSyncController) SyncDownloadsDeployment(ctx context. requiredDownloadsDeployment := deploymentsub.DefaultDownloadsDeployment(operatorConfigCopy, infrastructureConfig) - return resourceapply.ApplyDeployment(ctx, - c.deploymentClient, - controllerContext.Recorder(), - requiredDownloadsDeployment, - resourcemerge.ExpectedDeploymentGeneration(requiredDownloadsDeployment, operatorConfigCopy.Status.Generations), - ) + var actualDeployment *appsv1.Deployment + var deploymentChanged bool + deploymentErr := util.RetryOnTransientError(func() error { + var e error + actualDeployment, deploymentChanged, e = resourceapply.ApplyDeployment(ctx, + c.deploymentClient, + controllerContext.Recorder(), + requiredDownloadsDeployment, + resourcemerge.ExpectedDeploymentGeneration(requiredDownloadsDeployment, operatorConfigCopy.Status.Generations), + ) + return e + }) + return actualDeployment, deploymentChanged, deploymentErr } func (c *DownloadsDeploymentSyncController) removeDownloadsDeployment(ctx context.Context) error { diff --git a/pkg/console/controllers/oauthclients/oauthclients.go b/pkg/console/controllers/oauthclients/oauthclients.go index 0e716c56c7..39d1bb34f2 100644 --- a/pkg/console/controllers/oauthclients/oauthclients.go +++ b/pkg/console/controllers/oauthclients/oauthclients.go @@ -226,7 +226,10 @@ func (c *oauthClientsController) syncOAuthClient( clientCopy := oauthClient.DeepCopy() oauthsub.RegisterConsoleToOAuthClient(clientCopy, consoleURL, secretsub.GetSecretString(sec)) - _, _, oauthErr := oauthsub.CustomApplyOAuth(c.oauthClient, clientCopy, ctx) + oauthErr := util.RetryOnTransientError(func() error { + _, _, e := oauthsub.CustomApplyOAuth(c.oauthClient, clientCopy, ctx) + return e + }) if oauthErr != nil { return "FailedRegister", oauthErr } diff --git a/pkg/console/controllers/oauthclientsecret/oauthclientsecret.go b/pkg/console/controllers/oauthclientsecret/oauthclientsecret.go index 1791cca7cf..88cdecb135 100644 --- a/pkg/console/controllers/oauthclientsecret/oauthclientsecret.go +++ b/pkg/console/controllers/oauthclientsecret/oauthclientsecret.go @@ -24,6 +24,7 @@ import ( "github.com/openshift/library-go/pkg/operator/resource/resourceapply" v1helpers "github.com/openshift/library-go/pkg/operator/v1helpers" + "github.com/openshift/console-operator/pkg/console/controllers/util" authnsub "github.com/openshift/console-operator/pkg/console/subresource/authentication" secretsub "github.com/openshift/console-operator/pkg/console/subresource/secret" ) @@ -160,7 +161,10 @@ func (c *oauthClientSecretController) syncSecret(ctx context.Context, clientSecr secret, err := c.targetNSSecretsLister.Secrets(api.TargetNamespace).Get("console-oauth-config") if apierrors.IsNotFound(err) || secretsub.GetSecretString(secret) != clientSecret { - _, _, err = resourceapply.ApplySecret(ctx, c.secretsClient, recorder, secretsub.DefaultSecret(operatorConfig, clientSecret)) + err = util.RetryOnTransientError(func() error { + _, _, e := resourceapply.ApplySecret(ctx, c.secretsClient, recorder, secretsub.DefaultSecret(operatorConfig, clientSecret)) + return e + }) } return err } diff --git a/pkg/console/controllers/poddisruptionbudget/controller.go b/pkg/console/controllers/poddisruptionbudget/controller.go index 9f076d44a4..fd1b13ebfb 100644 --- a/pkg/console/controllers/poddisruptionbudget/controller.go +++ b/pkg/console/controllers/poddisruptionbudget/controller.go @@ -97,11 +97,11 @@ func (c *PodDisruptionBudgetController) Sync(ctx context.Context, controllerCont statusHandler := status.NewStatusHandler(c.operatorClient) requiredPDB := c.getDefaultPodDisruptionBudget() - _, _, pdbErr := resourceapply.ApplyPodDisruptionBudget(ctx, c.pdbClient, controllerContext.Recorder(), requiredPDB) + pdbErr := util.RetryOnTransientError(func() error { + _, _, err := resourceapply.ApplyPodDisruptionBudget(ctx, c.pdbClient, controllerContext.Recorder(), requiredPDB) + return err + }) statusHandler.AddConditions(status.HandleProgressingOrDegraded("PDBSync", "FailedApply", pdbErr)) - if pdbErr != nil { - return statusHandler.FlushAndReturn(pdbErr) - } return statusHandler.FlushAndReturn(pdbErr) } diff --git a/pkg/console/controllers/service/controller.go b/pkg/console/controllers/service/controller.go index 50397339f6..ed2896a614 100644 --- a/pkg/console/controllers/service/controller.go +++ b/pkg/console/controllers/service/controller.go @@ -130,7 +130,10 @@ func (c *ServiceSyncController) Sync(ctx context.Context, controllerContext fact routeConfig := routesub.NewRouteConfig(updatedOperatorConfig, ingressConfig, c.serviceName) requiredSvc := c.getDefaultService(ingressDisabled) - _, _, svcErr := resourceapply.ApplyService(ctx, c.serviceClient, controllerContext.Recorder(), requiredSvc) + svcErr := util.RetryOnTransientError(func() error { + _, _, err := resourceapply.ApplyService(ctx, c.serviceClient, controllerContext.Recorder(), requiredSvc) + return err + }) statusHandler.AddConditions(status.HandleProgressingOrDegraded("ServiceSync", "FailedApply", svcErr)) if svcErr != nil { return statusHandler.FlushAndReturn(svcErr) @@ -147,17 +150,23 @@ func (c *ServiceSyncController) Sync(ctx context.Context, controllerContext fact func (c *ServiceSyncController) SyncRedirectService(ctx context.Context, routeConfig *routesub.RouteConfig, controllerContext factory.SyncContext) (string, error) { if !routeConfig.IsCustomHostnameSet() { - if err := c.removeService(ctx, c.getRedirectServiceName()); err != nil { + err := util.RetryOnTransientError(func() error { + return c.removeService(ctx, c.getRedirectServiceName()) + }) + if err != nil { return "FailedDelete", err } return "", nil } requiredRedirectService := c.getRedirectService() - _, _, redirectSvcErr := resourceapply.ApplyService(ctx, c.serviceClient, controllerContext.Recorder(), requiredRedirectService) + redirectSvcErr := util.RetryOnTransientError(func() error { + _, _, err := resourceapply.ApplyService(ctx, c.serviceClient, controllerContext.Recorder(), requiredRedirectService) + return err + }) if redirectSvcErr != nil { return "FailedApply", redirectSvcErr } - return "", redirectSvcErr + return "", nil } func (c *ServiceSyncController) removeService(ctx context.Context, serviceName string) error { diff --git a/pkg/console/controllers/serviceaccounts/controller.go b/pkg/console/controllers/serviceaccounts/controller.go index 7b0538c766..e5468ae7de 100644 --- a/pkg/console/controllers/serviceaccounts/controller.go +++ b/pkg/console/controllers/serviceaccounts/controller.go @@ -152,14 +152,17 @@ func (c *ServiceAccountSyncController) SyncServiceAccount(ctx context.Context, o return fmt.Errorf("failed to get existing service account %s: %w", c.serviceAccountName, err) } - _, _, err = resourceapply.ApplyServiceAccount(ctx, - c.serviceAccountClient, - controllerContext.Recorder(), - serviceAccount, - ) - - if err != nil { - return fmt.Errorf("failed to apply service account %s: %w", c.serviceAccountName, err) + applyErr := util.RetryOnTransientError(func() error { + _, _, e := resourceapply.ApplyServiceAccount(ctx, + c.serviceAccountClient, + controllerContext.Recorder(), + serviceAccount, + ) + return e + }) + + if applyErr != nil { + return fmt.Errorf("failed to apply service account %s: %w", c.serviceAccountName, applyErr) } return nil diff --git a/pkg/console/controllers/storageversionmigration/controller.go b/pkg/console/controllers/storageversionmigration/controller.go index 400dd33a16..23a660a357 100644 --- a/pkg/console/controllers/storageversionmigration/controller.go +++ b/pkg/console/controllers/storageversionmigration/controller.go @@ -14,6 +14,7 @@ import ( "k8s.io/client-go/dynamic/dynamicinformer" "k8s.io/klog/v2" + "github.com/openshift/console-operator/pkg/console/controllers/util" "github.com/openshift/console-operator/pkg/console/status" "github.com/openshift/library-go/pkg/controller/factory" "github.com/openshift/library-go/pkg/operator/events" @@ -103,7 +104,9 @@ func (c *StorageVersionMigrationController) syncStorageVersionMigration(ctx cont if !succeeded { klog.V(4).Infof("StorageVersionMigration has not succeeded yet") // Delete the StorageVersionMigration if it has not succeeded yet - if err := c.deleteStorageVersionMigration(ctx); err != nil { + if err := util.RetryOnTransientError(func() error { + return c.deleteStorageVersionMigration(ctx) + }); err != nil { return "FailedDeleteSVM", err } return "", nil diff --git a/pkg/console/controllers/upgradenotification/controller.go b/pkg/console/controllers/upgradenotification/controller.go index f1b010163b..7427cbf8eb 100644 --- a/pkg/console/controllers/upgradenotification/controller.go +++ b/pkg/console/controllers/upgradenotification/controller.go @@ -131,14 +131,19 @@ func (c *UpgradeNotificationController) syncClusterUpgradeNotification(ctx conte BackgroundColor: "#F0AB00", }, } - _, err = c.consoleNotificationClient.Create(ctx, notification, metav1.CreateOptions{}) - if err != nil && !apierrors.IsAlreadyExists(err) { - return "FailedCreate", err + createErr := util.RetryOnTransientError(func() error { + _, e := c.consoleNotificationClient.Create(ctx, notification, metav1.CreateOptions{}) + return e + }) + if createErr != nil && !apierrors.IsAlreadyExists(createErr) { + return "FailedCreate", createErr } } else { - err = c.removeUpgradeNotification(ctx) - if err != nil { - return "FailedDelete", err + deleteErr := util.RetryOnTransientError(func() error { + return c.removeUpgradeNotification(ctx) + }) + if deleteErr != nil { + return "FailedDelete", deleteErr } } diff --git a/pkg/console/controllers/util/retry.go b/pkg/console/controllers/util/retry.go new file mode 100644 index 0000000000..3da74d5f26 --- /dev/null +++ b/pkg/console/controllers/util/retry.go @@ -0,0 +1,51 @@ +package util + +import ( + "time" + + // kube + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" + "k8s.io/klog/v2" +) + +// TransientBackoff defines the retry parameters for transient API errors. +// 3 steps with 500ms base and 2.0 factor gives: 500ms, 1s, 2s = ~3.5s max per call. +var TransientBackoff = wait.Backoff{ + Steps: 3, + Duration: 500 * time.Millisecond, + Factor: 2.0, + Jitter: 0.1, +} + +// IsRetryableError returns true for errors worth retrying — everything +// except known permanent errors. This naturally handles both API status +// errors (apierrors.StatusError) and network-level errors (connection +// refused, EOF, TLS failures) without explicit net.Error detection. +func IsRetryableError(err error) bool { + if apierrors.IsForbidden(err) || + apierrors.IsInvalid(err) || + apierrors.IsMethodNotSupported(err) || + apierrors.IsNotAcceptable(err) || + apierrors.IsAlreadyExists(err) { + return false + } + return true +} + +// RetryOnTransientError wraps a function call with retry logic to absorb +// transient API server errors (conflicts, timeouts, connection refused) +// that occur during upgrades. Only API write operations (Apply, Create, +// Update, Delete) should be wrapped — not lister/cache reads. +func RetryOnTransientError(fn func() error) error { + attempt := 0 + return retry.OnError(TransientBackoff, IsRetryableError, func() error { + err := fn() + if err != nil { + attempt++ + klog.V(4).Infof("transient error (attempt %d/%d): %v", attempt, TransientBackoff.Steps, err) + } + return err + }) +} diff --git a/pkg/console/controllers/util/retry_test.go b/pkg/console/controllers/util/retry_test.go new file mode 100644 index 0000000000..8670f3942f --- /dev/null +++ b/pkg/console/controllers/util/retry_test.go @@ -0,0 +1,165 @@ +package util + +import ( + "fmt" + "testing" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +func TestIsRetryableError(t *testing.T) { + tests := []struct { + name string + err error + retryable bool + }{ + { + name: "nil error is retryable", + err: nil, + retryable: true, + }, + { + name: "conflict is retryable", + err: apierrors.NewConflict(schema.GroupResource{Resource: "configmaps"}, "test", fmt.Errorf("conflict")), + retryable: true, + }, + { + name: "server timeout is retryable", + err: apierrors.NewServerTimeout(schema.GroupResource{Resource: "configmaps"}, "get", 0), + retryable: true, + }, + { + name: "too many requests is retryable", + err: apierrors.NewTooManyRequests("slow down", 1), + retryable: true, + }, + { + name: "service unavailable is retryable", + err: apierrors.NewServiceUnavailable("unavailable"), + retryable: true, + }, + { + name: "internal error is retryable", + err: apierrors.NewInternalError(fmt.Errorf("oops")), + retryable: true, + }, + { + name: "not found is retryable", + err: apierrors.NewNotFound(schema.GroupResource{Resource: "configmaps"}, "test"), + retryable: true, + }, + { + name: "generic error is retryable", + err: fmt.Errorf("connection refused"), + retryable: true, + }, + { + name: "forbidden is not retryable", + err: apierrors.NewForbidden(schema.GroupResource{Resource: "configmaps"}, "test", fmt.Errorf("forbidden")), + retryable: false, + }, + { + name: "invalid is not retryable", + err: apierrors.NewInvalid(schema.GroupKind{Kind: "ConfigMap"}, "test", nil), + retryable: false, + }, + { + name: "method not supported is not retryable", + err: apierrors.NewMethodNotSupported(schema.GroupResource{Resource: "configmaps"}, "patch"), + retryable: false, + }, + { + name: "already exists is not retryable", + err: apierrors.NewAlreadyExists(schema.GroupResource{Resource: "configmaps"}, "test"), + retryable: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := IsRetryableError(tt.err) + if got != tt.retryable { + t.Errorf("IsRetryableError(%v) = %v, want %v", tt.err, got, tt.retryable) + } + }) + } +} + +func TestRetryOnTransientError(t *testing.T) { + t.Run("succeeds on first attempt", func(t *testing.T) { + calls := 0 + err := RetryOnTransientError(func() error { + calls++ + return nil + }) + if err != nil { + t.Errorf("expected nil error, got %v", err) + } + if calls != 1 { + t.Errorf("expected 1 call, got %d", calls) + } + }) + + t.Run("retries transient error then succeeds", func(t *testing.T) { + calls := 0 + err := RetryOnTransientError(func() error { + calls++ + if calls < 3 { + return apierrors.NewServerTimeout(schema.GroupResource{Resource: "configmaps"}, "get", 0) + } + return nil + }) + if err != nil { + t.Errorf("expected nil error, got %v", err) + } + if calls != 3 { + t.Errorf("expected 3 calls, got %d", calls) + } + }) + + t.Run("does not retry permanent error", func(t *testing.T) { + calls := 0 + err := RetryOnTransientError(func() error { + calls++ + return apierrors.NewForbidden(schema.GroupResource{Resource: "configmaps"}, "test", fmt.Errorf("forbidden")) + }) + if err == nil { + t.Error("expected error, got nil") + } + if calls != 1 { + t.Errorf("expected 1 call (no retry), got %d", calls) + } + }) + + t.Run("exhausts retries on persistent transient error", func(t *testing.T) { + calls := 0 + err := RetryOnTransientError(func() error { + calls++ + return apierrors.NewServerTimeout(schema.GroupResource{Resource: "configmaps"}, "get", 0) + }) + if err == nil { + t.Error("expected error after exhausting retries, got nil") + } + if calls != TransientBackoff.Steps { + t.Errorf("expected %d calls, got %d", TransientBackoff.Steps, calls) + } + }) + + t.Run("retries generic network error", func(t *testing.T) { + calls := 0 + err := RetryOnTransientError(func() error { + calls++ + if calls < 2 { + return fmt.Errorf("connection refused") + } + return nil + }) + if err != nil { + t.Errorf("expected nil error, got %v", err) + } + if calls != 2 { + t.Errorf("expected 2 calls, got %d", calls) + } + }) +} diff --git a/pkg/console/operator/retry.go b/pkg/console/operator/retry.go new file mode 100644 index 0000000000..4bd7ced437 --- /dev/null +++ b/pkg/console/operator/retry.go @@ -0,0 +1,10 @@ +package operator + +import ( + // operator + "github.com/openshift/console-operator/pkg/console/controllers/util" +) + +func retryOnTransientError(fn func() error) error { + return util.RetryOnTransientError(fn) +} diff --git a/pkg/console/operator/sync_v400.go b/pkg/console/operator/sync_v400.go index 6a5c2c83bf..a305cdaf37 100644 --- a/pkg/console/operator/sync_v400.go +++ b/pkg/console/operator/sync_v400.go @@ -18,7 +18,6 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/client-go/util/retry" "k8s.io/klog/v2" // openshift @@ -263,14 +262,27 @@ func (co *consoleOperator) SyncConsoleConfig(ctx context.Context, consoleConfig klog.V(4).Infof("updating console.config.openshift.io with url: %v", consoleURL) updated := consoleConfig.DeepCopy() updated.Status.ConsoleURL = consoleURL - return co.consoleConfigClient.UpdateStatus(ctx, updated, metav1.UpdateOptions{}) + var result *configv1.Console + err := retryOnTransientError(func() error { + var e error + result, e = co.consoleConfigClient.UpdateStatus(ctx, updated, metav1.UpdateOptions{}) + return e + }) + return result, err } return consoleConfig, nil } func (co *consoleOperator) SyncConsolePublicConfig(ctx context.Context, consoleURL string, recorder events.Recorder) (*corev1.ConfigMap, bool, error) { requiredConfigMap := configmapsub.DefaultPublicConfig(consoleURL) - return resourceapply.ApplyConfigMap(ctx, co.configMapClient, recorder, requiredConfigMap) + var cm *corev1.ConfigMap + var changed bool + err := retryOnTransientError(func() error { + var e error + cm, changed, e = resourceapply.ApplyConfigMap(ctx, co.configMapClient, recorder, requiredConfigMap) + return e + }) + return cm, changed, err } func (co *consoleOperator) SyncDeployment( @@ -309,13 +321,18 @@ func (co *consoleOperator) SyncDeployment( } deploymentsub.LogDeploymentAnnotationChanges(co.deploymentClient, requiredDeployment, ctx) - deployment, _, applyDepErr := resourceapply.ApplyDeployment( - ctx, - co.deploymentClient, - recorder, - requiredDeployment, - resourcemerge.ExpectedDeploymentGeneration(requiredDeployment, updatedOperatorConfig.Status.Generations), - ) + var deployment *appsv1.Deployment + applyDepErr := retryOnTransientError(func() error { + var e error + deployment, _, e = resourceapply.ApplyDeployment( + ctx, + co.deploymentClient, + recorder, + requiredDeployment, + resourcemerge.ExpectedDeploymentGeneration(requiredDeployment, updatedOperatorConfig.Status.Generations), + ) + return e + }) if applyDepErr != nil { return nil, "FailedApply", applyDepErr @@ -418,11 +435,10 @@ func (co *consoleOperator) SyncConfigMap( var cm *corev1.ConfigMap var cmChanged bool var cmErr error - - // Retry on conflicts to handle concurrent ConfigMap updates - cmErr = retry.RetryOnConflict(retry.DefaultBackoff, func() error { - cm, cmChanged, cmErr = resourceapply.ApplyConfigMap(ctx, co.configMapClient, recorder, defaultConfigmap) - return cmErr + cmErr = retryOnTransientError(func() error { + var e error + cm, cmChanged, e = resourceapply.ApplyConfigMap(ctx, co.configMapClient, recorder, defaultConfigmap) + return e }) if cmErr != nil { return nil, "FailedApply", cmErr @@ -499,13 +515,17 @@ func (co *consoleOperator) SyncServiceCAConfigMap(ctx context.Context, operatorC // we can't use `resourceapply.ApplyConfigMap` since it compares data, and the service serving cert operator injects the data existing, err := co.targetNSConfigMapLister.ConfigMaps(required.Namespace).Get(required.Name) if apierrors.IsNotFound(err) { - actual, err := co.configMapClient.ConfigMaps(required.Namespace).Create(ctx, required, metav1.CreateOptions{}) - if err == nil { + var actual *corev1.ConfigMap + createErr := retryOnTransientError(func() error { + var e error + actual, e = co.configMapClient.ConfigMaps(required.Namespace).Create(ctx, required, metav1.CreateOptions{}) + return e + }) + if createErr == nil { klog.V(4).Infoln("service-ca configmap created") - return actual, "", err - } else { - return actual, "FailedCreate", err + return actual, "", nil } + return actual, "FailedCreate", createErr } if err != nil { return nil, "FailedGet", err @@ -518,25 +538,34 @@ func (co *consoleOperator) SyncServiceCAConfigMap(ctx context.Context, operatorC return existing, "", nil } - actual, err := co.configMapClient.ConfigMaps(required.Namespace).Update(ctx, existing, metav1.UpdateOptions{}) - if err == nil { + var actual *corev1.ConfigMap + updateErr := retryOnTransientError(func() error { + var e error + actual, e = co.configMapClient.ConfigMaps(required.Namespace).Update(ctx, existing, metav1.UpdateOptions{}) + return e + }) + if updateErr == nil { klog.V(4).Infoln("service-ca configmap updated") - return actual, "", err - } else { - return actual, "FailedUpdate", err + return actual, "", nil } + return actual, "FailedUpdate", updateErr } func (co *consoleOperator) SyncTrustedCAConfigMap(ctx context.Context, operatorConfig *operatorv1.Console) (trustedCA *corev1.ConfigMap, reason string, err error) { required := configmapsub.DefaultTrustedCAConfigMap(operatorConfig) existing, err := co.targetNSConfigMapLister.ConfigMaps(required.Namespace).Get(required.Name) if apierrors.IsNotFound(err) { - actual, err := co.configMapClient.ConfigMaps(required.Namespace).Create(ctx, required, metav1.CreateOptions{}) - if err != nil { - return actual, "FailedCreate", err + var actual *corev1.ConfigMap + createErr := retryOnTransientError(func() error { + var e error + actual, e = co.configMapClient.ConfigMaps(required.Namespace).Create(ctx, required, metav1.CreateOptions{}) + return e + }) + if createErr != nil { + return actual, "FailedCreate", createErr } klog.V(4).Infoln("trusted-ca-bundle configmap created") - return actual, "", err + return actual, "", nil } if err != nil { return nil, "FailedGet", err @@ -549,12 +578,17 @@ func (co *consoleOperator) SyncTrustedCAConfigMap(ctx context.Context, operatorC return existing, "", nil } - actual, err := co.configMapClient.ConfigMaps(required.Namespace).Update(ctx, existing, metav1.UpdateOptions{}) - if err != nil { - return actual, "FailedUpdate", err + var actual *corev1.ConfigMap + updateErr := retryOnTransientError(func() error { + var e error + actual, e = co.configMapClient.ConfigMaps(required.Namespace).Update(ctx, existing, metav1.UpdateOptions{}) + return e + }) + if updateErr != nil { + return actual, "FailedUpdate", updateErr } klog.V(4).Infoln("trusted-ca-bundle configmap updated") - return actual, "", err + return actual, "", nil } // SyncTechPreview determines if tech preview features should be enabled based on cluster FeatureSet @@ -812,6 +846,11 @@ func (co *consoleOperator) syncSessionSecret( } } - secret, _, err := resourceapply.ApplySecret(ctx, co.secretsClient, recorder, required) + var secret *corev1.Secret + err = retryOnTransientError(func() error { + var e error + secret, _, e = resourceapply.ApplySecret(ctx, co.secretsClient, recorder, required) + return e + }) return secret, err } From 71b7e785321ad0b1d5e8d6c65be06acecaf108c5 Mon Sep 17 00:00:00 2001 From: Steve Goodwin Date: Tue, 2 Jun 2026 15:35:26 -0400 Subject: [PATCH 2/2] Updated 4 sites to re-fetch the latest object via a live API Get inside the retry closure before calling Update/UpdateStatus. Each retry attempt now uses the current resourceVersion instead of the original stale copy. - SyncConsoleConfig: re-fetches consoleConfig before UpdateStatus - SyncServiceCAConfigMap: re-fetches configmap and re-applies metadata before Update - SyncTrustedCAConfigMap: same pattern - ApplyCLIDownloads: re-fetches CLI downloads resource, re-applies spec and metadata before Update Assisted by: Claude (Opus 4.6) --- .../controllers/clidownloads/controller.go | 10 ++++--- pkg/console/operator/sync_v400.go | 26 +++++++++++++------ 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/pkg/console/controllers/clidownloads/controller.go b/pkg/console/controllers/clidownloads/controller.go index 9d3708c51d..75a3a271de 100644 --- a/pkg/console/controllers/clidownloads/controller.go +++ b/pkg/console/controllers/clidownloads/controller.go @@ -253,11 +253,15 @@ func ApplyCLIDownloads(ctx context.Context, consoleClient consoleclientv1.Consol return existingCLIDownloadsCopy, "", nil } - existingCLIDownloadsCopy.Spec = requiredCLIDownloads.Spec var actualCLIDownloads *v1.ConsoleCLIDownload updateErr := controllersutil.RetryOnTransientError(func() error { - var e error - actualCLIDownloads, e = consoleClient.Update(ctx, existingCLIDownloadsCopy, metav1.UpdateOptions{}) + latest, e := consoleClient.Get(ctx, cliDownloadsName, metav1.GetOptions{}) + if e != nil { + return e + } + latest.Spec = requiredCLIDownloads.Spec + resourcemerge.EnsureObjectMeta(resourcemerge.BoolPtr(false), &latest.ObjectMeta, requiredCLIDownloads.ObjectMeta) + actualCLIDownloads, e = consoleClient.Update(ctx, latest, metav1.UpdateOptions{}) return e }) if updateErr != nil { diff --git a/pkg/console/operator/sync_v400.go b/pkg/console/operator/sync_v400.go index a305cdaf37..2704bb823e 100644 --- a/pkg/console/operator/sync_v400.go +++ b/pkg/console/operator/sync_v400.go @@ -260,12 +260,14 @@ func (co *consoleOperator) SyncConsoleConfig(ctx context.Context, consoleConfig metrics.HandleConsoleURL(oldURL, consoleURL) if oldURL != consoleURL { klog.V(4).Infof("updating console.config.openshift.io with url: %v", consoleURL) - updated := consoleConfig.DeepCopy() - updated.Status.ConsoleURL = consoleURL var result *configv1.Console err := retryOnTransientError(func() error { - var e error - result, e = co.consoleConfigClient.UpdateStatus(ctx, updated, metav1.UpdateOptions{}) + latest, e := co.consoleConfigClient.Get(ctx, consoleConfig.Name, metav1.GetOptions{}) + if e != nil { + return e + } + latest.Status.ConsoleURL = consoleURL + result, e = co.consoleConfigClient.UpdateStatus(ctx, latest, metav1.UpdateOptions{}) return e }) return result, err @@ -540,8 +542,12 @@ func (co *consoleOperator) SyncServiceCAConfigMap(ctx context.Context, operatorC var actual *corev1.ConfigMap updateErr := retryOnTransientError(func() error { - var e error - actual, e = co.configMapClient.ConfigMaps(required.Namespace).Update(ctx, existing, metav1.UpdateOptions{}) + latest, e := co.configMapClient.ConfigMaps(required.Namespace).Get(ctx, required.Name, metav1.GetOptions{}) + if e != nil { + return e + } + resourcemerge.EnsureObjectMeta(resourcemerge.BoolPtr(false), &latest.ObjectMeta, required.ObjectMeta) + actual, e = co.configMapClient.ConfigMaps(required.Namespace).Update(ctx, latest, metav1.UpdateOptions{}) return e }) if updateErr == nil { @@ -580,8 +586,12 @@ func (co *consoleOperator) SyncTrustedCAConfigMap(ctx context.Context, operatorC var actual *corev1.ConfigMap updateErr := retryOnTransientError(func() error { - var e error - actual, e = co.configMapClient.ConfigMaps(required.Namespace).Update(ctx, existing, metav1.UpdateOptions{}) + latest, e := co.configMapClient.ConfigMaps(required.Namespace).Get(ctx, required.Name, metav1.GetOptions{}) + if e != nil { + return e + } + resourcemerge.EnsureObjectMeta(resourcemerge.BoolPtr(false), &latest.ObjectMeta, required.ObjectMeta) + actual, e = co.configMapClient.ConfigMaps(required.Namespace).Update(ctx, latest, metav1.UpdateOptions{}) return e }) if updateErr != nil {