diff --git a/tools/helm/options.go b/tools/helm/options.go index 5119673..9c563e5 100644 --- a/tools/helm/options.go +++ b/tools/helm/options.go @@ -47,7 +47,8 @@ var ErrNilHelmReleaseResult = errors.New("helm release result is nil") func DefaultOptions() *RawOptions { return &RawOptions{ - Timeout: 5 * time.Minute, + Timeout: 5 * time.Minute, + StaleLockThreshold: DefaultStaleLockThreshold, } } @@ -65,6 +66,7 @@ func BindOptions(opts *RawOptions, cmd *cobra.Command) error { cmd.Flags().StringVar(&opts.KustoEndpoint, "kusto-endpoint", opts.KustoEndpoint, "URI of the Kusto endpoint to use for diagnostics.") cmd.Flags().DurationVar(&opts.Timeout, "timeout", opts.Timeout, "Timeout for waiting on the Helm release.") + cmd.Flags().DurationVar(&opts.StaleLockThreshold, "stale-lock-threshold", opts.StaleLockThreshold, "Fail fast before deploying if the latest release revision has been stuck in a pending (install/upgrade/rollback) state for longer than this duration. Set to 0 to disable the stale-lock check.") cmd.Flags().StringVar(&opts.KubeconfigFile, "kubeconfig", opts.KubeconfigFile, "Path to the kubeconfig.") cmd.Flags().BoolVar(&opts.DryRun, "dry-run", opts.DryRun, "Do not make any changes to the Kubernetes API server.") @@ -87,7 +89,8 @@ type RawOptions struct { KustoTable string KustoEndpoint string - Timeout time.Duration + Timeout time.Duration + StaleLockThreshold time.Duration KubeconfigFile string DryRun bool @@ -125,9 +128,10 @@ type completedOptions struct { KustoTable string KustoEndpoint string - Timeout time.Duration - DryRun bool - RollbackOnFailure bool + Timeout time.Duration + StaleLockThreshold time.Duration + DryRun bool + RollbackOnFailure bool } type Options struct { @@ -152,6 +156,10 @@ func (o *RawOptions) Validate() (*ValidatedOptions, error) { } } + if o.StaleLockThreshold < 0 { + return nil, fmt.Errorf("the stale-lock threshold must not be negative; use --stale-lock-threshold=0 to disable the check, got %s", o.StaleLockThreshold) + } + return &ValidatedOptions{ validatedOptions: &validatedOptions{ RawOptions: o, @@ -255,9 +263,10 @@ func (o *ValidatedOptions) Complete() (*Options, error) { KustoTable: o.KustoTable, KustoEndpoint: o.KustoEndpoint, - Timeout: o.Timeout, - DryRun: o.DryRun, - RollbackOnFailure: o.RollbackOnFailure, + Timeout: o.Timeout, + StaleLockThreshold: o.StaleLockThreshold, + DryRun: o.DryRun, + RollbackOnFailure: o.RollbackOnFailure, }, }, nil } @@ -287,6 +296,15 @@ func (opts *Options) Deploy(ctx context.Context) error { noReleaseYet := errors.Is(err, driver.ErrReleaseNotFound) || isReleaseUninstalled(logger, versions) if !noReleaseYet { + // fail fast with actionable diagnostics if the latest revision is stuck in a + // stale pending (install/upgrade/rollback) state, instead of letting Helm emit + // the opaque "another operation ... is in progress" error during the upgrade. + if opts.StaleLockThreshold > 0 { + if err := checkForStaleReleaseLock(logger, opts.StaleLockThreshold, versions); err != nil { + return err + } + } + // only when a previous release exists, do we need to fixup managed fields opts.DryRun = true logger.Info("Doing dry-run of Helm release.") diff --git a/tools/helm/stale_lock.go b/tools/helm/stale_lock.go new file mode 100644 index 0000000..77745b9 --- /dev/null +++ b/tools/helm/stale_lock.go @@ -0,0 +1,122 @@ +package helm + +import ( + "fmt" + "time" + + "github.com/go-logr/logr" + helmrelease "helm.sh/helm/v4/pkg/release" + helmreleasev1 "helm.sh/helm/v4/pkg/release/v1" +) + +// DefaultStaleLockThreshold is how old a pending Helm release revision must be +// before helmdeploy treats it as a stale lock and fails fast instead of letting +// Helm return the opaque "another operation (install/upgrade/rollback) is in +// progress" error. +const DefaultStaleLockThreshold = 15 * time.Minute + +// StaleReleaseLockError is returned when the latest release revision is stuck in +// a pending state (pending-install, pending-upgrade, or pending-rollback) and is +// older than the configured staleness threshold. It carries enough context for +// an operator to identify and remediate the stale lock. +type StaleReleaseLockError struct { + ReleaseName string + Namespace string + Revision int + Status string + Age time.Duration + Threshold time.Duration + SecretName string +} + +func (e *StaleReleaseLockError) Error() string { + return fmt.Sprintf( + "helm release %q in namespace %q is stuck in pending state %q at revision %d "+ + "(age %s exceeds staleness threshold %s); a previous Helm operation most likely "+ + "crashed or timed out and left a stale release lock.\n"+ + "To recover, back up and delete the stale Helm release secret, then retry the deployment:\n"+ + " kubectl --namespace %s get secret %s -o yaml > %s.backup.yaml\n"+ + " kubectl --namespace %s delete secret %s\n"+ + "Only delete the secret after confirming no Helm operation is genuinely still running.", + e.ReleaseName, e.Namespace, e.Status, e.Revision, + e.Age.Round(time.Second), e.Threshold, + e.Namespace, e.SecretName, e.SecretName, + e.Namespace, e.SecretName, + ) +} + +// checkForStaleReleaseLock inspects the latest revision in the provided release +// history. If that revision is in a pending state and its Info.LastDeployed +// timestamp is older than threshold, it returns a *StaleReleaseLockError so the +// caller can fail fast with actionable diagnostics. Pending revisions whose +// Info.LastDeployed is within the threshold (i.e. a genuinely in-flight +// operation) are left alone and return nil. +func checkForStaleReleaseLock(logger logr.Logger, threshold time.Duration, versionsi []helmrelease.Releaser) error { + versions, err := releaseListToV1List(versionsi) + if err != nil { + // Mirror isReleaseUninstalled: a conversion failure should not block the + // deployment - just skip the stale-lock diagnostics. + logger.Error(err, "cannot convert release list to v1 release list for stale-lock check") + return nil + } + if len(versions) == 0 { + return nil + } + + latest := versions[len(versions)-1] + if latest.Info == nil || !latest.Info.Status.IsPending() { + return nil + } + + age := releaseAge(latest) + if age < threshold { + logger.Info( + "Latest release revision is in a pending state but within the staleness threshold; continuing.", + "release", latest.Name, + "namespace", latest.Namespace, + "revision", latest.Version, + "status", latest.Info.Status, + "age", age.Round(time.Second).String(), + "threshold", threshold.String(), + ) + return nil + } + + secretName := releaseSecretName(latest) + logger.Info( + "Detected stale Helm release lock; failing fast.", + "release", latest.Name, + "namespace", latest.Namespace, + "revision", latest.Version, + "status", latest.Info.Status, + "age", age.Round(time.Second).String(), + "threshold", threshold.String(), + "secret", secretName, + ) + return &StaleReleaseLockError{ + ReleaseName: latest.Name, + Namespace: latest.Namespace, + Revision: latest.Version, + Status: latest.Info.Status.String(), + Age: age, + Threshold: threshold, + SecretName: secretName, + } +} + +// releaseAge reports how long ago the given release revision was last deployed. +func releaseAge(release *helmreleasev1.Release) time.Duration { + // A zero LastDeployed (e.g. a never-successfully-deployed pending-install) + // is treated as age 0 so it is never flagged stale - failing fast on a + // missing timestamp would break genuinely in-flight operations. + if release == nil || release.Info == nil || release.Info.LastDeployed.IsZero() { + return 0 + } + return time.Since(release.Info.LastDeployed) +} + +// releaseSecretName returns the name of the Kubernetes secret backing the given +// release revision, as used by Helm's default secret storage driver. +func releaseSecretName(release *helmreleasev1.Release) string { + return fmt.Sprintf("sh.helm.release.v1.%s.v%d", release.Name, release.Version) +} diff --git a/tools/helm/stale_lock_test.go b/tools/helm/stale_lock_test.go new file mode 100644 index 0000000..298b17e --- /dev/null +++ b/tools/helm/stale_lock_test.go @@ -0,0 +1,218 @@ +package helm + +import ( + "errors" + "strings" + "testing" + "time" + + "github.com/go-logr/logr/testr" + helmrelease "helm.sh/helm/v4/pkg/release" + helmreleasecommon "helm.sh/helm/v4/pkg/release/common" + helmreleasev1 "helm.sh/helm/v4/pkg/release/v1" +) + +func pendingRelease(name, namespace string, revision int, status helmreleasecommon.Status, lastDeployed time.Time) helmrelease.Releaser { + return &helmreleasev1.Release{ + Name: name, + Namespace: namespace, + Version: revision, + Info: &helmreleasev1.Info{ + Status: status, + LastDeployed: lastDeployed, + }, + } +} + +func TestCheckForStaleReleaseLock(t *testing.T) { + const threshold = DefaultStaleLockThreshold + now := time.Now() + + tests := []struct { + name string + versions []helmrelease.Releaser + wantStale bool + wantRelease string + wantNS string + wantRev int + wantStatus string + wantSecret string + }{ + { + name: "stale pending-upgrade fails fast", + versions: []helmrelease.Releaser{ + pendingRelease("backend", "aro-hcp", 7, helmreleasecommon.StatusPendingUpgrade, now.Add(-42*time.Minute)), + }, + wantStale: true, + wantRelease: "backend", + wantNS: "aro-hcp", + wantRev: 7, + wantStatus: "pending-upgrade", + wantSecret: "sh.helm.release.v1.backend.v7", + }, + { + name: "stale pending-install fails fast", + versions: []helmrelease.Releaser{ + pendingRelease("frontend", "aro-hcp", 1, helmreleasecommon.StatusPendingInstall, now.Add(-1*time.Hour)), + }, + wantStale: true, + wantRelease: "frontend", + wantNS: "aro-hcp", + wantRev: 1, + wantStatus: "pending-install", + wantSecret: "sh.helm.release.v1.frontend.v1", + }, + { + name: "stale pending-rollback fails fast", + versions: []helmrelease.Releaser{ + pendingRelease("backend", "aro-hcp", 9, helmreleasecommon.StatusPendingRollback, now.Add(-90*time.Minute)), + }, + wantStale: true, + wantRelease: "backend", + wantNS: "aro-hcp", + wantRev: 9, + wantStatus: "pending-rollback", + wantSecret: "sh.helm.release.v1.backend.v9", + }, + { + name: "normal deployed history does not trigger", + versions: []helmrelease.Releaser{ + pendingRelease("backend", "aro-hcp", 7, helmreleasecommon.StatusDeployed, now.Add(-42*time.Minute)), + }, + wantStale: false, + }, + { + name: "fresh pending operation within threshold does not trigger", + versions: []helmrelease.Releaser{ + pendingRelease("backend", "aro-hcp", 8, helmreleasecommon.StatusPendingUpgrade, now.Add(-2*time.Minute)), + }, + wantStale: false, + }, + { + name: "pending revision with zero LastDeployed does not trigger", + versions: []helmrelease.Releaser{ + pendingRelease("frontend", "aro-hcp", 1, helmreleasecommon.StatusPendingInstall, time.Time{}), + }, + wantStale: false, + }, + { + name: "empty history does not trigger", + versions: nil, + wantStale: false, + }, + { + name: "only the latest revision is considered", + versions: []helmrelease.Releaser{ + pendingRelease("backend", "aro-hcp", 6, helmreleasecommon.StatusPendingUpgrade, now.Add(-1*time.Hour)), + pendingRelease("backend", "aro-hcp", 7, helmreleasecommon.StatusDeployed, now.Add(-30*time.Minute)), + }, + wantStale: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + logger := testr.New(t) + err := checkForStaleReleaseLock(logger, threshold, tc.versions) + + if !tc.wantStale { + if err != nil { + t.Fatalf("expected no stale-lock error, got: %v", err) + } + return + } + + if err == nil { + t.Fatalf("expected a stale-lock error, got nil") + } + var staleErr *StaleReleaseLockError + if !errors.As(err, &staleErr) { + t.Fatalf("expected *StaleReleaseLockError, got %T: %v", err, err) + } + if staleErr.ReleaseName != tc.wantRelease { + t.Errorf("release name: want %q, got %q", tc.wantRelease, staleErr.ReleaseName) + } + if staleErr.Namespace != tc.wantNS { + t.Errorf("namespace: want %q, got %q", tc.wantNS, staleErr.Namespace) + } + if staleErr.Revision != tc.wantRev { + t.Errorf("revision: want %d, got %d", tc.wantRev, staleErr.Revision) + } + if staleErr.Status != tc.wantStatus { + t.Errorf("status: want %q, got %q", tc.wantStatus, staleErr.Status) + } + if staleErr.SecretName != tc.wantSecret { + t.Errorf("secret name: want %q, got %q", tc.wantSecret, staleErr.SecretName) + } + if staleErr.Threshold != threshold { + t.Errorf("threshold: want %s, got %s", threshold, staleErr.Threshold) + } + if staleErr.Age < threshold { + t.Errorf("age %s should be >= threshold %s", staleErr.Age, threshold) + } + }) + } +} + +func TestStaleReleaseLockErrorMessage(t *testing.T) { + err := &StaleReleaseLockError{ + ReleaseName: "backend", + Namespace: "aro-hcp", + Revision: 7, + Status: "pending-upgrade", + Age: 42 * time.Minute, + Threshold: 15 * time.Minute, + SecretName: "sh.helm.release.v1.backend.v7", + } + + msg := err.Error() + for _, want := range []string{ + "backend", + "aro-hcp", + "pending-upgrade", + "revision 7", + "sh.helm.release.v1.backend.v7", + "kubectl", + "delete secret", + } { + if !strings.Contains(msg, want) { + t.Errorf("error message missing %q; full message:\n%s", want, msg) + } + } +} + +func TestValidateStaleLockThreshold(t *testing.T) { + base := func() *RawOptions { + return &RawOptions{ + ReleaseName: "backend", + ReleaseNamespace: "aro-hcp", + ChartDir: "/charts/backend", + ValuesFile: "/values.yaml", + KubeconfigFile: "/kubeconfig", + } + } + + t.Run("negative threshold is rejected", func(t *testing.T) { + o := base() + o.StaleLockThreshold = -1 * time.Minute + if _, err := o.Validate(); err == nil { + t.Fatal("expected validation error for negative stale-lock threshold, got nil") + } + }) + + t.Run("zero threshold is allowed", func(t *testing.T) { + o := base() + o.StaleLockThreshold = 0 + if _, err := o.Validate(); err != nil { + t.Fatalf("expected no error for zero stale-lock threshold, got: %v", err) + } + }) + + t.Run("positive threshold is allowed", func(t *testing.T) { + o := base() + o.StaleLockThreshold = DefaultStaleLockThreshold + if _, err := o.Validate(); err != nil { + t.Fatalf("expected no error for positive stale-lock threshold, got: %v", err) + } + }) +}