Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 26 additions & 8 deletions tools/helm/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ var ErrNilHelmReleaseResult = errors.New("helm release result is nil")

func DefaultOptions() *RawOptions {
return &RawOptions{
Timeout: 5 * time.Minute,
Timeout: 5 * time.Minute,
StaleLockThreshold: DefaultStaleLockThreshold,
}
}

Expand All @@ -65,6 +66,7 @@ func BindOptions(opts *RawOptions, cmd *cobra.Command) error {
cmd.Flags().StringVar(&opts.KustoEndpoint, "kusto-endpoint", opts.KustoEndpoint, "URI of the Kusto endpoint to use for diagnostics.")

cmd.Flags().DurationVar(&opts.Timeout, "timeout", opts.Timeout, "Timeout for waiting on the Helm release.")
cmd.Flags().DurationVar(&opts.StaleLockThreshold, "stale-lock-threshold", opts.StaleLockThreshold, "Fail fast before deploying if the latest release revision has been stuck in a pending (install/upgrade/rollback) state for longer than this duration. Set to 0 to disable the stale-lock check.")

cmd.Flags().StringVar(&opts.KubeconfigFile, "kubeconfig", opts.KubeconfigFile, "Path to the kubeconfig.")
cmd.Flags().BoolVar(&opts.DryRun, "dry-run", opts.DryRun, "Do not make any changes to the Kubernetes API server.")
Expand All @@ -87,7 +89,8 @@ type RawOptions struct {
KustoTable string
KustoEndpoint string

Timeout time.Duration
Timeout time.Duration
StaleLockThreshold time.Duration

KubeconfigFile string
DryRun bool
Expand Down Expand Up @@ -125,9 +128,10 @@ type completedOptions struct {
KustoTable string
KustoEndpoint string

Timeout time.Duration
DryRun bool
RollbackOnFailure bool
Timeout time.Duration
StaleLockThreshold time.Duration
DryRun bool
RollbackOnFailure bool
}

type Options struct {
Expand All @@ -152,6 +156,10 @@ func (o *RawOptions) Validate() (*ValidatedOptions, error) {
}
}

if o.StaleLockThreshold < 0 {
return nil, fmt.Errorf("the stale-lock threshold must not be negative; use --stale-lock-threshold=0 to disable the check, got %s", o.StaleLockThreshold)
}

return &ValidatedOptions{
validatedOptions: &validatedOptions{
RawOptions: o,
Expand Down Expand Up @@ -255,9 +263,10 @@ func (o *ValidatedOptions) Complete() (*Options, error) {
KustoTable: o.KustoTable,
KustoEndpoint: o.KustoEndpoint,

Timeout: o.Timeout,
DryRun: o.DryRun,
RollbackOnFailure: o.RollbackOnFailure,
Timeout: o.Timeout,
StaleLockThreshold: o.StaleLockThreshold,
DryRun: o.DryRun,
RollbackOnFailure: o.RollbackOnFailure,
},
}, nil
}
Expand Down Expand Up @@ -287,6 +296,15 @@ func (opts *Options) Deploy(ctx context.Context) error {
noReleaseYet := errors.Is(err, driver.ErrReleaseNotFound) || isReleaseUninstalled(logger, versions)

if !noReleaseYet {
// fail fast with actionable diagnostics if the latest revision is stuck in a
// stale pending (install/upgrade/rollback) state, instead of letting Helm emit
// the opaque "another operation ... is in progress" error during the upgrade.
if opts.StaleLockThreshold > 0 {
if err := checkForStaleReleaseLock(logger, opts.StaleLockThreshold, versions); err != nil {
return err
}
}

// only when a previous release exists, do we need to fixup managed fields
opts.DryRun = true
logger.Info("Doing dry-run of Helm release.")
Expand Down
122 changes: 122 additions & 0 deletions tools/helm/stale_lock.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package helm

import (
"fmt"
"time"

"github.com/go-logr/logr"
helmrelease "helm.sh/helm/v4/pkg/release"
helmreleasev1 "helm.sh/helm/v4/pkg/release/v1"
)

// DefaultStaleLockThreshold is how old a pending Helm release revision must be
// before helmdeploy treats it as a stale lock and fails fast instead of letting
// Helm return the opaque "another operation (install/upgrade/rollback) is in
// progress" error.
const DefaultStaleLockThreshold = 15 * time.Minute

// StaleReleaseLockError is returned when the latest release revision is stuck in
// a pending state (pending-install, pending-upgrade, or pending-rollback) and is
// older than the configured staleness threshold. It carries enough context for
// an operator to identify and remediate the stale lock.
type StaleReleaseLockError struct {
ReleaseName string
Namespace string
Revision int
Status string
Age time.Duration
Threshold time.Duration
SecretName string
}

func (e *StaleReleaseLockError) Error() string {
return fmt.Sprintf(
"helm release %q in namespace %q is stuck in pending state %q at revision %d "+
"(age %s exceeds staleness threshold %s); a previous Helm operation most likely "+
"crashed or timed out and left a stale release lock.\n"+
"To recover, back up and delete the stale Helm release secret, then retry the deployment:\n"+
" kubectl --namespace %s get secret %s -o yaml > %s.backup.yaml\n"+
" kubectl --namespace %s delete secret %s\n"+
"Only delete the secret after confirming no Helm operation is genuinely still running.",
e.ReleaseName, e.Namespace, e.Status, e.Revision,
e.Age.Round(time.Second), e.Threshold,
e.Namespace, e.SecretName, e.SecretName,
e.Namespace, e.SecretName,
)
}

// checkForStaleReleaseLock inspects the latest revision in the provided release
// history. If that revision is in a pending state and its Info.LastDeployed
// timestamp is older than threshold, it returns a *StaleReleaseLockError so the
// caller can fail fast with actionable diagnostics. Pending revisions whose
// Info.LastDeployed is within the threshold (i.e. a genuinely in-flight
// operation) are left alone and return nil.
func checkForStaleReleaseLock(logger logr.Logger, threshold time.Duration, versionsi []helmrelease.Releaser) error {
Comment thread
raelga marked this conversation as resolved.
versions, err := releaseListToV1List(versionsi)
if err != nil {
// Mirror isReleaseUninstalled: a conversion failure should not block the
// deployment - just skip the stale-lock diagnostics.
logger.Error(err, "cannot convert release list to v1 release list for stale-lock check")
return nil
}
if len(versions) == 0 {
return nil
}

latest := versions[len(versions)-1]
if latest.Info == nil || !latest.Info.Status.IsPending() {
return nil
}

age := releaseAge(latest)
if age < threshold {
logger.Info(
"Latest release revision is in a pending state but within the staleness threshold; continuing.",
"release", latest.Name,
"namespace", latest.Namespace,
"revision", latest.Version,
"status", latest.Info.Status,
"age", age.Round(time.Second).String(),
"threshold", threshold.String(),
)
return nil
}

secretName := releaseSecretName(latest)
logger.Info(
"Detected stale Helm release lock; failing fast.",
"release", latest.Name,
"namespace", latest.Namespace,
"revision", latest.Version,
"status", latest.Info.Status,
"age", age.Round(time.Second).String(),
"threshold", threshold.String(),
"secret", secretName,
)
return &StaleReleaseLockError{
ReleaseName: latest.Name,
Namespace: latest.Namespace,
Revision: latest.Version,
Status: latest.Info.Status.String(),
Age: age,
Threshold: threshold,
SecretName: secretName,
}
}

// releaseAge reports how long ago the given release revision was last deployed.
func releaseAge(release *helmreleasev1.Release) time.Duration {
// A zero LastDeployed (e.g. a never-successfully-deployed pending-install)
// is treated as age 0 so it is never flagged stale - failing fast on a
// missing timestamp would break genuinely in-flight operations.
if release == nil || release.Info == nil || release.Info.LastDeployed.IsZero() {
return 0
}
return time.Since(release.Info.LastDeployed)
}

// releaseSecretName returns the name of the Kubernetes secret backing the given
// release revision, as used by Helm's default secret storage driver.
func releaseSecretName(release *helmreleasev1.Release) string {
return fmt.Sprintf("sh.helm.release.v1.%s.v%d", release.Name, release.Version)
}
Loading
Loading