From dcdee9c96a32f48af1c8f221df5c0149db74d6f1 Mon Sep 17 00:00:00 2001 From: mdzurick Date: Tue, 9 Jun 2026 18:32:15 +0000 Subject: [PATCH] harden cosign against transient failures by adding retry Signed-off-by: mdzurick --- .github/workflows/docker_images.yml | 48 ++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docker_images.yml b/.github/workflows/docker_images.yml index d2540c58584..46843df6ae7 100644 --- a/.github/workflows/docker_images.yml +++ b/.github/workflows/docker_images.yml @@ -202,7 +202,17 @@ jobs: env: DIGEST: ${{ steps.docker_build.outputs.digest }} TAGS: ${{ steps.metadata.outputs.tags }} - run: cosign sign --yes --recursive "${TAGS}@${DIGEST}" + # Retry to ride out transient OIDC/Fulcio network failures (e.g. a TLS + # handshake timeout that expires the OIDC token). Each attempt fetches a + # fresh token, avoiding "expired_token" from cosign's in-process retries. + run: | + n=0 + until cosign sign --yes --recursive "${TAGS}@${DIGEST}"; do + n=$((n+1)) + [ $n -ge 5 ] && { echo "::error::cosign sign failed after $n attempts"; exit 1; } + echo "cosign sign attempt $n failed; retrying in $((15*n))s..." + sleep $((15*n)) + done - name: Cache cuda-quantum image if: steps.build_info.outputs.tar_cache && steps.build_info.outputs.tar_archive @@ -417,7 +427,17 @@ jobs: env: DIGEST: ${{ steps.docker_build.outputs.digest }} TAGS: ${{ steps.metadata.outputs.tags }} - run: cosign sign --yes --recursive "${TAGS}@${DIGEST}" + # Retry to ride out transient OIDC/Fulcio network failures (e.g. a TLS + # handshake timeout that expires the OIDC token). Each attempt fetches a + # fresh token, avoiding "expired_token" from cosign's in-process retries. + run: | + n=0 + until cosign sign --yes --recursive "${TAGS}@${DIGEST}"; do + n=$((n+1)) + [ $n -ge 5 ] && { echo "::error::cosign sign failed after $n attempts"; exit 1; } + echo "cosign sign attempt $n failed; retrying in $((15*n))s..." + sleep $((15*n)) + done - name: Cache cuda-quantum-dev image if: steps.prereqs.outputs.tar_cache && steps.prereqs.outputs.tar_archive @@ -677,7 +697,17 @@ jobs: env: DIGEST: ${{ steps.release_build.outputs.digest }} TAGS: ${{ steps.cudaqdev_metadata.outputs.tags }} - run: cosign sign --yes --recursive "${TAGS}@${DIGEST}" + # Retry to ride out transient OIDC/Fulcio network failures (e.g. a TLS + # handshake timeout that expires the OIDC token). Each attempt fetches a + # fresh token, avoiding "expired_token" from cosign's in-process retries. + run: | + n=0 + until cosign sign --yes --recursive "${TAGS}@${DIGEST}"; do + n=$((n+1)) + [ $n -ge 5 ] && { echo "::error::cosign sign failed after $n attempts"; exit 1; } + echo "cosign sign attempt $n failed; retrying in $((15*n))s..." + sleep $((15*n)) + done - name: Log in to NGC if: needs.metadata.outputs.push_to_ngc == 'true' @@ -745,7 +775,17 @@ jobs: env: DIGEST: ${{ steps.cudaq_build.outputs.digest }} TAGS: ${{ steps.cudaq_metadata.outputs.tags }} - run: cosign sign --yes --recursive "${TAGS}@${DIGEST}" + # Retry to ride out transient OIDC/Fulcio network failures (e.g. a TLS + # handshake timeout that expires the OIDC token). Each attempt fetches a + # fresh token, avoiding "expired_token" from cosign's in-process retries. + run: | + n=0 + until cosign sign --yes --recursive "${TAGS}@${DIGEST}"; do + n=$((n+1)) + [ $n -ge 5 ] && { echo "::error::cosign sign failed after $n attempts"; exit 1; } + echo "cosign sign attempt $n failed; retrying in $((15*n))s..." + sleep $((15*n)) + done - name: Install NGC CLI if: inputs.environment && needs.metadata.outputs.push_to_ngc == 'true'