From c0510a384320319613550557aa685f4f9741c2bf Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 12 May 2026 16:14:13 +0800 Subject: [PATCH 1/2] fix(hermes): host-side chown PVC backing dirs after sync (#475) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The in-pod `init-hermes-perms` init container from #446 (c066baa) is neutered on Linux k3d because the embedded k3d config sets `KubeletInUserNamespace=true` (internal/embed/k3d-config.yaml). With user-namespacing, the pod's "root" maps to a host subuid that lacks chown authority over the host bind-mount path, so the in-pod `chown -R 10000:10000 /data` silently no-ops. The next init container (`init-hermes-data`) then fails with `mkdir /data/.hermes/home: Permission denied` and the pod CrashLoopBackOffs. local-path-provisioner's helper-pod sets the volume to 1000:1000 (internal/embed/infrastructure/base/templates/local-path.yaml), which happens to suit OpenClaw but not Hermes (containerUID = 10000). Fix: after `helmfile sync`, host-side chown the PVC backing dirs to containerUID:containerGID by exec-ing into the k3d server container via `docker exec`. That runs at the Docker daemon's real root and is not subject to the user-namespacing that silently breaks the in-pod attempt. The existing `fixRuntimeVolumeOwnership` helper already does exactly this for wallet keystore paths; we wire it into the agent deploy path via a new `ensureHermesPVCOwnership` that: 1. Waits up to 60s for each PVC (`hermes-data`, `remote-signer- keystores`) to be Bound — local-path is WaitForFirstConsumer so the host dir doesn't exist until the pod is scheduled. 2. Chowns each backing dir. 3. If a Hermes pod is currently stuck in Init:CrashLoopBackOff, deletes it so kubelet recreates immediately rather than after exponential backoff (~5 min worst case). Skips the delete when no pod is stuck so routine syncs (e.g. `obol model sync` after `obol model prefer`) do not gratuitously restart a healthy agent. Called from `hermes.Sync` after `helmfile sync` succeeds, so every Onboard / Setup / Sync call exercises it. Validated on spark2 (Linux ARM64, Ubuntu 24.04, NVIDIA GB10) by reverting the PVC dirs to 1000:1000, deleting the Hermes pod, and running `obol model sync`. Before: pod stuck in Init:CrashLoopBackOff with "Permission denied" in init-hermes-data logs. After: PVC dirs flip to 10000:10000 within the sync, pod reaches `Running 2/2` in ~30s with zero CrashLoopBackOff cycles. Test: - `TestHermesPVCPaths` pins the two host paths the helper chowns, so renaming `hermes-data`/`remote-signer-keystores` or relocating the namespace prefix can't silently regress the fix. - Full chown side-effect needs a live k3d cluster and is exercised by the spark2 validation above plus all existing integration flows; no unit-mocked k3d test added. --- internal/hermes/hermes.go | 102 +++++++++++++++++++++++++++++++++ internal/hermes/hermes_test.go | 24 ++++++++ 2 files changed, 126 insertions(+) diff --git a/internal/hermes/hermes.go b/internal/hermes/hermes.go index f4616093..ecfbdd2e 100644 --- a/internal/hermes/hermes.go +++ b/internal/hermes/hermes.go @@ -245,6 +245,11 @@ func Sync(cfg *config.Config, id string, u *ui.UI) error { return fmt.Errorf("helmfile sync failed: %w", err) } + // Host-side chown the PVC backing dirs to the in-pod UID/GID, bypassing + // the user-namespacing that defeats the in-pod `init-hermes-perms` + // chown from #446 (see ensureHermesPVCOwnership doc comment for details). + ensureHermesPVCOwnership(cfg, id, u) + // Publish wallet-metadata ConfigMap for the frontend (namespace now exists). applyWalletMetadataConfigMap(cfg, id, deploymentDir) @@ -1308,3 +1313,100 @@ func fixRuntimeVolumeOwnership(cfg *config.Config, hostPath string, u *ui.UI) { _ = os.Chown(hostPath, containerUID, containerGID) } } + +// hermesPVCPaths returns the host-side PVC backing directories that the Hermes +// pod mounts, in the order they appear in the deployment template. Exposed for +// tests and for callers that need to chown multiple volumes consistently. +func hermesPVCPaths(cfg *config.Config, id string) []string { + namespace := agentruntime.Namespace(agentruntime.Hermes, id) + return []string{ + filepath.Join(cfg.DataDir, namespace, agentruntime.Describe(agentruntime.Hermes).DataPVCName), + filepath.Join(cfg.DataDir, namespace, "remote-signer-keystores"), + } +} + +// ensureHermesPVCOwnership host-side chowns the Hermes PVC backing directories +// to containerUID:containerGID so the agent's init containers can write under +// /data on the first start. +// +// Why this is needed (issue #475): +// - The embedded k3d config (internal/embed/k3d-config.yaml) sets +// KubeletInUserNamespace=true. Pod "root" maps to a host subuid that +// lacks chown authority over the host bind-mount path provisioned by +// local-path-provisioner. The in-pod `init-hermes-perms` chown added in +// #446 (commit c066baa) silently no-ops in this configuration. +// - local-path-provisioner's helper-pod sets the dir to 1000:1000 (see +// internal/embed/infrastructure/base/templates/local-path.yaml). Hermes +// runs as 10000:10000, so the next init container fails on +// `mkdir /data/.hermes/home: Permission denied`. +// +// The fix is to chown from outside the user namespace: `docker exec` into the +// k3d server container runs at the host Docker daemon's authority, which is +// real root and is not subject to the kubelet's user-namespacing. +// +// Best-effort. Waits up to 60s for each PVC to be Bound (local-path uses +// WaitForFirstConsumer, so the host dir doesn't exist until the consuming +// pod is scheduled). On non-k3d backends fixRuntimeVolumeOwnership falls +// back to a plain os.Chown. +// +// If a Hermes pod is currently stuck in Init:CrashLoopBackOff because of the +// pre-fix permissions, deletes it so kubelet re-creates with the corrected +// perms immediately rather than after exponential backoff (up to ~5 min). +// Skips the delete when no pod is stuck so repeated `Sync` calls +// (e.g. `obol model sync` after `obol model prefer`) do not gratuitously +// restart a healthy agent. +func ensureHermesPVCOwnership(cfg *config.Config, id string, u *ui.UI) { + namespace := agentruntime.Namespace(agentruntime.Hermes, id) + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + kubectlBin := filepath.Join(cfg.BinDir, "kubectl") + + for _, pvc := range []string{ + agentruntime.Describe(agentruntime.Hermes).DataPVCName, + "remote-signer-keystores", + } { + waitCmd := exec.Command(kubectlBin, + "wait", "--for=jsonpath={.status.phase}=Bound", + "--timeout=60s", "pvc/"+pvc, "-n", namespace) + waitCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) + _ = waitCmd.Run() // best-effort; continue even on timeout + } + + for _, p := range hermesPVCPaths(cfg, id) { + fixRuntimeVolumeOwnership(cfg, p, u) + } + + if hermesInitStuck(cfg, namespace) { + deleteCmd := exec.Command(kubectlBin, + "-n", namespace, "delete", "pod", + "-l", "app.kubernetes.io/name=hermes", + "--ignore-not-found", "--wait=false") + deleteCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) + if err := deleteCmd.Run(); err == nil && u != nil { + u.Info("Restarted Hermes pod to apply fresh volume ownership") + } + } +} + +// hermesInitStuck reports whether at least one Hermes pod has an init +// container in CrashLoopBackOff or an Error waiting state — the signature of +// the perm-denied symptom this fix targets. Returns false on any kubectl +// failure so that a transient API hiccup does not trigger spurious restarts. +func hermesInitStuck(cfg *config.Config, namespace string) bool { + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + kubectlBin := filepath.Join(cfg.BinDir, "kubectl") + cmd := exec.Command(kubectlBin, + "-n", namespace, "get", "pods", + "-l", "app.kubernetes.io/name=hermes", + "-o", "jsonpath={.items[*].status.initContainerStatuses[*].state.waiting.reason}") + cmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) + out, err := cmd.Output() + if err != nil { + return false + } + for _, reason := range strings.Fields(string(out)) { + if strings.Contains(reason, "CrashLoop") || strings.Contains(reason, "Error") { + return true + } + } + return false +} diff --git a/internal/hermes/hermes_test.go b/internal/hermes/hermes_test.go index 4840ca68..2ce35a7a 100644 --- a/internal/hermes/hermes_test.go +++ b/internal/hermes/hermes_test.go @@ -351,3 +351,27 @@ func mkdirInstance(t *testing.T, cfg *config.Config, id string) { t.Fatalf("create Hermes instance %q: %v", id, err) } } + +// TestHermesPVCPaths pins the host-side directories that +// ensureHermesPVCOwnership chowns. Refactoring either path (renaming a PVC, +// relocating the namespace prefix in agentruntime, etc.) without updating +// this list would silently regress the #475 fix on Linux k3d, because the +// chown would land on a non-existent path while the real PVC backing dir +// kept its local-path-provisioner default ownership of 1000:1000. +func TestHermesPVCPaths(t *testing.T) { + cfg := testConfig(t) + const id = "obol-agent" + namespace := agentruntime.Namespace(agentruntime.Hermes, id) + + got := hermesPVCPaths(cfg, id) + want := []string{ + filepath.Join(cfg.DataDir, namespace, "hermes-data"), + filepath.Join(cfg.DataDir, namespace, "remote-signer-keystores"), + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("hermesPVCPaths(%q) = %#v; want %#v", id, got, want) + } + if got[0] == got[1] { + t.Fatalf("expected distinct paths, got %q twice", got[0]) + } +} From a99107da99053c3564b0d8330dda6b9186536fc8 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 12 May 2026 17:52:01 +0800 Subject: [PATCH 2/2] fix(hermes): exclude remote-signer-keystores from PVC chown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first revision of ensureHermesPVCOwnership chowned BOTH PVCs in the Hermes namespace — `hermes-data` AND `remote-signer-keystores` — to containerUID:containerGID (10000:10000). That broke the remote-signer pod on Linux k3d: remote-signer-7bc7c4759-nj8lx 0/1 CrashLoopBackOff {"level":"ERROR","message":"failed to load keystores", "error":"io error: Permission denied (os error 13)"} The two PVCs have different UID/GID contracts: hermes-data owned by the Hermes Deployment runAsUser=10000, runAsGroup=10000, fsGroup=10000 ✓ chown to 10000:10000 is correct remote-signer-keystores owned by the obol/remote-signer Helm release runAsUser=65532, fsGroup=1000 (read-only mount) ✗ chown to 10000:10000 makes /data/keystores unreadable to the pod ✓ the local-path-provisioner default of 1000:1000 already matches the fsGroup contract, so leaving it untouched is the safe behavior Reproduced today on spark2 after the original PR #475 fix landed: the Hermes pod recovered correctly, but the remote-signer pod that had been healthy on the prior boot now CrashLoopBackOff'd until the keystore volume was manually chowned back to 1000:1000. Changes: - hermesPVCPaths: drop the remote-signer-keystores entry. Doc comment explains why the negative is intentional. - ensureHermesPVCOwnership: drop remote-signer-keystores from the PVC-wait loop too — no reason to wait on a volume we no longer touch. - TestHermesPVCPaths: tightened to assert the single Hermes path. - TestHermesPVCPaths_ExcludesRemoteSignerKeystores: NEW negative guard so a future "re-include all PVCs in the namespace" refactor fails the test before it ships. --- internal/hermes/hermes.go | 20 +++++++++++++----- internal/hermes/hermes_test.go | 37 ++++++++++++++++++++++++++-------- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/internal/hermes/hermes.go b/internal/hermes/hermes.go index ecfbdd2e..59e8c66a 100644 --- a/internal/hermes/hermes.go +++ b/internal/hermes/hermes.go @@ -1314,14 +1314,22 @@ func fixRuntimeVolumeOwnership(cfg *config.Config, hostPath string, u *ui.UI) { } } -// hermesPVCPaths returns the host-side PVC backing directories that the Hermes -// pod mounts, in the order they appear in the deployment template. Exposed for -// tests and for callers that need to chown multiple volumes consistently. +// hermesPVCPaths returns the host-side PVC backing directories owned by the +// Hermes pod and chowned to containerUID:containerGID. +// +// Intentionally limited to PVCs that the Hermes container itself mounts — +// `remote-signer-keystores` is excluded even though it sits in the same +// namespace because the remote-signer pod runs as runAsUser=65532 with +// fsGroup=1000 (obol/remote-signer chart) and forcing its volume to +// 10000:10000 (Hermes' UID) makes the remote-signer crash-loop on +// `failed to load keystores: Permission denied (os error 13)` against +// the read-only /data/keystores mount. The local-path-provisioner default +// of 1000:1000 already matches that pod's fsGroup contract, so leaving +// that volume untouched is the safe behavior. func hermesPVCPaths(cfg *config.Config, id string) []string { namespace := agentruntime.Namespace(agentruntime.Hermes, id) return []string{ filepath.Join(cfg.DataDir, namespace, agentruntime.Describe(agentruntime.Hermes).DataPVCName), - filepath.Join(cfg.DataDir, namespace, "remote-signer-keystores"), } } @@ -1360,9 +1368,11 @@ func ensureHermesPVCOwnership(cfg *config.Config, id string, u *ui.UI) { kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") kubectlBin := filepath.Join(cfg.BinDir, "kubectl") + // Wait only for the PVCs hermesPVCPaths chowns. remote-signer-keystores + // is intentionally NOT in this loop — see the doc comment on + // hermesPVCPaths for why. for _, pvc := range []string{ agentruntime.Describe(agentruntime.Hermes).DataPVCName, - "remote-signer-keystores", } { waitCmd := exec.Command(kubectlBin, "wait", "--for=jsonpath={.status.phase}=Bound", diff --git a/internal/hermes/hermes_test.go b/internal/hermes/hermes_test.go index 2ce35a7a..e18499a0 100644 --- a/internal/hermes/hermes_test.go +++ b/internal/hermes/hermes_test.go @@ -353,11 +353,11 @@ func mkdirInstance(t *testing.T, cfg *config.Config, id string) { } // TestHermesPVCPaths pins the host-side directories that -// ensureHermesPVCOwnership chowns. Refactoring either path (renaming a PVC, -// relocating the namespace prefix in agentruntime, etc.) without updating -// this list would silently regress the #475 fix on Linux k3d, because the -// chown would land on a non-existent path while the real PVC backing dir -// kept its local-path-provisioner default ownership of 1000:1000. +// ensureHermesPVCOwnership chowns. Renaming the Hermes data PVC or +// relocating the namespace prefix in agentruntime without updating this +// list would silently regress the #475 fix on Linux k3d, because the chown +// would land on a non-existent path while the real PVC backing dir kept +// its local-path-provisioner default ownership of 1000:1000. func TestHermesPVCPaths(t *testing.T) { cfg := testConfig(t) const id = "obol-agent" @@ -366,12 +366,33 @@ func TestHermesPVCPaths(t *testing.T) { got := hermesPVCPaths(cfg, id) want := []string{ filepath.Join(cfg.DataDir, namespace, "hermes-data"), - filepath.Join(cfg.DataDir, namespace, "remote-signer-keystores"), } if !reflect.DeepEqual(got, want) { t.Fatalf("hermesPVCPaths(%q) = %#v; want %#v", id, got, want) } - if got[0] == got[1] { - t.Fatalf("expected distinct paths, got %q twice", got[0]) +} + +// TestHermesPVCPaths_ExcludesRemoteSignerKeystores pins the negative half of +// the contract: the helper MUST NOT include the remote-signer-keystores PVC +// path. The first revision of this fix included it, which chowned the +// volume to 10000:10000 (Hermes' UID) and broke the remote-signer pod — +// remote-signer runs as runAsUser=65532 with fsGroup=1000, expecting the +// local-path-provisioner default ownership of 1000:1000. The result was +// a remote-signer CrashLoopBackOff with +// +// failed to load keystores: Permission denied (os error 13) +// +// against /data/keystores. This guard makes that regression impossible to +// re-introduce by accident. +func TestHermesPVCPaths_ExcludesRemoteSignerKeystores(t *testing.T) { + cfg := testConfig(t) + const id = "obol-agent" + namespace := agentruntime.Namespace(agentruntime.Hermes, id) + + keystoreVolume := filepath.Join(cfg.DataDir, namespace, "remote-signer-keystores") + for _, p := range hermesPVCPaths(cfg, id) { + if p == keystoreVolume { + t.Fatalf("hermesPVCPaths included %q — the remote-signer pod (runAsUser=65532, fsGroup=1000) crash-loops when this volume is chowned to Hermes' containerUID:containerGID", keystoreVolume) + } } }