diff --git a/internal/hermes/hermes.go b/internal/hermes/hermes.go index f4616093..59e8c66a 100644 --- a/internal/hermes/hermes.go +++ b/internal/hermes/hermes.go @@ -245,6 +245,11 @@ func Sync(cfg *config.Config, id string, u *ui.UI) error { return fmt.Errorf("helmfile sync failed: %w", err) } + // Host-side chown the PVC backing dirs to the in-pod UID/GID, bypassing + // the user-namespacing that defeats the in-pod `init-hermes-perms` + // chown from #446 (see ensureHermesPVCOwnership doc comment for details). + ensureHermesPVCOwnership(cfg, id, u) + // Publish wallet-metadata ConfigMap for the frontend (namespace now exists). applyWalletMetadataConfigMap(cfg, id, deploymentDir) @@ -1308,3 +1313,110 @@ func fixRuntimeVolumeOwnership(cfg *config.Config, hostPath string, u *ui.UI) { _ = os.Chown(hostPath, containerUID, containerGID) } } + +// hermesPVCPaths returns the host-side PVC backing directories owned by the +// Hermes pod and chowned to containerUID:containerGID. +// +// Intentionally limited to PVCs that the Hermes container itself mounts — +// `remote-signer-keystores` is excluded even though it sits in the same +// namespace because the remote-signer pod runs as runAsUser=65532 with +// fsGroup=1000 (obol/remote-signer chart) and forcing its volume to +// 10000:10000 (Hermes' UID) makes the remote-signer crash-loop on +// `failed to load keystores: Permission denied (os error 13)` against +// the read-only /data/keystores mount. The local-path-provisioner default +// of 1000:1000 already matches that pod's fsGroup contract, so leaving +// that volume untouched is the safe behavior. +func hermesPVCPaths(cfg *config.Config, id string) []string { + namespace := agentruntime.Namespace(agentruntime.Hermes, id) + return []string{ + filepath.Join(cfg.DataDir, namespace, agentruntime.Describe(agentruntime.Hermes).DataPVCName), + } +} + +// ensureHermesPVCOwnership host-side chowns the Hermes PVC backing directories +// to containerUID:containerGID so the agent's init containers can write under +// /data on the first start. +// +// Why this is needed (issue #475): +// - The embedded k3d config (internal/embed/k3d-config.yaml) sets +// KubeletInUserNamespace=true. Pod "root" maps to a host subuid that +// lacks chown authority over the host bind-mount path provisioned by +// local-path-provisioner. The in-pod `init-hermes-perms` chown added in +// #446 (commit c066baa) silently no-ops in this configuration. +// - local-path-provisioner's helper-pod sets the dir to 1000:1000 (see +// internal/embed/infrastructure/base/templates/local-path.yaml). Hermes +// runs as 10000:10000, so the next init container fails on +// `mkdir /data/.hermes/home: Permission denied`. +// +// The fix is to chown from outside the user namespace: `docker exec` into the +// k3d server container runs at the host Docker daemon's authority, which is +// real root and is not subject to the kubelet's user-namespacing. +// +// Best-effort. Waits up to 60s for each PVC to be Bound (local-path uses +// WaitForFirstConsumer, so the host dir doesn't exist until the consuming +// pod is scheduled). On non-k3d backends fixRuntimeVolumeOwnership falls +// back to a plain os.Chown. +// +// If a Hermes pod is currently stuck in Init:CrashLoopBackOff because of the +// pre-fix permissions, deletes it so kubelet re-creates with the corrected +// perms immediately rather than after exponential backoff (up to ~5 min). +// Skips the delete when no pod is stuck so repeated `Sync` calls +// (e.g. `obol model sync` after `obol model prefer`) do not gratuitously +// restart a healthy agent. +func ensureHermesPVCOwnership(cfg *config.Config, id string, u *ui.UI) { + namespace := agentruntime.Namespace(agentruntime.Hermes, id) + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + kubectlBin := filepath.Join(cfg.BinDir, "kubectl") + + // Wait only for the PVCs hermesPVCPaths chowns. remote-signer-keystores + // is intentionally NOT in this loop — see the doc comment on + // hermesPVCPaths for why. + for _, pvc := range []string{ + agentruntime.Describe(agentruntime.Hermes).DataPVCName, + } { + waitCmd := exec.Command(kubectlBin, + "wait", "--for=jsonpath={.status.phase}=Bound", + "--timeout=60s", "pvc/"+pvc, "-n", namespace) + waitCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) + _ = waitCmd.Run() // best-effort; continue even on timeout + } + + for _, p := range hermesPVCPaths(cfg, id) { + fixRuntimeVolumeOwnership(cfg, p, u) + } + + if hermesInitStuck(cfg, namespace) { + deleteCmd := exec.Command(kubectlBin, + "-n", namespace, "delete", "pod", + "-l", "app.kubernetes.io/name=hermes", + "--ignore-not-found", "--wait=false") + deleteCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) + if err := deleteCmd.Run(); err == nil && u != nil { + u.Info("Restarted Hermes pod to apply fresh volume ownership") + } + } +} + +// hermesInitStuck reports whether at least one Hermes pod has an init +// container in CrashLoopBackOff or an Error waiting state — the signature of +// the perm-denied symptom this fix targets. Returns false on any kubectl +// failure so that a transient API hiccup does not trigger spurious restarts. +func hermesInitStuck(cfg *config.Config, namespace string) bool { + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + kubectlBin := filepath.Join(cfg.BinDir, "kubectl") + cmd := exec.Command(kubectlBin, + "-n", namespace, "get", "pods", + "-l", "app.kubernetes.io/name=hermes", + "-o", "jsonpath={.items[*].status.initContainerStatuses[*].state.waiting.reason}") + cmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) + out, err := cmd.Output() + if err != nil { + return false + } + for _, reason := range strings.Fields(string(out)) { + if strings.Contains(reason, "CrashLoop") || strings.Contains(reason, "Error") { + return true + } + } + return false +} diff --git a/internal/hermes/hermes_test.go b/internal/hermes/hermes_test.go index 4840ca68..e18499a0 100644 --- a/internal/hermes/hermes_test.go +++ b/internal/hermes/hermes_test.go @@ -351,3 +351,48 @@ func mkdirInstance(t *testing.T, cfg *config.Config, id string) { t.Fatalf("create Hermes instance %q: %v", id, err) } } + +// TestHermesPVCPaths pins the host-side directories that +// ensureHermesPVCOwnership chowns. Renaming the Hermes data PVC or +// relocating the namespace prefix in agentruntime without updating this +// list would silently regress the #475 fix on Linux k3d, because the chown +// would land on a non-existent path while the real PVC backing dir kept +// its local-path-provisioner default ownership of 1000:1000. +func TestHermesPVCPaths(t *testing.T) { + cfg := testConfig(t) + const id = "obol-agent" + namespace := agentruntime.Namespace(agentruntime.Hermes, id) + + got := hermesPVCPaths(cfg, id) + want := []string{ + filepath.Join(cfg.DataDir, namespace, "hermes-data"), + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("hermesPVCPaths(%q) = %#v; want %#v", id, got, want) + } +} + +// TestHermesPVCPaths_ExcludesRemoteSignerKeystores pins the negative half of +// the contract: the helper MUST NOT include the remote-signer-keystores PVC +// path. The first revision of this fix included it, which chowned the +// volume to 10000:10000 (Hermes' UID) and broke the remote-signer pod — +// remote-signer runs as runAsUser=65532 with fsGroup=1000, expecting the +// local-path-provisioner default ownership of 1000:1000. The result was +// a remote-signer CrashLoopBackOff with +// +// failed to load keystores: Permission denied (os error 13) +// +// against /data/keystores. This guard makes that regression impossible to +// re-introduce by accident. +func TestHermesPVCPaths_ExcludesRemoteSignerKeystores(t *testing.T) { + cfg := testConfig(t) + const id = "obol-agent" + namespace := agentruntime.Namespace(agentruntime.Hermes, id) + + keystoreVolume := filepath.Join(cfg.DataDir, namespace, "remote-signer-keystores") + for _, p := range hermesPVCPaths(cfg, id) { + if p == keystoreVolume { + t.Fatalf("hermesPVCPaths included %q — the remote-signer pod (runAsUser=65532, fsGroup=1000) crash-loops when this volume is chowned to Hermes' containerUID:containerGID", keystoreVolume) + } + } +}