Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions internal/hermes/hermes.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,11 @@ func Sync(cfg *config.Config, id string, u *ui.UI) error {
return fmt.Errorf("helmfile sync failed: %w", err)
}

// Host-side chown the PVC backing dirs to the in-pod UID/GID, bypassing
// the user-namespacing that defeats the in-pod `init-hermes-perms`
// chown from #446 (see ensureHermesPVCOwnership doc comment for details).
ensureHermesPVCOwnership(cfg, id, u)

// Publish wallet-metadata ConfigMap for the frontend (namespace now exists).
applyWalletMetadataConfigMap(cfg, id, deploymentDir)

Expand Down Expand Up @@ -1308,3 +1313,110 @@ func fixRuntimeVolumeOwnership(cfg *config.Config, hostPath string, u *ui.UI) {
_ = os.Chown(hostPath, containerUID, containerGID)
}
}

// hermesPVCPaths returns the host-side PVC backing directories owned by the
// Hermes pod and chowned to containerUID:containerGID.
//
// Intentionally limited to PVCs that the Hermes container itself mounts —
// `remote-signer-keystores` is excluded even though it sits in the same
// namespace because the remote-signer pod runs as runAsUser=65532 with
// fsGroup=1000 (obol/remote-signer chart) and forcing its volume to
// 10000:10000 (Hermes' UID) makes the remote-signer crash-loop on
// `failed to load keystores: Permission denied (os error 13)` against
// the read-only /data/keystores mount. The local-path-provisioner default
// of 1000:1000 already matches that pod's fsGroup contract, so leaving
// that volume untouched is the safe behavior.
func hermesPVCPaths(cfg *config.Config, id string) []string {
namespace := agentruntime.Namespace(agentruntime.Hermes, id)
return []string{
filepath.Join(cfg.DataDir, namespace, agentruntime.Describe(agentruntime.Hermes).DataPVCName),
}
}

// ensureHermesPVCOwnership host-side chowns the Hermes PVC backing directories
// to containerUID:containerGID so the agent's init containers can write under
// /data on the first start.
//
// Why this is needed (issue #475):
// - The embedded k3d config (internal/embed/k3d-config.yaml) sets
// KubeletInUserNamespace=true. Pod "root" maps to a host subuid that
// lacks chown authority over the host bind-mount path provisioned by
// local-path-provisioner. The in-pod `init-hermes-perms` chown added in
// #446 (commit c066baa) silently no-ops in this configuration.
// - local-path-provisioner's helper-pod sets the dir to 1000:1000 (see
// internal/embed/infrastructure/base/templates/local-path.yaml). Hermes
// runs as 10000:10000, so the next init container fails on
// `mkdir /data/.hermes/home: Permission denied`.
//
// The fix is to chown from outside the user namespace: `docker exec` into the
// k3d server container runs at the host Docker daemon's authority, which is
// real root and is not subject to the kubelet's user-namespacing.
//
// Best-effort. Waits up to 60s for each PVC to be Bound (local-path uses
// WaitForFirstConsumer, so the host dir doesn't exist until the consuming
// pod is scheduled). On non-k3d backends fixRuntimeVolumeOwnership falls
// back to a plain os.Chown.
//
// If a Hermes pod is currently stuck in Init:CrashLoopBackOff because of the
// pre-fix permissions, deletes it so kubelet re-creates with the corrected
// perms immediately rather than after exponential backoff (up to ~5 min).
// Skips the delete when no pod is stuck so repeated `Sync` calls
// (e.g. `obol model sync` after `obol model prefer`) do not gratuitously
// restart a healthy agent.
func ensureHermesPVCOwnership(cfg *config.Config, id string, u *ui.UI) {
namespace := agentruntime.Namespace(agentruntime.Hermes, id)
kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml")
kubectlBin := filepath.Join(cfg.BinDir, "kubectl")

// Wait only for the PVCs hermesPVCPaths chowns. remote-signer-keystores
// is intentionally NOT in this loop — see the doc comment on
// hermesPVCPaths for why.
for _, pvc := range []string{
agentruntime.Describe(agentruntime.Hermes).DataPVCName,
} {
waitCmd := exec.Command(kubectlBin,
"wait", "--for=jsonpath={.status.phase}=Bound",
"--timeout=60s", "pvc/"+pvc, "-n", namespace)
waitCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath)
_ = waitCmd.Run() // best-effort; continue even on timeout
}

for _, p := range hermesPVCPaths(cfg, id) {
fixRuntimeVolumeOwnership(cfg, p, u)
}

if hermesInitStuck(cfg, namespace) {
deleteCmd := exec.Command(kubectlBin,
"-n", namespace, "delete", "pod",
"-l", "app.kubernetes.io/name=hermes",
"--ignore-not-found", "--wait=false")
deleteCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath)
if err := deleteCmd.Run(); err == nil && u != nil {
u.Info("Restarted Hermes pod to apply fresh volume ownership")
}
}
}

// hermesInitStuck reports whether at least one Hermes pod has an init
// container in CrashLoopBackOff or an Error waiting state — the signature of
// the perm-denied symptom this fix targets. Returns false on any kubectl
// failure so that a transient API hiccup does not trigger spurious restarts.
func hermesInitStuck(cfg *config.Config, namespace string) bool {
kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml")
kubectlBin := filepath.Join(cfg.BinDir, "kubectl")
cmd := exec.Command(kubectlBin,
"-n", namespace, "get", "pods",
"-l", "app.kubernetes.io/name=hermes",
"-o", "jsonpath={.items[*].status.initContainerStatuses[*].state.waiting.reason}")
cmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath)
out, err := cmd.Output()
if err != nil {
return false
}
for _, reason := range strings.Fields(string(out)) {
if strings.Contains(reason, "CrashLoop") || strings.Contains(reason, "Error") {
return true
}
}
return false
}
45 changes: 45 additions & 0 deletions internal/hermes/hermes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,3 +351,48 @@ func mkdirInstance(t *testing.T, cfg *config.Config, id string) {
t.Fatalf("create Hermes instance %q: %v", id, err)
}
}

// TestHermesPVCPaths pins the host-side directories that
// ensureHermesPVCOwnership chowns. Renaming the Hermes data PVC or
// relocating the namespace prefix in agentruntime without updating this
// list would silently regress the #475 fix on Linux k3d, because the chown
// would land on a non-existent path while the real PVC backing dir kept
// its local-path-provisioner default ownership of 1000:1000.
func TestHermesPVCPaths(t *testing.T) {
cfg := testConfig(t)
const id = "obol-agent"
namespace := agentruntime.Namespace(agentruntime.Hermes, id)

got := hermesPVCPaths(cfg, id)
want := []string{
filepath.Join(cfg.DataDir, namespace, "hermes-data"),
}
if !reflect.DeepEqual(got, want) {
t.Fatalf("hermesPVCPaths(%q) = %#v; want %#v", id, got, want)
}
}

// TestHermesPVCPaths_ExcludesRemoteSignerKeystores pins the negative half of
// the contract: the helper MUST NOT include the remote-signer-keystores PVC
// path. The first revision of this fix included it, which chowned the
// volume to 10000:10000 (Hermes' UID) and broke the remote-signer pod —
// remote-signer runs as runAsUser=65532 with fsGroup=1000, expecting the
// local-path-provisioner default ownership of 1000:1000. The result was
// a remote-signer CrashLoopBackOff with
//
// failed to load keystores: Permission denied (os error 13)
//
// against /data/keystores. This guard makes that regression impossible to
// re-introduce by accident.
func TestHermesPVCPaths_ExcludesRemoteSignerKeystores(t *testing.T) {
cfg := testConfig(t)
const id = "obol-agent"
namespace := agentruntime.Namespace(agentruntime.Hermes, id)

keystoreVolume := filepath.Join(cfg.DataDir, namespace, "remote-signer-keystores")
for _, p := range hermesPVCPaths(cfg, id) {
if p == keystoreVolume {
t.Fatalf("hermesPVCPaths included %q — the remote-signer pod (runAsUser=65532, fsGroup=1000) crash-loops when this volume is chowned to Hermes' containerUID:containerGID", keystoreVolume)
}
}
}
Loading