diff --git a/.github/workflows/docker-publish-storefront.yml b/.github/workflows/docker-publish-storefront.yml index f78721fc..54ddc5de 100644 --- a/.github/workflows/docker-publish-storefront.yml +++ b/.github/workflows/docker-publish-storefront.yml @@ -6,6 +6,11 @@ on: - main tags: - 'v*' + # NOTE(review): keeping this path filter saves CI cost, but it can skip + # publishing storefront images for some main commits. Since runtime resolves + # storefront refs from obol's GitCommit, missing short-SHA tags can cause + # ImagePullBackOff. If the team wants per-commit immutable availability, + # remove this paths filter. paths: - 'web/public-storefront/**' - 'Dockerfile.public-storefront' diff --git a/.github/workflows/docker-publish-x402.yml b/.github/workflows/docker-publish-x402.yml index f66e7b46..490bb48a 100644 --- a/.github/workflows/docker-publish-x402.yml +++ b/.github/workflows/docker-publish-x402.yml @@ -6,6 +6,11 @@ on: - main tags: - 'v*' + # NOTE: this path filter saves CI cost, but it also means some main commits + # won't publish matching short-SHA tags for these images. Any runtime path + # that resolves image refs from obol's GitCommit (images.Resolve) can fail to + # pull if that SHA tag was never published. If per-commit immutability is + # required for all main commits, remove this filter (like storefront workflow). paths: - 'internal/x402/**' - 'internal/serviceoffercontroller/**' diff --git a/cmd/obol/model.go b/cmd/obol/model.go index f7c402e8..2d4b0670 100644 --- a/cmd/obol/model.go +++ b/cmd/obol/model.go @@ -233,6 +233,15 @@ func setupCloudProvider(cfg *config.Config, u *ui.UI, provider, apiKey string, m return err } + // User expectation: rerunning `obol model setup` with a new provider/model + // should make that model active immediately. ConfigureLiteLLM merges entries + // but does not guarantee head-of-list ordering, so explicitly promote the + // selected model (when one was provided) to become the default. + if len(models) > 0 && strings.TrimSpace(models[0]) != "" { + if err := model.PreferModels(cfg, u, []string{models[0]}); err != nil { + return fmt.Errorf("set default model to %s: %w", models[0], err) + } + } u.Print("") u.Successf("Model configured. To change later, run: obol model setup (or obol model remove )") diff --git a/docs/guides/stale-resources-troubleshooting.md b/docs/guides/stale-resources-troubleshooting.md new file mode 100644 index 00000000..3371a4ea --- /dev/null +++ b/docs/guides/stale-resources-troubleshooting.md @@ -0,0 +1,248 @@ +# Stale Resource Troubleshooting Runbook + +This runbook helps operators and agents diagnose stale Kubernetes resources in Obol Stack and recover cleanly when `obol stack up` appears healthy but routes/images are stale. + +## Scope + +Use this when you see issues such as: + +- `obol.stack` serves old UI or wrong backend. +- `503 Service Temporarily Unavailable` from nginx. +- tunnel root (`https:///`) returns "no available server". +- pods stuck in `ImagePullBackOff` for Obol-managed images. +- `obol sell demo` times out while waiting for pod readiness. + +--- + +## 1) Quick triage commands + +Run these first: + +```bash +obol kubectl get pods -A -o wide +obol kubectl get deploy -A +obol kubectl get svc -A +obol kubectl get httproutes.gateway.networking.k8s.io -A +obol kubectl get ingress -A +``` + +If a pod is not ready: + +```bash +obol kubectl describe pod -n +obol kubectl get events -n --sort-by=.lastTimestamp +``` + +--- + +## 2) Detect legacy ingress-nginx conflicts + +Symptoms: + +- `curl -I http://obol.stack` returns nginx 503. +- old `Ingress` objects still exist in `default`. + +Check: + +```bash +obol kubectl get ingress -A +obol kubectl get deploy,svc -n default | rg "ingress-nginx|obol-frontend-obol-app|erpc" +``` + +Expected on modern stack: + +- `HTTPRoute` resources (Gateway API) are present. +- no legacy nginx ingress objects for `obol.stack`. + +If legacy resources exist, remove them: + +```bash +obol kubectl delete ingress obol-frontend-obol-app -n default --ignore-not-found +obol kubectl delete ingress erpc -n default --ignore-not-found +obol kubectl delete deployment ingress-nginx-controller -n default --ignore-not-found +obol kubectl delete service ingress-nginx-controller -n default --ignore-not-found +``` + +Verify: + +```bash +curl -I http://obol.stack +curl -I http://obol.stack:8080 +``` + +Both should be `200` from current frontend (Traefik path). + +--- + +## 3) Fix tunnel storefront failures + +Symptoms: + +- tunnel root returns "no available server". +- `tunnel-storefront` pod in `ImagePullBackOff`. + +Check: + +```bash +obol kubectl get pods -n traefik +obol kubectl describe pod -n traefik +``` + +If error is image tag not found: + +```bash +obol kubectl set image deployment/tunnel-storefront -n traefik \ + storefront=ghcr.io/obolnetwork/obol-stack-public-storefront:latest +obol kubectl rollout status deployment/tunnel-storefront -n traefik --timeout=180s +``` + +Verify: + +```bash +curl -I https:/// +``` + +### Investigation: quick tunnel hostname drift causing storefront 404 + +Observed pattern during incident: + +- `obol tunnel status` reported an active quick URL. +- `https:///skill.md` returned `200`. +- `https:///` returned `404 page not found` (or `530` during reconnect). + +Root cause: + +- `traefik/tunnel-storefront` `HTTPRoute` was pinned to an older quick tunnel + hostname in `spec.hostnames`. +- `cloudflared` rotated to a new `*.trycloudflare.com` hostname. +- The root storefront route missed due to hostname mismatch, while other public + routes without hostname pinning (for example `x402/obol-skill-md-route`) kept + working. + +How to confirm quickly: + +```bash +# 1) Current quick tunnel URL from status/logs +obol tunnel status +obol kubectl logs -n traefik deploy/cloudflared --since=10m \ + | rg -o 'https://[a-z0-9-]+\.trycloudflare\.com' | tail -1 + +# 2) What hostname storefront route is pinned to +obol kubectl get httproute tunnel-storefront -n traefik -o jsonpath='{.spec.hostnames[0]}' + +# 3) Reproduce mismatch behavior +curl -i https:/// +curl -i https:///skill.md +``` + +Immediate recovery options: + +```bash +# Option A: repin to current quick hostname +obol kubectl patch httproute tunnel-storefront -n traefik --type merge \ + -p '{"spec":{"hostnames":[""]}}' + +# Option B: remove host pin (works across quick hostname rotations) +obol kubectl patch httproute tunnel-storefront -n traefik --type json \ + -p='[{"op":"remove","path":"/spec/hostnames"}]' +``` + +Post-fix verification: + +```bash +curl -i https:/// +curl -i https:///skill.md +obol kubectl run -n traefik tmp-curl --rm -i --restart=Never --image=curlimages/curl -- \ + sh -lc "curl -i -H 'Host: ' http://traefik.traefik.svc.cluster.local/ | sed -n '1,12p'" +``` + +Expected: + +- storefront `/` returns `200`. +- `/skill.md` continues to return `200`. +- internal Traefik check with explicit `Host` header returns `200`. + +--- + +## 4) Fix demo pod scheduling timeouts + +Symptoms: + +- `obol sell demo` fails waiting for pod readiness. +- pod remains `Pending`. + +Check scheduler reason: + +```bash +obol kubectl describe pod -n demo -l app=demo-hello +obol kubectl describe node +``` + +Common cause: `Insufficient memory`. + +Free memory by removing high-request workloads not needed for this test, then wait for rollout: + +```bash +obol kubectl rollout status deployment/demo-hello -n demo --timeout=180s +``` + +--- + +## 5) Image freshness model (important) + +`obol stack up` is deterministic and does not always "pull latest": + +- Some images are pinned by version tag in values files. +- Some are digest-pinned. +- Some are commit-derived via `images.Resolve(...)` (using obol binary `GitCommit`). + +If commit-derived image tags are not published for that commit SHA, runtime pull failures occur. + +--- + +## 5b) LiteLLM 401 due to accidental OpenAI placeholders + +Observed failure pattern: + +- chat fails with `Incorrect API key provided: ` +- LiteLLM config contains a placeholder-style model alias and matching placeholder `OPENAI_API_KEY` + +Why this happens: + +- auto cloud-provider detection can import a stale/default agent model from `~/.openclaw/openclaw.json` +- if shell env has a placeholder OpenAI key, LiteLLM can be patched with invalid credentials + +Mitigations: + +- set intended provider key (for Anthropic: `ANTHROPIC_API_KEY`) before `obol stack up` +- run `obol model prefer ` then `obol model sync` +- remove wrong aliases with `obol model remove ` + +--- + +## 6) Agent checklist for stale resources + +Use this checklist in order: + +1. `obol kubectl get ingress -A` -> must be empty (or expected non-obol custom ingresses only). +2. `obol kubectl get httproutes.gateway.networking.k8s.io -A` -> confirm `obol-frontend` and service routes exist. +3. `obol kubectl get pods -n traefik` -> `traefik`, `cloudflared`, and `tunnel-storefront` healthy. +4. `obol kubectl describe pod ...` for any `Pending`/`ImagePullBackOff`. +5. `curl -I http://obol.stack` and `curl -I http://obol.stack:8080` -> expect `200`. +6. `curl -I https:///` -> expect `200`. +7. if image pull fails, verify referenced tag exists in registry and compare to `obol version` `Git Commit`. + +--- + +## 7) Code-level hardening recommendations + +Current hardening implemented: + +- `stack up` runs ongoing reconciliation checks every run for known stale ingress conflicts. +- Default mode is non-destructive (warn only). Auto-clean is opt-in with `OBOL_STACK_AUTO_CLEAN_LEGACY=true`. + +Additional recommended hardening: + +- Ensure publish workflows produce short-SHA images for all commits that can be deployed by commit-derived refs. +- Prefer immutable pins (digest or explicit version tags) over `:latest` for client reproducibility. +- Add CI checks that verify deploy-time image refs exist in registry before release. diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml index 4e70ec1e..8627852e 100644 --- a/internal/embed/infrastructure/base/templates/llm.yaml +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -148,6 +148,8 @@ spec: # No Postgres required — /model/new and /model/delete work via # in-memory router + config.yaml persistence. # Source: https://github.com/ObolNetwork/litellm + # Pinned to a specific build for reproducibility; bump this ref to roll + # newer LiteLLM behavior into stack up. image: ghcr.io/obolnetwork/litellm:sha-c16b156 imagePullPolicy: IfNotPresent args: diff --git a/internal/embed/infrastructure/cloudflared/values.yaml b/internal/embed/infrastructure/cloudflared/values.yaml index a41a4715..16ac7cec 100644 --- a/internal/embed/infrastructure/cloudflared/values.yaml +++ b/internal/embed/infrastructure/cloudflared/values.yaml @@ -5,6 +5,9 @@ transport: image: repository: cloudflare/cloudflared + # Keep this pinned to a reviewed release for reproducible tunnel behavior. + # If you want "latest" cadence with safety, automate bumping this value + # to newer immutable versions in PRs rather than using :latest directly. tag: "2026.3.0" metrics: diff --git a/internal/stack/stack.go b/internal/stack/stack.go index f5bae1a5..abe11560 100644 --- a/internal/stack/stack.go +++ b/internal/stack/stack.go @@ -429,6 +429,13 @@ func syncDefaults(cfg *config.Config, u *ui.UI, kubeconfigPath string, dataDir s return fmt.Errorf("failed to apply defaults helmfile: %w", err) } + // Non-destructive stale-resource guard. + // By default this only warns. Set OBOL_STACK_AUTO_CLEAN_LEGACY=true to + // explicitly opt in to automatic cleanup of known legacy ingress resources. + if err := reconcileStackInvariants(cfg, u); err != nil { + u.Warnf("Stack reconciliation encountered errors: %v", err) + } + u.Success("Default infrastructure deployed") if previousLiteLLMConfig != "" { @@ -597,6 +604,81 @@ func obolPluginInstalled(marketplaceName string) bool { return false } +func reconcileStackInvariants(cfg *config.Config, u *ui.UI) error { + hasLegacy, err := hasLegacyIngressResources(cfg) + if err != nil { + return err + } + if hasLegacy { + if strings.EqualFold(strings.TrimSpace(os.Getenv("OBOL_STACK_AUTO_CLEAN_LEGACY")), "true") { + u.Dim("Found legacy ingress-nginx/default resources; auto-clean enabled") + if err := cleanupLegacyIngressResources(cfg); err != nil { + return err + } + u.Dim("Legacy ingress conflicts removed") + return nil + } + u.Warn("Legacy ingress resources detected from older stack layouts.") + u.Dim(" No resources were deleted (safe default).") + u.Dim(" To auto-clean on next run: OBOL_STACK_AUTO_CLEAN_LEGACY=true obol stack up") + u.Dim(" Or clean manually:") + u.Dim(" obol kubectl delete ingress obol-frontend-obol-app -n default --ignore-not-found") + u.Dim(" obol kubectl delete ingress erpc -n default --ignore-not-found") + u.Dim(" obol kubectl delete deployment ingress-nginx-controller -n default --ignore-not-found") + u.Dim(" obol kubectl delete service ingress-nginx-controller -n default --ignore-not-found") + } + return nil +} + +// cleanupLegacyIngressResources removes obsolete ingress-nginx and default +// namespace frontend/eRPC resources created by old stack layouts. +// Safe/idempotent: deletes use --ignore-not-found. +func cleanupLegacyIngressResources(cfg *config.Config) error { + bin, kc := kubectl.Paths(cfg) + resources := [][]string{ + {"ingress", "obol-frontend-obol-app", "-n", "default"}, + {"ingress", "erpc", "-n", "default"}, + {"deployment", "ingress-nginx-controller", "-n", "default"}, + {"service", "ingress-nginx-controller", "-n", "default"}, + } + + var errs []string + for _, r := range resources { + args := append([]string{"delete"}, r...) + args = append(args, "--ignore-not-found") + if err := kubectl.RunSilent(bin, kc, args...); err != nil { + errs = append(errs, err.Error()) + } + } + if len(errs) > 0 { + return errors.New(strings.Join(errs, "; ")) + } + return nil +} + +func hasLegacyIngressResources(cfg *config.Config) (bool, error) { + bin, kc := kubectl.Paths(cfg) + resources := [][]string{ + {"ingress", "obol-frontend-obol-app", "-n", "default"}, + {"ingress", "erpc", "-n", "default"}, + {"deployment", "ingress-nginx-controller", "-n", "default"}, + {"service", "ingress-nginx-controller", "-n", "default"}, + } + for _, r := range resources { + args := append([]string{"get"}, r...) + args = append(args, "-o", "name") + _, err := kubectl.Output(bin, kc, args...) + if err == nil { + return true, nil + } + if strings.Contains(strings.ToLower(err.Error()), "notfound") { + continue + } + return false, err + } + return false, nil +} + // autoConfigureLLM detects host Ollama and imported cloud providers, then // auto-configures LiteLLM so inference works out of the box. // Patches all providers first, then does a single restart. diff --git a/internal/tunnel/tunnel.go b/internal/tunnel/tunnel.go index 367ad877..30decc9d 100644 --- a/internal/tunnel/tunnel.go +++ b/internal/tunnel/tunnel.go @@ -922,7 +922,13 @@ func CreateStorefront(cfg *config.Config, tunnelURL string) error { }, }, }, - // HTTPRoute: tunnel hostname → storefront (more specific than frontend catch-all). + // HTTPRoute: tunnel hostname -> storefront (more specific than frontend catch-all). + // + // NOTE: quick tunnels rotate hostnames frequently and can leave stale + // host-pinned routes behind during restart races. If storefront "/" starts + // returning 404 while "/skill.md" still works, stale host pinning is the + // first thing to check. A robust fix is to avoid host pinning for quick + // tunnels (or update hostnames from the active cloudflared pod logs). { "apiVersion": "gateway.networking.k8s.io/v1", "kind": "HTTPRoute",