From 47084076b04e39b825c776cf517bd7dccf2f190e Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 12 May 2026 17:31:58 +0800 Subject: [PATCH 1/7] fix(sell inference): emit registration spec + honor operator overrides MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `obol sell inference` was producing ServiceOffers with three latent defects that together stopped /.well-known/agent-registration.json from ever being routed for inference-typed sells: 1. The inference subcommand never exposed `--register-*` flags (compare with `obol sell http` which does), even though the help text on `--register` says "registration is enabled by default". 2. `buildInferenceServiceOfferSpec` never wrote `spec.registration` onto the offer at all, so the controller's `reconcileRegistrationStatus` saw `Enabled=false` (zero value) and emitted `Registered=True Disabled` with no RegistrationRequest CR and no /.well-known HTTPRoute. 3. `buildInferenceServiceOfferSpec` hardcoded `spec.model.name = "ollama"` regardless of the actual `--model` value, so anything downstream that keyed off the model id (the controller's description default included) was looking at the wrong string. Surfaced today on spark2 while trying to fetch `https://inference.v1337.org/.well-known/agent-registration.json` — got the Traefik fall-through 404. Manual `kubectl patch` enabled registration, exposed a fourth defect: 4. `buildActiveRegistrationDocument` in the serviceoffer-controller unconditionally overwrote `Spec.Registration.Description` for inference offers with `" inference via x402 micropayments"`, even when the operator had supplied an explicit description at sell time. This PR fixes all four: - Add `--no-register`, `--register-name`, `--register-description`, `--register-image`, `--register-skills`, `--register-domains`, `--register-metadata` to `obol sell inference`. - Rename `sellHTTPRegistrationInput` / `buildSellHTTPRegistrationConfig` to the unqualified `sellRegistrationInput` / `buildSellRegistrationConfig` since they now serve both inference and http call sites. - Extend `buildInferenceServiceOfferSpec` to accept the resolved model name and the registration block, write `spec.model.name = `, and merge the registration block into `spec.registration` when non-empty. - In the controller's `buildActiveRegistrationDocument`, only fall back to the model-aware description string when the operator left `Spec.Registration.Description` empty. Tests that would have caught the regression earlier: - `TestSellInference_Flags` now requires the six registration flags; their absence on `main` was the bug. - `TestBuildInferenceServiceOfferSpec_RegistrationEnabledByDefault` pins that defaults produce `spec.registration.enabled = true` and the offer name as `spec.registration.name`. - `TestBuildInferenceServiceOfferSpec_NoRegisterOmitsRegistration` pins the `--no-register` opt-out. - `TestBuildInferenceServiceOfferSpec_OperatorOverridesWin` pins that operator-supplied name/description/image/skills/domains all survive into the spec verbatim. - `TestBuildInferenceServiceOfferSpec_ModelNameNotHardcoded` pins that `spec.model.name` reflects `--model`, not the historical "ollama" literal. - `TestBuildActiveRegistrationDocument_KeepsOperatorDescription` pins the controller-side fix: an operator description survives the buildActiveRegistrationDocument pass. - `TestBuildActiveRegistrationDocument_FallsBackToModelDescriptionForInference` pins the other branch — inference offers with no operator description still get the model-aware default, not the generic one. Drive-by: `TestSellInference_Flags` was failing on `main` after #470 removed the `--price` default but didn't update the corresponding assertion. Updated to assert `--price` default = "" with a comment explaining the contract. --- cmd/obol/sell.go | 69 +++++++- cmd/obol/sell_test.go | 156 +++++++++++++++++- internal/serviceoffercontroller/render.go | 15 +- .../serviceoffercontroller/render_test.go | 80 ++++++++- 4 files changed, 300 insertions(+), 20 deletions(-) diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index 6ba344ed..508aecaa 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -184,6 +184,34 @@ Examples: Name: "provenance-file", Usage: "Path to JSON file with provenance metadata (e.g. autoresearch experiment results)", }, + &cli.BoolFlag{ + Name: "no-register", + Usage: "Skip ERC-8004 registration (no /.well-known/agent-registration.json HTTPRoute is published)", + }, + &cli.StringFlag{ + Name: "register-name", + Usage: "Agent name for ERC-8004 registration (defaults to the offer name)", + }, + &cli.StringFlag{ + Name: "register-description", + Usage: "Agent description for ERC-8004 registration", + }, + &cli.StringFlag{ + Name: "register-image", + Usage: "Agent image URL for ERC-8004 registration", + }, + &cli.StringSliceFlag{ + Name: "register-skills", + Usage: "OASF skill tags for ERC-8004 registration (repeatable)", + }, + &cli.StringSliceFlag{ + Name: "register-domains", + Usage: "OASF domain tags for ERC-8004 registration (repeatable)", + }, + &cli.StringSliceFlag{ + Name: "register-metadata", + Usage: "Additional registration metadata as key=value pairs (repeatable, e.g. gpu=A100-80GB)", + }, }, Action: func(ctx context.Context, cmd *cli.Command) error { u := getUI(cmd) @@ -384,7 +412,19 @@ Examples: d.NoPaymentGate = false } else { // Create a ServiceOffer CR pointing at the host service. - soSpec, err := buildInferenceServiceOfferSpec(d, priceTable, svcNs, port, assetTerms) + reg, _, regErr := buildSellRegistrationConfig(name, sellRegistrationInput{ + NoRegister: cmd.Bool("no-register"), + Name: cmd.String("register-name"), + Description: cmd.String("register-description"), + Image: cmd.String("register-image"), + Skills: cmd.StringSlice("register-skills"), + Domains: cmd.StringSlice("register-domains"), + MetadataPairs: cmd.StringSlice("register-metadata"), + }) + if regErr != nil { + return regErr + } + soSpec, err := buildInferenceServiceOfferSpec(d, priceTable, svcNs, port, assetTerms, modelFlag, reg) if err != nil { return err } @@ -701,7 +741,7 @@ Examples: prov.Framework, prov.MetricName, prov.MetricValue, prov.ParamCount) } - reg, registerEnabled, err := buildSellHTTPRegistrationConfig(name, sellHTTPRegistrationInput{ + reg, registerEnabled, err := buildSellRegistrationConfig(name, sellRegistrationInput{ NoRegister: cmd.Bool("no-register"), Register: cmd.Bool("register"), Name: cmd.String("register-name"), @@ -827,7 +867,7 @@ type autoRegisterOptions struct { ExpectedOwner string } -type sellHTTPRegistrationInput struct { +type sellRegistrationInput struct { NoRegister bool Register bool Name string @@ -907,7 +947,7 @@ func autoRegisterServiceOffer(ctx context.Context, cfg *config.Config, u *ui.UI, return nil } -func buildSellHTTPRegistrationConfig(serviceName string, in sellHTTPRegistrationInput) (map[string]any, bool, error) { +func buildSellRegistrationConfig(serviceName string, in sellRegistrationInput) (map[string]any, bool, error) { registerEnabled := !in.NoRegister if !registerEnabled && (in.Register || in.Name != "" || in.Description != "" || in.Image != "" || len(in.Skills) > 0 || len(in.Domains) > 0 || len(in.MetadataPairs) > 0) { @@ -3383,7 +3423,16 @@ func resolveHostIP(cfg *config.Config) (string, error) { // buildInferenceServiceOfferSpec builds a ServiceOffer spec for a host-side // inference gateway routed through the cluster's x402 flow. -func buildInferenceServiceOfferSpec(d *inference.Deployment, pt schemas.PriceTable, ns, port string, asset schemas.AssetTerms) (map[string]any, error) { +// +// modelName is the upstream model identifier the operator passed via --model +// (e.g. "aeon-ultimate"). It is written into spec.model.name so the +// serviceoffer-controller's registration document carries the real model id +// rather than the historical hardcoded "ollama" string. +// +// registration is the operator's ERC-8004 registration block as produced by +// buildSellRegistrationConfig — pass nil (or an empty map) to skip +// registration. When non-nil it is merged verbatim into spec.registration. +func buildInferenceServiceOfferSpec(d *inference.Deployment, pt schemas.PriceTable, ns, port string, asset schemas.AssetTerms, modelName string, registration map[string]any) (map[string]any, error) { portNum, err := strconv.Atoi(port) if err != nil || portNum < 1 || portNum > 65535 { return nil, fmt.Errorf("invalid port %q: must be a number between 1 and 65535", port) @@ -3416,12 +3465,20 @@ func buildInferenceServiceOfferSpec(d *inference.Deployment, pt schemas.PriceTab } if d.UpstreamURL != "" { + model := strings.TrimSpace(modelName) + if model == "" { + model = "ollama" // pre-fix fallback; the Action enforces --model + } spec["model"] = map[string]any{ - "name": "ollama", + "name": model, "runtime": "ollama", } } + if len(registration) > 0 { + spec["registration"] = registration + } + return spec, nil } diff --git a/cmd/obol/sell_test.go b/cmd/obol/sell_test.go index adffbfa3..3277c183 100644 --- a/cmd/obol/sell_test.go +++ b/cmd/obol/sell_test.go @@ -6,6 +6,7 @@ import ( "testing" "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/inference" "github.com/ObolNetwork/obol-stack/internal/monetizeapi" "github.com/ObolNetwork/obol-stack/internal/schemas" x402verifier "github.com/ObolNetwork/obol-stack/internal/x402" @@ -185,9 +186,19 @@ func TestSellInference_Flags(t *testing.T) { "listen", "upstream", "enclave-tag", "vm", "vm-image", "vm-cpus", "vm-memory", "vm-host-port", "tee", "model-hash", + // Registration parity with `obol sell http`. Their absence on the + // inference subcommand was the regression that left + // /.well-known/agent-registration.json unrouted (see + // TestBuildInferenceServiceOfferSpec_RegistrationEnabledByDefault). + "no-register", + "register-name", "register-description", "register-image", + "register-skills", "register-domains", "register-metadata", ) - assertStringDefault(t, flags, "price", "0.001") + // --price intentionally has no default after #470 — the resolvePriceTable + // fallthrough requires an explicit price flag instead of letting "0.001" + // shadow --per-mtok / --per-request. Empty default is the pinned contract. + assertStringDefault(t, flags, "price", "") assertStringDefault(t, flags, "chain", "base") assertStringDefault(t, flags, "token", "USDC") assertStringDefault(t, flags, "listen", ":8402") @@ -256,8 +267,8 @@ func TestSellHTTP_Flags(t *testing.T) { assertIntDefault(t, flags, "max-timeout", 300) } -func TestBuildSellHTTPRegistrationConfig_DefaultEnabled(t *testing.T) { - reg, enabled, err := buildSellHTTPRegistrationConfig("demo", sellHTTPRegistrationInput{}) +func TestBuildSellRegistrationConfig_DefaultEnabled(t *testing.T) { + reg, enabled, err := buildSellRegistrationConfig("demo", sellRegistrationInput{}) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -272,8 +283,8 @@ func TestBuildSellHTTPRegistrationConfig_DefaultEnabled(t *testing.T) { } } -func TestBuildSellHTTPRegistrationConfig_NoRegisterConflicts(t *testing.T) { - _, _, err := buildSellHTTPRegistrationConfig("demo", sellHTTPRegistrationInput{ +func TestBuildSellRegistrationConfig_NoRegisterConflicts(t *testing.T) { + _, _, err := buildSellRegistrationConfig("demo", sellRegistrationInput{ NoRegister: true, Name: "custom", }) @@ -755,6 +766,141 @@ func TestDemoRPCNetwork(t *testing.T) { } } +// TestBuildInferenceServiceOfferSpec_RegistrationEnabledByDefault pins the +// fix for the missing-registration regression: `obol sell inference` used to +// build a ServiceOffer with empty `spec.registration`, so the controller +// emitted "Registration disabled" and never published the +// /.well-known/agent-registration.json HTTPRoute. The default contract is +// "registration enabled, name derived from offer name" — same as `sell http`. +func TestBuildInferenceServiceOfferSpec_RegistrationEnabledByDefault(t *testing.T) { + d := &inference.Deployment{ + Name: "aeon", + UpstreamURL: "http://127.0.0.1:8000", + WalletAddress: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + Chain: "base-sepolia", + PricePerRequest: "0.001", + } + reg, enabled, err := buildSellRegistrationConfig("aeon", sellRegistrationInput{}) + if err != nil { + t.Fatalf("buildSellRegistrationConfig: %v", err) + } + if !enabled { + t.Fatal("registration should default to enabled") + } + spec, err := buildInferenceServiceOfferSpec(d, schemas.PriceTable{PerRequest: "0.001"}, "llm", "8402", schemas.AssetTerms{}, "aeon-ultimate", reg) + if err != nil { + t.Fatalf("buildInferenceServiceOfferSpec: %v", err) + } + got, ok := spec["registration"].(map[string]any) + if !ok { + t.Fatalf("spec.registration missing or wrong type: %#v", spec["registration"]) + } + if got["enabled"] != true { + t.Errorf("spec.registration.enabled = %v, want true", got["enabled"]) + } + if got["name"] != "aeon" { + t.Errorf("spec.registration.name = %v, want %q", got["name"], "aeon") + } +} + +// TestBuildInferenceServiceOfferSpec_NoRegisterOmitsRegistration confirms the +// opt-out path: --no-register produces an empty registration map and the spec +// builder must leave spec.registration unset (so the controller emits the +// well-known route only when explicitly requested). +func TestBuildInferenceServiceOfferSpec_NoRegisterOmitsRegistration(t *testing.T) { + d := &inference.Deployment{ + Name: "aeon", UpstreamURL: "http://127.0.0.1:8000", + WalletAddress: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + Chain: "base-sepolia", PricePerRequest: "0.001", + } + reg, enabled, err := buildSellRegistrationConfig("aeon", sellRegistrationInput{NoRegister: true}) + if err != nil { + t.Fatalf("buildSellRegistrationConfig: %v", err) + } + if enabled { + t.Fatal("--no-register should disable registration") + } + spec, err := buildInferenceServiceOfferSpec(d, schemas.PriceTable{PerRequest: "0.001"}, "llm", "8402", schemas.AssetTerms{}, "aeon-ultimate", reg) + if err != nil { + t.Fatalf("buildInferenceServiceOfferSpec: %v", err) + } + if _, present := spec["registration"]; present { + t.Errorf("spec.registration should be absent when --no-register; got %#v", spec["registration"]) + } +} + +// TestBuildInferenceServiceOfferSpec_OperatorOverridesWin pins the fix for +// the controller-side description-clobber regression in +// internal/serviceoffercontroller/render.go: the operator's explicit +// registration fields (name, description, image, skills, domains) must +// survive into the published spec verbatim, not be silently replaced by +// controller-side defaults. This test covers the *spec input*; the +// controller-side guarantee is covered by the companion test in +// serviceoffercontroller/render_test.go. +func TestBuildInferenceServiceOfferSpec_OperatorOverridesWin(t *testing.T) { + d := &inference.Deployment{ + Name: "aeon", UpstreamURL: "http://127.0.0.1:8000", + WalletAddress: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + Chain: "base-sepolia", PricePerRequest: "0.001", + } + reg, _, err := buildSellRegistrationConfig("aeon", sellRegistrationInput{ + Name: "Qwen36 AEON Ultimate", + Description: "Uncensored Qwen3.6-27B abliteration on DGX Spark", + Image: "https://example.com/aeon.png", + Skills: []string{"llm/inference", "llm/uncensored"}, + Domains: []string{"inference.v1337.org"}, + }) + if err != nil { + t.Fatalf("buildSellRegistrationConfig: %v", err) + } + spec, err := buildInferenceServiceOfferSpec(d, schemas.PriceTable{PerRequest: "0.001"}, "llm", "8402", schemas.AssetTerms{}, "aeon-ultimate", reg) + if err != nil { + t.Fatalf("buildInferenceServiceOfferSpec: %v", err) + } + r := spec["registration"].(map[string]any) + for k, want := range map[string]any{ + "name": "Qwen36 AEON Ultimate", + "description": "Uncensored Qwen3.6-27B abliteration on DGX Spark", + "image": "https://example.com/aeon.png", + } { + if r[k] != want { + t.Errorf("spec.registration.%s = %#v, want %#v", k, r[k], want) + } + } + skills, _ := r["skills"].([]string) + if len(skills) != 2 || skills[0] != "llm/inference" { + t.Errorf("spec.registration.skills = %#v, want [llm/inference llm/uncensored]", r["skills"]) + } + domains, _ := r["domains"].([]string) + if len(domains) != 1 || domains[0] != "inference.v1337.org" { + t.Errorf("spec.registration.domains = %#v, want [inference.v1337.org]", r["domains"]) + } +} + +// TestBuildInferenceServiceOfferSpec_ModelNameNotHardcoded pins the fix for +// the "spec.model.name = ollama regardless of --model" regression. The model +// id the operator passed must surface in spec.model.name so the controller's +// per-model registration description (and any downstream tooling that keys +// off model name) reads the truth, not the historical hardcoded literal. +func TestBuildInferenceServiceOfferSpec_ModelNameNotHardcoded(t *testing.T) { + d := &inference.Deployment{ + Name: "aeon", UpstreamURL: "http://127.0.0.1:8000", + WalletAddress: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + Chain: "base-sepolia", PricePerRequest: "0.001", + } + spec, err := buildInferenceServiceOfferSpec(d, schemas.PriceTable{PerRequest: "0.001"}, "llm", "8402", schemas.AssetTerms{}, "aeon-ultimate", nil) + if err != nil { + t.Fatalf("buildInferenceServiceOfferSpec: %v", err) + } + model, ok := spec["model"].(map[string]any) + if !ok { + t.Fatalf("spec.model missing or wrong type: %#v", spec["model"]) + } + if model["name"] != "aeon-ultimate" { + t.Errorf("spec.model.name = %v, want %q (must reflect --model, not the legacy hardcoded \"ollama\")", model["name"], "aeon-ultimate") + } +} + func TestBuildDemoServiceOffer_RegisterFlagDrivesEnabled(t *testing.T) { for _, register := range []bool{true, false} { manifest := buildDemoServiceOffer( diff --git a/internal/serviceoffercontroller/render.go b/internal/serviceoffercontroller/render.go index df59bfee..ef6c4705 100644 --- a/internal/serviceoffercontroller/render.go +++ b/internal/serviceoffercontroller/render.go @@ -26,7 +26,6 @@ const ( servicesJSONRouteName = "obol-services-json-route" ) - func buildRegistrationRequest(offer *monetizeapi.ServiceOffer, desiredState string) *unstructured.Unstructured { return &unstructured.Unstructured{ Object: map[string]any{ @@ -583,12 +582,18 @@ func isConditionTrue(status monetizeapi.ServiceOfferStatus, conditionType string func buildActiveRegistrationDocument(owner *monetizeapi.ServiceOffer, offers []*monetizeapi.ServiceOffer, baseURL, agentID string) erc8004.AgentRegistration { baseURL = strings.TrimRight(baseURL, "/") + // Operator-supplied description wins. Only fall back to a controller- + // generated default when the offer left Spec.Registration.Description + // empty. The inference-typed default is more specific (names the model), + // so it preempts the generic default — but neither overrides an explicit + // operator value. description := owner.Spec.Registration.Description if description == "" { - description = fmt.Sprintf("x402 payment-gated %s service: %s", fallbackOfferType(owner), owner.Name) - } - if owner.IsInference() && owner.Spec.Model.Name != "" { - description = fmt.Sprintf("%s inference via x402 micropayments", owner.Spec.Model.Name) + if owner.IsInference() && owner.Spec.Model.Name != "" { + description = fmt.Sprintf("%s inference via x402 micropayments", owner.Spec.Model.Name) + } else { + description = fmt.Sprintf("x402 payment-gated %s service: %s", fallbackOfferType(owner), owner.Name) + } } image := owner.Spec.Registration.Image diff --git a/internal/serviceoffercontroller/render_test.go b/internal/serviceoffercontroller/render_test.go index d77ed603..ea077860 100644 --- a/internal/serviceoffercontroller/render_test.go +++ b/internal/serviceoffercontroller/render_test.go @@ -240,6 +240,74 @@ func TestBuildActiveRegistrationDocument(t *testing.T) { } } +// TestBuildActiveRegistrationDocument_KeepsOperatorDescription pins the fix +// for the controller-side description-clobber bug: +// `buildActiveRegistrationDocument` used to unconditionally overwrite +// `owner.Spec.Registration.Description` for inference-typed offers with +// `" inference via x402 micropayments"`, so any explicit operator +// description set at sell time was silently lost in the published +// /.well-known/agent-registration.json document. The fix only fills the +// description from the model name when the operator's value is empty. +func TestBuildActiveRegistrationDocument_KeepsOperatorDescription(t *testing.T) { + operatorDesc := "Uncensored Qwen3.6-27B abliteration on DGX Spark" + owner := &monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: "aeon", Namespace: "llm"}, + Spec: monetizeapi.ServiceOfferSpec{ + Type: "inference", + Model: monetizeapi.ServiceOfferModel{Name: "aeon-ultimate"}, + Path: "/services/aeon", + Registration: monetizeapi.ServiceOfferRegistration{ + Enabled: true, + Name: "Qwen36 AEON Ultimate", + Description: operatorDesc, + }, + }, + Status: monetizeapi.ServiceOfferStatus{ + Conditions: []monetizeapi.Condition{ + {Type: "ModelReady", Status: "True"}, + {Type: "UpstreamHealthy", Status: "True"}, + {Type: "PaymentGateReady", Status: "True"}, + {Type: "RoutePublished", Status: "True"}, + }, + }, + } + doc := buildActiveRegistrationDocument(owner, []*monetizeapi.ServiceOffer{owner}, "https://inference.example.com", "") + if doc.Description != operatorDesc { + t.Fatalf("description = %q, want operator value %q (the controller used to overwrite this with %q-inference-via-x402-micropayments)", + doc.Description, operatorDesc, owner.Spec.Model.Name) + } + if doc.Name != "Qwen36 AEON Ultimate" { + t.Errorf("name = %q, want operator value %q", doc.Name, "Qwen36 AEON Ultimate") + } +} + +// TestBuildActiveRegistrationDocument_FallsBackToModelDescriptionForInference +// pins the *other* side of the description contract: when the operator does +// not supply a description, inference offers should still get the +// model-aware default (" inference via x402 micropayments"), +// not the generic "x402 payment-gated service: " string used +// for non-inference fallback. The refactor must preserve both branches. +func TestBuildActiveRegistrationDocument_FallsBackToModelDescriptionForInference(t *testing.T) { + owner := &monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: "aeon", Namespace: "llm"}, + Spec: monetizeapi.ServiceOfferSpec{ + Type: "inference", + Model: monetizeapi.ServiceOfferModel{Name: "aeon-ultimate"}, + Path: "/services/aeon", + Registration: monetizeapi.ServiceOfferRegistration{ + Enabled: true, + Name: "aeon", + // Description intentionally left blank. + }, + }, + } + doc := buildActiveRegistrationDocument(owner, []*monetizeapi.ServiceOffer{owner}, "https://inference.example.com", "") + want := "aeon-ultimate inference via x402 micropayments" + if doc.Description != want { + t.Errorf("description = %q, want %q (model-aware default for inference offers with no operator description)", doc.Description, want) + } +} + func TestBuildRegistrationServices_IncludesOwnerWhenOwnerNotYetPublished(t *testing.T) { owner := &monetizeapi.ServiceOffer{ ObjectMeta: metav1.ObjectMeta{Name: "owner", Namespace: "demo"}, @@ -528,13 +596,17 @@ func TestBuildServiceCatalogJSON_ExcludesNonReady(t *testing.T) { offers := []*monetizeapi.ServiceOffer{ nil, { - ObjectMeta: metav1.ObjectMeta{Name: "paused-svc", Namespace: "llm", - Annotations: map[string]string{monetizeapi.PausedAnnotation: "true"}}, + ObjectMeta: metav1.ObjectMeta{ + Name: "paused-svc", Namespace: "llm", + Annotations: map[string]string{monetizeapi.PausedAnnotation: "true"}, + }, Status: monetizeapi.ServiceOfferStatus{Conditions: readyCond}, }, { - ObjectMeta: metav1.ObjectMeta{Name: "deleting-svc", Namespace: "llm", - DeletionTimestamp: &deleting}, + ObjectMeta: metav1.ObjectMeta{ + Name: "deleting-svc", Namespace: "llm", + DeletionTimestamp: &deleting, + }, Status: monetizeapi.ServiceOfferStatus{Conditions: readyCond}, }, { From b92b78ed57f0e013bde9e16bd80f569780ea2ff8 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 12 May 2026 18:15:13 +0800 Subject: [PATCH 2/7] fix(sell inference): auto-register on-chain to match sell http parity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first revision of #485 stopped at "produce a ServiceOffer with a populated spec.registration block". That made /.well-known/agent-registration.json get routed, but left the offer in Registered=False AwaitingExternalRegistration until someone manually ran `obol sell register`. The serviceoffer-controller's storefront filter (buildServiceCatalogJSON in render.go) requires Ready=True, which transitively requires Registered=True, so the offer was silently excluded from /api/services.json — the very feed the operator's own storefront UI consumes. obol sell http already auto-registers on the same code path. Mirror that here so the inference path reaches the same end state without a follow-up obol sell register step. Changes: - Extract shouldAutoRegisterSell(spec, tunnelURL) as the shared decision predicate. The same gate now drives both the http and inference auto-register call sites; defensively returns false when the registration block is missing, disabled, malformed, or the tunnel URL is empty. - sellInferenceCommand Action: after kubectlApply + EnsureTunnelForSell succeed, if shouldAutoRegisterSell says yes, call autoRegisterServiceOffer with the resolved name/description/wallet pulled from the spec. Surface failures as warnings + a re-run hint rather than aborting the gateway start, because the underlying call needs gas on the target chain and we do not want a one-off RPC hiccup to block local dev. Test coverage that would have caught the regression: - TestShouldAutoRegisterSell - table-driven over six scenarios including the defensive cases (registration not a map, registration.enabled not a bool). Both call sites use the helper. - TestSellInferenceAction_InvokesAutoRegister - source-level guard that scans the sellInferenceCommand body for shouldAutoRegisterSell and autoRegisterServiceOffer. The bug we just fixed was "Action calls neither"; an innocent refactor could remove the calls without any unit-level signal otherwise. Operational note: the agent's remote-signer wallet needs a small ETH balance on the target chain (~0.20-0.50 USD typical) for the on-chain register tx. If the wallet is unfunded the warning fires and the offer stays in AwaitingExternalRegistration; the operator can fund the wallet and re-run obol sell register to finish. --- cmd/obol/sell.go | 46 +++++++++++++++++++- cmd/obol/sell_test.go | 99 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+), 1 deletion(-) diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index 508aecaa..2a13730d 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -448,12 +448,36 @@ Examples: u.Blank() u.Info("Ensuring tunnel is active for public access...") - if tunnelURL, tErr := tunnel.EnsureTunnelForSell(cfg, u); tErr != nil { + tunnelURL := "" + if url, tErr := tunnel.EnsureTunnelForSell(cfg, u); tErr != nil { u.Warnf("Tunnel not started: %v", tErr) u.Dim(" Start manually with: obol tunnel restart") } else { + tunnelURL = url u.Successf("Tunnel active: %s", tunnelURL) } + + // Auto-register the seller on ERC-8004, mirroring the + // `obol sell http` path. Without this step the offer + // stays in Registered=False AwaitingExternalRegistration + // forever, which makes Ready=False and excludes the + // offer from /api/services.json (the storefront feed + // only includes Ready=True offers). + if shouldAutoRegisterSell(soSpec, tunnelURL) { + reg, _ := soSpec["registration"].(map[string]any) + u.Blank() + u.Info("Registering seller agent on ERC-8004...") + if err := autoRegisterServiceOffer(ctx, cfg, u, autoRegisterOptions{ + ChainCSV: cmd.String("chain"), + Endpoint: tunnelURL, + AgentName: registrationNameForPrompt(name, reg), + AgentDesc: registrationDescriptionForPrompt(name, reg), + ExpectedOwner: wallet, + }); err != nil { + u.Warnf("automatic sell registration failed: %v", err) + u.Dim(" Re-run later with: obol sell register " + name + " -n " + svcNs) + } + } } } } @@ -839,6 +863,26 @@ Examples: } } +// shouldAutoRegisterSell reports whether the post-create auto-register step +// must run for a freshly-applied ServiceOffer spec. Both `obol sell http` and +// `obol sell inference` need the same gate: registration must be enabled AND +// a tunnel URL must be available to write into the on-chain registration +// document. The decision is intentionally shared so the inference path +// cannot silently regress to "create the offer, never register" (the bug +// behind https://github.com/ObolNetwork/obol-stack/issues — Registered=False +// AwaitingExternalRegistration hiding the offer from /api/services.json). +func shouldAutoRegisterSell(spec map[string]any, tunnelURL string) bool { + if tunnelURL == "" { + return false + } + reg, ok := spec["registration"].(map[string]any) + if !ok { + return false + } + enabled, _ := reg["enabled"].(bool) + return enabled +} + func registrationNameForPrompt(fallback string, reg map[string]any) string { if reg == nil { return fallback diff --git a/cmd/obol/sell_test.go b/cmd/obol/sell_test.go index 3277c183..b91861b5 100644 --- a/cmd/obol/sell_test.go +++ b/cmd/obol/sell_test.go @@ -2,6 +2,7 @@ package main import ( "errors" + "os" "strings" "testing" @@ -901,6 +902,104 @@ func TestBuildInferenceServiceOfferSpec_ModelNameNotHardcoded(t *testing.T) { } } +// TestShouldAutoRegisterSell pins the decision logic shared by the http and +// inference action handlers. The bug it guards: leaving a fresh inference +// offer in `Registered=False AwaitingExternalRegistration` so that the +// controller's services.json filter (which requires Ready=True, which in +// turn requires Registered=True) silently excluded the offer from the +// operator's own storefront. Both call sites must auto-register exactly +// when the spec says enabled AND the tunnel is up — anything else is a +// regression. +func TestShouldAutoRegisterSell(t *testing.T) { + tests := []struct { + name string + spec map[string]any + tunnelURL string + want bool + }{ + { + name: "registration enabled + tunnel up → register", + spec: map[string]any{"registration": map[string]any{"enabled": true}}, + tunnelURL: "https://inference.example.com", + want: true, + }, + { + name: "registration explicitly disabled → skip", + spec: map[string]any{"registration": map[string]any{"enabled": false}}, + tunnelURL: "https://inference.example.com", + want: false, + }, + { + name: "no registration block (sell http --no-register / pre-#485 sell inference) → skip", + spec: map[string]any{}, + tunnelURL: "https://inference.example.com", + want: false, + }, + { + name: "registration enabled but tunnel down → skip (no endpoint to advertise)", + spec: map[string]any{"registration": map[string]any{"enabled": true}}, + tunnelURL: "", + want: false, + }, + { + name: "registration block is not a map (defensive) → skip", + spec: map[string]any{"registration": "garbage"}, + tunnelURL: "https://inference.example.com", + want: false, + }, + { + name: "registration.enabled is not a bool (defensive) → skip", + spec: map[string]any{"registration": map[string]any{"enabled": "yes"}}, + tunnelURL: "https://inference.example.com", + want: false, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := shouldAutoRegisterSell(tc.spec, tc.tunnelURL) + if got != tc.want { + t.Errorf("shouldAutoRegisterSell = %v, want %v", got, tc.want) + } + }) + } +} + +// TestSellInferenceAction_InvokesAutoRegister is a source-level guard +// against the specific regression the user just hit on spark2: the +// inference Action committed the ServiceOffer, ensured the tunnel, then +// jumped straight to runInferenceGateway without ever calling +// autoRegisterServiceOffer. Result: the offer stayed in +// AwaitingExternalRegistration and never reached the storefront feed. +// Without this guard, an innocent refactor of the post-create code path +// could silently remove the call again — and the only downstream signal +// would be "operator's storefront mysteriously empty", which is hard to +// attribute. +func TestSellInferenceAction_InvokesAutoRegister(t *testing.T) { + src, err := os.ReadFile("sell.go") + if err != nil { + t.Fatalf("read sell.go: %v", err) + } + body := string(src) + start := strings.Index(body, "func sellInferenceCommand(") + if start < 0 { + t.Fatal("sellInferenceCommand not found in sell.go") + } + next := strings.Index(body[start+1:], "\nfunc ") + if next < 0 { + t.Fatal("could not delimit sellInferenceCommand body") + } + scope := body[start : start+1+next] + for _, needle := range []string{ + "shouldAutoRegisterSell(", + "autoRegisterServiceOffer(", + } { + if !strings.Contains(scope, needle) { + t.Errorf("sellInferenceCommand body must contain %q — auto-register path missing, "+ + "offers will stay in AwaitingExternalRegistration and be excluded from /api/services.json", needle) + } + } +} + func TestBuildDemoServiceOffer_RegisterFlagDrivesEnabled(t *testing.T) { for _, register := range []bool{true, false} { manifest := buildDemoServiceOffer( From dbf6c89ca28fb6bc16c6424462fc3a4e039610c4 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 12 May 2026 18:39:47 +0800 Subject: [PATCH 3/7] fix(sell): allow signer != payTo on ERC-8004 register (not a spec constraint) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The autoRegisterServiceOffer pre-flight check rejected any registration whose signer didn't match the offer's payTo wallet: registration signer 0xA... does not match the payment wallet 0xB... Use a matching signer, omit --wallet so the remote-signer wallet is used, or pass --no-register The error wording read like an ERC-8004 limitation but isn't. ERC-8004 treats the agent OWNER (msg.sender at register time) and the agent WALLET (settable post-mint via setAgentWallet) as independent addresses. x402 settlement honors the offer's spec.payment.payTo directly — buyers pay that address regardless of what the registry's getAgentWallet returns. The "hot signer, cold/multisig payee" split is the canonical pattern. The historic guard existed because the obol CLI never exposed setAgentWallet, so a mismatched registration left operators with no in-CLI recovery path. This change instead surfaces the split as an informational note + adds `obol sell update --pay-to ` as the recovery surface (already in tree; just needed test coverage and the connection wired into the diagnostic). Changes: - signerPayeeDelegationNote(signer, payTo) returns a human-readable note when the two diverge (case-insensitive, whitespace-tolerant, empty on either side) and "" otherwise. Used by autoRegisterServiceOffer instead of the previous early-return. - buildSellUpdatePatch(payTo, chain, price) extracted from the inline sellUpdateCommand Action so the patch shape — the thing that actually hits the cluster — is testable without a live offer. Action calls the helper instead of inlining the same logic. Tests: - TestSignerPayeeDelegationNote — 6-case table: match, case-insensitive, whitespace, empty payTo, empty signer (defensive), true mismatch (assertions name the addresses + advise sell update). - TestAutoRegister_AllowsSignerPayeeMismatch — source-level guard asserting the banned error wording is gone from autoRegisterServiceOffer and the soft-notice path is wired. Anyone re-introducing the check has to delete this test too, which forces them to read the rationale. - TestBuildSellUpdatePatch_PayToOnly — `obol sell update --pay-to 0xBooB` builds a patch that touches only spec.payment.payTo, not network or price. - TestBuildSellUpdatePatch_PriceSwitchNullsOldKeys — table over perRequest/perMTok/perHour: the unused keys are explicitly null so a switch (e.g. perRequest → perMTok) doesn't leave the previous key fighting through merge semantics. - TestBuildSellUpdatePatch_NoFieldsErrors — error fires when no fields are set, and the error names the flags the operator should pass. - TestSellUpdate_PayToFlagSurface — `obol sell update` exposes --pay-to (with --wallet/--recipient/-w aliases via payToFlag), and --namespace is Required. --- cmd/obol/sell.go | 135 ++++++++++++++++++------- cmd/obol/sell_test.go | 223 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 322 insertions(+), 36 deletions(-) diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index 2a13730d..724a14bf 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -863,6 +863,83 @@ Examples: } } +// signerPayeeDelegationNote returns a human-readable note explaining the +// ownership delegation when the agent's on-chain registration signer differs +// from the offer's payment payTo wallet, and "" when they match (or either +// is empty). Used by the auto-register flow to surface the split as an +// informational message rather than blocking the registration outright — +// ERC-8004 explicitly supports this separation via setAgentWallet, and x402 +// settlement uses the offer's payTo regardless of the registry's wallet. +// +// The previous behavior (returning fmt.Errorf("registration signer ... does +// not match the payment wallet ...")) made it look like an ERC-8004 spec +// constraint when it was purely an obol-CLI policy choice. See PR notes for +// the full rationale. +func signerPayeeDelegationNote(signer, payTo string) string { + s := strings.TrimSpace(signer) + p := strings.TrimSpace(payTo) + if s == "" || p == "" || strings.EqualFold(s, p) { + return "" + } + return fmt.Sprintf( + "Agent owner (registration signer) %s differs from offer payTo %s. "+ + "ERC-8004 allows this via setAgentWallet; x402 settlement honors payTo regardless. "+ + "Re-point payments later with: obol sell update --pay-to ", + s, p, + ) +} + +// buildSellUpdatePatch assembles the JSON-merge patch body for +// `obol sell update`. Extracted from the Action so the patch shape — the +// thing that actually hits the cluster — is testable without a live ServiceOffer. +// +// Returns the patch map and an error when nothing was provided (the Action +// surfaces this as "nothing to update: pass at least one of ..."). +// +// When --pay-to is set, the wallet must already have been validated by +// x402verifier.ValidateWallet at the call site; this helper does no further +// validation so it stays a pure data shape. +func buildSellUpdatePatch(payTo, chain string, price schemas.PriceTable) (map[string]any, error) { + payment := map[string]any{} + + if payTo = strings.TrimSpace(payTo); payTo != "" { + payment["payTo"] = payTo + } + if chain = strings.TrimSpace(chain); chain != "" { + payment["network"] = chain + } + + if price.PerRequest != "" || price.PerMTok != "" || price.PerHour != "" { + // Null out the unused price keys explicitly so a switch from, e.g., + // perRequest→perMTok doesn't leave the previous key stranded and + // fighting the new one through merge semantics. + p := map[string]any{ + "perRequest": nil, + "perMTok": nil, + "perHour": nil, + } + switch { + case price.PerRequest != "": + p["perRequest"] = price.PerRequest + case price.PerMTok != "": + p["perMTok"] = price.PerMTok + case price.PerHour != "": + p["perHour"] = price.PerHour + } + payment["price"] = p + } + + if len(payment) == 0 { + return nil, errors.New("nothing to update: pass at least one of --per-request / --per-mtok / --per-hour / --pay-to / --chain") + } + + return map[string]any{ + "spec": map[string]any{ + "payment": payment, + }, + }, nil +} + // shouldAutoRegisterSell reports whether the post-create auto-register step // must run for a freshly-applied ServiceOffer spec. Both `obol sell http` and // `obol sell inference` need the same gate: registration must be enabled AND @@ -958,8 +1035,19 @@ func autoRegisterServiceOffer(ctx context.Context, cfg *config.Config, u *ui.UI, } signerAddr := addr.Hex() - if opts.ExpectedOwner != "" && !strings.EqualFold(strings.TrimSpace(opts.ExpectedOwner), strings.TrimSpace(signerAddr)) { - return fmt.Errorf("registration signer %s does not match the payment wallet %s.\nUse a matching signer, omit --wallet so the remote-signer wallet is used, or pass --no-register", signerAddr, opts.ExpectedOwner) + // ERC-8004 treats the agent OWNER (msg.sender at register time) and the + // agent WALLET (settable post-mint via setAgentWallet) as independent + // addresses. x402 settlement honors the offer's spec.payment.payTo + // directly — buyers pay that address regardless of what the registry's + // getAgentWallet returns. So a different signer and payTo is legitimate; + // it is the canonical pattern for "hot signer, cold/multisig payee". + // + // We surface the split as an informational note (not an error) so the + // operator can confirm the delegation was intentional, and so the obvious + // follow-up — `obol sell update --pay-to ` for the payee — is + // discoverable. + if note := signerPayeeDelegationNote(signerAddr, opts.ExpectedOwner); note != "" { + u.Info(note) } agentURI := strings.TrimRight(opts.Endpoint, "/") + "/.well-known/agent-registration.json" @@ -2328,50 +2416,25 @@ Examples: return fmt.Errorf("ServiceOffer %s/%s not found: %w", ns, name, err) } - payment := map[string]any{} - - if wallet := strings.TrimSpace(cmd.String("pay-to")); wallet != "" { + wallet := strings.TrimSpace(cmd.String("pay-to")) + if wallet != "" { if err := x402verifier.ValidateWallet(wallet); err != nil { return err } - payment["payTo"] = wallet - } - - if chain := strings.TrimSpace(cmd.String("chain")); chain != "" { - payment["network"] = chain } - priceSet := cmd.String("price") != "" || cmd.String("per-request") != "" || cmd.String("per-mtok") != "" || cmd.String("per-hour") != "" - if priceSet { - priceTable, err := resolvePriceTable(cmd, true) + var price schemas.PriceTable + if cmd.String("price") != "" || cmd.String("per-request") != "" || cmd.String("per-mtok") != "" || cmd.String("per-hour") != "" { + resolved, err := resolvePriceTable(cmd, true) if err != nil { return err } - - price := map[string]any{ - "perRequest": nil, - "perMTok": nil, - "perHour": nil, - } - switch { - case priceTable.PerRequest != "": - price["perRequest"] = priceTable.PerRequest - case priceTable.PerMTok != "": - price["perMTok"] = priceTable.PerMTok - case priceTable.PerHour != "": - price["perHour"] = priceTable.PerHour - } - payment["price"] = price - } - - if len(payment) == 0 { - return errors.New("nothing to update: pass at least one of --per-request / --per-mtok / --per-hour / --wallet / --chain") + price = resolved } - patch := map[string]any{ - "spec": map[string]any{ - "payment": payment, - }, + patch, err := buildSellUpdatePatch(wallet, cmd.String("chain"), price) + if err != nil { + return err } patchBytes, err := json.Marshal(patch) if err != nil { diff --git a/cmd/obol/sell_test.go b/cmd/obol/sell_test.go index b91861b5..7ec30a05 100644 --- a/cmd/obol/sell_test.go +++ b/cmd/obol/sell_test.go @@ -1000,6 +1000,229 @@ func TestSellInferenceAction_InvokesAutoRegister(t *testing.T) { } } +// TestSignerPayeeDelegationNote pins the contract that obol now treats the +// "registration signer != offer payTo" case as legitimate ERC-8004 ownership +// delegation rather than an error. The helper returns "" when the two match +// (or either is empty) and a single-line informational note otherwise. +// +// Regression context: autoRegisterServiceOffer used to fail with +// +// registration signer 0xA... does not match the payment wallet 0xB... +// +// which read like an ERC-8004 spec constraint but was purely an obol-CLI +// policy. ERC-8004 explicitly supports the split (msg.sender at register +// time owns the agent; setAgentWallet re-points the wallet post-mint), and +// x402 settlement honors the offer's payTo regardless of what the registry +// reports. +func TestSignerPayeeDelegationNote(t *testing.T) { + tests := []struct { + name string + signer string + payTo string + wantNoteHas []string + wantEmpty bool + }{ + { + name: "exact match → no note", + signer: "0xA5d4aF96E3e740383A36c3123A54724dAcB3Df57", + payTo: "0xA5d4aF96E3e740383A36c3123A54724dAcB3Df57", + wantEmpty: true, + }, + { + name: "case-insensitive match → no note", + signer: "0xa5d4af96e3e740383a36c3123a54724dacb3df57", + payTo: "0xA5D4AF96E3E740383A36C3123A54724DACB3DF57", + wantEmpty: true, + }, + { + name: "whitespace tolerance → no note", + signer: " 0xA5d4aF96E3e740383A36c3123A54724dAcB3Df57 ", + payTo: "0xA5d4aF96E3e740383A36c3123A54724dAcB3Df57", + wantEmpty: true, + }, + { + name: "empty payTo → no note (caller didn't request the check)", + signer: "0xA5d4aF96E3e740383A36c3123A54724dAcB3Df57", + payTo: "", + wantEmpty: true, + }, + { + name: "empty signer → no note (impossible in practice; defensive)", + signer: "", + payTo: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + wantEmpty: true, + }, + { + name: "true mismatch → note names both addrs and suggests sell update", + signer: "0xA5d4aF96E3e740383A36c3123A54724dAcB3Df57", + payTo: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + wantNoteHas: []string{ + "0xA5d4aF96E3e740383A36c3123A54724dAcB3Df57", + "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + "obol sell update", + "--pay-to", + }, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := signerPayeeDelegationNote(tc.signer, tc.payTo) + if tc.wantEmpty { + if got != "" { + t.Errorf("note = %q, want empty", got) + } + return + } + for _, sub := range tc.wantNoteHas { + if !strings.Contains(got, sub) { + t.Errorf("note missing %q: %s", sub, got) + } + } + }) + } +} + +// TestAutoRegister_AllowsSignerPayeeMismatch is the source-level guard against +// re-introducing the early-return error that used to reject ERC-8004 +// registrations whenever signer != payTo. The error wording was specific +// ("registration signer ... does not match the payment wallet ..."); we grep +// the function body for it. Anyone who reintroduces that check — even with a +// different error message — must also delete this test, which is a deliberate +// nudge to read the rationale comment first. +func TestAutoRegister_AllowsSignerPayeeMismatch(t *testing.T) { + src, err := os.ReadFile("sell.go") + if err != nil { + t.Fatalf("read sell.go: %v", err) + } + body := string(src) + start := strings.Index(body, "func autoRegisterServiceOffer(") + if start < 0 { + t.Fatal("autoRegisterServiceOffer not found in sell.go") + } + end := strings.Index(body[start+1:], "\nfunc ") + if end < 0 { + t.Fatal("could not delimit autoRegisterServiceOffer body") + } + scope := body[start : start+1+end] + for _, banned := range []string{ + `"registration signer %s does not match the payment wallet`, + `does not match the payment wallet`, + } { + if strings.Contains(scope, banned) { + t.Errorf("autoRegisterServiceOffer must NOT reject signer != payee — ERC-8004 allows "+ + "the split via setAgentWallet and x402 settlement honors payTo directly. "+ + "Banned snippet still present: %q", banned) + } + } + // Positive guard: the soft-notice path must still be present, otherwise + // the operator gets no signal that the addresses diverge. + if !strings.Contains(scope, "signerPayeeDelegationNote(") { + t.Error("autoRegisterServiceOffer must call signerPayeeDelegationNote(...) so operators see when signer ≠ payTo") + } +} + +// TestBuildSellUpdatePatch_PayToOnly pins the `obol sell update --pay-to 0x...` +// shape. The user-facing promise is "change the payee in place" — the patch +// must touch only spec.payment.payTo, not the rest of the offer. +func TestBuildSellUpdatePatch_PayToOnly(t *testing.T) { + const newPayee = "0xB00B00000000000000000000000000000000B00B" + patch, err := buildSellUpdatePatch(newPayee, "", schemas.PriceTable{}) + if err != nil { + t.Fatalf("buildSellUpdatePatch: %v", err) + } + spec, ok := patch["spec"].(map[string]any) + if !ok { + t.Fatalf("patch.spec missing or wrong type: %#v", patch) + } + payment, ok := spec["payment"].(map[string]any) + if !ok { + t.Fatalf("patch.spec.payment missing or wrong type: %#v", spec) + } + if payment["payTo"] != newPayee { + t.Errorf("patch.spec.payment.payTo = %v, want %q", payment["payTo"], newPayee) + } + if _, present := payment["network"]; present { + t.Errorf("--pay-to only patch should NOT touch payment.network; got %#v", payment["network"]) + } + if _, present := payment["price"]; present { + t.Errorf("--pay-to only patch should NOT touch payment.price; got %#v", payment["price"]) + } +} + +// TestBuildSellUpdatePatch_PriceSwitchNullsOldKeys pins the switching contract: +// changing the price model (e.g. perRequest → perMTok) must explicitly null +// the unused keys so merge-patch semantics don't leave a stranded perRequest +// fighting the new perMTok. +func TestBuildSellUpdatePatch_PriceSwitchNullsOldKeys(t *testing.T) { + tests := []struct { + name string + price schemas.PriceTable + setKey string + }{ + {"per-request set", schemas.PriceTable{PerRequest: "0.002"}, "perRequest"}, + {"per-mtok set", schemas.PriceTable{PerMTok: "5.0"}, "perMTok"}, + {"per-hour set", schemas.PriceTable{PerHour: "1.0"}, "perHour"}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + patch, err := buildSellUpdatePatch("", "", tc.price) + if err != nil { + t.Fatalf("buildSellUpdatePatch: %v", err) + } + price := patch["spec"].(map[string]any)["payment"].(map[string]any)["price"].(map[string]any) + for _, key := range []string{"perRequest", "perMTok", "perHour"} { + v := price[key] + if key == tc.setKey { + if v == nil { + t.Errorf("price.%s = nil, want a value", key) + } + } else { + if v != nil { + t.Errorf("price.%s = %#v, want nil (so old key doesn't survive the merge)", key, v) + } + } + } + }) + } +} + +// TestBuildSellUpdatePatch_NoFieldsErrors pins the no-op-protection contract: +// `obol sell update ` with no field flags must error rather than fire +// a no-op kubectl patch. The error message names the flags so the operator +// learns the surface from the failure. +func TestBuildSellUpdatePatch_NoFieldsErrors(t *testing.T) { + _, err := buildSellUpdatePatch("", "", schemas.PriceTable{}) + if err == nil { + t.Fatal("expected error when no update flags are set") + } + for _, sub := range []string{"--per-request", "--per-mtok", "--per-hour", "--pay-to", "--chain"} { + if !strings.Contains(err.Error(), sub) { + t.Errorf("error must name flag %q so the operator learns the surface; got: %v", sub, err) + } + } +} + +// TestSellUpdate_PayToFlagSurface pins the user-facing `obol sell update +// --pay-to 0x...` flag wiring: the same payToFlag() shape used by +// inference + http (so a buyer can muscle-memory `-w` / `--wallet`), plus +// the namespace requirement (the resource is per-namespace, ambiguous +// otherwise). +func TestSellUpdate_PayToFlagSurface(t *testing.T) { + cfg := newTestConfig(t) + cmd := sellCommand(cfg) + update := findSubcommand(t, cmd, "update") + flags := flagMap(update) + + requireFlags(t, flags, + "namespace", "pay-to", "wallet", "recipient", "w", + "chain", "price", "per-request", "per-mtok", "per-hour", + ) + assertFlagRequired(t, flags, "namespace") + assertFlagHasAlias(t, flags, "pay-to", "wallet") + assertFlagHasAlias(t, flags, "pay-to", "recipient") + assertFlagHasAlias(t, flags, "pay-to", "w") +} + func TestBuildDemoServiceOffer_RegisterFlagDrivesEnabled(t *testing.T) { for _, register := range []bool{true, false} { manifest := buildDemoServiceOffer( From 2d36745b50d26f951ba2a56792104348f7315ebe Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 12 May 2026 19:20:16 +0800 Subject: [PATCH 4/7] feat(stack): resume sell-inference offers on stack up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the "cluster comes back but the seller offers don't" gap. After `obol stack down` destroys the k3d cluster, all ServiceOffer custom resources are wiped from etcd. The descriptors on disk at ~/.workspace/config/inference// survive, but nothing replays them when the cluster comes back, so operators had to manually re-run `obol sell inference ` for every offer after every stack up. This wires a resume step into `obol stack up`: after stack.Up returns successfully, the action iterates inference.Store, rebuilds the Service+Endpoints+ServiceOffer for each persisted deployment from the on-disk descriptor, and kubectl-applies them. The foreground host gateway is NOT auto-started — it is an interactive operator action and stack-up shouldn't launch long-running processes — but the cluster side is fully reattached so the operator's eventual `obol sell inference ` rerun hits a "service already healthy" reconcile instead of "build from scratch." Storage model: - inference.Deployment gains ModelName, ServiceNamespace, and Registration fields, persisted with `omitempty` so legacy descriptors written by older binaries still load (the new fields come back as zero values; the resume path either defaults sensibly or refuses with an actionable message). - The inference Action now resolves the registration block once and passes the same map to both store.Create() and the in-process ServiceOffer apply — guaranteeing on-disk state matches what the cluster sees. Resume path: - resumeSellOffers(ctx, cfg, u): lists inference.Store, skips when no kubeconfig (no cluster yet), warns + continues per-offer on errors. - resumeOneInferenceOffer: createHostService + buildInferenceServiceOfferSpec + kubectlApply, no foreground process. Returns an actionable error on missing ModelName so legacy descriptors surface a clear "recreate the offer" prompt rather than producing a broken ServiceOffer. - Wired from cmd/obol/main.go's `stack up` Action after stack.Up. A resume failure is logged as a warning, not propagated — stack-up must succeed even if one descriptor is malformed. Tests: - TestStoreCreate_PersistsResumeFields: pins ModelName, ServiceNamespace, and Registration round-trip through JSON. Without this round-trip, the resume path silently loses operator customizations. - TestStoreCreate_LegacyDescriptorWithoutResumeFields: pins backwards-compatibility — a JSON written by an older binary still loads, with the new fields as zero values. - TestResumeSellOffers_EmptyStoreNoOp: empty store, resume returns nil. - TestResumeSellOffers_DescriptorPresentButNoCluster: descriptor on disk, kubeconfig absent (post-purge, pre-up state) — resume must skip cleanly. - TestResumeOneInferenceOffer_RequiresModelName: legacy descriptor with empty ModelName surfaces an actionable error naming the missing field and the recovery command. - TestResumeOneInferenceOffer_NilDescriptor: defensive nil/empty descriptor guard so one bad entry can't crash the loop. - TestStackUpAction_CallsResumeSellOffers: source-level guard that cmd/obol/main.go's `stack up` Action calls resumeSellOffers AFTER stack.Up. Pinning the order — running resume before stack.Up would see no kubeconfig and skip every offer silently. Operational note: `obol sell http` offers don't yet have an on-disk descriptor (only `obol sell inference` persists), so they aren't covered by this resume pass. Filed as a follow-up — adding a parallel manifest store for `sell http` plus a third resume branch is its own PR. --- cmd/obol/main.go | 17 ++- cmd/obol/sell.go | 180 +++++++++++++++++++++++++------ cmd/obol/sell_test.go | 114 ++++++++++++++++++++ internal/inference/store.go | 19 ++++ internal/inference/store_test.go | 101 +++++++++++++++++ 5 files changed, 400 insertions(+), 31 deletions(-) diff --git a/cmd/obol/main.go b/cmd/obol/main.go index 72055ea3..72b8d4f6 100644 --- a/cmd/obol/main.go +++ b/cmd/obol/main.go @@ -204,7 +204,22 @@ GLOBAL OPTIONS:{{template "visibleFlagTemplate" .}}{{end}} }, }, Action: func(ctx context.Context, cmd *cli.Command) error { - return stack.Up(cfg, getUI(cmd), cmd.Bool("wildcard-dns")) + u := getUI(cmd) + if err := stack.Up(cfg, u, cmd.Bool("wildcard-dns")); err != nil { + return err + } + // Re-apply cluster-side state for locally-persisted + // `obol sell *` offers. ServiceOffer CRs and the + // Service/Endpoints that route to the host gateway + // live in etcd, which is destroyed by `obol stack + // down`, so a fresh `stack up` would otherwise come + // back with the descriptors still on disk but no + // matching cluster resources. Best-effort: a resume + // failure does not block stack-up. + if err := resumeSellOffers(ctx, cfg, u); err != nil { + u.Warnf("Could not resume sell offers: %v", err) + } + return nil }, }, { diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index 724a14bf..f5d35e1a 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -334,24 +334,46 @@ Examples: assetSymbol = "USDC" } + // Resolve the registration block once, here, so we can persist it + // alongside the deployment descriptor. The resume path + // (`obol stack up` after a stack-down) rebuilds the ServiceOffer + // from the on-disk descriptor; without the registration block + // persisted, replays would lose the operator's --register-* + // customizations. + persistedRegistration, _, regErr := buildSellRegistrationConfig(name, sellRegistrationInput{ + NoRegister: cmd.Bool("no-register"), + Name: cmd.String("register-name"), + Description: cmd.String("register-description"), + Image: cmd.String("register-image"), + Skills: cmd.StringSlice("register-skills"), + Domains: cmd.StringSlice("register-domains"), + MetadataPairs: cmd.StringSlice("register-metadata"), + }) + if regErr != nil { + return regErr + } + d := &inference.Deployment{ - Name: name, - EnclaveTag: cmd.String("enclave-tag"), - ListenAddr: cmd.String("listen"), - UpstreamURL: upstreamFlag, - WalletAddress: wallet, - PricePerRequest: perRequest, - PricePerMTok: priceTable.PerMTok, - AssetSymbol: assetSymbol, - Chain: chainName, - FacilitatorURL: cmd.String("facilitator"), - VMMode: cmd.Bool("vm"), - VMImage: cmd.String("vm-image"), - VMCPUs: cmd.Int("vm-cpus"), - VMMemoryMB: cmd.Int("vm-memory"), - VMHostPort: cmd.Int("vm-host-port"), - TEEType: teeType, - ModelHash: modelHash, + Name: name, + EnclaveTag: cmd.String("enclave-tag"), + ListenAddr: cmd.String("listen"), + UpstreamURL: upstreamFlag, + WalletAddress: wallet, + PricePerRequest: perRequest, + PricePerMTok: priceTable.PerMTok, + AssetSymbol: assetSymbol, + Chain: chainName, + FacilitatorURL: cmd.String("facilitator"), + VMMode: cmd.Bool("vm"), + VMImage: cmd.String("vm-image"), + VMCPUs: cmd.Int("vm-cpus"), + VMMemoryMB: cmd.Int("vm-memory"), + VMHostPort: cmd.Int("vm-host-port"), + TEEType: teeType, + ModelHash: modelHash, + ModelName: modelFlag, + ServiceNamespace: "llm", + Registration: persistedRegistration, } if pf := cmd.String("provenance-file"); pf != "" { @@ -412,19 +434,9 @@ Examples: d.NoPaymentGate = false } else { // Create a ServiceOffer CR pointing at the host service. - reg, _, regErr := buildSellRegistrationConfig(name, sellRegistrationInput{ - NoRegister: cmd.Bool("no-register"), - Name: cmd.String("register-name"), - Description: cmd.String("register-description"), - Image: cmd.String("register-image"), - Skills: cmd.StringSlice("register-skills"), - Domains: cmd.StringSlice("register-domains"), - MetadataPairs: cmd.StringSlice("register-metadata"), - }) - if regErr != nil { - return regErr - } - soSpec, err := buildInferenceServiceOfferSpec(d, priceTable, svcNs, port, assetTerms, modelFlag, reg) + // Reuse the persistedRegistration resolved above; both this + // in-process create AND the on-disk descriptor must agree. + soSpec, err := buildInferenceServiceOfferSpec(d, priceTable, svcNs, port, assetTerms, modelFlag, persistedRegistration) if err != nil { return err } @@ -3441,6 +3453,114 @@ func loadProvenance(path string) (*inference.Provenance, error) { // // Kubernetes Endpoints require an IP address, not a hostname. We resolve the // host IP using the same strategy as ollamaHostIPForBackend in internal/stack. +// resumeSellOffers re-applies the cluster-side artifacts (Service + +// Endpoints + ServiceOffer) for every locally-persisted `obol sell +// inference` deployment after `obol stack up` brings a fresh cluster +// online. Without this step the cluster has no record of operator-created +// offers (CRs live in etcd, which is destroyed by `obol stack down`), +// even though the host-side descriptors at +// `/inference//` still exist. +// +// The foreground gateway is NOT restarted here — `obol sell inference` +// is an interactive operator action and we don't want stack-up to launch +// long-running processes. The operator re-runs `obol sell inference +// ` after stack-up to bring the gateway back; this step ensures +// the cluster side is already in place so the gateway hits a "service +// healthy" reconcile instead of "create from scratch". +// +// Best-effort. Per-offer failures emit a warning and the loop continues, +// so one broken descriptor cannot block stack-up. +func resumeSellOffers(ctx context.Context, cfg *config.Config, u *ui.UI) error { + store := inference.NewStore(cfg.ConfigDir) + deployments, err := store.List() + if err != nil { + return fmt.Errorf("list inference deployments: %w", err) + } + if len(deployments) == 0 { + return nil + } + + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + if _, statErr := os.Stat(kubeconfigPath); statErr != nil { + // No cluster — nothing to reattach. + return nil + } + + u.Blank() + u.Infof("Resuming %d locally-persisted sell-inference offer(s)...", len(deployments)) + + var resumed int + for _, d := range deployments { + if err := resumeOneInferenceOffer(cfg, u, d); err != nil { + u.Warnf("resume %s: %v", d.Name, err) + continue + } + resumed++ + u.Successf("Resumed sell-inference offer %q (run `obol sell inference %s` to restart the host gateway)", d.Name, d.Name) + } + + if resumed > 0 { + u.Dim(" Host gateways are not auto-started — re-run `obol sell inference ` in a terminal you can keep open.") + } + _ = ctx // reserved for cancellation support; current resume calls are synchronous and short + return nil +} + +// resumeOneInferenceOffer re-creates the cluster-side artifacts that +// `obol sell inference` would have produced for a single Deployment. Pure +// in the sense that it only consumes the on-disk descriptor; it never +// re-prompts the operator. Returns an error when the descriptor is +// incomplete (no model name, no namespace, no listen port) so the resume +// loop can surface a clear message per-offer. +func resumeOneInferenceOffer(cfg *config.Config, u *ui.UI, d *inference.Deployment) error { + if d == nil || d.Name == "" { + return errors.New("nil or unnamed deployment descriptor") + } + if d.ModelName == "" { + return fmt.Errorf("deployment %q is missing model_name on disk — recreate the offer with `obol sell inference %s --model ...`", d.Name, d.Name) + } + ns := d.ServiceNamespace + if ns == "" { + ns = "llm" // legacy descriptors written before service_namespace was persisted + } + + port := "8402" + if idx := strings.LastIndex(d.ListenAddr, ":"); idx >= 0 && idx+1 < len(d.ListenAddr) { + port = d.ListenAddr[idx+1:] + } + + if err := createHostService(cfg, d.Name, ns, port); err != nil { + return fmt.Errorf("create cluster Service/Endpoints: %w", err) + } + + chainName := d.Chain + assetTerms, err := resolveAssetTermsFor(d.AssetSymbol, &chainName, true) + if err != nil { + return fmt.Errorf("resolve asset terms: %w", err) + } + + pt := schemas.PriceTable{PerRequest: d.PricePerRequest, PerMTok: d.PricePerMTok} + soSpec, err := buildInferenceServiceOfferSpec(d, pt, ns, port, assetTerms, d.ModelName, d.Registration) + if err != nil { + return fmt.Errorf("rebuild ServiceOffer spec: %w", err) + } + + manifest := map[string]any{ + "apiVersion": "obol.org/v1alpha1", + "kind": "ServiceOffer", + "metadata": map[string]any{ + "name": d.Name, + "namespace": ns, + }, + "spec": soSpec, + } + if err := kubectlApply(cfg, manifest); err != nil { + return fmt.Errorf("apply ServiceOffer: %w", err) + } + _ = u // ui is held for caller-side messaging only + return nil +} + func createHostService(cfg *config.Config, name, ns, port string) error { hostIP, err := resolveHostIP(cfg) if err != nil { diff --git a/cmd/obol/sell_test.go b/cmd/obol/sell_test.go index 7ec30a05..bdcd1e1e 100644 --- a/cmd/obol/sell_test.go +++ b/cmd/obol/sell_test.go @@ -1,6 +1,7 @@ package main import ( + "context" "errors" "os" "strings" @@ -10,6 +11,7 @@ import ( "github.com/ObolNetwork/obol-stack/internal/inference" "github.com/ObolNetwork/obol-stack/internal/monetizeapi" "github.com/ObolNetwork/obol-stack/internal/schemas" + "github.com/ObolNetwork/obol-stack/internal/ui" x402verifier "github.com/ObolNetwork/obol-stack/internal/x402" "github.com/urfave/cli/v3" ) @@ -1223,6 +1225,118 @@ func TestSellUpdate_PayToFlagSurface(t *testing.T) { assertFlagHasAlias(t, flags, "pay-to", "w") } +// TestResumeSellOffers_EmptyStoreNoOp pins the "nothing to resume" path: +// stack-up against a workspace with no persisted sell-inference deployments +// must return nil without erroring. The same path also has to handle the +// no-cluster-yet case (no kubeconfig.yaml) gracefully — both are exercised +// by an empty ConfigDir. +func TestResumeSellOffers_EmptyStoreNoOp(t *testing.T) { + cfg := newTestConfig(t) + u := ui.New(false) + if err := resumeSellOffers(context.Background(), cfg, u); err != nil { + t.Fatalf("empty resume must succeed, got: %v", err) + } +} + +// TestResumeSellOffers_DescriptorPresentButNoCluster pins the +// "descriptors-on-disk-but-cluster-not-up-yet" path. Real-world example: +// `obol stack down`, then re-running `obol sell inference` somewhere that +// happens to fail before reaching cluster apply — the descriptor lands on +// disk, the cluster never comes back, and a subsequent `obol stack up` +// against a missing kubeconfig must NOT panic or hard-error. It should be +// a quiet skip until the cluster is available. +func TestResumeSellOffers_DescriptorPresentButNoCluster(t *testing.T) { + cfg := newTestConfig(t) + store := inference.NewStore(cfg.ConfigDir) + if err := store.Create(&inference.Deployment{ + Name: "aeon", + ModelName: "aeon-ultimate", + ServiceNamespace: "llm", + ListenAddr: "0.0.0.0:8402", + WalletAddress: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + PricePerRequest: "0.023", + AssetSymbol: "OBOL", + Chain: "base-sepolia", + }, true); err != nil { + t.Fatalf("Create: %v", err) + } + // No kubeconfig.yaml in cfg.ConfigDir — resume must not attempt + // cluster operations. + u := ui.New(false) + if err := resumeSellOffers(context.Background(), cfg, u); err != nil { + t.Fatalf("resume with descriptor + no cluster must skip cleanly, got: %v", err) + } +} + +// TestResumeOneInferenceOffer_RequiresModelName pins the per-offer error +// for legacy descriptors that were written before ModelName became a +// persisted field. The resume path cannot fabricate a model name out of +// thin air, and silently writing a ServiceOffer with no spec.model.name +// would surface as a controller "ModelReady=False" loop with no actionable +// signal. Operators need a clear "recreate the offer" message instead. +func TestResumeOneInferenceOffer_RequiresModelName(t *testing.T) { + cfg := newTestConfig(t) + u := ui.New(false) + err := resumeOneInferenceOffer(cfg, u, &inference.Deployment{ + Name: "legacy", + ServiceNamespace: "llm", + ListenAddr: ":8402", + // No ModelName. + }) + if err == nil { + t.Fatal("expected error for legacy descriptor with no ModelName") + } + for _, sub := range []string{"model_name", "obol sell inference"} { + if !strings.Contains(err.Error(), sub) { + t.Errorf("error must name the missing field and the recovery command; missing %q: %v", sub, err) + } + } +} + +// TestResumeOneInferenceOffer_NilDescriptor pins the defensive guard for +// the never-supposed-to-happen case of a nil or empty descriptor reaching +// the resume loop. A bug elsewhere that produces such an entry shouldn't +// panic the entire resume pass; one descriptor failing should not block +// the rest. +func TestResumeOneInferenceOffer_NilDescriptor(t *testing.T) { + cfg := newTestConfig(t) + u := ui.New(false) + if err := resumeOneInferenceOffer(cfg, u, nil); err == nil { + t.Fatal("expected error for nil descriptor") + } + if err := resumeOneInferenceOffer(cfg, u, &inference.Deployment{}); err == nil { + t.Fatal("expected error for empty descriptor (no Name)") + } +} + +// TestStackUpAction_CallsResumeSellOffers is a source-level guard against +// silently regressing the stack-up → sell-resume wiring. The whole feature +// hinges on the `stack up` action handler calling resumeSellOffers after +// stack.Up succeeds; without that call, persisted sell-inference offers +// stay on disk forever and never reach the freshly-recreated cluster. +// A future refactor that splits the handler or moves the call must update +// this test, which forces a moment of "why is this here" attention. +func TestStackUpAction_CallsResumeSellOffers(t *testing.T) { + src, err := os.ReadFile("main.go") + if err != nil { + t.Fatalf("read main.go: %v", err) + } + body := string(src) + if !strings.Contains(body, "resumeSellOffers(") { + t.Fatal("cmd/obol/main.go must call resumeSellOffers — without it persisted sell-inference offers never reach a freshly-stacked cluster") + } + // Belt-and-suspenders: assert the call lives after stack.Up so the + // kubeconfig + infrastructure are ready when resume runs. + upIdx := strings.Index(body, "stack.Up(cfg") + resumeIdx := strings.Index(body, "resumeSellOffers(") + if upIdx < 0 || resumeIdx < 0 { + t.Fatalf("expected both stack.Up and resumeSellOffers in main.go; upIdx=%d resumeIdx=%d", upIdx, resumeIdx) + } + if resumeIdx < upIdx { + t.Error("resumeSellOffers must be invoked AFTER stack.Up — running it before the cluster is up will see no kubeconfig and skip every offer") + } +} + func TestBuildDemoServiceOffer_RegisterFlagDrivesEnabled(t *testing.T) { for _, register := range []bool{true, false} { manifest := buildDemoServiceOffer( diff --git a/internal/inference/store.go b/internal/inference/store.go index fe9740cd..f2569b63 100644 --- a/internal/inference/store.go +++ b/internal/inference/store.go @@ -102,6 +102,25 @@ type Deployment struct { // config and passed to the registration document when selling. Provenance *Provenance `json:"provenance,omitempty"` + // ModelName is the upstream model identifier the operator selected via + // `obol sell inference --model ` (e.g. "aeon-ultimate"). Persisted + // so `obol stack up` can rebuild the ServiceOffer's spec.model.name on + // resume without re-prompting the operator. + ModelName string `json:"model_name,omitempty"` + + // ServiceNamespace is the Kubernetes namespace the cluster-routed Service + // + Endpoints + ServiceOffer live in (typically "llm" for inference + // deployments co-located with LiteLLM). Persisted so the resume path + // re-applies into the same namespace. + ServiceNamespace string `json:"service_namespace,omitempty"` + + // Registration is the ERC-8004 registration block the operator passed at + // sell time (--register-*). Persisted as map[string]any (the same shape + // buildSellRegistrationConfig emits) so the resume path can rebuild the + // ServiceOffer's spec.registration verbatim. Empty when the offer was + // created with --no-register. + Registration map[string]any `json:"registration,omitempty"` + // CreatedAt is the RFC3339 timestamp of when this deployment was created. CreatedAt string `json:"created_at"` diff --git a/internal/inference/store_test.go b/internal/inference/store_test.go index bc1fca70..42299d10 100644 --- a/internal/inference/store_test.go +++ b/internal/inference/store_test.go @@ -301,3 +301,104 @@ func TestStoreDirPermissions(t *testing.T) { t.Errorf("config.json permissions: want 0600, got %04o", mode) } } + +// TestStoreCreate_PersistsResumeFields pins the new resume-side fields on +// the inference Deployment descriptor. Without ModelName, ServiceNamespace, +// and Registration persisted on disk, the `obol stack up` resume path +// would lose the operator's --model + --register-* customizations and +// rebuild a stripped-down ServiceOffer (or fail outright on missing +// model_name). The round-trip here is the contract: anything Create() +// writes must come back verbatim from Get(). +func TestStoreCreate_PersistsResumeFields(t *testing.T) { + dir := t.TempDir() + store := inference.NewStore(dir) + + registration := map[string]any{ + "enabled": true, + "name": "Qwen3.6-27B AEON Ultimate", + "description": "Uncensored Qwen3.6-27B abliteration on DGX Spark", + "skills": []any{"llm/inference", "llm/uncensored"}, + "domains": []any{"inference.v1337.org"}, + } + + in := &inference.Deployment{ + Name: "aeon", + EnclaveTag: "com.obol.inference.aeon", + ListenAddr: "0.0.0.0:8402", + UpstreamURL: "http://127.0.0.1:8000", + WalletAddress: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + PricePerRequest: "0.023", + AssetSymbol: "OBOL", + Chain: "base-sepolia", + ModelName: "aeon-ultimate", + ServiceNamespace: "llm", + Registration: registration, + } + if err := store.Create(in, true); err != nil { + t.Fatalf("Create: %v", err) + } + + got, err := store.Get("aeon") + if err != nil { + t.Fatalf("Get: %v", err) + } + + if got.ModelName != "aeon-ultimate" { + t.Errorf("ModelName = %q, want %q", got.ModelName, "aeon-ultimate") + } + if got.ServiceNamespace != "llm" { + t.Errorf("ServiceNamespace = %q, want %q", got.ServiceNamespace, "llm") + } + if got.Registration == nil { + t.Fatal("Registration round-tripped to nil; resume would rebuild offer without operator customizations") + } + if got.Registration["enabled"] != true { + t.Errorf("Registration.enabled = %v, want true", got.Registration["enabled"]) + } + if got.Registration["name"] != "Qwen3.6-27B AEON Ultimate" { + t.Errorf("Registration.name = %v, want %q", got.Registration["name"], "Qwen3.6-27B AEON Ultimate") + } +} + +// TestStoreCreate_LegacyDescriptorWithoutResumeFields pins backwards- +// compatibility: a descriptor written by an older binary (no ModelName, +// no ServiceNamespace, no Registration) must still be readable, with the +// new fields coming back as zero values. The resume path uses the zero +// values to either default sensibly or refuse with an actionable error. +func TestStoreCreate_LegacyDescriptorWithoutResumeFields(t *testing.T) { + dir := t.TempDir() + configDir := dir + "/inference/legacy" + if err := os.MkdirAll(configDir, 0o700); err != nil { + t.Fatalf("mkdir: %v", err) + } + // JSON shape an older binary would have written — no model_name, + // no service_namespace, no registration. + legacyJSON := `{ + "name": "legacy", + "listen_addr": ":8402", + "upstream_url": "http://localhost:11434", + "wallet_address": "0xefab75b7b199bf8512e2d5b379374cb94dfdba47", + "price_per_request": "0.001", + "chain": "base", + "facilitator_url": "https://x402.gcp.obol.tech", + "created_at": "2026-01-01T00:00:00Z" + }` + if err := os.WriteFile(configDir+"/config.json", []byte(legacyJSON), 0o600); err != nil { + t.Fatalf("write: %v", err) + } + + store := inference.NewStore(dir) + got, err := store.Get("legacy") + if err != nil { + t.Fatalf("Get: %v", err) + } + if got.ModelName != "" { + t.Errorf("ModelName = %q, want empty for legacy descriptor", got.ModelName) + } + if got.ServiceNamespace != "" { + t.Errorf("ServiceNamespace = %q, want empty for legacy descriptor", got.ServiceNamespace) + } + if got.Registration != nil { + t.Errorf("Registration = %#v, want nil for legacy descriptor", got.Registration) + } +} From 6efdf3ff4ff21d55e671a251024a3334a0ee1ad8 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 12 May 2026 19:51:26 +0800 Subject: [PATCH 5/7] =?UTF-8?q?feat(stack):=20zero-command=20resume=20?= =?UTF-8?q?=E2=80=94=20auto-start=20gateway=20+=20relax=20storefront=20fil?= =?UTF-8?q?ter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first cut of stack-up resume (prior commit) reattached cluster-side artifacts but stopped short of two things that together left the operator's storefront empty after every stack-up cycle: 1. The foreground x402 gateway never restarted, so UpstreamHealthy=False. 2. Even after a manual gateway restart, the controller's storefront filter required Ready=True — which itself requires Registered=True — and an unfunded agent wallet (or a deliberate "register later" choice) leaves Registered=False AwaitingExternalRegistration. The operationally-usable offer was invisible to the operator's own UI. Both addressed here so `obol stack up` is the only command needed. 1. Auto-start the gateway as a detached host subprocess: - startDetachedInferenceGateway reconstructs the original `obol sell inference --model … --register-* …` invocation from the persisted inference.Deployment and spawns it with Setsid + Process.Release so it survives the parent's exit. - PID + log path live under /sell-inference//. readGatewayPID + processAlive let subsequent stack-ups detect a still-running gateway and skip the relaunch. - Surfaced via a u.Successf with the PID + log path so operators can `tail -f` or `kill $(cat .pid)` without parsing helpers. - sell_proc_unix.go isolates the platform-specific SysProcAttr behind the unix build tag, keeping the rest of the code portable. The long-term shape is to ship the inference gateway as a real helm-managed cluster Pod (it would survive stack-down/up like every other infra piece, and the controller could observe it as a first-class workload). Comment in startDetachedInferenceGateway spells that out so the next person looking at this knows the host subprocess + PID-file plumbing is the deliberate near-term step, not the final form. 2. Relax the storefront filter: - offerOperationallyReady = ModelReady + UpstreamHealthy + PaymentGateReady + RoutePublished. Registered is intentionally NOT in the AND — on-chain ERC-8004 registration is publication metadata, not operational readiness. Aggregate Ready=True is also accepted as a shortcut so existing test fixtures and any external callers that only emit the aggregate signal still work. - offerAwaitingRegistration flags Registered=False AwaitingExternalRegistration specifically, and buildServiceCatalogJSON propagates it as ServiceCatalogEntry.RegistrationPending=true so storefront UIs can badge "registration pending" alongside the usable offer. Tests: - TestBuildResumeGatewayArgs (3 cases): full descriptor incl. registration map, --no-register path, legacy descriptor with no Registration map. Pins positional-name-before-flags ordering so a CLI surface tweak can't silently desync the resume relaunch. - TestReadGatewayPID (8 cases): clean integer, trailing newline, surrounding whitespace, zero rejected, negative rejected, non-numeric rejected, empty rejected, missing-file rejected. Format is one decimal int in ASCII so external tooling (`kill $(cat .pid)`) works without parsing. - TestProcessAlive_SelfAndBogus: self is alive, absurd PID is not. - TestOfferOperationallyReady_IncludesAwaitingExternalRegistration: the headline behavioral fix — an offer with all four ops conditions True + Registered=False(AwaitingExternalRegistration) is operationally ready. - TestOfferOperationallyReady_RejectsRealNotReady: an offer with UpstreamHealthy=False is still excluded — the relax is narrow to the registration gate only. - TestBuildServiceCatalogJSON_IncludesPendingRegistrationOffers: end-to-end through /api/services.json — AwaitingExternalRegistration offers appear with RegistrationPending=true. - TestBuildServiceCatalogJSON_RegistrationPendingFalseForFullyReady: the negative — fully Ready=True offers must NOT carry RegistrationPending or the storefront badge would stick around on healthy offers. Operational footnote: the auto-spawned gateway picks up the same inference.Deployment that originally created the offer. If the descriptor predates this PR and has no ModelName, the resume per-offer warning fires (already pinned by TestResumeOneInferenceOffer_RequiresModelName in the prior commit) — recreate the offer once with the new code so the descriptor has the resume-side fields, after which every subsequent stack down/up cycle reattaches the offer with zero extra commands. --- cmd/obol/sell.go | 188 +++++++++++++++++- cmd/obol/sell_proc_unix.go | 18 ++ cmd/obol/sell_test.go | 184 +++++++++++++++++ internal/schemas/service_catalog.go | 8 + internal/serviceoffercontroller/render.go | 87 ++++++-- .../serviceoffercontroller/render_test.go | 136 +++++++++++++ 6 files changed, 608 insertions(+), 13 deletions(-) create mode 100644 cmd/obol/sell_proc_unix.go diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index f5d35e1a..1e9e7f99 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -3557,10 +3557,196 @@ func resumeOneInferenceOffer(cfg *config.Config, u *ui.UI, d *inference.Deployme if err := kubectlApply(cfg, manifest); err != nil { return fmt.Errorf("apply ServiceOffer: %w", err) } - _ = u // ui is held for caller-side messaging only + + // Auto-start the foreground x402 gateway as a detached background + // subprocess so the offer reaches UpstreamHealthy=True (and therefore + // shows up in /api/services.json) without the operator running an + // extra `obol sell inference` command after every stack-up. + // + // This is the pragmatic shape of the resume feature today. The proper + // long-term path is C from the design doc: build the inference gateway + // as an in-cluster Deployment image, helmfile-managed, so stack-up + // brings it back the same way it brings back Traefik or LiteLLM. That + // removes the host-side subprocess + PID-file plumbing entirely and + // lets the controller observe the gateway as a normal Pod. Tracked as + // a follow-up because the gateway code (internal/inference/gateway.go) + // currently runs in-process with the CLI, not as a packaged image. + if err := startDetachedInferenceGateway(cfg, u, d); err != nil { + u.Warnf("could not auto-start host gateway for %q: %v", d.Name, err) + u.Dim(" Run `obol sell inference " + d.Name + "` manually in a terminal you can keep open.") + } + return nil +} + +// startDetachedInferenceGateway forks `obol sell inference ` as a +// detached background subprocess. The CLI rebuilds the full flag set from +// the persisted Deployment, redirects stdout/stderr to a log file under +// the workspace state dir, and stores the PID alongside so subsequent +// stack-ups can detect a still-running gateway and skip the relaunch. +// +// Skipped silently when a healthy PID is already on disk. Returns an +// error only when launch itself fails (e.g. missing obol binary, log +// file unwritable). Callers should warn-and-continue on failure — a +// missing gateway leaves UpstreamHealthy=False which the controller will +// re-emit clearly, while a hard error here would block the rest of the +// resume loop. +func startDetachedInferenceGateway(cfg *config.Config, u *ui.UI, d *inference.Deployment) error { + if d == nil || d.Name == "" { + return errors.New("nil or unnamed deployment") + } + + stateDir := filepath.Join(cfg.StateDir, "sell-inference", d.Name) + if err := os.MkdirAll(stateDir, 0o755); err != nil { + return fmt.Errorf("create state dir: %w", err) + } + pidFile := filepath.Join(stateDir, "gateway.pid") + logFile := filepath.Join(stateDir, "gateway.log") + + if pid, ok := readGatewayPID(pidFile); ok && processAlive(pid) { + u.Dim(" Gateway already running for " + d.Name + " (pid " + strconv.Itoa(pid) + ")") + return nil + } + + obolBin := filepath.Join(cfg.BinDir, "obol") + if _, statErr := os.Stat(obolBin); statErr != nil { + // Fall back to whatever obol the parent process is running as. + exe, exeErr := os.Executable() + if exeErr != nil { + return fmt.Errorf("locate obol binary: %w", exeErr) + } + obolBin = exe + } + + args := buildResumeGatewayArgs(d) + + logF, err := os.OpenFile(logFile, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644) + if err != nil { + return fmt.Errorf("open log file: %w", err) + } + + cmd := exec.Command(obolBin, args...) + cmd.Stdout = logF + cmd.Stderr = logF + cmd.SysProcAttr = detachedSysProcAttr() + cmd.Env = os.Environ() + + if err := cmd.Start(); err != nil { + _ = logF.Close() + return fmt.Errorf("start gateway subprocess: %w", err) + } + // We don't wait on the child, but releasing the handle ensures the + // parent process can exit cleanly without keeping a zombie reference. + if err := cmd.Process.Release(); err != nil { + u.Dim(" (could not release gateway process handle: " + err.Error() + ")") + } + if err := os.WriteFile(pidFile, []byte(strconv.Itoa(cmd.Process.Pid)), 0o644); err != nil { + u.Warnf("gateway started (pid %d) but could not write %s: %v", cmd.Process.Pid, pidFile, err) + } + u.Successf("Gateway started in background (pid %d, log %s)", cmd.Process.Pid, logFile) + return nil +} + +// buildResumeGatewayArgs reconstructs the `obol sell inference` flag set +// that originally created the offer, from the persisted Deployment. The +// order matches obol sell inference's flag list so a future surface +// reduction of the CLI doesn't silently drop a piece of operator intent +// (CI's source-level guard tests catch that case). +// +// Exposed for testing: callers can compare the reproduced flag set +// against the original `obol sell inference` invocation that wrote the +// descriptor. +func buildResumeGatewayArgs(d *inference.Deployment) []string { + args := []string{"sell", "inference", d.Name} + if d.ModelName != "" { + args = append(args, "--model", d.ModelName) + } + if d.UpstreamURL != "" { + args = append(args, "--upstream", d.UpstreamURL) + } + if d.WalletAddress != "" { + args = append(args, "--pay-to", d.WalletAddress) + } + if d.ListenAddr != "" { + args = append(args, "--listen", d.ListenAddr) + } + if d.Chain != "" { + args = append(args, "--chain", d.Chain) + } + if d.AssetSymbol != "" { + args = append(args, "--token", d.AssetSymbol) + } + if d.PricePerMTok != "" { + args = append(args, "--per-mtok", d.PricePerMTok) + } else if d.PricePerRequest != "" { + args = append(args, "--price", d.PricePerRequest) + } + if d.FacilitatorURL != "" { + args = append(args, "--facilitator", d.FacilitatorURL) + } + // Registration block — rebuild --register-* flags from the persisted map. + if reg, ok := d.Registration["enabled"].(bool); ok && !reg { + args = append(args, "--no-register") + } else if d.Registration != nil { + if v, _ := d.Registration["name"].(string); v != "" { + args = append(args, "--register-name", v) + } + if v, _ := d.Registration["description"].(string); v != "" { + args = append(args, "--register-description", v) + } + if v, _ := d.Registration["image"].(string); v != "" { + args = append(args, "--register-image", v) + } + for _, s := range registrationStringSlice(d.Registration, "skills") { + args = append(args, "--register-skills", s) + } + for _, s := range registrationStringSlice(d.Registration, "domains") { + args = append(args, "--register-domains", s) + } + } + return args +} + +func registrationStringSlice(reg map[string]any, key string) []string { + raw, ok := reg[key] + if !ok { + return nil + } + switch v := raw.(type) { + case []string: + return v + case []any: + out := make([]string, 0, len(v)) + for _, item := range v { + if s, ok := item.(string); ok { + out = append(out, s) + } + } + return out + } return nil } +func readGatewayPID(path string) (int, bool) { + data, err := os.ReadFile(path) + if err != nil { + return 0, false + } + pid, err := strconv.Atoi(strings.TrimSpace(string(data))) + if err != nil || pid <= 0 { + return 0, false + } + return pid, true +} + +func processAlive(pid int) bool { + p, err := os.FindProcess(pid) + if err != nil { + return false + } + // Signal 0 doesn't deliver — just probes existence/permission on Unix. + return p.Signal(syscall.Signal(0)) == nil +} + func createHostService(cfg *config.Config, name, ns, port string) error { hostIP, err := resolveHostIP(cfg) if err != nil { diff --git a/cmd/obol/sell_proc_unix.go b/cmd/obol/sell_proc_unix.go new file mode 100644 index 00000000..e5604a9e --- /dev/null +++ b/cmd/obol/sell_proc_unix.go @@ -0,0 +1,18 @@ +//go:build unix + +package main + +import "syscall" + +// detachedSysProcAttr returns the platform-specific SysProcAttr that +// detaches a spawned child from the parent's process group. On Unix this +// is a new session leader (Setsid: true) so the child survives the +// parent's exit and does not receive SIGHUP / SIGINT propagated from +// the controlling terminal. +// +// Linux/macOS/BSD share the same shape via the unix build tag. If we +// ever ship a Windows obol binary, add sell_proc_windows.go with a +// CREATE_NEW_PROCESS_GROUP equivalent. +func detachedSysProcAttr() *syscall.SysProcAttr { + return &syscall.SysProcAttr{Setsid: true} +} diff --git a/cmd/obol/sell_test.go b/cmd/obol/sell_test.go index bdcd1e1e..15c03b8d 100644 --- a/cmd/obol/sell_test.go +++ b/cmd/obol/sell_test.go @@ -4,6 +4,7 @@ import ( "context" "errors" "os" + "path/filepath" "strings" "testing" @@ -1337,6 +1338,189 @@ func TestStackUpAction_CallsResumeSellOffers(t *testing.T) { } } +// TestBuildResumeGatewayArgs pins the round-trip between an +// inference.Deployment on disk and the `obol sell inference` invocation +// the resume path uses to relaunch the host gateway. If a flag is dropped +// here, an offer that came back from disk would lose part of the +// operator's original intent — most painfully, registration metadata +// that fell out of the relaunch would leave the gateway running with +// stripped-down /.well-known/ content. +func TestBuildResumeGatewayArgs(t *testing.T) { + tests := []struct { + name string + d *inference.Deployment + wantSubs []string + wantNoSub []string + }{ + { + name: "full descriptor (the spark2 aeon offer)", + d: &inference.Deployment{ + Name: "aeon", + ModelName: "aeon-ultimate", + UpstreamURL: "http://127.0.0.1:8000", + WalletAddress: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + ListenAddr: "0.0.0.0:8402", + Chain: "base-sepolia", + AssetSymbol: "OBOL", + PricePerMTok: "23", + PricePerRequest: "0.023", + FacilitatorURL: "https://x402.gcp.obol.tech", + Registration: map[string]any{ + "enabled": true, + "name": "Qwen3.6-27B AEON Ultimate", + "description": "Uncensored Qwen3.6-27B abliteration", + "skills": []any{"llm/inference", "llm/uncensored"}, + "domains": []any{"inference.v1337.org"}, + }, + }, + wantSubs: []string{ + "sell", "inference", "aeon", + "--model", "aeon-ultimate", + "--upstream", "http://127.0.0.1:8000", + "--pay-to", "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + "--listen", "0.0.0.0:8402", + "--chain", "base-sepolia", + "--token", "OBOL", + "--per-mtok", "23", + "--facilitator", "https://x402.gcp.obol.tech", + "--register-name", "Qwen3.6-27B AEON Ultimate", + "--register-description", "Uncensored Qwen3.6-27B abliteration", + "--register-skills", "llm/inference", + "--register-skills", "llm/uncensored", + "--register-domains", "inference.v1337.org", + }, + wantNoSub: []string{ + "--price", // perMTok set, perRequest must not also be passed + "--no-register", + }, + }, + { + name: "no-register descriptor", + d: &inference.Deployment{ + Name: "no-reg", + ModelName: "qwen3:0.6b", + ListenAddr: ":8402", + Registration: map[string]any{"enabled": false}, + }, + wantSubs: []string{ + "--no-register", + }, + wantNoSub: []string{ + "--register-name", + "--register-description", + }, + }, + { + name: "legacy descriptor (no registration map at all)", + d: &inference.Deployment{ + Name: "legacy", + ModelName: "qwen3.5:9b", + ListenAddr: ":8402", + PricePerRequest: "0.001", + }, + wantSubs: []string{"--price", "0.001"}, + wantNoSub: []string{ + "--no-register", + "--register-name", + }, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := buildResumeGatewayArgs(tc.d) + joined := strings.Join(got, " ") + for _, want := range tc.wantSubs { + found := false + for _, a := range got { + if a == want { + found = true + break + } + } + if !found { + t.Errorf("buildResumeGatewayArgs missing %q in %v", want, got) + } + } + for _, banned := range tc.wantNoSub { + for _, a := range got { + if a == banned { + t.Errorf("buildResumeGatewayArgs unexpectedly contains %q in %v", banned, got) + } + } + } + // Order sanity: positional `` comes before any flag. + nameIdx := -1 + firstFlagIdx := -1 + for i, a := range got { + if a == tc.d.Name && nameIdx < 0 { + nameIdx = i + } + if strings.HasPrefix(a, "--") && firstFlagIdx < 0 { + firstFlagIdx = i + } + } + if nameIdx < 0 || firstFlagIdx < 0 || nameIdx > firstFlagIdx { + t.Errorf("name positional must come before flags; got %v", joined) + } + }) + } +} + +// TestReadGatewayPID pins the on-disk PID file format. The file must be a +// single decimal integer in ASCII (no JSON, no trailing newlines that +// confuse trivial readers) so other tools — `tail -f .../gateway.log` and +// `kill $(cat .../gateway.pid)` — work without parsing helpers. +func TestReadGatewayPID(t *testing.T) { + dir := t.TempDir() + tests := []struct { + name string + content string + wantPID int + wantOK bool + }{ + {"clean integer", "12345", 12345, true}, + {"trailing newline tolerated", "12345\n", 12345, true}, + {"surrounding whitespace tolerated", " 12345 \n", 12345, true}, + {"zero rejected", "0", 0, false}, + {"negative rejected", "-1", 0, false}, + {"non-numeric rejected", "abc", 0, false}, + {"empty rejected", "", 0, false}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + path := filepath.Join(dir, tc.name+".pid") + if err := os.WriteFile(path, []byte(tc.content), 0o644); err != nil { + t.Fatalf("write: %v", err) + } + pid, ok := readGatewayPID(path) + if pid != tc.wantPID || ok != tc.wantOK { + t.Errorf("readGatewayPID(%q) = (%d, %v); want (%d, %v)", tc.content, pid, ok, tc.wantPID, tc.wantOK) + } + }) + } + + t.Run("missing file → (0, false)", func(t *testing.T) { + pid, ok := readGatewayPID(filepath.Join(dir, "does-not-exist")) + if pid != 0 || ok { + t.Errorf("readGatewayPID(missing) = (%d, %v); want (0, false)", pid, ok) + } + }) +} + +// TestProcessAlive_SelfAndBogus pins the liveness probe used to decide +// whether the previous gateway is still running. The current process is +// trivially alive; a far-out-of-range PID is not. We don't test +// "definitely-dead-but-recently-existed" cases because those depend on +// the kernel's PID-reuse window and would be flaky. +func TestProcessAlive_SelfAndBogus(t *testing.T) { + if !processAlive(os.Getpid()) { + t.Error("processAlive(self) must be true") + } + if processAlive(99_999_999) { + t.Error("processAlive(absurd pid) must be false") + } +} + func TestBuildDemoServiceOffer_RegisterFlagDrivesEnabled(t *testing.T) { for _, register := range []bool{true, false} { manifest := buildDemoServiceOffer( diff --git a/internal/schemas/service_catalog.go b/internal/schemas/service_catalog.go index cc6e411f..e53085b6 100644 --- a/internal/schemas/service_catalog.go +++ b/internal/schemas/service_catalog.go @@ -31,6 +31,14 @@ type ServiceCatalogEntry struct { Asset *ServiceCatalogAsset `json:"asset,omitempty"` Description string `json:"description"` IsDemo bool `json:"isDemo"` + + // RegistrationPending is true when the offer is operationally ready + // (route published, payment gate active, upstream healthy) but its + // ERC-8004 on-chain registration tx hasn't landed yet. The storefront + // should surface these with a "registration pending" badge so consumers + // know the offer is usable for x402 payments today, even though + // ERC-8004 discovery via the chain still resolves to the prior state. + RegistrationPending bool `json:"registrationPending,omitempty"` } // ServiceCatalogAsset describes the settlement token resolved for a catalog diff --git a/internal/serviceoffercontroller/render.go b/internal/serviceoffercontroller/render.go index ef6c4705..72dcf4d7 100644 --- a/internal/serviceoffercontroller/render.go +++ b/internal/serviceoffercontroller/render.go @@ -776,7 +776,69 @@ func buildSkillCatalogMarkdown(offers []*monetizeapi.ServiceOffer, baseURL strin return strings.Join(lines, "\n") } -// buildServiceCatalogJSON returns a JSON array of ready ServiceOffers for the public storefront. +// offerOperationallyReady reports whether an offer is usable for x402 +// payments today. This is intentionally LOOSER than the controller's +// Ready=True condition: ModelReady + UpstreamHealthy + PaymentGateReady +// + RoutePublished are sufficient. Registered is NOT in the AND. The +// reasoning: ERC-8004 on-chain registration is publication metadata, not +// operational readiness — an offer with the route published, payment gate +// active, and upstream healthy serves buyers correctly regardless of +// whether the on-chain identity has been minted yet. +// +// Used by the storefront catalog (and the skill catalog) so an offer that +// is functionally usable doesn't disappear from the operator's own +// dashboard just because the agent wallet hasn't been funded with gas +// yet. Callers should set ServiceCatalogEntry.RegistrationPending = true +// when the offer's Registered condition is False with reason +// AwaitingExternalRegistration, so storefront UIs can badge it. +func offerOperationallyReady(offer *monetizeapi.ServiceOffer) bool { + if offer == nil { + return false + } + // Backwards-compatible shortcut: the aggregate Ready=True implies all + // the per-condition gates by construction (see controller.go's `ready` + // computation), and existing tests / external callers that only emit + // the aggregate signal still want their offers to appear. + if isConditionTrue(offer.Status, "Ready") { + return true + } + // Fine-grained operational readiness: the four per-condition gates + // that make the offer usable today. Registered is intentionally NOT + // in this AND — see the doc comment on this function and on + // buildServiceCatalogJSON for the rationale. + return isConditionTrue(offer.Status, "ModelReady") && + isConditionTrue(offer.Status, "UpstreamHealthy") && + isConditionTrue(offer.Status, "PaymentGateReady") && + isConditionTrue(offer.Status, "RoutePublished") +} + +// offerAwaitingRegistration reports whether an offer is operationally +// ready but has its on-chain ERC-8004 registration still pending. Used to +// flip ServiceCatalogEntry.RegistrationPending so storefront UIs can show +// a "registration pending" badge alongside the usable offer. +func offerAwaitingRegistration(offer *monetizeapi.ServiceOffer) bool { + if offer == nil { + return false + } + for _, c := range offer.Status.Conditions { + if c.Type == "Registered" && c.Status == "False" && c.Reason == "AwaitingExternalRegistration" { + return true + } + } + return false +} + +// buildServiceCatalogJSON returns a JSON array of operationally-ready +// ServiceOffers for the public storefront feed (/api/services.json). +// +// The filter is operationally-ready (route published, payment gate +// active, upstream healthy) rather than the stricter controller +// Ready=True (which also requires Registered=True). Excluding offers +// whose only False condition is AwaitingExternalRegistration made +// operators' own seller dashboards mysteriously empty after `obol stack +// up` until they funded the agent wallet and ran `obol sell register`. +// That UX failed the "all paid services come back automatically" promise +// of the stack-up resume feature. func buildServiceCatalogJSON(offers []*monetizeapi.ServiceOffer, baseURL string) string { baseURL = strings.TrimRight(baseURL, "/") @@ -785,7 +847,7 @@ func buildServiceCatalogJSON(offers []*monetizeapi.ServiceOffer, baseURL string) if offer == nil || offer.DeletionTimestamp != nil || offer.IsPaused() { continue } - if isConditionTrue(offer.Status, "Ready") { + if offerOperationallyReady(offer) { ready = append(ready, offer) } } @@ -808,16 +870,17 @@ func buildServiceCatalogJSON(offers []*monetizeapi.ServiceOffer, baseURL string) } svc := schemas.ServiceCatalogEntry{ - Name: offer.Name, - Namespace: offer.Namespace, - Type: fallbackOfferType(offer), - Model: modelName, - Endpoint: baseURL + offer.EffectivePath(), - Price: describeOfferPrice(offer), - PayTo: offer.Spec.Payment.PayTo, - Network: offer.Spec.Payment.Network, - Description: desc, - IsDemo: offer.Namespace == "demo", + Name: offer.Name, + Namespace: offer.Namespace, + Type: fallbackOfferType(offer), + Model: modelName, + Endpoint: baseURL + offer.EffectivePath(), + Price: describeOfferPrice(offer), + PayTo: offer.Spec.Payment.PayTo, + Network: offer.Spec.Payment.Network, + Description: desc, + IsDemo: offer.Namespace == "demo", + RegistrationPending: offerAwaitingRegistration(offer), } raw, unit := offerPriceRawAndUnit(offer) diff --git a/internal/serviceoffercontroller/render_test.go b/internal/serviceoffercontroller/render_test.go index ea077860..4a11df62 100644 --- a/internal/serviceoffercontroller/render_test.go +++ b/internal/serviceoffercontroller/render_test.go @@ -992,3 +992,139 @@ func TestSafeName_Truncation(t *testing.T) { t.Error("different long names should produce different childNames") } } + +// TestOfferOperationallyReady_IncludesAwaitingExternalRegistration pins the +// behavioral fix that "operationally ready" no longer requires the on-chain +// ERC-8004 registration to have landed. +func TestOfferOperationallyReady_IncludesAwaitingExternalRegistration(t *testing.T) { + awaiting := &monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: "aeon", Namespace: "llm"}, + Status: monetizeapi.ServiceOfferStatus{ + Conditions: []monetizeapi.Condition{ + {Type: "ModelReady", Status: "True"}, + {Type: "UpstreamHealthy", Status: "True"}, + {Type: "PaymentGateReady", Status: "True"}, + {Type: "RoutePublished", Status: "True"}, + {Type: "Registered", Status: "False", Reason: "AwaitingExternalRegistration"}, + {Type: "Ready", Status: "False", Reason: "Reconciling"}, + }, + }, + } + if !offerOperationallyReady(awaiting) { + t.Fatal("offerOperationallyReady must return true for AwaitingExternalRegistration — the offer is usable for x402 payments today regardless of on-chain identity") + } + if !offerAwaitingRegistration(awaiting) { + t.Fatal("offerAwaitingRegistration must flag this offer so the storefront badges it as registration-pending") + } +} + +// TestOfferOperationallyReady_RejectsRealNotReady pins the narrow scope of +// the relax: an offer whose foreground gateway is down (UpstreamHealthy=False) +// must still be excluded. +func TestOfferOperationallyReady_RejectsRealNotReady(t *testing.T) { + notUsable := &monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: "broken", Namespace: "llm"}, + Status: monetizeapi.ServiceOfferStatus{ + Conditions: []monetizeapi.Condition{ + {Type: "ModelReady", Status: "True"}, + {Type: "UpstreamHealthy", Status: "False", Reason: "Unhealthy"}, + {Type: "PaymentGateReady", Status: "False", Reason: "WaitingForUpstream"}, + {Type: "RoutePublished", Status: "False", Reason: "WaitingForPaymentGate"}, + }, + }, + } + if offerOperationallyReady(notUsable) { + t.Error("offerOperationallyReady must reject an offer with UpstreamHealthy=False") + } + if offerAwaitingRegistration(notUsable) { + t.Error("offerAwaitingRegistration must NOT flag offers whose Registered condition isn't AwaitingExternalRegistration") + } +} + +// TestBuildServiceCatalogJSON_IncludesPendingRegistrationOffers pins the +// end-to-end wiring: an offer that's operationally ready but awaiting +// on-chain registration appears in /api/services.json with +// RegistrationPending=true. +func TestBuildServiceCatalogJSON_IncludesPendingRegistrationOffers(t *testing.T) { + offers := []*monetizeapi.ServiceOffer{ + { + ObjectMeta: metav1.ObjectMeta{Name: "aeon", Namespace: "llm"}, + Spec: monetizeapi.ServiceOfferSpec{ + Type: "inference", + Path: "/services/aeon", + Payment: monetizeapi.ServiceOfferPayment{ + Network: "base-sepolia", + PayTo: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + Price: monetizeapi.ServiceOfferPriceTable{PerMTok: "23"}, + }, + Model: monetizeapi.ServiceOfferModel{Name: "aeon-ultimate"}, + Registration: monetizeapi.ServiceOfferRegistration{ + Enabled: true, Name: "AEON Ultimate", + Description: "Uncensored Qwen3.6-27B", + }, + }, + Status: monetizeapi.ServiceOfferStatus{ + Conditions: []monetizeapi.Condition{ + {Type: "ModelReady", Status: "True"}, + {Type: "UpstreamHealthy", Status: "True"}, + {Type: "PaymentGateReady", Status: "True"}, + {Type: "RoutePublished", Status: "True"}, + {Type: "Registered", Status: "False", Reason: "AwaitingExternalRegistration"}, + }, + }, + }, + } + jsonStr := buildServiceCatalogJSON(offers, "https://inference.example.com") + var services []schemas.ServiceCatalogEntry + if err := json.Unmarshal([]byte(jsonStr), &services); err != nil { + t.Fatalf("invalid JSON: %v\n%s", err, jsonStr) + } + if len(services) != 1 { + t.Fatalf("expected 1 service in catalog, got %d: %+v", len(services), services) + } + if services[0].Name != "aeon" { + t.Errorf("got %q, want aeon", services[0].Name) + } + if !services[0].RegistrationPending { + t.Error("RegistrationPending must be true for AwaitingExternalRegistration offers") + } +} + +// TestBuildServiceCatalogJSON_RegistrationPendingFalseForFullyReady pins +// the negative: an offer that's fully Ready=True (on-chain register tx +// landed) must NOT carry RegistrationPending. +func TestBuildServiceCatalogJSON_RegistrationPendingFalseForFullyReady(t *testing.T) { + offers := []*monetizeapi.ServiceOffer{ + { + ObjectMeta: metav1.ObjectMeta{Name: "fully-ready", Namespace: "llm"}, + Spec: monetizeapi.ServiceOfferSpec{ + Type: "http", + Payment: monetizeapi.ServiceOfferPayment{ + Network: "base", PayTo: "0x1111111111111111111111111111111111111111", + Price: monetizeapi.ServiceOfferPriceTable{PerRequest: "0.001"}, + }, + }, + Status: monetizeapi.ServiceOfferStatus{ + Conditions: []monetizeapi.Condition{ + {Type: "ModelReady", Status: "True"}, + {Type: "UpstreamHealthy", Status: "True"}, + {Type: "PaymentGateReady", Status: "True"}, + {Type: "RoutePublished", Status: "True"}, + {Type: "Registered", Status: "True", Reason: "Active"}, + {Type: "Ready", Status: "True"}, + }, + }, + }, + } + jsonStr := buildServiceCatalogJSON(offers, "https://example.com") + var services []schemas.ServiceCatalogEntry + if err := json.Unmarshal([]byte(jsonStr), &services); err != nil { + t.Fatalf("invalid JSON: %v\n%s", err, jsonStr) + } + if len(services) != 1 { + t.Fatalf("expected 1 service, got %d", len(services)) + } + if services[0].RegistrationPending { + t.Error("RegistrationPending must be false for fully Ready=True offers") + } +} From 9947bb02399b46e12ad8f256352d2d7d63c2e0e8 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 12 May 2026 20:11:15 +0800 Subject: [PATCH 6/7] fix(stack): three cosmetic fixes on top of the zero-command resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surfaced during the live spark2 walkthrough of the prior commit: 1. Gateway PID printed as -1. cmd.Process.Release() on Unix sets p.Pid to -1 as part of its handle teardown, so reading cmd.Process.Pid AFTER Release prints a bogus -1 to the operator and pins -1 in the PID file. Snapshot the PID before the Release call. 2. Stale "Host gateways are not auto-started" footer. Carried over from the pre-auto-spawn version of the resume function. With the gateway now auto-spawned, the message is misleading. Replaced with a pointer to the gateway log path so operators can `tail -f` immediately. 3. /skill.md catalog used the strict Ready=True filter while /api/services.json used the new operationally-ready filter. The two surfaces should stay consistent — an offer that's usable for x402 payments shows up in BOTH the storefront feed and the operator- facing skill catalog. Switched buildSkillCatalogMarkdown to the same offerOperationallyReady predicate. No new tests; the existing TestOfferOperationallyReady_* / catalog tests cover the filter relax for both call sites since they share the predicate. The PID-reading order is a one-line semantic fix; the footer wording isn't worth a test. --- cmd/obol/sell.go | 15 ++++++++++----- internal/serviceoffercontroller/render.go | 8 +++++++- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index 1e9e7f99..8e1d6d8c 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -3496,11 +3496,11 @@ func resumeSellOffers(ctx context.Context, cfg *config.Config, u *ui.UI) error { continue } resumed++ - u.Successf("Resumed sell-inference offer %q (run `obol sell inference %s` to restart the host gateway)", d.Name, d.Name) + u.Successf("Resumed sell-inference offer %q", d.Name) } if resumed > 0 { - u.Dim(" Host gateways are not auto-started — re-run `obol sell inference ` in a terminal you can keep open.") + u.Dim(" Gateways spawned as detached background processes — check /sell-inference//gateway.log for output.") } _ = ctx // reserved for cancellation support; current resume calls are synchronous and short return nil @@ -3634,15 +3634,20 @@ func startDetachedInferenceGateway(cfg *config.Config, u *ui.UI, d *inference.De _ = logF.Close() return fmt.Errorf("start gateway subprocess: %w", err) } + // Snapshot the PID BEFORE Release — on Unix, os.Process.Release sets + // the Pid field to -1 as part of its handle teardown, so reading the + // field afterwards prints a misleading "-1" to the operator and pins + // a bogus -1 in the PID file. Persist + announce the captured value. + gatewayPID := cmd.Process.Pid // We don't wait on the child, but releasing the handle ensures the // parent process can exit cleanly without keeping a zombie reference. if err := cmd.Process.Release(); err != nil { u.Dim(" (could not release gateway process handle: " + err.Error() + ")") } - if err := os.WriteFile(pidFile, []byte(strconv.Itoa(cmd.Process.Pid)), 0o644); err != nil { - u.Warnf("gateway started (pid %d) but could not write %s: %v", cmd.Process.Pid, pidFile, err) + if err := os.WriteFile(pidFile, []byte(strconv.Itoa(gatewayPID)), 0o644); err != nil { + u.Warnf("gateway started (pid %d) but could not write %s: %v", gatewayPID, pidFile, err) } - u.Successf("Gateway started in background (pid %d, log %s)", cmd.Process.Pid, logFile) + u.Successf("Gateway started in background (pid %d, log %s)", gatewayPID, logFile) return nil } diff --git a/internal/serviceoffercontroller/render.go b/internal/serviceoffercontroller/render.go index 72dcf4d7..46f1797a 100644 --- a/internal/serviceoffercontroller/render.go +++ b/internal/serviceoffercontroller/render.go @@ -705,12 +705,18 @@ func offerPublishedForRegistration(offer *monetizeapi.ServiceOffer) bool { func buildSkillCatalogMarkdown(offers []*monetizeapi.ServiceOffer, baseURL string) string { baseURL = strings.TrimRight(baseURL, "/") + // Same operationally-ready filter as buildServiceCatalogJSON — keep the + // two surfaces consistent. An offer that's usable for x402 payments + // (route published, payment gate active, upstream healthy) appears in + // both /skill.md and /api/services.json, with the on-chain ERC-8004 + // registration treated as informational metadata rather than a gating + // signal. See offerOperationallyReady's doc comment for the rationale. var ready []*monetizeapi.ServiceOffer for _, offer := range offers { if offer == nil || offer.DeletionTimestamp != nil || offer.IsPaused() { continue } - if isConditionTrue(offer.Status, "Ready") { + if offerOperationallyReady(offer) { ready = append(ready, offer) } } From 702d7c67dcbc6f2fb21f991d3fbb5341a03653a3 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 12 May 2026 20:45:52 +0800 Subject: [PATCH 7/7] feat(stack): resume obol sell http offers on stack up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the second half of the "all paid services come back automatically after stack up" promise. The inference resume in earlier commits only handled `obol sell inference` because that's the only sell command whose state was persisted on disk. `obol sell http` always built its ServiceOffer manifest fresh from CLI flags and kubectl-applied it; no on-disk trace, so the resume loop had nothing to replay for the http case. On-disk schema: /sell-http/__.yaml One YAML file per offer, holding the rendered ServiceOffer manifest verbatim. The `__` filename ensures two offers with the same name in different namespaces never collide. We keep this separate from the inference store on purpose: inference descriptors carry host-side fields (ListenAddr, ModelName, Registration, …) that HTTP offers don't have, and folding them into one schema would force empty/optional fields onto every HTTP descriptor. A future unification is fine but not the right MVP shape. Lifecycle: - `obol sell http` (both --from-json and flag-driven paths) calls persistSellHTTPOffer right after kubectlApply succeeds. Persistence failures emit a warning but don't abort — the cluster-side offer is already in place, the disk state is the recovery aid, not the source of truth. - `obol sell delete` calls removeSellHTTPOffer so a stack-up after a delete doesn't resurrect the offer. Idempotent; missing-file removals are silent no-ops. - `obol stack up`'s resumeSellOffers calls resumeSellHTTPOffers after the inference branch. The early-return for "no inference offers" was rewritten to fall through to the http branch — operators with only HTTP offers and no inference offers must still get their storefront back. resumeSellHTTPOffers reads each YAML, unmarshals, and kubectl-applies. Per-offer errors are surfaced as warnings; the loop keeps going so one corrupt YAML can't block the rest. Tests: - TestPersistSellHTTPOffer_RoundTrip — pins the on-disk shape (path layout + YAML body content) by writing a manifest, reading it back, and asserting metadata.{name,namespace} + spec.payment.payTo survive. - TestPersistSellHTTPOffer_NamespaceIsolation — two offers with the same name in different namespaces produce distinct files. - TestRemoveSellHTTPOffer_DropsPersistedManifest — the symmetric teardown contract: file goes away on delete, idempotent on double- delete, defensive on empty inputs. - TestResumeSellHTTPOffers_EmptyStoreNoOp — no offers ever persisted must be a no-op, not an error. - TestSellDeleteAction_CallsRemoveSellHTTPOffer — source-level guard that the delete handler still calls the cleanup. Without it, an innocent refactor could leak persisted manifests and the only downstream symptom would be "deleted offers spookily come back" on the next stack-up — hard to attribute. - TestResumeSellOffers_HTTPOnlyStore — pins that the early-return rewrite works: an http-only workspace still gets its offers replayed. The sell-inference resume tests from the prior commits still cover their respective paths; the http-side additions are independent so neither store interferes with the other. --- cmd/obol/sell.go | 201 ++++++++++++++++++++++++++++++++++++++++-- cmd/obol/sell_test.go | 194 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 389 insertions(+), 6 deletions(-) diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index 8e1d6d8c..4e4a3285 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -39,6 +39,7 @@ import ( x402verifier "github.com/ObolNetwork/obol-stack/internal/x402" "github.com/ethereum/go-ethereum/common" "github.com/urfave/cli/v3" + "gopkg.in/yaml.v3" ) func sellCommand(cfg *config.Config) *cli.Command { @@ -656,6 +657,9 @@ Examples: if err := kubectlApply(cfg, manifest); err != nil { return err } + if persistErr := persistSellHTTPOffer(cfg, ns, name, manifest); persistErr != nil { + u.Warnf("could not persist offer for stack-up resume: %v", persistErr) + } u.Successf("ServiceOffer %s/%s created from JSON", ns, name) return nil } @@ -829,6 +833,9 @@ Examples: if err != nil { return err } + if persistErr := persistSellHTTPOffer(cfg, ns, name, manifest); persistErr != nil { + u.Warnf("could not persist offer for stack-up resume: %v", persistErr) + } action := "created" if strings.Contains(applyOut, "configured") || strings.Contains(applyOut, "unchanged") { action = "updated" @@ -2549,6 +2556,17 @@ func sellDeleteCommand(cfg *config.Config) *cli.Command { return err } + // Drop the on-disk sell-http manifest so the next `obol stack + // up` doesn't replay an offer the operator just deleted. The + // inference path doesn't need this hook — `obol sell delete` + // doesn't currently remove the inference.Store descriptor + // either, by design (the descriptor is what `obol sell + // inference list/status` reads). For HTTP we keep no such + // post-delete state, so removing the file is the right shape. + if removeErr := removeSellHTTPOffer(cfg, ns, name); removeErr != nil { + u.Warnf("could not remove persisted sell-http offer at %s/%s: %v", ns, name, removeErr) + } + // Clean up demo backend resources if this is a demo service. if ns == demoNamespace { cleanupDemoBackend(cfg, u, name) @@ -3471,18 +3489,23 @@ func loadProvenance(path string) (*inference.Provenance, error) { // Best-effort. Per-offer failures emit a warning and the loop continues, // so one broken descriptor cannot block stack-up. func resumeSellOffers(ctx context.Context, cfg *config.Config, u *ui.UI) error { + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + if _, statErr := os.Stat(kubeconfigPath); statErr != nil { + // No cluster — nothing to reattach for either inference or http. + return nil + } + store := inference.NewStore(cfg.ConfigDir) deployments, err := store.List() if err != nil { return fmt.Errorf("list inference deployments: %w", err) } - if len(deployments) == 0 { - return nil - } - kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") - if _, statErr := os.Stat(kubeconfigPath); statErr != nil { - // No cluster — nothing to reattach. + if len(deployments) == 0 { + // No inference offers; fall through to http resume below. + if err := resumeSellHTTPOffers(cfg, u); err != nil { + u.Warnf("resume sell-http offers: %v", err) + } return nil } @@ -3502,6 +3525,16 @@ func resumeSellOffers(ctx context.Context, cfg *config.Config, u *ui.UI) error { if resumed > 0 { u.Dim(" Gateways spawned as detached background processes — check /sell-inference//gateway.log for output.") } + + // Replay persisted `obol sell http` offers as well. The two stores are + // independent on disk (sell-http manifests live at + // /sell-http/, sell-inference descriptors at + // /inference/) but the operator wants one stack-up to + // bring every paid offer back regardless of type. + if err := resumeSellHTTPOffers(cfg, u); err != nil { + u.Warnf("resume sell-http offers: %v", err) + } + _ = ctx // reserved for cancellation support; current resume calls are synchronous and short return nil } @@ -3900,6 +3933,162 @@ func buildInferenceServiceOfferSpec(d *inference.Deployment, pt schemas.PriceTab return spec, nil } +// sellHTTPStoreDir returns the on-disk root for persisted `obol sell http` +// ServiceOffer manifests. Schema is one YAML file per offer at +// /sell-http/__.yaml so two offers with the +// same name in different namespaces never collide. +// +// Unlike `obol sell inference`, http offers don't have a host-side +// foreground process to track — the upstream is an in-cluster Service. +// The on-disk artifact is just the rendered ServiceOffer manifest; the +// resume path kubectl-applies it to bring the offer back identically. +// +// Long-term we should fold this into a single sell-offer store (one +// schema for both inference and http), so resume is one walk instead of +// two. Keeping them separate for now because the inference store is +// rich (host listen addr, asset symbol, registration block) while http +// only needs the rendered manifest — collapsing them would force +// inference-only fields onto every http descriptor. +func sellHTTPStoreDir(cfg *config.Config) string { + return filepath.Join(cfg.ConfigDir, "sell-http") +} + +func sellHTTPStorePath(cfg *config.Config, namespace, name string) string { + return filepath.Join(sellHTTPStoreDir(cfg), namespace+"__"+name+".yaml") +} + +// persistSellHTTPOffer writes the rendered ServiceOffer manifest to disk +// so `obol stack up` can replay it after a stack-down/up cycle wipes +// etcd. Idempotent: subsequent calls overwrite atomically (write to a +// `.tmp` sibling, rename into place) so a crash mid-write doesn't leave +// a half-rendered file the resume path would choke on. +// +// Best-effort from the caller's perspective: a persistence failure +// should warn but not abort the sell command, because the cluster-side +// offer is already in place. Returning an error lets the caller pick +// the surface. +func persistSellHTTPOffer(cfg *config.Config, namespace, name string, manifest map[string]any) error { + if cfg == nil || namespace == "" || name == "" { + return errors.New("persistSellHTTPOffer: missing cfg / namespace / name") + } + if err := os.MkdirAll(sellHTTPStoreDir(cfg), 0o755); err != nil { + return fmt.Errorf("create store dir: %w", err) + } + data, err := yaml.Marshal(manifest) + if err != nil { + return fmt.Errorf("marshal manifest: %w", err) + } + final := sellHTTPStorePath(cfg, namespace, name) + tmp := final + ".tmp" + if err := os.WriteFile(tmp, data, 0o600); err != nil { + return fmt.Errorf("write %s: %w", tmp, err) + } + if err := os.Rename(tmp, final); err != nil { + _ = os.Remove(tmp) + return fmt.Errorf("rename %s → %s: %w", tmp, final, err) + } + return nil +} + +// removeSellHTTPOffer deletes the on-disk manifest for a single offer +// when `obol sell delete` succeeds. Without this, the resume path on +// the next `obol stack up` would re-create an offer the operator +// intentionally deleted. Best-effort — a missing file is a no-op. +func removeSellHTTPOffer(cfg *config.Config, namespace, name string) error { + if cfg == nil || namespace == "" || name == "" { + return nil + } + path := sellHTTPStorePath(cfg, namespace, name) + if err := os.Remove(path); err != nil && !os.IsNotExist(err) { + return err + } + return nil +} + +// resumeSellHTTPOffers re-applies every persisted `obol sell http` +// manifest after `obol stack up` rebuilds the cluster. Mirror of the +// inference resume loop: walk the store dir, kubectl apply each file, +// warn-and-continue on per-offer failures so one corrupt YAML can't +// block the rest. +// +// Skipped silently when the store dir is missing (no offers ever +// persisted) or when no kubeconfig is present (stack up hasn't reached +// the cluster yet). Counts and announces what was reattached so the +// operator sees the same "Resumed N offers" feedback they get for +// inference. +func resumeSellHTTPOffers(cfg *config.Config, u *ui.UI) error { + dir := sellHTTPStoreDir(cfg) + entries, err := os.ReadDir(dir) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return fmt.Errorf("read store dir: %w", err) + } + if len(entries) == 0 { + return nil + } + + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + if _, statErr := os.Stat(kubeconfigPath); statErr != nil { + return nil // no cluster yet + } + + u.Blank() + var manifests []sellHTTPStoredOffer + for _, e := range entries { + if e.IsDir() || !strings.HasSuffix(e.Name(), ".yaml") { + continue + } + path := filepath.Join(dir, e.Name()) + data, err := os.ReadFile(path) + if err != nil { + u.Warnf("read %s: %v", path, err) + continue + } + var manifest map[string]any + if err := yaml.Unmarshal(data, &manifest); err != nil { + u.Warnf("parse %s: %v", path, err) + continue + } + ns, name := manifestNSName(manifest) + manifests = append(manifests, sellHTTPStoredOffer{Path: path, Manifest: manifest, Namespace: ns, Name: name}) + } + if len(manifests) == 0 { + return nil + } + + u.Infof("Resuming %d locally-persisted sell-http offer(s)...", len(manifests)) + for _, m := range manifests { + if err := kubectlApply(cfg, m.Manifest); err != nil { + u.Warnf("resume http %s/%s: %v", m.Namespace, m.Name, err) + continue + } + u.Successf("Resumed sell-http offer %s/%s", m.Namespace, m.Name) + } + return nil +} + +type sellHTTPStoredOffer struct { + Path string + Manifest map[string]any + Namespace string + Name string +} + +// manifestNSName pulls metadata.namespace + metadata.name out of an +// unmarshaled ServiceOffer manifest. Returns empty strings when the +// shape is malformed; the resume loop reports the error per-file. +func manifestNSName(manifest map[string]any) (string, string) { + md, ok := manifest["metadata"].(map[string]any) + if !ok { + return "", "" + } + ns, _ := md["namespace"].(string) + name, _ := md["name"].(string) + return ns, name +} + // removePricingRoute is a no-op retained for compatibility. // The serviceoffer-controller now manages pricing routes via the ServiceOffer // informer; static ConfigMap routes are no longer used. diff --git a/cmd/obol/sell_test.go b/cmd/obol/sell_test.go index 15c03b8d..9f3991ea 100644 --- a/cmd/obol/sell_test.go +++ b/cmd/obol/sell_test.go @@ -15,6 +15,7 @@ import ( "github.com/ObolNetwork/obol-stack/internal/ui" x402verifier "github.com/ObolNetwork/obol-stack/internal/x402" "github.com/urfave/cli/v3" + "gopkg.in/yaml.v3" ) // ───────────────────────────────────────────────────────────────────────────── @@ -1521,6 +1522,199 @@ func TestProcessAlive_SelfAndBogus(t *testing.T) { } } +// TestPersistSellHTTPOffer_RoundTrip pins the on-disk manifest contract. +// Anything the persistence layer writes must come back identically from +// the resume path (which reads it via yaml.Unmarshal + kubectl-apply). +// The path layout — /sell-http/__.yaml — is +// the second half of the contract: two offers with the same name in +// different namespaces must never collide on disk. +func TestPersistSellHTTPOffer_RoundTrip(t *testing.T) { + cfg := newTestConfig(t) + manifest := map[string]any{ + "apiVersion": "obol.org/v1alpha1", + "kind": "ServiceOffer", + "metadata": map[string]any{ + "name": "my-api", + "namespace": "default", + }, + "spec": map[string]any{ + "type": "http", + "upstream": map[string]any{ + "service": "my-svc", "namespace": "default", "port": int64(8080), "healthPath": "/health", + }, + "payment": map[string]any{ + "scheme": "exact", + "network": "base-sepolia", + "payTo": "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + "price": map[string]any{"perRequest": "0.001"}, + }, + }, + } + if err := persistSellHTTPOffer(cfg, "default", "my-api", manifest); err != nil { + t.Fatalf("persistSellHTTPOffer: %v", err) + } + + expected := filepath.Join(cfg.ConfigDir, "sell-http", "default__my-api.yaml") + data, err := os.ReadFile(expected) + if err != nil { + t.Fatalf("expected manifest at %s: %v", expected, err) + } + if len(data) == 0 { + t.Fatal("manifest file is empty") + } + // Round-trip: parse, verify metadata.name + spec.payment.payTo survived. + var round map[string]any + if err := yaml.Unmarshal(data, &round); err != nil { + t.Fatalf("YAML unmarshal: %v\n%s", err, data) + } + ns, name := manifestNSName(round) + if ns != "default" || name != "my-api" { + t.Errorf("round-tripped metadata = %s/%s, want default/my-api", ns, name) + } + payTo := round["spec"].(map[string]any)["payment"].(map[string]any)["payTo"] + if payTo != "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47" { + t.Errorf("round-tripped payTo = %v, want operator-supplied", payTo) + } +} + +// TestPersistSellHTTPOffer_NamespaceIsolation pins that two offers with +// the same name in different namespaces produce distinct files. A +// careless filename scheme would have the second `obol sell http` +// silently overwrite the first. +func TestPersistSellHTTPOffer_NamespaceIsolation(t *testing.T) { + cfg := newTestConfig(t) + stub := func(ns string) map[string]any { + return map[string]any{ + "apiVersion": "obol.org/v1alpha1", + "kind": "ServiceOffer", + "metadata": map[string]any{"name": "shared", "namespace": ns}, + "spec": map[string]any{"type": "http"}, + } + } + if err := persistSellHTTPOffer(cfg, "team-a", "shared", stub("team-a")); err != nil { + t.Fatalf("persist team-a: %v", err) + } + if err := persistSellHTTPOffer(cfg, "team-b", "shared", stub("team-b")); err != nil { + t.Fatalf("persist team-b: %v", err) + } + for _, ns := range []string{"team-a", "team-b"} { + path := filepath.Join(cfg.ConfigDir, "sell-http", ns+"__shared.yaml") + if _, err := os.Stat(path); err != nil { + t.Errorf("expected %s to exist: %v", path, err) + } + } +} + +// TestRemoveSellHTTPOffer_DropsPersistedManifest pins the symmetric +// teardown: `obol sell delete` must remove the on-disk manifest so the +// next stack-up doesn't re-create an offer the operator intentionally +// deleted. The function must also be a quiet no-op for missing files — +// running it twice (or against an offer that was never persisted) must +// not error. +func TestRemoveSellHTTPOffer_DropsPersistedManifest(t *testing.T) { + cfg := newTestConfig(t) + manifest := map[string]any{ + "apiVersion": "obol.org/v1alpha1", "kind": "ServiceOffer", + "metadata": map[string]any{"name": "doomed", "namespace": "llm"}, + } + if err := persistSellHTTPOffer(cfg, "llm", "doomed", manifest); err != nil { + t.Fatalf("persist: %v", err) + } + path := filepath.Join(cfg.ConfigDir, "sell-http", "llm__doomed.yaml") + if _, err := os.Stat(path); err != nil { + t.Fatalf("manifest must exist before remove: %v", err) + } + if err := removeSellHTTPOffer(cfg, "llm", "doomed"); err != nil { + t.Errorf("remove: %v", err) + } + if _, err := os.Stat(path); !os.IsNotExist(err) { + t.Errorf("manifest still exists after remove: %v", err) + } + // Idempotent: removing the missing file must not error. + if err := removeSellHTTPOffer(cfg, "llm", "doomed"); err != nil { + t.Errorf("second remove must be no-op, got: %v", err) + } + // Defensive: empty inputs are silent no-ops, not panics. + if err := removeSellHTTPOffer(cfg, "", "foo"); err != nil { + t.Errorf("empty namespace: %v", err) + } + if err := removeSellHTTPOffer(cfg, "llm", ""); err != nil { + t.Errorf("empty name: %v", err) + } +} + +// TestResumeSellHTTPOffers_EmptyStoreNoOp pins the "no http offers yet" +// path: stack-up against a workspace that's never persisted an http +// offer returns nil without erroring. Same shape as the inference +// equivalent — important because resumeSellOffers calls both even when +// only one store has entries. +func TestResumeSellHTTPOffers_EmptyStoreNoOp(t *testing.T) { + cfg := newTestConfig(t) + // Need a kubeconfig.yaml present, otherwise the function bails early + // (correctly — no cluster). For the "store empty but cluster up" + // case we want to verify it doesn't error. + if err := os.WriteFile(filepath.Join(cfg.ConfigDir, "kubeconfig.yaml"), []byte("placeholder"), 0o600); err != nil { + t.Fatalf("seed kubeconfig: %v", err) + } + if err := resumeSellHTTPOffers(cfg, ui.New(false)); err != nil { + t.Errorf("empty-store resume must succeed: %v", err) + } +} + +// TestSellDeleteAction_CallsRemoveSellHTTPOffer is a source-level guard +// against the obvious post-delete leak: forget to call +// removeSellHTTPOffer in the sell delete handler and the next +// `obol stack up` resurrects the offer the operator just killed. The +// only signal would be "deleted offers spookily come back" which is +// hard to attribute, hence the test. +func TestSellDeleteAction_CallsRemoveSellHTTPOffer(t *testing.T) { + src, err := os.ReadFile("sell.go") + if err != nil { + t.Fatalf("read sell.go: %v", err) + } + body := string(src) + start := strings.Index(body, "func sellDeleteCommand(") + if start < 0 { + t.Fatal("sellDeleteCommand not found") + } + end := strings.Index(body[start+1:], "\nfunc ") + if end < 0 { + t.Fatal("could not delimit sellDeleteCommand body") + } + scope := body[start : start+1+end] + if !strings.Contains(scope, "removeSellHTTPOffer(") { + t.Fatal("sellDeleteCommand must call removeSellHTTPOffer — otherwise the on-disk manifest survives the kubectl delete and `obol stack up` resurrects the offer") + } +} + +// TestResumeSellOffers_HTTPOnlyStore pins that http offers are still +// resumed when the inference store is completely empty. The original +// resumeSellOffers short-circuited on `len(deployments) == 0` before +// reaching the http branch; this test catches a regression that +// reintroduces that early return. +func TestResumeSellOffers_HTTPOnlyStore(t *testing.T) { + cfg := newTestConfig(t) + // Seed a kubeconfig so resume gets past the no-cluster guard. + if err := os.WriteFile(filepath.Join(cfg.ConfigDir, "kubeconfig.yaml"), []byte("placeholder"), 0o600); err != nil { + t.Fatalf("seed kubeconfig: %v", err) + } + // No inference offers. One http offer present. Function must walk + // the http branch without erroring on the missing inference store. + manifest := map[string]any{ + "apiVersion": "obol.org/v1alpha1", "kind": "ServiceOffer", + "metadata": map[string]any{"name": "only-http", "namespace": "llm"}, + } + if err := persistSellHTTPOffer(cfg, "llm", "only-http", manifest); err != nil { + t.Fatalf("persist: %v", err) + } + // kubectl apply will fail against the fake kubeconfig — that's + // expected and reported as a warn, not an error. We just need + // resumeSellOffers itself to return nil. + if err := resumeSellOffers(context.Background(), cfg, ui.New(false)); err != nil { + t.Errorf("http-only store must not error from resumeSellOffers: %v", err) + } +} + func TestBuildDemoServiceOffer_RegisterFlagDrivesEnabled(t *testing.T) { for _, register := range []bool{true, false} { manifest := buildDemoServiceOffer(