From 47084076b04e39b825c776cf517bd7dccf2f190e Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 12 May 2026 17:31:58 +0800 Subject: [PATCH 1/6] fix(sell inference): emit registration spec + honor operator overrides MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `obol sell inference` was producing ServiceOffers with three latent defects that together stopped /.well-known/agent-registration.json from ever being routed for inference-typed sells: 1. The inference subcommand never exposed `--register-*` flags (compare with `obol sell http` which does), even though the help text on `--register` says "registration is enabled by default". 2. `buildInferenceServiceOfferSpec` never wrote `spec.registration` onto the offer at all, so the controller's `reconcileRegistrationStatus` saw `Enabled=false` (zero value) and emitted `Registered=True Disabled` with no RegistrationRequest CR and no /.well-known HTTPRoute. 3. `buildInferenceServiceOfferSpec` hardcoded `spec.model.name = "ollama"` regardless of the actual `--model` value, so anything downstream that keyed off the model id (the controller's description default included) was looking at the wrong string. Surfaced today on spark2 while trying to fetch `https://inference.v1337.org/.well-known/agent-registration.json` — got the Traefik fall-through 404. Manual `kubectl patch` enabled registration, exposed a fourth defect: 4. `buildActiveRegistrationDocument` in the serviceoffer-controller unconditionally overwrote `Spec.Registration.Description` for inference offers with `" inference via x402 micropayments"`, even when the operator had supplied an explicit description at sell time. This PR fixes all four: - Add `--no-register`, `--register-name`, `--register-description`, `--register-image`, `--register-skills`, `--register-domains`, `--register-metadata` to `obol sell inference`. - Rename `sellHTTPRegistrationInput` / `buildSellHTTPRegistrationConfig` to the unqualified `sellRegistrationInput` / `buildSellRegistrationConfig` since they now serve both inference and http call sites. - Extend `buildInferenceServiceOfferSpec` to accept the resolved model name and the registration block, write `spec.model.name = `, and merge the registration block into `spec.registration` when non-empty. - In the controller's `buildActiveRegistrationDocument`, only fall back to the model-aware description string when the operator left `Spec.Registration.Description` empty. Tests that would have caught the regression earlier: - `TestSellInference_Flags` now requires the six registration flags; their absence on `main` was the bug. - `TestBuildInferenceServiceOfferSpec_RegistrationEnabledByDefault` pins that defaults produce `spec.registration.enabled = true` and the offer name as `spec.registration.name`. - `TestBuildInferenceServiceOfferSpec_NoRegisterOmitsRegistration` pins the `--no-register` opt-out. - `TestBuildInferenceServiceOfferSpec_OperatorOverridesWin` pins that operator-supplied name/description/image/skills/domains all survive into the spec verbatim. - `TestBuildInferenceServiceOfferSpec_ModelNameNotHardcoded` pins that `spec.model.name` reflects `--model`, not the historical "ollama" literal. - `TestBuildActiveRegistrationDocument_KeepsOperatorDescription` pins the controller-side fix: an operator description survives the buildActiveRegistrationDocument pass. - `TestBuildActiveRegistrationDocument_FallsBackToModelDescriptionForInference` pins the other branch — inference offers with no operator description still get the model-aware default, not the generic one. Drive-by: `TestSellInference_Flags` was failing on `main` after #470 removed the `--price` default but didn't update the corresponding assertion. Updated to assert `--price` default = "" with a comment explaining the contract. --- cmd/obol/sell.go | 69 +++++++- cmd/obol/sell_test.go | 156 +++++++++++++++++- internal/serviceoffercontroller/render.go | 15 +- .../serviceoffercontroller/render_test.go | 80 ++++++++- 4 files changed, 300 insertions(+), 20 deletions(-) diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index 6ba344ed..508aecaa 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -184,6 +184,34 @@ Examples: Name: "provenance-file", Usage: "Path to JSON file with provenance metadata (e.g. autoresearch experiment results)", }, + &cli.BoolFlag{ + Name: "no-register", + Usage: "Skip ERC-8004 registration (no /.well-known/agent-registration.json HTTPRoute is published)", + }, + &cli.StringFlag{ + Name: "register-name", + Usage: "Agent name for ERC-8004 registration (defaults to the offer name)", + }, + &cli.StringFlag{ + Name: "register-description", + Usage: "Agent description for ERC-8004 registration", + }, + &cli.StringFlag{ + Name: "register-image", + Usage: "Agent image URL for ERC-8004 registration", + }, + &cli.StringSliceFlag{ + Name: "register-skills", + Usage: "OASF skill tags for ERC-8004 registration (repeatable)", + }, + &cli.StringSliceFlag{ + Name: "register-domains", + Usage: "OASF domain tags for ERC-8004 registration (repeatable)", + }, + &cli.StringSliceFlag{ + Name: "register-metadata", + Usage: "Additional registration metadata as key=value pairs (repeatable, e.g. gpu=A100-80GB)", + }, }, Action: func(ctx context.Context, cmd *cli.Command) error { u := getUI(cmd) @@ -384,7 +412,19 @@ Examples: d.NoPaymentGate = false } else { // Create a ServiceOffer CR pointing at the host service. - soSpec, err := buildInferenceServiceOfferSpec(d, priceTable, svcNs, port, assetTerms) + reg, _, regErr := buildSellRegistrationConfig(name, sellRegistrationInput{ + NoRegister: cmd.Bool("no-register"), + Name: cmd.String("register-name"), + Description: cmd.String("register-description"), + Image: cmd.String("register-image"), + Skills: cmd.StringSlice("register-skills"), + Domains: cmd.StringSlice("register-domains"), + MetadataPairs: cmd.StringSlice("register-metadata"), + }) + if regErr != nil { + return regErr + } + soSpec, err := buildInferenceServiceOfferSpec(d, priceTable, svcNs, port, assetTerms, modelFlag, reg) if err != nil { return err } @@ -701,7 +741,7 @@ Examples: prov.Framework, prov.MetricName, prov.MetricValue, prov.ParamCount) } - reg, registerEnabled, err := buildSellHTTPRegistrationConfig(name, sellHTTPRegistrationInput{ + reg, registerEnabled, err := buildSellRegistrationConfig(name, sellRegistrationInput{ NoRegister: cmd.Bool("no-register"), Register: cmd.Bool("register"), Name: cmd.String("register-name"), @@ -827,7 +867,7 @@ type autoRegisterOptions struct { ExpectedOwner string } -type sellHTTPRegistrationInput struct { +type sellRegistrationInput struct { NoRegister bool Register bool Name string @@ -907,7 +947,7 @@ func autoRegisterServiceOffer(ctx context.Context, cfg *config.Config, u *ui.UI, return nil } -func buildSellHTTPRegistrationConfig(serviceName string, in sellHTTPRegistrationInput) (map[string]any, bool, error) { +func buildSellRegistrationConfig(serviceName string, in sellRegistrationInput) (map[string]any, bool, error) { registerEnabled := !in.NoRegister if !registerEnabled && (in.Register || in.Name != "" || in.Description != "" || in.Image != "" || len(in.Skills) > 0 || len(in.Domains) > 0 || len(in.MetadataPairs) > 0) { @@ -3383,7 +3423,16 @@ func resolveHostIP(cfg *config.Config) (string, error) { // buildInferenceServiceOfferSpec builds a ServiceOffer spec for a host-side // inference gateway routed through the cluster's x402 flow. -func buildInferenceServiceOfferSpec(d *inference.Deployment, pt schemas.PriceTable, ns, port string, asset schemas.AssetTerms) (map[string]any, error) { +// +// modelName is the upstream model identifier the operator passed via --model +// (e.g. "aeon-ultimate"). It is written into spec.model.name so the +// serviceoffer-controller's registration document carries the real model id +// rather than the historical hardcoded "ollama" string. +// +// registration is the operator's ERC-8004 registration block as produced by +// buildSellRegistrationConfig — pass nil (or an empty map) to skip +// registration. When non-nil it is merged verbatim into spec.registration. +func buildInferenceServiceOfferSpec(d *inference.Deployment, pt schemas.PriceTable, ns, port string, asset schemas.AssetTerms, modelName string, registration map[string]any) (map[string]any, error) { portNum, err := strconv.Atoi(port) if err != nil || portNum < 1 || portNum > 65535 { return nil, fmt.Errorf("invalid port %q: must be a number between 1 and 65535", port) @@ -3416,12 +3465,20 @@ func buildInferenceServiceOfferSpec(d *inference.Deployment, pt schemas.PriceTab } if d.UpstreamURL != "" { + model := strings.TrimSpace(modelName) + if model == "" { + model = "ollama" // pre-fix fallback; the Action enforces --model + } spec["model"] = map[string]any{ - "name": "ollama", + "name": model, "runtime": "ollama", } } + if len(registration) > 0 { + spec["registration"] = registration + } + return spec, nil } diff --git a/cmd/obol/sell_test.go b/cmd/obol/sell_test.go index adffbfa3..3277c183 100644 --- a/cmd/obol/sell_test.go +++ b/cmd/obol/sell_test.go @@ -6,6 +6,7 @@ import ( "testing" "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/inference" "github.com/ObolNetwork/obol-stack/internal/monetizeapi" "github.com/ObolNetwork/obol-stack/internal/schemas" x402verifier "github.com/ObolNetwork/obol-stack/internal/x402" @@ -185,9 +186,19 @@ func TestSellInference_Flags(t *testing.T) { "listen", "upstream", "enclave-tag", "vm", "vm-image", "vm-cpus", "vm-memory", "vm-host-port", "tee", "model-hash", + // Registration parity with `obol sell http`. Their absence on the + // inference subcommand was the regression that left + // /.well-known/agent-registration.json unrouted (see + // TestBuildInferenceServiceOfferSpec_RegistrationEnabledByDefault). + "no-register", + "register-name", "register-description", "register-image", + "register-skills", "register-domains", "register-metadata", ) - assertStringDefault(t, flags, "price", "0.001") + // --price intentionally has no default after #470 — the resolvePriceTable + // fallthrough requires an explicit price flag instead of letting "0.001" + // shadow --per-mtok / --per-request. Empty default is the pinned contract. + assertStringDefault(t, flags, "price", "") assertStringDefault(t, flags, "chain", "base") assertStringDefault(t, flags, "token", "USDC") assertStringDefault(t, flags, "listen", ":8402") @@ -256,8 +267,8 @@ func TestSellHTTP_Flags(t *testing.T) { assertIntDefault(t, flags, "max-timeout", 300) } -func TestBuildSellHTTPRegistrationConfig_DefaultEnabled(t *testing.T) { - reg, enabled, err := buildSellHTTPRegistrationConfig("demo", sellHTTPRegistrationInput{}) +func TestBuildSellRegistrationConfig_DefaultEnabled(t *testing.T) { + reg, enabled, err := buildSellRegistrationConfig("demo", sellRegistrationInput{}) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -272,8 +283,8 @@ func TestBuildSellHTTPRegistrationConfig_DefaultEnabled(t *testing.T) { } } -func TestBuildSellHTTPRegistrationConfig_NoRegisterConflicts(t *testing.T) { - _, _, err := buildSellHTTPRegistrationConfig("demo", sellHTTPRegistrationInput{ +func TestBuildSellRegistrationConfig_NoRegisterConflicts(t *testing.T) { + _, _, err := buildSellRegistrationConfig("demo", sellRegistrationInput{ NoRegister: true, Name: "custom", }) @@ -755,6 +766,141 @@ func TestDemoRPCNetwork(t *testing.T) { } } +// TestBuildInferenceServiceOfferSpec_RegistrationEnabledByDefault pins the +// fix for the missing-registration regression: `obol sell inference` used to +// build a ServiceOffer with empty `spec.registration`, so the controller +// emitted "Registration disabled" and never published the +// /.well-known/agent-registration.json HTTPRoute. The default contract is +// "registration enabled, name derived from offer name" — same as `sell http`. +func TestBuildInferenceServiceOfferSpec_RegistrationEnabledByDefault(t *testing.T) { + d := &inference.Deployment{ + Name: "aeon", + UpstreamURL: "http://127.0.0.1:8000", + WalletAddress: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + Chain: "base-sepolia", + PricePerRequest: "0.001", + } + reg, enabled, err := buildSellRegistrationConfig("aeon", sellRegistrationInput{}) + if err != nil { + t.Fatalf("buildSellRegistrationConfig: %v", err) + } + if !enabled { + t.Fatal("registration should default to enabled") + } + spec, err := buildInferenceServiceOfferSpec(d, schemas.PriceTable{PerRequest: "0.001"}, "llm", "8402", schemas.AssetTerms{}, "aeon-ultimate", reg) + if err != nil { + t.Fatalf("buildInferenceServiceOfferSpec: %v", err) + } + got, ok := spec["registration"].(map[string]any) + if !ok { + t.Fatalf("spec.registration missing or wrong type: %#v", spec["registration"]) + } + if got["enabled"] != true { + t.Errorf("spec.registration.enabled = %v, want true", got["enabled"]) + } + if got["name"] != "aeon" { + t.Errorf("spec.registration.name = %v, want %q", got["name"], "aeon") + } +} + +// TestBuildInferenceServiceOfferSpec_NoRegisterOmitsRegistration confirms the +// opt-out path: --no-register produces an empty registration map and the spec +// builder must leave spec.registration unset (so the controller emits the +// well-known route only when explicitly requested). +func TestBuildInferenceServiceOfferSpec_NoRegisterOmitsRegistration(t *testing.T) { + d := &inference.Deployment{ + Name: "aeon", UpstreamURL: "http://127.0.0.1:8000", + WalletAddress: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + Chain: "base-sepolia", PricePerRequest: "0.001", + } + reg, enabled, err := buildSellRegistrationConfig("aeon", sellRegistrationInput{NoRegister: true}) + if err != nil { + t.Fatalf("buildSellRegistrationConfig: %v", err) + } + if enabled { + t.Fatal("--no-register should disable registration") + } + spec, err := buildInferenceServiceOfferSpec(d, schemas.PriceTable{PerRequest: "0.001"}, "llm", "8402", schemas.AssetTerms{}, "aeon-ultimate", reg) + if err != nil { + t.Fatalf("buildInferenceServiceOfferSpec: %v", err) + } + if _, present := spec["registration"]; present { + t.Errorf("spec.registration should be absent when --no-register; got %#v", spec["registration"]) + } +} + +// TestBuildInferenceServiceOfferSpec_OperatorOverridesWin pins the fix for +// the controller-side description-clobber regression in +// internal/serviceoffercontroller/render.go: the operator's explicit +// registration fields (name, description, image, skills, domains) must +// survive into the published spec verbatim, not be silently replaced by +// controller-side defaults. This test covers the *spec input*; the +// controller-side guarantee is covered by the companion test in +// serviceoffercontroller/render_test.go. +func TestBuildInferenceServiceOfferSpec_OperatorOverridesWin(t *testing.T) { + d := &inference.Deployment{ + Name: "aeon", UpstreamURL: "http://127.0.0.1:8000", + WalletAddress: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + Chain: "base-sepolia", PricePerRequest: "0.001", + } + reg, _, err := buildSellRegistrationConfig("aeon", sellRegistrationInput{ + Name: "Qwen36 AEON Ultimate", + Description: "Uncensored Qwen3.6-27B abliteration on DGX Spark", + Image: "https://example.com/aeon.png", + Skills: []string{"llm/inference", "llm/uncensored"}, + Domains: []string{"inference.v1337.org"}, + }) + if err != nil { + t.Fatalf("buildSellRegistrationConfig: %v", err) + } + spec, err := buildInferenceServiceOfferSpec(d, schemas.PriceTable{PerRequest: "0.001"}, "llm", "8402", schemas.AssetTerms{}, "aeon-ultimate", reg) + if err != nil { + t.Fatalf("buildInferenceServiceOfferSpec: %v", err) + } + r := spec["registration"].(map[string]any) + for k, want := range map[string]any{ + "name": "Qwen36 AEON Ultimate", + "description": "Uncensored Qwen3.6-27B abliteration on DGX Spark", + "image": "https://example.com/aeon.png", + } { + if r[k] != want { + t.Errorf("spec.registration.%s = %#v, want %#v", k, r[k], want) + } + } + skills, _ := r["skills"].([]string) + if len(skills) != 2 || skills[0] != "llm/inference" { + t.Errorf("spec.registration.skills = %#v, want [llm/inference llm/uncensored]", r["skills"]) + } + domains, _ := r["domains"].([]string) + if len(domains) != 1 || domains[0] != "inference.v1337.org" { + t.Errorf("spec.registration.domains = %#v, want [inference.v1337.org]", r["domains"]) + } +} + +// TestBuildInferenceServiceOfferSpec_ModelNameNotHardcoded pins the fix for +// the "spec.model.name = ollama regardless of --model" regression. The model +// id the operator passed must surface in spec.model.name so the controller's +// per-model registration description (and any downstream tooling that keys +// off model name) reads the truth, not the historical hardcoded literal. +func TestBuildInferenceServiceOfferSpec_ModelNameNotHardcoded(t *testing.T) { + d := &inference.Deployment{ + Name: "aeon", UpstreamURL: "http://127.0.0.1:8000", + WalletAddress: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + Chain: "base-sepolia", PricePerRequest: "0.001", + } + spec, err := buildInferenceServiceOfferSpec(d, schemas.PriceTable{PerRequest: "0.001"}, "llm", "8402", schemas.AssetTerms{}, "aeon-ultimate", nil) + if err != nil { + t.Fatalf("buildInferenceServiceOfferSpec: %v", err) + } + model, ok := spec["model"].(map[string]any) + if !ok { + t.Fatalf("spec.model missing or wrong type: %#v", spec["model"]) + } + if model["name"] != "aeon-ultimate" { + t.Errorf("spec.model.name = %v, want %q (must reflect --model, not the legacy hardcoded \"ollama\")", model["name"], "aeon-ultimate") + } +} + func TestBuildDemoServiceOffer_RegisterFlagDrivesEnabled(t *testing.T) { for _, register := range []bool{true, false} { manifest := buildDemoServiceOffer( diff --git a/internal/serviceoffercontroller/render.go b/internal/serviceoffercontroller/render.go index df59bfee..ef6c4705 100644 --- a/internal/serviceoffercontroller/render.go +++ b/internal/serviceoffercontroller/render.go @@ -26,7 +26,6 @@ const ( servicesJSONRouteName = "obol-services-json-route" ) - func buildRegistrationRequest(offer *monetizeapi.ServiceOffer, desiredState string) *unstructured.Unstructured { return &unstructured.Unstructured{ Object: map[string]any{ @@ -583,12 +582,18 @@ func isConditionTrue(status monetizeapi.ServiceOfferStatus, conditionType string func buildActiveRegistrationDocument(owner *monetizeapi.ServiceOffer, offers []*monetizeapi.ServiceOffer, baseURL, agentID string) erc8004.AgentRegistration { baseURL = strings.TrimRight(baseURL, "/") + // Operator-supplied description wins. Only fall back to a controller- + // generated default when the offer left Spec.Registration.Description + // empty. The inference-typed default is more specific (names the model), + // so it preempts the generic default — but neither overrides an explicit + // operator value. description := owner.Spec.Registration.Description if description == "" { - description = fmt.Sprintf("x402 payment-gated %s service: %s", fallbackOfferType(owner), owner.Name) - } - if owner.IsInference() && owner.Spec.Model.Name != "" { - description = fmt.Sprintf("%s inference via x402 micropayments", owner.Spec.Model.Name) + if owner.IsInference() && owner.Spec.Model.Name != "" { + description = fmt.Sprintf("%s inference via x402 micropayments", owner.Spec.Model.Name) + } else { + description = fmt.Sprintf("x402 payment-gated %s service: %s", fallbackOfferType(owner), owner.Name) + } } image := owner.Spec.Registration.Image diff --git a/internal/serviceoffercontroller/render_test.go b/internal/serviceoffercontroller/render_test.go index d77ed603..ea077860 100644 --- a/internal/serviceoffercontroller/render_test.go +++ b/internal/serviceoffercontroller/render_test.go @@ -240,6 +240,74 @@ func TestBuildActiveRegistrationDocument(t *testing.T) { } } +// TestBuildActiveRegistrationDocument_KeepsOperatorDescription pins the fix +// for the controller-side description-clobber bug: +// `buildActiveRegistrationDocument` used to unconditionally overwrite +// `owner.Spec.Registration.Description` for inference-typed offers with +// `" inference via x402 micropayments"`, so any explicit operator +// description set at sell time was silently lost in the published +// /.well-known/agent-registration.json document. The fix only fills the +// description from the model name when the operator's value is empty. +func TestBuildActiveRegistrationDocument_KeepsOperatorDescription(t *testing.T) { + operatorDesc := "Uncensored Qwen3.6-27B abliteration on DGX Spark" + owner := &monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: "aeon", Namespace: "llm"}, + Spec: monetizeapi.ServiceOfferSpec{ + Type: "inference", + Model: monetizeapi.ServiceOfferModel{Name: "aeon-ultimate"}, + Path: "/services/aeon", + Registration: monetizeapi.ServiceOfferRegistration{ + Enabled: true, + Name: "Qwen36 AEON Ultimate", + Description: operatorDesc, + }, + }, + Status: monetizeapi.ServiceOfferStatus{ + Conditions: []monetizeapi.Condition{ + {Type: "ModelReady", Status: "True"}, + {Type: "UpstreamHealthy", Status: "True"}, + {Type: "PaymentGateReady", Status: "True"}, + {Type: "RoutePublished", Status: "True"}, + }, + }, + } + doc := buildActiveRegistrationDocument(owner, []*monetizeapi.ServiceOffer{owner}, "https://inference.example.com", "") + if doc.Description != operatorDesc { + t.Fatalf("description = %q, want operator value %q (the controller used to overwrite this with %q-inference-via-x402-micropayments)", + doc.Description, operatorDesc, owner.Spec.Model.Name) + } + if doc.Name != "Qwen36 AEON Ultimate" { + t.Errorf("name = %q, want operator value %q", doc.Name, "Qwen36 AEON Ultimate") + } +} + +// TestBuildActiveRegistrationDocument_FallsBackToModelDescriptionForInference +// pins the *other* side of the description contract: when the operator does +// not supply a description, inference offers should still get the +// model-aware default (" inference via x402 micropayments"), +// not the generic "x402 payment-gated service: " string used +// for non-inference fallback. The refactor must preserve both branches. +func TestBuildActiveRegistrationDocument_FallsBackToModelDescriptionForInference(t *testing.T) { + owner := &monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: "aeon", Namespace: "llm"}, + Spec: monetizeapi.ServiceOfferSpec{ + Type: "inference", + Model: monetizeapi.ServiceOfferModel{Name: "aeon-ultimate"}, + Path: "/services/aeon", + Registration: monetizeapi.ServiceOfferRegistration{ + Enabled: true, + Name: "aeon", + // Description intentionally left blank. + }, + }, + } + doc := buildActiveRegistrationDocument(owner, []*monetizeapi.ServiceOffer{owner}, "https://inference.example.com", "") + want := "aeon-ultimate inference via x402 micropayments" + if doc.Description != want { + t.Errorf("description = %q, want %q (model-aware default for inference offers with no operator description)", doc.Description, want) + } +} + func TestBuildRegistrationServices_IncludesOwnerWhenOwnerNotYetPublished(t *testing.T) { owner := &monetizeapi.ServiceOffer{ ObjectMeta: metav1.ObjectMeta{Name: "owner", Namespace: "demo"}, @@ -528,13 +596,17 @@ func TestBuildServiceCatalogJSON_ExcludesNonReady(t *testing.T) { offers := []*monetizeapi.ServiceOffer{ nil, { - ObjectMeta: metav1.ObjectMeta{Name: "paused-svc", Namespace: "llm", - Annotations: map[string]string{monetizeapi.PausedAnnotation: "true"}}, + ObjectMeta: metav1.ObjectMeta{ + Name: "paused-svc", Namespace: "llm", + Annotations: map[string]string{monetizeapi.PausedAnnotation: "true"}, + }, Status: monetizeapi.ServiceOfferStatus{Conditions: readyCond}, }, { - ObjectMeta: metav1.ObjectMeta{Name: "deleting-svc", Namespace: "llm", - DeletionTimestamp: &deleting}, + ObjectMeta: metav1.ObjectMeta{ + Name: "deleting-svc", Namespace: "llm", + DeletionTimestamp: &deleting, + }, Status: monetizeapi.ServiceOfferStatus{Conditions: readyCond}, }, { From b92b78ed57f0e013bde9e16bd80f569780ea2ff8 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 12 May 2026 18:15:13 +0800 Subject: [PATCH 2/6] fix(sell inference): auto-register on-chain to match sell http parity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first revision of #485 stopped at "produce a ServiceOffer with a populated spec.registration block". That made /.well-known/agent-registration.json get routed, but left the offer in Registered=False AwaitingExternalRegistration until someone manually ran `obol sell register`. The serviceoffer-controller's storefront filter (buildServiceCatalogJSON in render.go) requires Ready=True, which transitively requires Registered=True, so the offer was silently excluded from /api/services.json — the very feed the operator's own storefront UI consumes. obol sell http already auto-registers on the same code path. Mirror that here so the inference path reaches the same end state without a follow-up obol sell register step. Changes: - Extract shouldAutoRegisterSell(spec, tunnelURL) as the shared decision predicate. The same gate now drives both the http and inference auto-register call sites; defensively returns false when the registration block is missing, disabled, malformed, or the tunnel URL is empty. - sellInferenceCommand Action: after kubectlApply + EnsureTunnelForSell succeed, if shouldAutoRegisterSell says yes, call autoRegisterServiceOffer with the resolved name/description/wallet pulled from the spec. Surface failures as warnings + a re-run hint rather than aborting the gateway start, because the underlying call needs gas on the target chain and we do not want a one-off RPC hiccup to block local dev. Test coverage that would have caught the regression: - TestShouldAutoRegisterSell - table-driven over six scenarios including the defensive cases (registration not a map, registration.enabled not a bool). Both call sites use the helper. - TestSellInferenceAction_InvokesAutoRegister - source-level guard that scans the sellInferenceCommand body for shouldAutoRegisterSell and autoRegisterServiceOffer. The bug we just fixed was "Action calls neither"; an innocent refactor could remove the calls without any unit-level signal otherwise. Operational note: the agent's remote-signer wallet needs a small ETH balance on the target chain (~0.20-0.50 USD typical) for the on-chain register tx. If the wallet is unfunded the warning fires and the offer stays in AwaitingExternalRegistration; the operator can fund the wallet and re-run obol sell register to finish. --- cmd/obol/sell.go | 46 +++++++++++++++++++- cmd/obol/sell_test.go | 99 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+), 1 deletion(-) diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index 508aecaa..2a13730d 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -448,12 +448,36 @@ Examples: u.Blank() u.Info("Ensuring tunnel is active for public access...") - if tunnelURL, tErr := tunnel.EnsureTunnelForSell(cfg, u); tErr != nil { + tunnelURL := "" + if url, tErr := tunnel.EnsureTunnelForSell(cfg, u); tErr != nil { u.Warnf("Tunnel not started: %v", tErr) u.Dim(" Start manually with: obol tunnel restart") } else { + tunnelURL = url u.Successf("Tunnel active: %s", tunnelURL) } + + // Auto-register the seller on ERC-8004, mirroring the + // `obol sell http` path. Without this step the offer + // stays in Registered=False AwaitingExternalRegistration + // forever, which makes Ready=False and excludes the + // offer from /api/services.json (the storefront feed + // only includes Ready=True offers). + if shouldAutoRegisterSell(soSpec, tunnelURL) { + reg, _ := soSpec["registration"].(map[string]any) + u.Blank() + u.Info("Registering seller agent on ERC-8004...") + if err := autoRegisterServiceOffer(ctx, cfg, u, autoRegisterOptions{ + ChainCSV: cmd.String("chain"), + Endpoint: tunnelURL, + AgentName: registrationNameForPrompt(name, reg), + AgentDesc: registrationDescriptionForPrompt(name, reg), + ExpectedOwner: wallet, + }); err != nil { + u.Warnf("automatic sell registration failed: %v", err) + u.Dim(" Re-run later with: obol sell register " + name + " -n " + svcNs) + } + } } } } @@ -839,6 +863,26 @@ Examples: } } +// shouldAutoRegisterSell reports whether the post-create auto-register step +// must run for a freshly-applied ServiceOffer spec. Both `obol sell http` and +// `obol sell inference` need the same gate: registration must be enabled AND +// a tunnel URL must be available to write into the on-chain registration +// document. The decision is intentionally shared so the inference path +// cannot silently regress to "create the offer, never register" (the bug +// behind https://github.com/ObolNetwork/obol-stack/issues — Registered=False +// AwaitingExternalRegistration hiding the offer from /api/services.json). +func shouldAutoRegisterSell(spec map[string]any, tunnelURL string) bool { + if tunnelURL == "" { + return false + } + reg, ok := spec["registration"].(map[string]any) + if !ok { + return false + } + enabled, _ := reg["enabled"].(bool) + return enabled +} + func registrationNameForPrompt(fallback string, reg map[string]any) string { if reg == nil { return fallback diff --git a/cmd/obol/sell_test.go b/cmd/obol/sell_test.go index 3277c183..b91861b5 100644 --- a/cmd/obol/sell_test.go +++ b/cmd/obol/sell_test.go @@ -2,6 +2,7 @@ package main import ( "errors" + "os" "strings" "testing" @@ -901,6 +902,104 @@ func TestBuildInferenceServiceOfferSpec_ModelNameNotHardcoded(t *testing.T) { } } +// TestShouldAutoRegisterSell pins the decision logic shared by the http and +// inference action handlers. The bug it guards: leaving a fresh inference +// offer in `Registered=False AwaitingExternalRegistration` so that the +// controller's services.json filter (which requires Ready=True, which in +// turn requires Registered=True) silently excluded the offer from the +// operator's own storefront. Both call sites must auto-register exactly +// when the spec says enabled AND the tunnel is up — anything else is a +// regression. +func TestShouldAutoRegisterSell(t *testing.T) { + tests := []struct { + name string + spec map[string]any + tunnelURL string + want bool + }{ + { + name: "registration enabled + tunnel up → register", + spec: map[string]any{"registration": map[string]any{"enabled": true}}, + tunnelURL: "https://inference.example.com", + want: true, + }, + { + name: "registration explicitly disabled → skip", + spec: map[string]any{"registration": map[string]any{"enabled": false}}, + tunnelURL: "https://inference.example.com", + want: false, + }, + { + name: "no registration block (sell http --no-register / pre-#485 sell inference) → skip", + spec: map[string]any{}, + tunnelURL: "https://inference.example.com", + want: false, + }, + { + name: "registration enabled but tunnel down → skip (no endpoint to advertise)", + spec: map[string]any{"registration": map[string]any{"enabled": true}}, + tunnelURL: "", + want: false, + }, + { + name: "registration block is not a map (defensive) → skip", + spec: map[string]any{"registration": "garbage"}, + tunnelURL: "https://inference.example.com", + want: false, + }, + { + name: "registration.enabled is not a bool (defensive) → skip", + spec: map[string]any{"registration": map[string]any{"enabled": "yes"}}, + tunnelURL: "https://inference.example.com", + want: false, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := shouldAutoRegisterSell(tc.spec, tc.tunnelURL) + if got != tc.want { + t.Errorf("shouldAutoRegisterSell = %v, want %v", got, tc.want) + } + }) + } +} + +// TestSellInferenceAction_InvokesAutoRegister is a source-level guard +// against the specific regression the user just hit on spark2: the +// inference Action committed the ServiceOffer, ensured the tunnel, then +// jumped straight to runInferenceGateway without ever calling +// autoRegisterServiceOffer. Result: the offer stayed in +// AwaitingExternalRegistration and never reached the storefront feed. +// Without this guard, an innocent refactor of the post-create code path +// could silently remove the call again — and the only downstream signal +// would be "operator's storefront mysteriously empty", which is hard to +// attribute. +func TestSellInferenceAction_InvokesAutoRegister(t *testing.T) { + src, err := os.ReadFile("sell.go") + if err != nil { + t.Fatalf("read sell.go: %v", err) + } + body := string(src) + start := strings.Index(body, "func sellInferenceCommand(") + if start < 0 { + t.Fatal("sellInferenceCommand not found in sell.go") + } + next := strings.Index(body[start+1:], "\nfunc ") + if next < 0 { + t.Fatal("could not delimit sellInferenceCommand body") + } + scope := body[start : start+1+next] + for _, needle := range []string{ + "shouldAutoRegisterSell(", + "autoRegisterServiceOffer(", + } { + if !strings.Contains(scope, needle) { + t.Errorf("sellInferenceCommand body must contain %q — auto-register path missing, "+ + "offers will stay in AwaitingExternalRegistration and be excluded from /api/services.json", needle) + } + } +} + func TestBuildDemoServiceOffer_RegisterFlagDrivesEnabled(t *testing.T) { for _, register := range []bool{true, false} { manifest := buildDemoServiceOffer( From dbf6c89ca28fb6bc16c6424462fc3a4e039610c4 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 12 May 2026 18:39:47 +0800 Subject: [PATCH 3/6] fix(sell): allow signer != payTo on ERC-8004 register (not a spec constraint) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The autoRegisterServiceOffer pre-flight check rejected any registration whose signer didn't match the offer's payTo wallet: registration signer 0xA... does not match the payment wallet 0xB... Use a matching signer, omit --wallet so the remote-signer wallet is used, or pass --no-register The error wording read like an ERC-8004 limitation but isn't. ERC-8004 treats the agent OWNER (msg.sender at register time) and the agent WALLET (settable post-mint via setAgentWallet) as independent addresses. x402 settlement honors the offer's spec.payment.payTo directly — buyers pay that address regardless of what the registry's getAgentWallet returns. The "hot signer, cold/multisig payee" split is the canonical pattern. The historic guard existed because the obol CLI never exposed setAgentWallet, so a mismatched registration left operators with no in-CLI recovery path. This change instead surfaces the split as an informational note + adds `obol sell update --pay-to ` as the recovery surface (already in tree; just needed test coverage and the connection wired into the diagnostic). Changes: - signerPayeeDelegationNote(signer, payTo) returns a human-readable note when the two diverge (case-insensitive, whitespace-tolerant, empty on either side) and "" otherwise. Used by autoRegisterServiceOffer instead of the previous early-return. - buildSellUpdatePatch(payTo, chain, price) extracted from the inline sellUpdateCommand Action so the patch shape — the thing that actually hits the cluster — is testable without a live offer. Action calls the helper instead of inlining the same logic. Tests: - TestSignerPayeeDelegationNote — 6-case table: match, case-insensitive, whitespace, empty payTo, empty signer (defensive), true mismatch (assertions name the addresses + advise sell update). - TestAutoRegister_AllowsSignerPayeeMismatch — source-level guard asserting the banned error wording is gone from autoRegisterServiceOffer and the soft-notice path is wired. Anyone re-introducing the check has to delete this test too, which forces them to read the rationale. - TestBuildSellUpdatePatch_PayToOnly — `obol sell update --pay-to 0xBooB` builds a patch that touches only spec.payment.payTo, not network or price. - TestBuildSellUpdatePatch_PriceSwitchNullsOldKeys — table over perRequest/perMTok/perHour: the unused keys are explicitly null so a switch (e.g. perRequest → perMTok) doesn't leave the previous key fighting through merge semantics. - TestBuildSellUpdatePatch_NoFieldsErrors — error fires when no fields are set, and the error names the flags the operator should pass. - TestSellUpdate_PayToFlagSurface — `obol sell update` exposes --pay-to (with --wallet/--recipient/-w aliases via payToFlag), and --namespace is Required. --- cmd/obol/sell.go | 135 ++++++++++++++++++------- cmd/obol/sell_test.go | 223 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 322 insertions(+), 36 deletions(-) diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index 2a13730d..724a14bf 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -863,6 +863,83 @@ Examples: } } +// signerPayeeDelegationNote returns a human-readable note explaining the +// ownership delegation when the agent's on-chain registration signer differs +// from the offer's payment payTo wallet, and "" when they match (or either +// is empty). Used by the auto-register flow to surface the split as an +// informational message rather than blocking the registration outright — +// ERC-8004 explicitly supports this separation via setAgentWallet, and x402 +// settlement uses the offer's payTo regardless of the registry's wallet. +// +// The previous behavior (returning fmt.Errorf("registration signer ... does +// not match the payment wallet ...")) made it look like an ERC-8004 spec +// constraint when it was purely an obol-CLI policy choice. See PR notes for +// the full rationale. +func signerPayeeDelegationNote(signer, payTo string) string { + s := strings.TrimSpace(signer) + p := strings.TrimSpace(payTo) + if s == "" || p == "" || strings.EqualFold(s, p) { + return "" + } + return fmt.Sprintf( + "Agent owner (registration signer) %s differs from offer payTo %s. "+ + "ERC-8004 allows this via setAgentWallet; x402 settlement honors payTo regardless. "+ + "Re-point payments later with: obol sell update --pay-to ", + s, p, + ) +} + +// buildSellUpdatePatch assembles the JSON-merge patch body for +// `obol sell update`. Extracted from the Action so the patch shape — the +// thing that actually hits the cluster — is testable without a live ServiceOffer. +// +// Returns the patch map and an error when nothing was provided (the Action +// surfaces this as "nothing to update: pass at least one of ..."). +// +// When --pay-to is set, the wallet must already have been validated by +// x402verifier.ValidateWallet at the call site; this helper does no further +// validation so it stays a pure data shape. +func buildSellUpdatePatch(payTo, chain string, price schemas.PriceTable) (map[string]any, error) { + payment := map[string]any{} + + if payTo = strings.TrimSpace(payTo); payTo != "" { + payment["payTo"] = payTo + } + if chain = strings.TrimSpace(chain); chain != "" { + payment["network"] = chain + } + + if price.PerRequest != "" || price.PerMTok != "" || price.PerHour != "" { + // Null out the unused price keys explicitly so a switch from, e.g., + // perRequest→perMTok doesn't leave the previous key stranded and + // fighting the new one through merge semantics. + p := map[string]any{ + "perRequest": nil, + "perMTok": nil, + "perHour": nil, + } + switch { + case price.PerRequest != "": + p["perRequest"] = price.PerRequest + case price.PerMTok != "": + p["perMTok"] = price.PerMTok + case price.PerHour != "": + p["perHour"] = price.PerHour + } + payment["price"] = p + } + + if len(payment) == 0 { + return nil, errors.New("nothing to update: pass at least one of --per-request / --per-mtok / --per-hour / --pay-to / --chain") + } + + return map[string]any{ + "spec": map[string]any{ + "payment": payment, + }, + }, nil +} + // shouldAutoRegisterSell reports whether the post-create auto-register step // must run for a freshly-applied ServiceOffer spec. Both `obol sell http` and // `obol sell inference` need the same gate: registration must be enabled AND @@ -958,8 +1035,19 @@ func autoRegisterServiceOffer(ctx context.Context, cfg *config.Config, u *ui.UI, } signerAddr := addr.Hex() - if opts.ExpectedOwner != "" && !strings.EqualFold(strings.TrimSpace(opts.ExpectedOwner), strings.TrimSpace(signerAddr)) { - return fmt.Errorf("registration signer %s does not match the payment wallet %s.\nUse a matching signer, omit --wallet so the remote-signer wallet is used, or pass --no-register", signerAddr, opts.ExpectedOwner) + // ERC-8004 treats the agent OWNER (msg.sender at register time) and the + // agent WALLET (settable post-mint via setAgentWallet) as independent + // addresses. x402 settlement honors the offer's spec.payment.payTo + // directly — buyers pay that address regardless of what the registry's + // getAgentWallet returns. So a different signer and payTo is legitimate; + // it is the canonical pattern for "hot signer, cold/multisig payee". + // + // We surface the split as an informational note (not an error) so the + // operator can confirm the delegation was intentional, and so the obvious + // follow-up — `obol sell update --pay-to ` for the payee — is + // discoverable. + if note := signerPayeeDelegationNote(signerAddr, opts.ExpectedOwner); note != "" { + u.Info(note) } agentURI := strings.TrimRight(opts.Endpoint, "/") + "/.well-known/agent-registration.json" @@ -2328,50 +2416,25 @@ Examples: return fmt.Errorf("ServiceOffer %s/%s not found: %w", ns, name, err) } - payment := map[string]any{} - - if wallet := strings.TrimSpace(cmd.String("pay-to")); wallet != "" { + wallet := strings.TrimSpace(cmd.String("pay-to")) + if wallet != "" { if err := x402verifier.ValidateWallet(wallet); err != nil { return err } - payment["payTo"] = wallet - } - - if chain := strings.TrimSpace(cmd.String("chain")); chain != "" { - payment["network"] = chain } - priceSet := cmd.String("price") != "" || cmd.String("per-request") != "" || cmd.String("per-mtok") != "" || cmd.String("per-hour") != "" - if priceSet { - priceTable, err := resolvePriceTable(cmd, true) + var price schemas.PriceTable + if cmd.String("price") != "" || cmd.String("per-request") != "" || cmd.String("per-mtok") != "" || cmd.String("per-hour") != "" { + resolved, err := resolvePriceTable(cmd, true) if err != nil { return err } - - price := map[string]any{ - "perRequest": nil, - "perMTok": nil, - "perHour": nil, - } - switch { - case priceTable.PerRequest != "": - price["perRequest"] = priceTable.PerRequest - case priceTable.PerMTok != "": - price["perMTok"] = priceTable.PerMTok - case priceTable.PerHour != "": - price["perHour"] = priceTable.PerHour - } - payment["price"] = price - } - - if len(payment) == 0 { - return errors.New("nothing to update: pass at least one of --per-request / --per-mtok / --per-hour / --wallet / --chain") + price = resolved } - patch := map[string]any{ - "spec": map[string]any{ - "payment": payment, - }, + patch, err := buildSellUpdatePatch(wallet, cmd.String("chain"), price) + if err != nil { + return err } patchBytes, err := json.Marshal(patch) if err != nil { diff --git a/cmd/obol/sell_test.go b/cmd/obol/sell_test.go index b91861b5..7ec30a05 100644 --- a/cmd/obol/sell_test.go +++ b/cmd/obol/sell_test.go @@ -1000,6 +1000,229 @@ func TestSellInferenceAction_InvokesAutoRegister(t *testing.T) { } } +// TestSignerPayeeDelegationNote pins the contract that obol now treats the +// "registration signer != offer payTo" case as legitimate ERC-8004 ownership +// delegation rather than an error. The helper returns "" when the two match +// (or either is empty) and a single-line informational note otherwise. +// +// Regression context: autoRegisterServiceOffer used to fail with +// +// registration signer 0xA... does not match the payment wallet 0xB... +// +// which read like an ERC-8004 spec constraint but was purely an obol-CLI +// policy. ERC-8004 explicitly supports the split (msg.sender at register +// time owns the agent; setAgentWallet re-points the wallet post-mint), and +// x402 settlement honors the offer's payTo regardless of what the registry +// reports. +func TestSignerPayeeDelegationNote(t *testing.T) { + tests := []struct { + name string + signer string + payTo string + wantNoteHas []string + wantEmpty bool + }{ + { + name: "exact match → no note", + signer: "0xA5d4aF96E3e740383A36c3123A54724dAcB3Df57", + payTo: "0xA5d4aF96E3e740383A36c3123A54724dAcB3Df57", + wantEmpty: true, + }, + { + name: "case-insensitive match → no note", + signer: "0xa5d4af96e3e740383a36c3123a54724dacb3df57", + payTo: "0xA5D4AF96E3E740383A36C3123A54724DACB3DF57", + wantEmpty: true, + }, + { + name: "whitespace tolerance → no note", + signer: " 0xA5d4aF96E3e740383A36c3123A54724dAcB3Df57 ", + payTo: "0xA5d4aF96E3e740383A36c3123A54724dAcB3Df57", + wantEmpty: true, + }, + { + name: "empty payTo → no note (caller didn't request the check)", + signer: "0xA5d4aF96E3e740383A36c3123A54724dAcB3Df57", + payTo: "", + wantEmpty: true, + }, + { + name: "empty signer → no note (impossible in practice; defensive)", + signer: "", + payTo: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + wantEmpty: true, + }, + { + name: "true mismatch → note names both addrs and suggests sell update", + signer: "0xA5d4aF96E3e740383A36c3123A54724dAcB3Df57", + payTo: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + wantNoteHas: []string{ + "0xA5d4aF96E3e740383A36c3123A54724dAcB3Df57", + "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + "obol sell update", + "--pay-to", + }, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := signerPayeeDelegationNote(tc.signer, tc.payTo) + if tc.wantEmpty { + if got != "" { + t.Errorf("note = %q, want empty", got) + } + return + } + for _, sub := range tc.wantNoteHas { + if !strings.Contains(got, sub) { + t.Errorf("note missing %q: %s", sub, got) + } + } + }) + } +} + +// TestAutoRegister_AllowsSignerPayeeMismatch is the source-level guard against +// re-introducing the early-return error that used to reject ERC-8004 +// registrations whenever signer != payTo. The error wording was specific +// ("registration signer ... does not match the payment wallet ..."); we grep +// the function body for it. Anyone who reintroduces that check — even with a +// different error message — must also delete this test, which is a deliberate +// nudge to read the rationale comment first. +func TestAutoRegister_AllowsSignerPayeeMismatch(t *testing.T) { + src, err := os.ReadFile("sell.go") + if err != nil { + t.Fatalf("read sell.go: %v", err) + } + body := string(src) + start := strings.Index(body, "func autoRegisterServiceOffer(") + if start < 0 { + t.Fatal("autoRegisterServiceOffer not found in sell.go") + } + end := strings.Index(body[start+1:], "\nfunc ") + if end < 0 { + t.Fatal("could not delimit autoRegisterServiceOffer body") + } + scope := body[start : start+1+end] + for _, banned := range []string{ + `"registration signer %s does not match the payment wallet`, + `does not match the payment wallet`, + } { + if strings.Contains(scope, banned) { + t.Errorf("autoRegisterServiceOffer must NOT reject signer != payee — ERC-8004 allows "+ + "the split via setAgentWallet and x402 settlement honors payTo directly. "+ + "Banned snippet still present: %q", banned) + } + } + // Positive guard: the soft-notice path must still be present, otherwise + // the operator gets no signal that the addresses diverge. + if !strings.Contains(scope, "signerPayeeDelegationNote(") { + t.Error("autoRegisterServiceOffer must call signerPayeeDelegationNote(...) so operators see when signer ≠ payTo") + } +} + +// TestBuildSellUpdatePatch_PayToOnly pins the `obol sell update --pay-to 0x...` +// shape. The user-facing promise is "change the payee in place" — the patch +// must touch only spec.payment.payTo, not the rest of the offer. +func TestBuildSellUpdatePatch_PayToOnly(t *testing.T) { + const newPayee = "0xB00B00000000000000000000000000000000B00B" + patch, err := buildSellUpdatePatch(newPayee, "", schemas.PriceTable{}) + if err != nil { + t.Fatalf("buildSellUpdatePatch: %v", err) + } + spec, ok := patch["spec"].(map[string]any) + if !ok { + t.Fatalf("patch.spec missing or wrong type: %#v", patch) + } + payment, ok := spec["payment"].(map[string]any) + if !ok { + t.Fatalf("patch.spec.payment missing or wrong type: %#v", spec) + } + if payment["payTo"] != newPayee { + t.Errorf("patch.spec.payment.payTo = %v, want %q", payment["payTo"], newPayee) + } + if _, present := payment["network"]; present { + t.Errorf("--pay-to only patch should NOT touch payment.network; got %#v", payment["network"]) + } + if _, present := payment["price"]; present { + t.Errorf("--pay-to only patch should NOT touch payment.price; got %#v", payment["price"]) + } +} + +// TestBuildSellUpdatePatch_PriceSwitchNullsOldKeys pins the switching contract: +// changing the price model (e.g. perRequest → perMTok) must explicitly null +// the unused keys so merge-patch semantics don't leave a stranded perRequest +// fighting the new perMTok. +func TestBuildSellUpdatePatch_PriceSwitchNullsOldKeys(t *testing.T) { + tests := []struct { + name string + price schemas.PriceTable + setKey string + }{ + {"per-request set", schemas.PriceTable{PerRequest: "0.002"}, "perRequest"}, + {"per-mtok set", schemas.PriceTable{PerMTok: "5.0"}, "perMTok"}, + {"per-hour set", schemas.PriceTable{PerHour: "1.0"}, "perHour"}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + patch, err := buildSellUpdatePatch("", "", tc.price) + if err != nil { + t.Fatalf("buildSellUpdatePatch: %v", err) + } + price := patch["spec"].(map[string]any)["payment"].(map[string]any)["price"].(map[string]any) + for _, key := range []string{"perRequest", "perMTok", "perHour"} { + v := price[key] + if key == tc.setKey { + if v == nil { + t.Errorf("price.%s = nil, want a value", key) + } + } else { + if v != nil { + t.Errorf("price.%s = %#v, want nil (so old key doesn't survive the merge)", key, v) + } + } + } + }) + } +} + +// TestBuildSellUpdatePatch_NoFieldsErrors pins the no-op-protection contract: +// `obol sell update ` with no field flags must error rather than fire +// a no-op kubectl patch. The error message names the flags so the operator +// learns the surface from the failure. +func TestBuildSellUpdatePatch_NoFieldsErrors(t *testing.T) { + _, err := buildSellUpdatePatch("", "", schemas.PriceTable{}) + if err == nil { + t.Fatal("expected error when no update flags are set") + } + for _, sub := range []string{"--per-request", "--per-mtok", "--per-hour", "--pay-to", "--chain"} { + if !strings.Contains(err.Error(), sub) { + t.Errorf("error must name flag %q so the operator learns the surface; got: %v", sub, err) + } + } +} + +// TestSellUpdate_PayToFlagSurface pins the user-facing `obol sell update +// --pay-to 0x...` flag wiring: the same payToFlag() shape used by +// inference + http (so a buyer can muscle-memory `-w` / `--wallet`), plus +// the namespace requirement (the resource is per-namespace, ambiguous +// otherwise). +func TestSellUpdate_PayToFlagSurface(t *testing.T) { + cfg := newTestConfig(t) + cmd := sellCommand(cfg) + update := findSubcommand(t, cmd, "update") + flags := flagMap(update) + + requireFlags(t, flags, + "namespace", "pay-to", "wallet", "recipient", "w", + "chain", "price", "per-request", "per-mtok", "per-hour", + ) + assertFlagRequired(t, flags, "namespace") + assertFlagHasAlias(t, flags, "pay-to", "wallet") + assertFlagHasAlias(t, flags, "pay-to", "recipient") + assertFlagHasAlias(t, flags, "pay-to", "w") +} + func TestBuildDemoServiceOffer_RegisterFlagDrivesEnabled(t *testing.T) { for _, register := range []bool{true, false} { manifest := buildDemoServiceOffer( From 2d36745b50d26f951ba2a56792104348f7315ebe Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 12 May 2026 19:20:16 +0800 Subject: [PATCH 4/6] feat(stack): resume sell-inference offers on stack up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the "cluster comes back but the seller offers don't" gap. After `obol stack down` destroys the k3d cluster, all ServiceOffer custom resources are wiped from etcd. The descriptors on disk at ~/.workspace/config/inference// survive, but nothing replays them when the cluster comes back, so operators had to manually re-run `obol sell inference ` for every offer after every stack up. This wires a resume step into `obol stack up`: after stack.Up returns successfully, the action iterates inference.Store, rebuilds the Service+Endpoints+ServiceOffer for each persisted deployment from the on-disk descriptor, and kubectl-applies them. The foreground host gateway is NOT auto-started — it is an interactive operator action and stack-up shouldn't launch long-running processes — but the cluster side is fully reattached so the operator's eventual `obol sell inference ` rerun hits a "service already healthy" reconcile instead of "build from scratch." Storage model: - inference.Deployment gains ModelName, ServiceNamespace, and Registration fields, persisted with `omitempty` so legacy descriptors written by older binaries still load (the new fields come back as zero values; the resume path either defaults sensibly or refuses with an actionable message). - The inference Action now resolves the registration block once and passes the same map to both store.Create() and the in-process ServiceOffer apply — guaranteeing on-disk state matches what the cluster sees. Resume path: - resumeSellOffers(ctx, cfg, u): lists inference.Store, skips when no kubeconfig (no cluster yet), warns + continues per-offer on errors. - resumeOneInferenceOffer: createHostService + buildInferenceServiceOfferSpec + kubectlApply, no foreground process. Returns an actionable error on missing ModelName so legacy descriptors surface a clear "recreate the offer" prompt rather than producing a broken ServiceOffer. - Wired from cmd/obol/main.go's `stack up` Action after stack.Up. A resume failure is logged as a warning, not propagated — stack-up must succeed even if one descriptor is malformed. Tests: - TestStoreCreate_PersistsResumeFields: pins ModelName, ServiceNamespace, and Registration round-trip through JSON. Without this round-trip, the resume path silently loses operator customizations. - TestStoreCreate_LegacyDescriptorWithoutResumeFields: pins backwards-compatibility — a JSON written by an older binary still loads, with the new fields as zero values. - TestResumeSellOffers_EmptyStoreNoOp: empty store, resume returns nil. - TestResumeSellOffers_DescriptorPresentButNoCluster: descriptor on disk, kubeconfig absent (post-purge, pre-up state) — resume must skip cleanly. - TestResumeOneInferenceOffer_RequiresModelName: legacy descriptor with empty ModelName surfaces an actionable error naming the missing field and the recovery command. - TestResumeOneInferenceOffer_NilDescriptor: defensive nil/empty descriptor guard so one bad entry can't crash the loop. - TestStackUpAction_CallsResumeSellOffers: source-level guard that cmd/obol/main.go's `stack up` Action calls resumeSellOffers AFTER stack.Up. Pinning the order — running resume before stack.Up would see no kubeconfig and skip every offer silently. Operational note: `obol sell http` offers don't yet have an on-disk descriptor (only `obol sell inference` persists), so they aren't covered by this resume pass. Filed as a follow-up — adding a parallel manifest store for `sell http` plus a third resume branch is its own PR. --- cmd/obol/main.go | 17 ++- cmd/obol/sell.go | 180 +++++++++++++++++++++++++------ cmd/obol/sell_test.go | 114 ++++++++++++++++++++ internal/inference/store.go | 19 ++++ internal/inference/store_test.go | 101 +++++++++++++++++ 5 files changed, 400 insertions(+), 31 deletions(-) diff --git a/cmd/obol/main.go b/cmd/obol/main.go index 72055ea3..72b8d4f6 100644 --- a/cmd/obol/main.go +++ b/cmd/obol/main.go @@ -204,7 +204,22 @@ GLOBAL OPTIONS:{{template "visibleFlagTemplate" .}}{{end}} }, }, Action: func(ctx context.Context, cmd *cli.Command) error { - return stack.Up(cfg, getUI(cmd), cmd.Bool("wildcard-dns")) + u := getUI(cmd) + if err := stack.Up(cfg, u, cmd.Bool("wildcard-dns")); err != nil { + return err + } + // Re-apply cluster-side state for locally-persisted + // `obol sell *` offers. ServiceOffer CRs and the + // Service/Endpoints that route to the host gateway + // live in etcd, which is destroyed by `obol stack + // down`, so a fresh `stack up` would otherwise come + // back with the descriptors still on disk but no + // matching cluster resources. Best-effort: a resume + // failure does not block stack-up. + if err := resumeSellOffers(ctx, cfg, u); err != nil { + u.Warnf("Could not resume sell offers: %v", err) + } + return nil }, }, { diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index 724a14bf..f5d35e1a 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -334,24 +334,46 @@ Examples: assetSymbol = "USDC" } + // Resolve the registration block once, here, so we can persist it + // alongside the deployment descriptor. The resume path + // (`obol stack up` after a stack-down) rebuilds the ServiceOffer + // from the on-disk descriptor; without the registration block + // persisted, replays would lose the operator's --register-* + // customizations. + persistedRegistration, _, regErr := buildSellRegistrationConfig(name, sellRegistrationInput{ + NoRegister: cmd.Bool("no-register"), + Name: cmd.String("register-name"), + Description: cmd.String("register-description"), + Image: cmd.String("register-image"), + Skills: cmd.StringSlice("register-skills"), + Domains: cmd.StringSlice("register-domains"), + MetadataPairs: cmd.StringSlice("register-metadata"), + }) + if regErr != nil { + return regErr + } + d := &inference.Deployment{ - Name: name, - EnclaveTag: cmd.String("enclave-tag"), - ListenAddr: cmd.String("listen"), - UpstreamURL: upstreamFlag, - WalletAddress: wallet, - PricePerRequest: perRequest, - PricePerMTok: priceTable.PerMTok, - AssetSymbol: assetSymbol, - Chain: chainName, - FacilitatorURL: cmd.String("facilitator"), - VMMode: cmd.Bool("vm"), - VMImage: cmd.String("vm-image"), - VMCPUs: cmd.Int("vm-cpus"), - VMMemoryMB: cmd.Int("vm-memory"), - VMHostPort: cmd.Int("vm-host-port"), - TEEType: teeType, - ModelHash: modelHash, + Name: name, + EnclaveTag: cmd.String("enclave-tag"), + ListenAddr: cmd.String("listen"), + UpstreamURL: upstreamFlag, + WalletAddress: wallet, + PricePerRequest: perRequest, + PricePerMTok: priceTable.PerMTok, + AssetSymbol: assetSymbol, + Chain: chainName, + FacilitatorURL: cmd.String("facilitator"), + VMMode: cmd.Bool("vm"), + VMImage: cmd.String("vm-image"), + VMCPUs: cmd.Int("vm-cpus"), + VMMemoryMB: cmd.Int("vm-memory"), + VMHostPort: cmd.Int("vm-host-port"), + TEEType: teeType, + ModelHash: modelHash, + ModelName: modelFlag, + ServiceNamespace: "llm", + Registration: persistedRegistration, } if pf := cmd.String("provenance-file"); pf != "" { @@ -412,19 +434,9 @@ Examples: d.NoPaymentGate = false } else { // Create a ServiceOffer CR pointing at the host service. - reg, _, regErr := buildSellRegistrationConfig(name, sellRegistrationInput{ - NoRegister: cmd.Bool("no-register"), - Name: cmd.String("register-name"), - Description: cmd.String("register-description"), - Image: cmd.String("register-image"), - Skills: cmd.StringSlice("register-skills"), - Domains: cmd.StringSlice("register-domains"), - MetadataPairs: cmd.StringSlice("register-metadata"), - }) - if regErr != nil { - return regErr - } - soSpec, err := buildInferenceServiceOfferSpec(d, priceTable, svcNs, port, assetTerms, modelFlag, reg) + // Reuse the persistedRegistration resolved above; both this + // in-process create AND the on-disk descriptor must agree. + soSpec, err := buildInferenceServiceOfferSpec(d, priceTable, svcNs, port, assetTerms, modelFlag, persistedRegistration) if err != nil { return err } @@ -3441,6 +3453,114 @@ func loadProvenance(path string) (*inference.Provenance, error) { // // Kubernetes Endpoints require an IP address, not a hostname. We resolve the // host IP using the same strategy as ollamaHostIPForBackend in internal/stack. +// resumeSellOffers re-applies the cluster-side artifacts (Service + +// Endpoints + ServiceOffer) for every locally-persisted `obol sell +// inference` deployment after `obol stack up` brings a fresh cluster +// online. Without this step the cluster has no record of operator-created +// offers (CRs live in etcd, which is destroyed by `obol stack down`), +// even though the host-side descriptors at +// `/inference//` still exist. +// +// The foreground gateway is NOT restarted here — `obol sell inference` +// is an interactive operator action and we don't want stack-up to launch +// long-running processes. The operator re-runs `obol sell inference +// ` after stack-up to bring the gateway back; this step ensures +// the cluster side is already in place so the gateway hits a "service +// healthy" reconcile instead of "create from scratch". +// +// Best-effort. Per-offer failures emit a warning and the loop continues, +// so one broken descriptor cannot block stack-up. +func resumeSellOffers(ctx context.Context, cfg *config.Config, u *ui.UI) error { + store := inference.NewStore(cfg.ConfigDir) + deployments, err := store.List() + if err != nil { + return fmt.Errorf("list inference deployments: %w", err) + } + if len(deployments) == 0 { + return nil + } + + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + if _, statErr := os.Stat(kubeconfigPath); statErr != nil { + // No cluster — nothing to reattach. + return nil + } + + u.Blank() + u.Infof("Resuming %d locally-persisted sell-inference offer(s)...", len(deployments)) + + var resumed int + for _, d := range deployments { + if err := resumeOneInferenceOffer(cfg, u, d); err != nil { + u.Warnf("resume %s: %v", d.Name, err) + continue + } + resumed++ + u.Successf("Resumed sell-inference offer %q (run `obol sell inference %s` to restart the host gateway)", d.Name, d.Name) + } + + if resumed > 0 { + u.Dim(" Host gateways are not auto-started — re-run `obol sell inference ` in a terminal you can keep open.") + } + _ = ctx // reserved for cancellation support; current resume calls are synchronous and short + return nil +} + +// resumeOneInferenceOffer re-creates the cluster-side artifacts that +// `obol sell inference` would have produced for a single Deployment. Pure +// in the sense that it only consumes the on-disk descriptor; it never +// re-prompts the operator. Returns an error when the descriptor is +// incomplete (no model name, no namespace, no listen port) so the resume +// loop can surface a clear message per-offer. +func resumeOneInferenceOffer(cfg *config.Config, u *ui.UI, d *inference.Deployment) error { + if d == nil || d.Name == "" { + return errors.New("nil or unnamed deployment descriptor") + } + if d.ModelName == "" { + return fmt.Errorf("deployment %q is missing model_name on disk — recreate the offer with `obol sell inference %s --model ...`", d.Name, d.Name) + } + ns := d.ServiceNamespace + if ns == "" { + ns = "llm" // legacy descriptors written before service_namespace was persisted + } + + port := "8402" + if idx := strings.LastIndex(d.ListenAddr, ":"); idx >= 0 && idx+1 < len(d.ListenAddr) { + port = d.ListenAddr[idx+1:] + } + + if err := createHostService(cfg, d.Name, ns, port); err != nil { + return fmt.Errorf("create cluster Service/Endpoints: %w", err) + } + + chainName := d.Chain + assetTerms, err := resolveAssetTermsFor(d.AssetSymbol, &chainName, true) + if err != nil { + return fmt.Errorf("resolve asset terms: %w", err) + } + + pt := schemas.PriceTable{PerRequest: d.PricePerRequest, PerMTok: d.PricePerMTok} + soSpec, err := buildInferenceServiceOfferSpec(d, pt, ns, port, assetTerms, d.ModelName, d.Registration) + if err != nil { + return fmt.Errorf("rebuild ServiceOffer spec: %w", err) + } + + manifest := map[string]any{ + "apiVersion": "obol.org/v1alpha1", + "kind": "ServiceOffer", + "metadata": map[string]any{ + "name": d.Name, + "namespace": ns, + }, + "spec": soSpec, + } + if err := kubectlApply(cfg, manifest); err != nil { + return fmt.Errorf("apply ServiceOffer: %w", err) + } + _ = u // ui is held for caller-side messaging only + return nil +} + func createHostService(cfg *config.Config, name, ns, port string) error { hostIP, err := resolveHostIP(cfg) if err != nil { diff --git a/cmd/obol/sell_test.go b/cmd/obol/sell_test.go index 7ec30a05..bdcd1e1e 100644 --- a/cmd/obol/sell_test.go +++ b/cmd/obol/sell_test.go @@ -1,6 +1,7 @@ package main import ( + "context" "errors" "os" "strings" @@ -10,6 +11,7 @@ import ( "github.com/ObolNetwork/obol-stack/internal/inference" "github.com/ObolNetwork/obol-stack/internal/monetizeapi" "github.com/ObolNetwork/obol-stack/internal/schemas" + "github.com/ObolNetwork/obol-stack/internal/ui" x402verifier "github.com/ObolNetwork/obol-stack/internal/x402" "github.com/urfave/cli/v3" ) @@ -1223,6 +1225,118 @@ func TestSellUpdate_PayToFlagSurface(t *testing.T) { assertFlagHasAlias(t, flags, "pay-to", "w") } +// TestResumeSellOffers_EmptyStoreNoOp pins the "nothing to resume" path: +// stack-up against a workspace with no persisted sell-inference deployments +// must return nil without erroring. The same path also has to handle the +// no-cluster-yet case (no kubeconfig.yaml) gracefully — both are exercised +// by an empty ConfigDir. +func TestResumeSellOffers_EmptyStoreNoOp(t *testing.T) { + cfg := newTestConfig(t) + u := ui.New(false) + if err := resumeSellOffers(context.Background(), cfg, u); err != nil { + t.Fatalf("empty resume must succeed, got: %v", err) + } +} + +// TestResumeSellOffers_DescriptorPresentButNoCluster pins the +// "descriptors-on-disk-but-cluster-not-up-yet" path. Real-world example: +// `obol stack down`, then re-running `obol sell inference` somewhere that +// happens to fail before reaching cluster apply — the descriptor lands on +// disk, the cluster never comes back, and a subsequent `obol stack up` +// against a missing kubeconfig must NOT panic or hard-error. It should be +// a quiet skip until the cluster is available. +func TestResumeSellOffers_DescriptorPresentButNoCluster(t *testing.T) { + cfg := newTestConfig(t) + store := inference.NewStore(cfg.ConfigDir) + if err := store.Create(&inference.Deployment{ + Name: "aeon", + ModelName: "aeon-ultimate", + ServiceNamespace: "llm", + ListenAddr: "0.0.0.0:8402", + WalletAddress: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + PricePerRequest: "0.023", + AssetSymbol: "OBOL", + Chain: "base-sepolia", + }, true); err != nil { + t.Fatalf("Create: %v", err) + } + // No kubeconfig.yaml in cfg.ConfigDir — resume must not attempt + // cluster operations. + u := ui.New(false) + if err := resumeSellOffers(context.Background(), cfg, u); err != nil { + t.Fatalf("resume with descriptor + no cluster must skip cleanly, got: %v", err) + } +} + +// TestResumeOneInferenceOffer_RequiresModelName pins the per-offer error +// for legacy descriptors that were written before ModelName became a +// persisted field. The resume path cannot fabricate a model name out of +// thin air, and silently writing a ServiceOffer with no spec.model.name +// would surface as a controller "ModelReady=False" loop with no actionable +// signal. Operators need a clear "recreate the offer" message instead. +func TestResumeOneInferenceOffer_RequiresModelName(t *testing.T) { + cfg := newTestConfig(t) + u := ui.New(false) + err := resumeOneInferenceOffer(cfg, u, &inference.Deployment{ + Name: "legacy", + ServiceNamespace: "llm", + ListenAddr: ":8402", + // No ModelName. + }) + if err == nil { + t.Fatal("expected error for legacy descriptor with no ModelName") + } + for _, sub := range []string{"model_name", "obol sell inference"} { + if !strings.Contains(err.Error(), sub) { + t.Errorf("error must name the missing field and the recovery command; missing %q: %v", sub, err) + } + } +} + +// TestResumeOneInferenceOffer_NilDescriptor pins the defensive guard for +// the never-supposed-to-happen case of a nil or empty descriptor reaching +// the resume loop. A bug elsewhere that produces such an entry shouldn't +// panic the entire resume pass; one descriptor failing should not block +// the rest. +func TestResumeOneInferenceOffer_NilDescriptor(t *testing.T) { + cfg := newTestConfig(t) + u := ui.New(false) + if err := resumeOneInferenceOffer(cfg, u, nil); err == nil { + t.Fatal("expected error for nil descriptor") + } + if err := resumeOneInferenceOffer(cfg, u, &inference.Deployment{}); err == nil { + t.Fatal("expected error for empty descriptor (no Name)") + } +} + +// TestStackUpAction_CallsResumeSellOffers is a source-level guard against +// silently regressing the stack-up → sell-resume wiring. The whole feature +// hinges on the `stack up` action handler calling resumeSellOffers after +// stack.Up succeeds; without that call, persisted sell-inference offers +// stay on disk forever and never reach the freshly-recreated cluster. +// A future refactor that splits the handler or moves the call must update +// this test, which forces a moment of "why is this here" attention. +func TestStackUpAction_CallsResumeSellOffers(t *testing.T) { + src, err := os.ReadFile("main.go") + if err != nil { + t.Fatalf("read main.go: %v", err) + } + body := string(src) + if !strings.Contains(body, "resumeSellOffers(") { + t.Fatal("cmd/obol/main.go must call resumeSellOffers — without it persisted sell-inference offers never reach a freshly-stacked cluster") + } + // Belt-and-suspenders: assert the call lives after stack.Up so the + // kubeconfig + infrastructure are ready when resume runs. + upIdx := strings.Index(body, "stack.Up(cfg") + resumeIdx := strings.Index(body, "resumeSellOffers(") + if upIdx < 0 || resumeIdx < 0 { + t.Fatalf("expected both stack.Up and resumeSellOffers in main.go; upIdx=%d resumeIdx=%d", upIdx, resumeIdx) + } + if resumeIdx < upIdx { + t.Error("resumeSellOffers must be invoked AFTER stack.Up — running it before the cluster is up will see no kubeconfig and skip every offer") + } +} + func TestBuildDemoServiceOffer_RegisterFlagDrivesEnabled(t *testing.T) { for _, register := range []bool{true, false} { manifest := buildDemoServiceOffer( diff --git a/internal/inference/store.go b/internal/inference/store.go index fe9740cd..f2569b63 100644 --- a/internal/inference/store.go +++ b/internal/inference/store.go @@ -102,6 +102,25 @@ type Deployment struct { // config and passed to the registration document when selling. Provenance *Provenance `json:"provenance,omitempty"` + // ModelName is the upstream model identifier the operator selected via + // `obol sell inference --model ` (e.g. "aeon-ultimate"). Persisted + // so `obol stack up` can rebuild the ServiceOffer's spec.model.name on + // resume without re-prompting the operator. + ModelName string `json:"model_name,omitempty"` + + // ServiceNamespace is the Kubernetes namespace the cluster-routed Service + // + Endpoints + ServiceOffer live in (typically "llm" for inference + // deployments co-located with LiteLLM). Persisted so the resume path + // re-applies into the same namespace. + ServiceNamespace string `json:"service_namespace,omitempty"` + + // Registration is the ERC-8004 registration block the operator passed at + // sell time (--register-*). Persisted as map[string]any (the same shape + // buildSellRegistrationConfig emits) so the resume path can rebuild the + // ServiceOffer's spec.registration verbatim. Empty when the offer was + // created with --no-register. + Registration map[string]any `json:"registration,omitempty"` + // CreatedAt is the RFC3339 timestamp of when this deployment was created. CreatedAt string `json:"created_at"` diff --git a/internal/inference/store_test.go b/internal/inference/store_test.go index bc1fca70..42299d10 100644 --- a/internal/inference/store_test.go +++ b/internal/inference/store_test.go @@ -301,3 +301,104 @@ func TestStoreDirPermissions(t *testing.T) { t.Errorf("config.json permissions: want 0600, got %04o", mode) } } + +// TestStoreCreate_PersistsResumeFields pins the new resume-side fields on +// the inference Deployment descriptor. Without ModelName, ServiceNamespace, +// and Registration persisted on disk, the `obol stack up` resume path +// would lose the operator's --model + --register-* customizations and +// rebuild a stripped-down ServiceOffer (or fail outright on missing +// model_name). The round-trip here is the contract: anything Create() +// writes must come back verbatim from Get(). +func TestStoreCreate_PersistsResumeFields(t *testing.T) { + dir := t.TempDir() + store := inference.NewStore(dir) + + registration := map[string]any{ + "enabled": true, + "name": "Qwen3.6-27B AEON Ultimate", + "description": "Uncensored Qwen3.6-27B abliteration on DGX Spark", + "skills": []any{"llm/inference", "llm/uncensored"}, + "domains": []any{"inference.v1337.org"}, + } + + in := &inference.Deployment{ + Name: "aeon", + EnclaveTag: "com.obol.inference.aeon", + ListenAddr: "0.0.0.0:8402", + UpstreamURL: "http://127.0.0.1:8000", + WalletAddress: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + PricePerRequest: "0.023", + AssetSymbol: "OBOL", + Chain: "base-sepolia", + ModelName: "aeon-ultimate", + ServiceNamespace: "llm", + Registration: registration, + } + if err := store.Create(in, true); err != nil { + t.Fatalf("Create: %v", err) + } + + got, err := store.Get("aeon") + if err != nil { + t.Fatalf("Get: %v", err) + } + + if got.ModelName != "aeon-ultimate" { + t.Errorf("ModelName = %q, want %q", got.ModelName, "aeon-ultimate") + } + if got.ServiceNamespace != "llm" { + t.Errorf("ServiceNamespace = %q, want %q", got.ServiceNamespace, "llm") + } + if got.Registration == nil { + t.Fatal("Registration round-tripped to nil; resume would rebuild offer without operator customizations") + } + if got.Registration["enabled"] != true { + t.Errorf("Registration.enabled = %v, want true", got.Registration["enabled"]) + } + if got.Registration["name"] != "Qwen3.6-27B AEON Ultimate" { + t.Errorf("Registration.name = %v, want %q", got.Registration["name"], "Qwen3.6-27B AEON Ultimate") + } +} + +// TestStoreCreate_LegacyDescriptorWithoutResumeFields pins backwards- +// compatibility: a descriptor written by an older binary (no ModelName, +// no ServiceNamespace, no Registration) must still be readable, with the +// new fields coming back as zero values. The resume path uses the zero +// values to either default sensibly or refuse with an actionable error. +func TestStoreCreate_LegacyDescriptorWithoutResumeFields(t *testing.T) { + dir := t.TempDir() + configDir := dir + "/inference/legacy" + if err := os.MkdirAll(configDir, 0o700); err != nil { + t.Fatalf("mkdir: %v", err) + } + // JSON shape an older binary would have written — no model_name, + // no service_namespace, no registration. + legacyJSON := `{ + "name": "legacy", + "listen_addr": ":8402", + "upstream_url": "http://localhost:11434", + "wallet_address": "0xefab75b7b199bf8512e2d5b379374cb94dfdba47", + "price_per_request": "0.001", + "chain": "base", + "facilitator_url": "https://x402.gcp.obol.tech", + "created_at": "2026-01-01T00:00:00Z" + }` + if err := os.WriteFile(configDir+"/config.json", []byte(legacyJSON), 0o600); err != nil { + t.Fatalf("write: %v", err) + } + + store := inference.NewStore(dir) + got, err := store.Get("legacy") + if err != nil { + t.Fatalf("Get: %v", err) + } + if got.ModelName != "" { + t.Errorf("ModelName = %q, want empty for legacy descriptor", got.ModelName) + } + if got.ServiceNamespace != "" { + t.Errorf("ServiceNamespace = %q, want empty for legacy descriptor", got.ServiceNamespace) + } + if got.Registration != nil { + t.Errorf("Registration = %#v, want nil for legacy descriptor", got.Registration) + } +} From 6efdf3ff4ff21d55e671a251024a3334a0ee1ad8 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 12 May 2026 19:51:26 +0800 Subject: [PATCH 5/6] =?UTF-8?q?feat(stack):=20zero-command=20resume=20?= =?UTF-8?q?=E2=80=94=20auto-start=20gateway=20+=20relax=20storefront=20fil?= =?UTF-8?q?ter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first cut of stack-up resume (prior commit) reattached cluster-side artifacts but stopped short of two things that together left the operator's storefront empty after every stack-up cycle: 1. The foreground x402 gateway never restarted, so UpstreamHealthy=False. 2. Even after a manual gateway restart, the controller's storefront filter required Ready=True — which itself requires Registered=True — and an unfunded agent wallet (or a deliberate "register later" choice) leaves Registered=False AwaitingExternalRegistration. The operationally-usable offer was invisible to the operator's own UI. Both addressed here so `obol stack up` is the only command needed. 1. Auto-start the gateway as a detached host subprocess: - startDetachedInferenceGateway reconstructs the original `obol sell inference --model … --register-* …` invocation from the persisted inference.Deployment and spawns it with Setsid + Process.Release so it survives the parent's exit. - PID + log path live under /sell-inference//. readGatewayPID + processAlive let subsequent stack-ups detect a still-running gateway and skip the relaunch. - Surfaced via a u.Successf with the PID + log path so operators can `tail -f` or `kill $(cat .pid)` without parsing helpers. - sell_proc_unix.go isolates the platform-specific SysProcAttr behind the unix build tag, keeping the rest of the code portable. The long-term shape is to ship the inference gateway as a real helm-managed cluster Pod (it would survive stack-down/up like every other infra piece, and the controller could observe it as a first-class workload). Comment in startDetachedInferenceGateway spells that out so the next person looking at this knows the host subprocess + PID-file plumbing is the deliberate near-term step, not the final form. 2. Relax the storefront filter: - offerOperationallyReady = ModelReady + UpstreamHealthy + PaymentGateReady + RoutePublished. Registered is intentionally NOT in the AND — on-chain ERC-8004 registration is publication metadata, not operational readiness. Aggregate Ready=True is also accepted as a shortcut so existing test fixtures and any external callers that only emit the aggregate signal still work. - offerAwaitingRegistration flags Registered=False AwaitingExternalRegistration specifically, and buildServiceCatalogJSON propagates it as ServiceCatalogEntry.RegistrationPending=true so storefront UIs can badge "registration pending" alongside the usable offer. Tests: - TestBuildResumeGatewayArgs (3 cases): full descriptor incl. registration map, --no-register path, legacy descriptor with no Registration map. Pins positional-name-before-flags ordering so a CLI surface tweak can't silently desync the resume relaunch. - TestReadGatewayPID (8 cases): clean integer, trailing newline, surrounding whitespace, zero rejected, negative rejected, non-numeric rejected, empty rejected, missing-file rejected. Format is one decimal int in ASCII so external tooling (`kill $(cat .pid)`) works without parsing. - TestProcessAlive_SelfAndBogus: self is alive, absurd PID is not. - TestOfferOperationallyReady_IncludesAwaitingExternalRegistration: the headline behavioral fix — an offer with all four ops conditions True + Registered=False(AwaitingExternalRegistration) is operationally ready. - TestOfferOperationallyReady_RejectsRealNotReady: an offer with UpstreamHealthy=False is still excluded — the relax is narrow to the registration gate only. - TestBuildServiceCatalogJSON_IncludesPendingRegistrationOffers: end-to-end through /api/services.json — AwaitingExternalRegistration offers appear with RegistrationPending=true. - TestBuildServiceCatalogJSON_RegistrationPendingFalseForFullyReady: the negative — fully Ready=True offers must NOT carry RegistrationPending or the storefront badge would stick around on healthy offers. Operational footnote: the auto-spawned gateway picks up the same inference.Deployment that originally created the offer. If the descriptor predates this PR and has no ModelName, the resume per-offer warning fires (already pinned by TestResumeOneInferenceOffer_RequiresModelName in the prior commit) — recreate the offer once with the new code so the descriptor has the resume-side fields, after which every subsequent stack down/up cycle reattaches the offer with zero extra commands. --- cmd/obol/sell.go | 188 +++++++++++++++++- cmd/obol/sell_proc_unix.go | 18 ++ cmd/obol/sell_test.go | 184 +++++++++++++++++ internal/schemas/service_catalog.go | 8 + internal/serviceoffercontroller/render.go | 87 ++++++-- .../serviceoffercontroller/render_test.go | 136 +++++++++++++ 6 files changed, 608 insertions(+), 13 deletions(-) create mode 100644 cmd/obol/sell_proc_unix.go diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index f5d35e1a..1e9e7f99 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -3557,10 +3557,196 @@ func resumeOneInferenceOffer(cfg *config.Config, u *ui.UI, d *inference.Deployme if err := kubectlApply(cfg, manifest); err != nil { return fmt.Errorf("apply ServiceOffer: %w", err) } - _ = u // ui is held for caller-side messaging only + + // Auto-start the foreground x402 gateway as a detached background + // subprocess so the offer reaches UpstreamHealthy=True (and therefore + // shows up in /api/services.json) without the operator running an + // extra `obol sell inference` command after every stack-up. + // + // This is the pragmatic shape of the resume feature today. The proper + // long-term path is C from the design doc: build the inference gateway + // as an in-cluster Deployment image, helmfile-managed, so stack-up + // brings it back the same way it brings back Traefik or LiteLLM. That + // removes the host-side subprocess + PID-file plumbing entirely and + // lets the controller observe the gateway as a normal Pod. Tracked as + // a follow-up because the gateway code (internal/inference/gateway.go) + // currently runs in-process with the CLI, not as a packaged image. + if err := startDetachedInferenceGateway(cfg, u, d); err != nil { + u.Warnf("could not auto-start host gateway for %q: %v", d.Name, err) + u.Dim(" Run `obol sell inference " + d.Name + "` manually in a terminal you can keep open.") + } + return nil +} + +// startDetachedInferenceGateway forks `obol sell inference ` as a +// detached background subprocess. The CLI rebuilds the full flag set from +// the persisted Deployment, redirects stdout/stderr to a log file under +// the workspace state dir, and stores the PID alongside so subsequent +// stack-ups can detect a still-running gateway and skip the relaunch. +// +// Skipped silently when a healthy PID is already on disk. Returns an +// error only when launch itself fails (e.g. missing obol binary, log +// file unwritable). Callers should warn-and-continue on failure — a +// missing gateway leaves UpstreamHealthy=False which the controller will +// re-emit clearly, while a hard error here would block the rest of the +// resume loop. +func startDetachedInferenceGateway(cfg *config.Config, u *ui.UI, d *inference.Deployment) error { + if d == nil || d.Name == "" { + return errors.New("nil or unnamed deployment") + } + + stateDir := filepath.Join(cfg.StateDir, "sell-inference", d.Name) + if err := os.MkdirAll(stateDir, 0o755); err != nil { + return fmt.Errorf("create state dir: %w", err) + } + pidFile := filepath.Join(stateDir, "gateway.pid") + logFile := filepath.Join(stateDir, "gateway.log") + + if pid, ok := readGatewayPID(pidFile); ok && processAlive(pid) { + u.Dim(" Gateway already running for " + d.Name + " (pid " + strconv.Itoa(pid) + ")") + return nil + } + + obolBin := filepath.Join(cfg.BinDir, "obol") + if _, statErr := os.Stat(obolBin); statErr != nil { + // Fall back to whatever obol the parent process is running as. + exe, exeErr := os.Executable() + if exeErr != nil { + return fmt.Errorf("locate obol binary: %w", exeErr) + } + obolBin = exe + } + + args := buildResumeGatewayArgs(d) + + logF, err := os.OpenFile(logFile, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644) + if err != nil { + return fmt.Errorf("open log file: %w", err) + } + + cmd := exec.Command(obolBin, args...) + cmd.Stdout = logF + cmd.Stderr = logF + cmd.SysProcAttr = detachedSysProcAttr() + cmd.Env = os.Environ() + + if err := cmd.Start(); err != nil { + _ = logF.Close() + return fmt.Errorf("start gateway subprocess: %w", err) + } + // We don't wait on the child, but releasing the handle ensures the + // parent process can exit cleanly without keeping a zombie reference. + if err := cmd.Process.Release(); err != nil { + u.Dim(" (could not release gateway process handle: " + err.Error() + ")") + } + if err := os.WriteFile(pidFile, []byte(strconv.Itoa(cmd.Process.Pid)), 0o644); err != nil { + u.Warnf("gateway started (pid %d) but could not write %s: %v", cmd.Process.Pid, pidFile, err) + } + u.Successf("Gateway started in background (pid %d, log %s)", cmd.Process.Pid, logFile) + return nil +} + +// buildResumeGatewayArgs reconstructs the `obol sell inference` flag set +// that originally created the offer, from the persisted Deployment. The +// order matches obol sell inference's flag list so a future surface +// reduction of the CLI doesn't silently drop a piece of operator intent +// (CI's source-level guard tests catch that case). +// +// Exposed for testing: callers can compare the reproduced flag set +// against the original `obol sell inference` invocation that wrote the +// descriptor. +func buildResumeGatewayArgs(d *inference.Deployment) []string { + args := []string{"sell", "inference", d.Name} + if d.ModelName != "" { + args = append(args, "--model", d.ModelName) + } + if d.UpstreamURL != "" { + args = append(args, "--upstream", d.UpstreamURL) + } + if d.WalletAddress != "" { + args = append(args, "--pay-to", d.WalletAddress) + } + if d.ListenAddr != "" { + args = append(args, "--listen", d.ListenAddr) + } + if d.Chain != "" { + args = append(args, "--chain", d.Chain) + } + if d.AssetSymbol != "" { + args = append(args, "--token", d.AssetSymbol) + } + if d.PricePerMTok != "" { + args = append(args, "--per-mtok", d.PricePerMTok) + } else if d.PricePerRequest != "" { + args = append(args, "--price", d.PricePerRequest) + } + if d.FacilitatorURL != "" { + args = append(args, "--facilitator", d.FacilitatorURL) + } + // Registration block — rebuild --register-* flags from the persisted map. + if reg, ok := d.Registration["enabled"].(bool); ok && !reg { + args = append(args, "--no-register") + } else if d.Registration != nil { + if v, _ := d.Registration["name"].(string); v != "" { + args = append(args, "--register-name", v) + } + if v, _ := d.Registration["description"].(string); v != "" { + args = append(args, "--register-description", v) + } + if v, _ := d.Registration["image"].(string); v != "" { + args = append(args, "--register-image", v) + } + for _, s := range registrationStringSlice(d.Registration, "skills") { + args = append(args, "--register-skills", s) + } + for _, s := range registrationStringSlice(d.Registration, "domains") { + args = append(args, "--register-domains", s) + } + } + return args +} + +func registrationStringSlice(reg map[string]any, key string) []string { + raw, ok := reg[key] + if !ok { + return nil + } + switch v := raw.(type) { + case []string: + return v + case []any: + out := make([]string, 0, len(v)) + for _, item := range v { + if s, ok := item.(string); ok { + out = append(out, s) + } + } + return out + } return nil } +func readGatewayPID(path string) (int, bool) { + data, err := os.ReadFile(path) + if err != nil { + return 0, false + } + pid, err := strconv.Atoi(strings.TrimSpace(string(data))) + if err != nil || pid <= 0 { + return 0, false + } + return pid, true +} + +func processAlive(pid int) bool { + p, err := os.FindProcess(pid) + if err != nil { + return false + } + // Signal 0 doesn't deliver — just probes existence/permission on Unix. + return p.Signal(syscall.Signal(0)) == nil +} + func createHostService(cfg *config.Config, name, ns, port string) error { hostIP, err := resolveHostIP(cfg) if err != nil { diff --git a/cmd/obol/sell_proc_unix.go b/cmd/obol/sell_proc_unix.go new file mode 100644 index 00000000..e5604a9e --- /dev/null +++ b/cmd/obol/sell_proc_unix.go @@ -0,0 +1,18 @@ +//go:build unix + +package main + +import "syscall" + +// detachedSysProcAttr returns the platform-specific SysProcAttr that +// detaches a spawned child from the parent's process group. On Unix this +// is a new session leader (Setsid: true) so the child survives the +// parent's exit and does not receive SIGHUP / SIGINT propagated from +// the controlling terminal. +// +// Linux/macOS/BSD share the same shape via the unix build tag. If we +// ever ship a Windows obol binary, add sell_proc_windows.go with a +// CREATE_NEW_PROCESS_GROUP equivalent. +func detachedSysProcAttr() *syscall.SysProcAttr { + return &syscall.SysProcAttr{Setsid: true} +} diff --git a/cmd/obol/sell_test.go b/cmd/obol/sell_test.go index bdcd1e1e..15c03b8d 100644 --- a/cmd/obol/sell_test.go +++ b/cmd/obol/sell_test.go @@ -4,6 +4,7 @@ import ( "context" "errors" "os" + "path/filepath" "strings" "testing" @@ -1337,6 +1338,189 @@ func TestStackUpAction_CallsResumeSellOffers(t *testing.T) { } } +// TestBuildResumeGatewayArgs pins the round-trip between an +// inference.Deployment on disk and the `obol sell inference` invocation +// the resume path uses to relaunch the host gateway. If a flag is dropped +// here, an offer that came back from disk would lose part of the +// operator's original intent — most painfully, registration metadata +// that fell out of the relaunch would leave the gateway running with +// stripped-down /.well-known/ content. +func TestBuildResumeGatewayArgs(t *testing.T) { + tests := []struct { + name string + d *inference.Deployment + wantSubs []string + wantNoSub []string + }{ + { + name: "full descriptor (the spark2 aeon offer)", + d: &inference.Deployment{ + Name: "aeon", + ModelName: "aeon-ultimate", + UpstreamURL: "http://127.0.0.1:8000", + WalletAddress: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + ListenAddr: "0.0.0.0:8402", + Chain: "base-sepolia", + AssetSymbol: "OBOL", + PricePerMTok: "23", + PricePerRequest: "0.023", + FacilitatorURL: "https://x402.gcp.obol.tech", + Registration: map[string]any{ + "enabled": true, + "name": "Qwen3.6-27B AEON Ultimate", + "description": "Uncensored Qwen3.6-27B abliteration", + "skills": []any{"llm/inference", "llm/uncensored"}, + "domains": []any{"inference.v1337.org"}, + }, + }, + wantSubs: []string{ + "sell", "inference", "aeon", + "--model", "aeon-ultimate", + "--upstream", "http://127.0.0.1:8000", + "--pay-to", "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + "--listen", "0.0.0.0:8402", + "--chain", "base-sepolia", + "--token", "OBOL", + "--per-mtok", "23", + "--facilitator", "https://x402.gcp.obol.tech", + "--register-name", "Qwen3.6-27B AEON Ultimate", + "--register-description", "Uncensored Qwen3.6-27B abliteration", + "--register-skills", "llm/inference", + "--register-skills", "llm/uncensored", + "--register-domains", "inference.v1337.org", + }, + wantNoSub: []string{ + "--price", // perMTok set, perRequest must not also be passed + "--no-register", + }, + }, + { + name: "no-register descriptor", + d: &inference.Deployment{ + Name: "no-reg", + ModelName: "qwen3:0.6b", + ListenAddr: ":8402", + Registration: map[string]any{"enabled": false}, + }, + wantSubs: []string{ + "--no-register", + }, + wantNoSub: []string{ + "--register-name", + "--register-description", + }, + }, + { + name: "legacy descriptor (no registration map at all)", + d: &inference.Deployment{ + Name: "legacy", + ModelName: "qwen3.5:9b", + ListenAddr: ":8402", + PricePerRequest: "0.001", + }, + wantSubs: []string{"--price", "0.001"}, + wantNoSub: []string{ + "--no-register", + "--register-name", + }, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := buildResumeGatewayArgs(tc.d) + joined := strings.Join(got, " ") + for _, want := range tc.wantSubs { + found := false + for _, a := range got { + if a == want { + found = true + break + } + } + if !found { + t.Errorf("buildResumeGatewayArgs missing %q in %v", want, got) + } + } + for _, banned := range tc.wantNoSub { + for _, a := range got { + if a == banned { + t.Errorf("buildResumeGatewayArgs unexpectedly contains %q in %v", banned, got) + } + } + } + // Order sanity: positional `` comes before any flag. + nameIdx := -1 + firstFlagIdx := -1 + for i, a := range got { + if a == tc.d.Name && nameIdx < 0 { + nameIdx = i + } + if strings.HasPrefix(a, "--") && firstFlagIdx < 0 { + firstFlagIdx = i + } + } + if nameIdx < 0 || firstFlagIdx < 0 || nameIdx > firstFlagIdx { + t.Errorf("name positional must come before flags; got %v", joined) + } + }) + } +} + +// TestReadGatewayPID pins the on-disk PID file format. The file must be a +// single decimal integer in ASCII (no JSON, no trailing newlines that +// confuse trivial readers) so other tools — `tail -f .../gateway.log` and +// `kill $(cat .../gateway.pid)` — work without parsing helpers. +func TestReadGatewayPID(t *testing.T) { + dir := t.TempDir() + tests := []struct { + name string + content string + wantPID int + wantOK bool + }{ + {"clean integer", "12345", 12345, true}, + {"trailing newline tolerated", "12345\n", 12345, true}, + {"surrounding whitespace tolerated", " 12345 \n", 12345, true}, + {"zero rejected", "0", 0, false}, + {"negative rejected", "-1", 0, false}, + {"non-numeric rejected", "abc", 0, false}, + {"empty rejected", "", 0, false}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + path := filepath.Join(dir, tc.name+".pid") + if err := os.WriteFile(path, []byte(tc.content), 0o644); err != nil { + t.Fatalf("write: %v", err) + } + pid, ok := readGatewayPID(path) + if pid != tc.wantPID || ok != tc.wantOK { + t.Errorf("readGatewayPID(%q) = (%d, %v); want (%d, %v)", tc.content, pid, ok, tc.wantPID, tc.wantOK) + } + }) + } + + t.Run("missing file → (0, false)", func(t *testing.T) { + pid, ok := readGatewayPID(filepath.Join(dir, "does-not-exist")) + if pid != 0 || ok { + t.Errorf("readGatewayPID(missing) = (%d, %v); want (0, false)", pid, ok) + } + }) +} + +// TestProcessAlive_SelfAndBogus pins the liveness probe used to decide +// whether the previous gateway is still running. The current process is +// trivially alive; a far-out-of-range PID is not. We don't test +// "definitely-dead-but-recently-existed" cases because those depend on +// the kernel's PID-reuse window and would be flaky. +func TestProcessAlive_SelfAndBogus(t *testing.T) { + if !processAlive(os.Getpid()) { + t.Error("processAlive(self) must be true") + } + if processAlive(99_999_999) { + t.Error("processAlive(absurd pid) must be false") + } +} + func TestBuildDemoServiceOffer_RegisterFlagDrivesEnabled(t *testing.T) { for _, register := range []bool{true, false} { manifest := buildDemoServiceOffer( diff --git a/internal/schemas/service_catalog.go b/internal/schemas/service_catalog.go index cc6e411f..e53085b6 100644 --- a/internal/schemas/service_catalog.go +++ b/internal/schemas/service_catalog.go @@ -31,6 +31,14 @@ type ServiceCatalogEntry struct { Asset *ServiceCatalogAsset `json:"asset,omitempty"` Description string `json:"description"` IsDemo bool `json:"isDemo"` + + // RegistrationPending is true when the offer is operationally ready + // (route published, payment gate active, upstream healthy) but its + // ERC-8004 on-chain registration tx hasn't landed yet. The storefront + // should surface these with a "registration pending" badge so consumers + // know the offer is usable for x402 payments today, even though + // ERC-8004 discovery via the chain still resolves to the prior state. + RegistrationPending bool `json:"registrationPending,omitempty"` } // ServiceCatalogAsset describes the settlement token resolved for a catalog diff --git a/internal/serviceoffercontroller/render.go b/internal/serviceoffercontroller/render.go index ef6c4705..72dcf4d7 100644 --- a/internal/serviceoffercontroller/render.go +++ b/internal/serviceoffercontroller/render.go @@ -776,7 +776,69 @@ func buildSkillCatalogMarkdown(offers []*monetizeapi.ServiceOffer, baseURL strin return strings.Join(lines, "\n") } -// buildServiceCatalogJSON returns a JSON array of ready ServiceOffers for the public storefront. +// offerOperationallyReady reports whether an offer is usable for x402 +// payments today. This is intentionally LOOSER than the controller's +// Ready=True condition: ModelReady + UpstreamHealthy + PaymentGateReady +// + RoutePublished are sufficient. Registered is NOT in the AND. The +// reasoning: ERC-8004 on-chain registration is publication metadata, not +// operational readiness — an offer with the route published, payment gate +// active, and upstream healthy serves buyers correctly regardless of +// whether the on-chain identity has been minted yet. +// +// Used by the storefront catalog (and the skill catalog) so an offer that +// is functionally usable doesn't disappear from the operator's own +// dashboard just because the agent wallet hasn't been funded with gas +// yet. Callers should set ServiceCatalogEntry.RegistrationPending = true +// when the offer's Registered condition is False with reason +// AwaitingExternalRegistration, so storefront UIs can badge it. +func offerOperationallyReady(offer *monetizeapi.ServiceOffer) bool { + if offer == nil { + return false + } + // Backwards-compatible shortcut: the aggregate Ready=True implies all + // the per-condition gates by construction (see controller.go's `ready` + // computation), and existing tests / external callers that only emit + // the aggregate signal still want their offers to appear. + if isConditionTrue(offer.Status, "Ready") { + return true + } + // Fine-grained operational readiness: the four per-condition gates + // that make the offer usable today. Registered is intentionally NOT + // in this AND — see the doc comment on this function and on + // buildServiceCatalogJSON for the rationale. + return isConditionTrue(offer.Status, "ModelReady") && + isConditionTrue(offer.Status, "UpstreamHealthy") && + isConditionTrue(offer.Status, "PaymentGateReady") && + isConditionTrue(offer.Status, "RoutePublished") +} + +// offerAwaitingRegistration reports whether an offer is operationally +// ready but has its on-chain ERC-8004 registration still pending. Used to +// flip ServiceCatalogEntry.RegistrationPending so storefront UIs can show +// a "registration pending" badge alongside the usable offer. +func offerAwaitingRegistration(offer *monetizeapi.ServiceOffer) bool { + if offer == nil { + return false + } + for _, c := range offer.Status.Conditions { + if c.Type == "Registered" && c.Status == "False" && c.Reason == "AwaitingExternalRegistration" { + return true + } + } + return false +} + +// buildServiceCatalogJSON returns a JSON array of operationally-ready +// ServiceOffers for the public storefront feed (/api/services.json). +// +// The filter is operationally-ready (route published, payment gate +// active, upstream healthy) rather than the stricter controller +// Ready=True (which also requires Registered=True). Excluding offers +// whose only False condition is AwaitingExternalRegistration made +// operators' own seller dashboards mysteriously empty after `obol stack +// up` until they funded the agent wallet and ran `obol sell register`. +// That UX failed the "all paid services come back automatically" promise +// of the stack-up resume feature. func buildServiceCatalogJSON(offers []*monetizeapi.ServiceOffer, baseURL string) string { baseURL = strings.TrimRight(baseURL, "/") @@ -785,7 +847,7 @@ func buildServiceCatalogJSON(offers []*monetizeapi.ServiceOffer, baseURL string) if offer == nil || offer.DeletionTimestamp != nil || offer.IsPaused() { continue } - if isConditionTrue(offer.Status, "Ready") { + if offerOperationallyReady(offer) { ready = append(ready, offer) } } @@ -808,16 +870,17 @@ func buildServiceCatalogJSON(offers []*monetizeapi.ServiceOffer, baseURL string) } svc := schemas.ServiceCatalogEntry{ - Name: offer.Name, - Namespace: offer.Namespace, - Type: fallbackOfferType(offer), - Model: modelName, - Endpoint: baseURL + offer.EffectivePath(), - Price: describeOfferPrice(offer), - PayTo: offer.Spec.Payment.PayTo, - Network: offer.Spec.Payment.Network, - Description: desc, - IsDemo: offer.Namespace == "demo", + Name: offer.Name, + Namespace: offer.Namespace, + Type: fallbackOfferType(offer), + Model: modelName, + Endpoint: baseURL + offer.EffectivePath(), + Price: describeOfferPrice(offer), + PayTo: offer.Spec.Payment.PayTo, + Network: offer.Spec.Payment.Network, + Description: desc, + IsDemo: offer.Namespace == "demo", + RegistrationPending: offerAwaitingRegistration(offer), } raw, unit := offerPriceRawAndUnit(offer) diff --git a/internal/serviceoffercontroller/render_test.go b/internal/serviceoffercontroller/render_test.go index ea077860..4a11df62 100644 --- a/internal/serviceoffercontroller/render_test.go +++ b/internal/serviceoffercontroller/render_test.go @@ -992,3 +992,139 @@ func TestSafeName_Truncation(t *testing.T) { t.Error("different long names should produce different childNames") } } + +// TestOfferOperationallyReady_IncludesAwaitingExternalRegistration pins the +// behavioral fix that "operationally ready" no longer requires the on-chain +// ERC-8004 registration to have landed. +func TestOfferOperationallyReady_IncludesAwaitingExternalRegistration(t *testing.T) { + awaiting := &monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: "aeon", Namespace: "llm"}, + Status: monetizeapi.ServiceOfferStatus{ + Conditions: []monetizeapi.Condition{ + {Type: "ModelReady", Status: "True"}, + {Type: "UpstreamHealthy", Status: "True"}, + {Type: "PaymentGateReady", Status: "True"}, + {Type: "RoutePublished", Status: "True"}, + {Type: "Registered", Status: "False", Reason: "AwaitingExternalRegistration"}, + {Type: "Ready", Status: "False", Reason: "Reconciling"}, + }, + }, + } + if !offerOperationallyReady(awaiting) { + t.Fatal("offerOperationallyReady must return true for AwaitingExternalRegistration — the offer is usable for x402 payments today regardless of on-chain identity") + } + if !offerAwaitingRegistration(awaiting) { + t.Fatal("offerAwaitingRegistration must flag this offer so the storefront badges it as registration-pending") + } +} + +// TestOfferOperationallyReady_RejectsRealNotReady pins the narrow scope of +// the relax: an offer whose foreground gateway is down (UpstreamHealthy=False) +// must still be excluded. +func TestOfferOperationallyReady_RejectsRealNotReady(t *testing.T) { + notUsable := &monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: "broken", Namespace: "llm"}, + Status: monetizeapi.ServiceOfferStatus{ + Conditions: []monetizeapi.Condition{ + {Type: "ModelReady", Status: "True"}, + {Type: "UpstreamHealthy", Status: "False", Reason: "Unhealthy"}, + {Type: "PaymentGateReady", Status: "False", Reason: "WaitingForUpstream"}, + {Type: "RoutePublished", Status: "False", Reason: "WaitingForPaymentGate"}, + }, + }, + } + if offerOperationallyReady(notUsable) { + t.Error("offerOperationallyReady must reject an offer with UpstreamHealthy=False") + } + if offerAwaitingRegistration(notUsable) { + t.Error("offerAwaitingRegistration must NOT flag offers whose Registered condition isn't AwaitingExternalRegistration") + } +} + +// TestBuildServiceCatalogJSON_IncludesPendingRegistrationOffers pins the +// end-to-end wiring: an offer that's operationally ready but awaiting +// on-chain registration appears in /api/services.json with +// RegistrationPending=true. +func TestBuildServiceCatalogJSON_IncludesPendingRegistrationOffers(t *testing.T) { + offers := []*monetizeapi.ServiceOffer{ + { + ObjectMeta: metav1.ObjectMeta{Name: "aeon", Namespace: "llm"}, + Spec: monetizeapi.ServiceOfferSpec{ + Type: "inference", + Path: "/services/aeon", + Payment: monetizeapi.ServiceOfferPayment{ + Network: "base-sepolia", + PayTo: "0xeFAb75b7b199bf8512e2d5b379374Cb94dfdBA47", + Price: monetizeapi.ServiceOfferPriceTable{PerMTok: "23"}, + }, + Model: monetizeapi.ServiceOfferModel{Name: "aeon-ultimate"}, + Registration: monetizeapi.ServiceOfferRegistration{ + Enabled: true, Name: "AEON Ultimate", + Description: "Uncensored Qwen3.6-27B", + }, + }, + Status: monetizeapi.ServiceOfferStatus{ + Conditions: []monetizeapi.Condition{ + {Type: "ModelReady", Status: "True"}, + {Type: "UpstreamHealthy", Status: "True"}, + {Type: "PaymentGateReady", Status: "True"}, + {Type: "RoutePublished", Status: "True"}, + {Type: "Registered", Status: "False", Reason: "AwaitingExternalRegistration"}, + }, + }, + }, + } + jsonStr := buildServiceCatalogJSON(offers, "https://inference.example.com") + var services []schemas.ServiceCatalogEntry + if err := json.Unmarshal([]byte(jsonStr), &services); err != nil { + t.Fatalf("invalid JSON: %v\n%s", err, jsonStr) + } + if len(services) != 1 { + t.Fatalf("expected 1 service in catalog, got %d: %+v", len(services), services) + } + if services[0].Name != "aeon" { + t.Errorf("got %q, want aeon", services[0].Name) + } + if !services[0].RegistrationPending { + t.Error("RegistrationPending must be true for AwaitingExternalRegistration offers") + } +} + +// TestBuildServiceCatalogJSON_RegistrationPendingFalseForFullyReady pins +// the negative: an offer that's fully Ready=True (on-chain register tx +// landed) must NOT carry RegistrationPending. +func TestBuildServiceCatalogJSON_RegistrationPendingFalseForFullyReady(t *testing.T) { + offers := []*monetizeapi.ServiceOffer{ + { + ObjectMeta: metav1.ObjectMeta{Name: "fully-ready", Namespace: "llm"}, + Spec: monetizeapi.ServiceOfferSpec{ + Type: "http", + Payment: monetizeapi.ServiceOfferPayment{ + Network: "base", PayTo: "0x1111111111111111111111111111111111111111", + Price: monetizeapi.ServiceOfferPriceTable{PerRequest: "0.001"}, + }, + }, + Status: monetizeapi.ServiceOfferStatus{ + Conditions: []monetizeapi.Condition{ + {Type: "ModelReady", Status: "True"}, + {Type: "UpstreamHealthy", Status: "True"}, + {Type: "PaymentGateReady", Status: "True"}, + {Type: "RoutePublished", Status: "True"}, + {Type: "Registered", Status: "True", Reason: "Active"}, + {Type: "Ready", Status: "True"}, + }, + }, + }, + } + jsonStr := buildServiceCatalogJSON(offers, "https://example.com") + var services []schemas.ServiceCatalogEntry + if err := json.Unmarshal([]byte(jsonStr), &services); err != nil { + t.Fatalf("invalid JSON: %v\n%s", err, jsonStr) + } + if len(services) != 1 { + t.Fatalf("expected 1 service, got %d", len(services)) + } + if services[0].RegistrationPending { + t.Error("RegistrationPending must be false for fully Ready=True offers") + } +} From 9947bb02399b46e12ad8f256352d2d7d63c2e0e8 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 12 May 2026 20:11:15 +0800 Subject: [PATCH 6/6] fix(stack): three cosmetic fixes on top of the zero-command resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surfaced during the live spark2 walkthrough of the prior commit: 1. Gateway PID printed as -1. cmd.Process.Release() on Unix sets p.Pid to -1 as part of its handle teardown, so reading cmd.Process.Pid AFTER Release prints a bogus -1 to the operator and pins -1 in the PID file. Snapshot the PID before the Release call. 2. Stale "Host gateways are not auto-started" footer. Carried over from the pre-auto-spawn version of the resume function. With the gateway now auto-spawned, the message is misleading. Replaced with a pointer to the gateway log path so operators can `tail -f` immediately. 3. /skill.md catalog used the strict Ready=True filter while /api/services.json used the new operationally-ready filter. The two surfaces should stay consistent — an offer that's usable for x402 payments shows up in BOTH the storefront feed and the operator- facing skill catalog. Switched buildSkillCatalogMarkdown to the same offerOperationallyReady predicate. No new tests; the existing TestOfferOperationallyReady_* / catalog tests cover the filter relax for both call sites since they share the predicate. The PID-reading order is a one-line semantic fix; the footer wording isn't worth a test. --- cmd/obol/sell.go | 15 ++++++++++----- internal/serviceoffercontroller/render.go | 8 +++++++- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index 1e9e7f99..8e1d6d8c 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -3496,11 +3496,11 @@ func resumeSellOffers(ctx context.Context, cfg *config.Config, u *ui.UI) error { continue } resumed++ - u.Successf("Resumed sell-inference offer %q (run `obol sell inference %s` to restart the host gateway)", d.Name, d.Name) + u.Successf("Resumed sell-inference offer %q", d.Name) } if resumed > 0 { - u.Dim(" Host gateways are not auto-started — re-run `obol sell inference ` in a terminal you can keep open.") + u.Dim(" Gateways spawned as detached background processes — check /sell-inference//gateway.log for output.") } _ = ctx // reserved for cancellation support; current resume calls are synchronous and short return nil @@ -3634,15 +3634,20 @@ func startDetachedInferenceGateway(cfg *config.Config, u *ui.UI, d *inference.De _ = logF.Close() return fmt.Errorf("start gateway subprocess: %w", err) } + // Snapshot the PID BEFORE Release — on Unix, os.Process.Release sets + // the Pid field to -1 as part of its handle teardown, so reading the + // field afterwards prints a misleading "-1" to the operator and pins + // a bogus -1 in the PID file. Persist + announce the captured value. + gatewayPID := cmd.Process.Pid // We don't wait on the child, but releasing the handle ensures the // parent process can exit cleanly without keeping a zombie reference. if err := cmd.Process.Release(); err != nil { u.Dim(" (could not release gateway process handle: " + err.Error() + ")") } - if err := os.WriteFile(pidFile, []byte(strconv.Itoa(cmd.Process.Pid)), 0o644); err != nil { - u.Warnf("gateway started (pid %d) but could not write %s: %v", cmd.Process.Pid, pidFile, err) + if err := os.WriteFile(pidFile, []byte(strconv.Itoa(gatewayPID)), 0o644); err != nil { + u.Warnf("gateway started (pid %d) but could not write %s: %v", gatewayPID, pidFile, err) } - u.Successf("Gateway started in background (pid %d, log %s)", cmd.Process.Pid, logFile) + u.Successf("Gateway started in background (pid %d, log %s)", gatewayPID, logFile) return nil } diff --git a/internal/serviceoffercontroller/render.go b/internal/serviceoffercontroller/render.go index 72dcf4d7..46f1797a 100644 --- a/internal/serviceoffercontroller/render.go +++ b/internal/serviceoffercontroller/render.go @@ -705,12 +705,18 @@ func offerPublishedForRegistration(offer *monetizeapi.ServiceOffer) bool { func buildSkillCatalogMarkdown(offers []*monetizeapi.ServiceOffer, baseURL string) string { baseURL = strings.TrimRight(baseURL, "/") + // Same operationally-ready filter as buildServiceCatalogJSON — keep the + // two surfaces consistent. An offer that's usable for x402 payments + // (route published, payment gate active, upstream healthy) appears in + // both /skill.md and /api/services.json, with the on-chain ERC-8004 + // registration treated as informational metadata rather than a gating + // signal. See offerOperationallyReady's doc comment for the rationale. var ready []*monetizeapi.ServiceOffer for _, offer := range offers { if offer == nil || offer.DeletionTimestamp != nil || offer.IsPaused() { continue } - if isConditionTrue(offer.Status, "Ready") { + if offerOperationallyReady(offer) { ready = append(ready, offer) } }