From 7a7d51be9c208d7d6d5decd6053f5a1f97b1cd9b Mon Sep 17 00:00:00 2001 From: bussyjd Date: Fri, 15 May 2026 14:25:47 +0800 Subject: [PATCH 1/3] fix(flows): 1-retry wrapper for agent buy prompt in flow-13/14 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The agent step at flow-13/14 step 46 sends a long single-shot prompt to the obol-agent (qwen36-fast, ~4B params) telling it to invoke buy.py via its terminal tool. qwen36-fast occasionally narrates a fabricated failure (HTTP 404 path-doubling, eRPC DNS error, etc.) instead of actually running the bash command. When that happens, no PurchaseRequest is created and step 47 fails with "PurchaseRequest CR not ready" — even though buy.py was never invoked. This commit factors the prompt into agent_buy_with_retry() in lib-dual-stack.sh and replaces both flow-13 and flow-14 step 46 with a single call. The wrapper: 1. Sends the prompt as before. 2. Polls bob's hermes-obol-agent namespace for the alice-obol PR for up to 60s. 3. If the PR doesn't appear, prints a LOUD warning box flagging this as documented agent unreliability and re-sends the prompt once. 4. If still absent, step 47 fails as before. Net effect: probabilistic single-attempt FAILs become reliable PASSes on real flake while still failing loudly on a real regression. The WARN box on retry is the audit trail — if it fires regularly, the smoke needs a more reliable LLM (qwen36-deep / qwen36-35b-heretic) or a non-agent fallback. Refers: plans/inference-v1337-followup-20260514.md (the v1337 buy attempt-5 SIGKILL false-positive was the same flake class) Saves ~50 lines of duplication between the two flow scripts. --- flows/flow-13-dual-stack-obol.sh | 26 +------- flows/flow-14-live-obol-base-sepolia.sh | 24 +------- flows/lib-dual-stack.sh | 79 +++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 48 deletions(-) diff --git a/flows/flow-13-dual-stack-obol.sh b/flows/flow-13-dual-stack-obol.sh index 5dca3eb4..5beee05d 100755 --- a/flows/flow-13-dual-stack-obol.sh +++ b/flows/flow-13-dual-stack-obol.sh @@ -899,31 +899,7 @@ pass "Agent discovery prompt issued (success will be confirmed by buy + Purchase # ═════════════════════════════════════════════════════════════════ step "Bob's agent: buy 5 OBOL Permit2 auths from Alice" -buy_response=$(curl -sf --max-time 300 \ - -X POST "http://localhost:${BOB_AGENT_PORT}/v1/chat/completions" \ - -H "Authorization: Bearer $BOB_TOKEN" \ - -H "Content-Type: application/json" \ - -d "{ - \"model\": \"$BOB_AGENT_RUNTIME-agent\", - \"messages\": [{ - \"role\": \"user\", - \"content\": \"Use the buy-x402 skill and your terminal tool. Run exactly once: ERPC_URL=http://erpc.erpc.svc.cluster.local/rpc ERPC_NETWORK=base-sepolia python3 $BOB_OBOL_SKILLS_DIR/buy-x402/scripts/buy.py buy alice-obol --endpoint $TUNNEL_URL/services/alice-obol-inference/v1/chat/completions --model $OBOL_LLM_MODEL --count 5\" - }], - \"max_tokens\": 4000, - \"stream\": false - }" 2>&1 || true) -buy_content=$(extract_assistant_content "$buy_response" 2>/dev/null || true) -echo "${buy_content:0:500}" -# Don't grep buy_content for natural-language confirmation; structural success -# is the PurchaseRequest CR Ready=True poll below. -if [ -z "$(printf '%s' "$buy_content" | tr -d '[:space:]')" ]; then - echo " ! Agent returned no final assistant text; confirming purchase via PurchaseRequest CR" -fi -if printf '%s' "$buy_content" | agent_response_refused; then - fail "Agent refused to run buy.py: ${buy_content:0:500}" - emit_metrics; exit 1 -fi -pass "Agent buy prompt issued (success will be confirmed by PurchaseRequest CR)" +agent_buy_with_retry # ═════════════════════════════════════════════════════════════════ # 36-39. PR Ready / LiteLLM rollout / sidecar auths / paid call diff --git a/flows/flow-14-live-obol-base-sepolia.sh b/flows/flow-14-live-obol-base-sepolia.sh index 1531e8f3..12a37620 100755 --- a/flows/flow-14-live-obol-base-sepolia.sh +++ b/flows/flow-14-live-obol-base-sepolia.sh @@ -953,29 +953,7 @@ pass "Agent discovery prompt issued (success will be confirmed by buy + Purchase # ═════════════════════════════════════════════════════════════════ step "Bob's agent: buy 5 OBOL Permit2 auths from Alice" -buy_response=$(curl -sf --max-time 300 \ - -X POST "http://localhost:${BOB_AGENT_PORT}/v1/chat/completions" \ - -H "Authorization: Bearer $BOB_TOKEN" \ - -H "Content-Type: application/json" \ - -d "{ - \"model\": \"$BOB_AGENT_RUNTIME-agent\", - \"messages\": [{ - \"role\": \"user\", - \"content\": \"Use the buy-x402 skill and your terminal tool. Run exactly once: ERPC_URL=http://erpc.erpc.svc.cluster.local/rpc ERPC_NETWORK=base-sepolia python3 $BOB_OBOL_SKILLS_DIR/buy-x402/scripts/buy.py buy alice-obol --endpoint $TUNNEL_URL/services/alice-obol-inference/v1/chat/completions --model $OBOL_LLM_MODEL --count 5\" - }], - \"max_tokens\": 4000, - \"stream\": false - }" 2>&1 || true) -buy_content=$(extract_assistant_content "$buy_response" 2>/dev/null || true) -echo "${buy_content:0:500}" -if [ -z "$(printf '%s' "$buy_content" | tr -d '[:space:]')" ]; then - echo " ! Agent returned no final assistant text; confirming purchase via PurchaseRequest CR" -fi -if printf '%s' "$buy_content" | agent_response_refused; then - fail "Agent refused to run buy.py: ${buy_content:0:500}" - emit_metrics; exit 1 -fi -pass "Agent buy prompt issued (success will be confirmed by PurchaseRequest CR)" +agent_buy_with_retry # ═════════════════════════════════════════════════════════════════ # 31-34. PR Ready / LiteLLM rollout / sidecar auths / paid call diff --git a/flows/lib-dual-stack.sh b/flows/lib-dual-stack.sh index cb5cfd66..a88efb76 100644 --- a/flows/lib-dual-stack.sh +++ b/flows/lib-dual-stack.sh @@ -347,6 +347,85 @@ except Exception as e: " 2>&1 || true } +# Send the long single-shot buy prompt to Bob's agent. The prompt expands +# against the caller's environment (BOB_AGENT_PORT, BOB_TOKEN, +# BOB_AGENT_RUNTIME, BOB_OBOL_SKILLS_DIR, TUNNEL_URL, OBOL_LLM_MODEL). +_agent_buy_send_prompt() { + curl -sf --max-time 300 \ + -X POST "http://localhost:${BOB_AGENT_PORT}/v1/chat/completions" \ + -H "Authorization: Bearer $BOB_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{ + \"model\": \"$BOB_AGENT_RUNTIME-agent\", + \"messages\": [{ + \"role\": \"user\", + \"content\": \"Use the buy-x402 skill and your terminal tool. Run exactly once: ERPC_URL=http://erpc.erpc.svc.cluster.local/rpc ERPC_NETWORK=base-sepolia python3 $BOB_OBOL_SKILLS_DIR/buy-x402/scripts/buy.py buy alice-obol --endpoint $TUNNEL_URL/services/alice-obol-inference/v1/chat/completions --model $OBOL_LLM_MODEL --count 5\" + }], + \"max_tokens\": 4000, + \"stream\": false + }" 2>&1 || true +} + +_agent_buy_pr_exists() { + bob kubectl get purchaserequests.obol.org -n "$BOB_AGENT_NS" alice-obol \ + -o name 2>/dev/null | grep -q . +} + +# 1-retry wrapper for the agent buy prompt at flow-13/14 step 46. qwen36-fast +# (4B-class) occasionally narrates a fabricated failure on the long single-shot +# buy prompt instead of actually invoking the bash tool. When that happens, no +# PurchaseRequest is created and step 47 fails with "PurchaseRequest CR not +# ready" — even though buy.py was never invoked. See +# plans/inference-v1337-followup-20260514.md. +# +# Strategy: poll for the PR for up to 60s after the first prompt; if absent, +# print a LOUD warning flagging this as agent unreliability and re-send the +# prompt once. If still absent after the retry, step 47 fails as before. +agent_buy_with_retry() { + local response content retried=0 i + + response=$(_agent_buy_send_prompt) + content=$(extract_assistant_content "$response" 2>/dev/null || true) + echo "${content:0:500}" + if [ -z "$(printf '%s' "$content" | tr -d '[:space:]')" ]; then + echo " ! Agent returned no final assistant text; confirming purchase via PurchaseRequest CR" + fi + if printf '%s' "$content" | agent_response_refused; then + fail "Agent refused to run buy.py: ${content:0:500}" + emit_metrics; exit 1 + fi + + # Wait up to 60s for the controller to reconcile the PR. Healthy runs see + # it within ~5s; the long ceiling absorbs cluster-cold-start jitter. + for i in $(seq 1 12); do + _agent_buy_pr_exists && break + sleep 5 + done + + if ! _agent_buy_pr_exists; then + echo "" + echo " ╔════════════════════════════════════════════════════════════════════════╗" + echo " ║ WARN: agent did NOT create a PurchaseRequest after 60s. ║" + echo " ║ Documented qwen36-fast (4B) flake — agent narrates a fabricated ║" + echo " ║ failure instead of invoking buy.py. Re-prompting ONCE. ║" + echo " ║ If this fires regularly, switch to a more reliable LLM (qwen36-deep ║" + echo " ║ / qwen36-35b-heretic) or add a non-agent fallback path. ║" + echo " ║ Ref: plans/inference-v1337-followup-20260514.md ║" + echo " ╚════════════════════════════════════════════════════════════════════════╝" + echo "" + retried=1 + response=$(_agent_buy_send_prompt) + content=$(extract_assistant_content "$response" 2>/dev/null || true) + echo " RETRY response: ${content:0:500}" + if printf '%s' "$content" | agent_response_refused; then + fail "Agent refused to run buy.py on retry: ${content:0:500}" + emit_metrics; exit 1 + fi + fi + + pass "Agent buy prompt issued (retry=$retried; success will be confirmed by PurchaseRequest CR)" +} + extract_assistant_content() { DUAL_STACK_RESPONSE="$1" python3 - <<'PY' import json From 3cea3fb64373e05856df4b851c77855372126dce Mon Sep 17 00:00:00 2001 From: bussyjd Date: Fri, 15 May 2026 14:34:37 +0800 Subject: [PATCH 2/3] chore(flows): switch default QA LLM from qwen36-fast (4B) to qwen36-deep (27B) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The smaller qwen36-fast was the previous default for OBOL_LLM_MODEL across release-smoke and flow-{03,04,11,13,14} plus buy-external. It's documented as flaky on the long single-shot agent-buy prompt at flow-13/14 step 46 (see the retry-wrapper rationale added in the prior commit, plus plans/inference-v1337-followup-20260514.md). Switching the default to qwen36-deep (27B-class, also served by the same spark1 vLLM endpoint) trades a bit of latency for a much more reliable tool-call behaviour. Operators can still pin the smaller model explicitly via OBOL_LLM_MODEL=qwen36-fast for fast iteration on non-agent flows. Files changed: - flows/lib.sh, flows/release-smoke.sh, flows/flow-{03,04,11,13,14}*.sh, flows/buy-external.sh — default value switch - flows/lib-dual-stack.sh — WARN box in agent_buy_with_retry now recommends checking the model is qwen36-deep first; mentions qwen36-35b-heretic as the next escalation - CLAUDE.md, .agents/skills/obol-stack-dev/{SKILL.md,references/*.md} — documentation refreshed Not changed (intentional): - internal/{model,hermes}/*_test.go — qwen36-fast is a test fixture for the rank parser, not a default; switching would invalidate test expectations without changing test intent - plans/post-490-integration-20260513.md — historical record --- .agents/skills/obol-stack-dev/SKILL.md | 2 +- .../obol-stack-dev/references/llm-routing.md | 4 ++-- .../obol-stack-dev/references/paid-flows.md | 2 +- .../obol-stack-dev/references/remote-qa.md | 2 +- CLAUDE.md | 6 ++--- flows/buy-external.sh | 4 ++-- flows/flow-03-inference.sh | 2 +- flows/flow-04-agent.sh | 4 ++-- flows/flow-11-dual-stack.sh | 4 ++-- flows/flow-13-dual-stack-obol.sh | 10 ++++---- flows/flow-14-live-obol-base-sepolia.sh | 4 ++-- flows/lib-dual-stack.sh | 23 +++++++++++-------- flows/lib.sh | 6 +++-- flows/release-smoke.sh | 2 +- 14 files changed, 40 insertions(+), 35 deletions(-) diff --git a/.agents/skills/obol-stack-dev/SKILL.md b/.agents/skills/obol-stack-dev/SKILL.md index 86436796..33ab1661 100644 --- a/.agents/skills/obol-stack-dev/SKILL.md +++ b/.agents/skills/obol-stack-dev/SKILL.md @@ -47,7 +47,7 @@ OBOL_TOKEN_BASE_SEPOLIA=0x0a09371a8b011d5110656ceBCc70603e53FD2c78 **Payment assertion**: don't bypass the agent buy step with a direct script exec. If the agent times out, diagnose Hermes/LiteLLM/model routing — don't relax the assertion. Required evidence: `PurchaseRequest Ready=True` + paid HTTP 200 + on-chain `Transfer` + exact balance deltas. -**QA LLM**: full seller/buyer QA must route Alice and Bob through `OBOL_LLM_ENDPOINT` (OpenAI-compatible vLLM or llama.cpp on the QA host). Default `OBOL_LLM_MODEL=qwen36-fast`. Sequence: `obol model setup custom` → `obol model prefer` → one `obol model sync`. Local Ollama and cloud-fallback are **not** acceptable green substitutes for full-flow QA. +**QA LLM**: full seller/buyer QA must route Alice and Bob through `OBOL_LLM_ENDPOINT` (OpenAI-compatible vLLM or llama.cpp on the QA host). Default `OBOL_LLM_MODEL=qwen36-deep` (27B-class). The smaller `qwen36-fast` (~4B) was the previous default but flakes on the long single-shot agent-buy prompt at flow-13/14 step 46 — see the retry-wrapper rationale in `flows/lib-dual-stack.sh::agent_buy_with_retry`. Sequence: `obol model setup custom` → `obol model prefer` → one `obol model sync`. Local Ollama and cloud-fallback are **not** acceptable green substitutes for full-flow QA. **Public vs private routes**: `/services/*`, `/.well-known/agent-registration.json`, `/skill.md`, and `/` (storefront) are public via the tunnel. **NEVER** remove `hostnames: ["obol.stack"]` from frontend or eRPC HTTPRoutes — exposing them publicly is a critical security flaw. diff --git a/.agents/skills/obol-stack-dev/references/llm-routing.md b/.agents/skills/obol-stack-dev/references/llm-routing.md index 6137e336..49e014d1 100644 --- a/.agents/skills/obol-stack-dev/references/llm-routing.md +++ b/.agents/skills/obol-stack-dev/references/llm-routing.md @@ -47,7 +47,7 @@ Canonical user flow for vLLM / sglang / mlx-lm / a remote GPU box. **No ConfigMa obol stack up # Drop auto-detected Ollama entries — they will out-rank the new custom entry. -# Internal/model/rank.go parses ":9b" as 90 deci-billions; "qwen36-fast" (no +# Internal/model/rank.go parses ":9b" as 90 deci-billions; "qwen36-deep" (no # ":Nb" tag) ranks 0. Without removing them, the agent stays on slow host Ollama. obol model remove qwen3.5:9b obol model remove qwen3.5:4b @@ -55,7 +55,7 @@ obol model remove qwen3.5:4b obol model setup custom \ --name spark1-vllm \ --endpoint http://192.168.18.23:8000/v1 \ - --model qwen36-fast + --model qwen36-deep # `setup custom` validates the endpoint, patches LiteLLM, and internally calls # syncAgentModels → hermes.Sync → rewrites the default agent's deployment files # with the new primary model. No manual restart needed. diff --git a/.agents/skills/obol-stack-dev/references/paid-flows.md b/.agents/skills/obol-stack-dev/references/paid-flows.md index 4ab4ab82..ee6dfded 100644 --- a/.agents/skills/obol-stack-dev/references/paid-flows.md +++ b/.agents/skills/obol-stack-dev/references/paid-flows.md @@ -64,7 +64,7 @@ The runner has a `warn_unpaid_base_sepolia_rpc` preflight. The CLI scrubs paid-R - Alice ServiceOffer reaches `Ready=True`. - ERC-8004 registration tx published to Base Sepolia (`/.well-known/agent-registration.json` reachable via tunnel for live flows). - Bob `PurchaseRequest` reaches `Ready=True`. -- LiteLLM exposes `paid/` (default `qwen36-fast`). +- LiteLLM exposes `paid/` (default `qwen36-deep`). - Paid inference returns HTTP 200 and **final-answer** content (not reasoning metadata or tool-catalogue text). - On-chain `Transfer(Bob signer → Alice, )` receipt is archived. - Alice balance increases and Bob signer balance decreases by exactly `PAID_AMOUNT` wei (USDC for flow-11, OBOL for flow-13/14). diff --git a/.agents/skills/obol-stack-dev/references/remote-qa.md b/.agents/skills/obol-stack-dev/references/remote-qa.md index 7151c30e..ffae4c9b 100644 --- a/.agents/skills/obol-stack-dev/references/remote-qa.md +++ b/.agents/skills/obol-stack-dev/references/remote-qa.md @@ -60,7 +60,7 @@ Set `OBOL_LLM_MODEL` to an id returned by `/models`. cd "$QA" export PATH="$QA/.workspace/bin:$FOUNDRY_BIN:$TOOL_ROOT:$PATH" export OBOL_LLM_ENDPOINT=${OBOL_LLM_ENDPOINT:-http://127.0.0.1:8000/v1} -export OBOL_LLM_MODEL=${OBOL_LLM_MODEL:-qwen36-fast} +export OBOL_LLM_MODEL=${OBOL_LLM_MODEL:-qwen36-deep} ts=$(date +%Y%m%d-%H%M%S) log="$QA/.tmp/flow-14-$ts.log" art="$QA/.tmp/flow-14-$ts-artifacts" diff --git a/CLAUDE.md b/CLAUDE.md index d0740460..04a71396 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -37,7 +37,7 @@ go test -tags integration -v -run TestIntegration_Tunnel_SellDiscoverBuySidecar_ # Release-gate seller/buyer smoke (requires OBOL_LLM_ENDPOINT pointing at OpenAI-compatible vLLM/llama.cpp) RELEASE_SMOKE_INCLUDE_OBOL=true RELEASE_SMOKE_INCLUDE_OBOL_FORK=true \ - OBOL_LLM_ENDPOINT=http://127.0.0.1:8000/v1 OBOL_LLM_MODEL=qwen36-fast \ + OBOL_LLM_ENDPOINT=http://127.0.0.1:8000/v1 OBOL_LLM_MODEL=qwen36-deep \ bash flows/release-smoke.sh just up # obol stack init + up @@ -246,13 +246,13 @@ obol model remove qwen3.5:4b obol model setup custom \ --name spark1-vllm \ --endpoint http://192.168.18.23:8000/v1 \ - --model qwen36-fast + --model qwen36-deep # `setup custom` validates the endpoint, patches LiteLLM, and internally calls # syncAgentModels → hermes.Sync → rewrites the default agent's deployment files # with the new primary model. No manual restart needed. # (b) OR keep Ollama and force-promote the custom entry to the head: -obol model prefer qwen36-fast +obol model prefer qwen36-deep obol model sync # propagate to Hermes obol model list # confirm head of model_list diff --git a/flows/buy-external.sh b/flows/buy-external.sh index 002d0d2b..a386312e 100755 --- a/flows/buy-external.sh +++ b/flows/buy-external.sh @@ -60,7 +60,7 @@ # EXTERNAL_PR_TIMEOUT_S default: 300 (5 min) # EXTERNAL_LOG_BLOCKS_BACK default: 30 (~6 min on Base Sepolia at 2s/blk) # OBOL_LLM_ENDPOINT default: http://127.0.0.1:8000/v1 -# OBOL_LLM_MODEL default: qwen36-fast +# OBOL_LLM_MODEL default: qwen36-deep (27B-class) # OBOL_LLM_NAME default: external-llm # # Exit code: 0 on PASS (every step pass), 1 on any FAIL. @@ -106,7 +106,7 @@ EXTERNAL_PR_TIMEOUT_S="${EXTERNAL_PR_TIMEOUT_S:-300}" EXTERNAL_LOG_BLOCKS_BACK="${EXTERNAL_LOG_BLOCKS_BACK:-30}" OBOL_LLM_ENDPOINT="${OBOL_LLM_ENDPOINT:-http://127.0.0.1:8000/v1}" -OBOL_LLM_MODEL="${OBOL_LLM_MODEL:-qwen36-fast}" +OBOL_LLM_MODEL="${OBOL_LLM_MODEL:-qwen36-deep}" OBOL_LLM_NAME="${OBOL_LLM_NAME:-external-llm}" # Resolve OBOL_ROOT before sourcing helpers — lib.sh re-derives it but diff --git a/flows/flow-03-inference.sh b/flows/flow-03-inference.sh index 6fb63761..9233056c 100755 --- a/flows/flow-03-inference.sh +++ b/flows/flow-03-inference.sh @@ -5,7 +5,7 @@ source "$(dirname "$0")/lib.sh" if [ -n "${OBOL_LLM_ENDPOINT:-}" ]; then run_step "Route LiteLLM through QA LLM endpoint" route_llm_via_obol_cli "$OBOL" - LITELLM_MODEL="${OBOL_LLM_MODEL:-qwen36-fast}" + LITELLM_MODEL="${OBOL_LLM_MODEL:-qwen36-deep}" else LITELLM_MODEL="$FLOW_MODEL" diff --git a/flows/flow-04-agent.sh b/flows/flow-04-agent.sh index f70cede0..bb593eb6 100755 --- a/flows/flow-04-agent.sh +++ b/flows/flow-04-agent.sh @@ -110,8 +110,8 @@ fi model_name=$("$OBOL" kubectl get cm hermes-config -n "$NS" -o jsonpath='{.data.config\.yaml}' 2>/dev/null | sed -n 's/^[[:space:]]*default: //p' | tr -d '"' | head -1) [ -n "$model_name" ] || model_name="qwen3.5:35b" -if [ -n "${OBOL_LLM_ENDPOINT:-}" ] && [ "$model_name" != "${OBOL_LLM_MODEL:-qwen36-fast}" ]; then - fail "Hermes default model $model_name does not match QA LLM model ${OBOL_LLM_MODEL:-qwen36-fast}" +if [ -n "${OBOL_LLM_ENDPOINT:-}" ] && [ "$model_name" != "${OBOL_LLM_MODEL:-qwen36-deep}" ]; then + fail "Hermes default model $model_name does not match QA LLM model ${OBOL_LLM_MODEL:-qwen36-deep}" cleanup_pid "$PF_PID" emit_metrics exit 0 diff --git a/flows/flow-11-dual-stack.sh b/flows/flow-11-dual-stack.sh index 2f2f2a83..a0a19677 100755 --- a/flows/flow-11-dual-stack.sh +++ b/flows/flow-11-dual-stack.sh @@ -36,7 +36,7 @@ # FLOW11_BOB_HTTP_PORT FLOW11_BOB_HTTP_ALT_PORT # FLOW11_BOB_HTTPS_PORT FLOW11_BOB_HTTPS_ALT_PORT # OBOL_LLM_ENDPOINT required vLLM/llama.cpp/OpenAI-compatible endpoint -# OBOL_LLM_MODEL endpoint model name (default: qwen36-fast) +# OBOL_LLM_MODEL endpoint model name (default: qwen36-deep) source "$(dirname "$0")/lib.sh" # ═════════════════════════════════════════════════════════════════ @@ -60,7 +60,7 @@ BOB_HTTP_ALT_PORT="${FLOW11_BOB_HTTP_ALT_PORT:-$(pick_free_port)}" BOB_HTTPS_PORT="${FLOW11_BOB_HTTPS_PORT:-$(pick_free_port)}" BOB_HTTPS_ALT_PORT="${FLOW11_BOB_HTTPS_ALT_PORT:-$(pick_free_port)}" FACILITATOR_URL="${FLOW11_FACILITATOR_URL:-https://x402.gcp.obol.tech}" -OBOL_LLM_MODEL="${OBOL_LLM_MODEL:-qwen36-fast}" +OBOL_LLM_MODEL="${OBOL_LLM_MODEL:-qwen36-deep}" export OBOL_LLM_MODEL FLOW11_ARTIFACT_DIR="${FLOW11_ARTIFACT_DIR:-$OBOL_ROOT/.tmp/flow-11-$(date +%Y%m%d-%H%M%S)}" if ! BASE_SEPOLIA_RPC="$(resolve_base_sepolia_rpc "${FLOW11_BASE_SEPOLIA_RPC:-${BASE_SEPOLIA_RPC:-}}")"; then diff --git a/flows/flow-13-dual-stack-obol.sh b/flows/flow-13-dual-stack-obol.sh index 5beee05d..3b2cea6f 100755 --- a/flows/flow-13-dual-stack-obol.sh +++ b/flows/flow-13-dual-stack-obol.sh @@ -36,7 +36,7 @@ # FLOW13_BOB_HTTP_PORT, _ALT, _HTTPS_PORT, _HTTPS_ALT_PORT # FLOW13_ARTIFACT_DIR where receipts + logs land # OBOL_LLM_ENDPOINT required vLLM/llama.cpp/OpenAI-compatible endpoint -# OBOL_LLM_MODEL endpoint model name (default: qwen36-fast) +# OBOL_LLM_MODEL endpoint model name (default: qwen36-deep, 27B-class) # source "$(dirname "$0")/lib.sh" DUAL_STACK_FLOW_PREFIX="FLOW13" @@ -61,7 +61,7 @@ BOB_HTTP_ALT_PORT="$(dual_stack_env_or_free_port BOB_HTTP_ALT_PORT)" BOB_HTTPS_PORT="$(dual_stack_env_or_free_port BOB_HTTPS_PORT)" BOB_HTTPS_ALT_PORT="$(dual_stack_env_or_free_port BOB_HTTPS_ALT_PORT)" -OBOL_LLM_MODEL="${OBOL_LLM_MODEL:-qwen36-fast}" +OBOL_LLM_MODEL="${OBOL_LLM_MODEL:-qwen36-deep}" export OBOL_LLM_MODEL ANVIL_PORT="${FLOW13_ANVIL_PORT:-$(pick_free_port)}" @@ -923,9 +923,9 @@ buyer_status=$(buyer_sidecar_status) # Mirror flow-14's relaxed assertion. Two reasons to allow remaining>=5 # rather than exact-5: (a) controller may merge into an existing auth # pool on rerun (remaining=10 etc.); (b) the agent prompt asks for -# --count 5, but qwen36-fast occasionally hallucinates --count 1, which -# is an LLM-stochasticity issue not a buy-flow correctness issue. We -# only care that the buy step actually provisioned at least the +# --count 5, but the LLM occasionally hallucinates a different count, +# which is an LLM-stochasticity issue not a buy-flow correctness issue. +# We only care that the buy step actually provisioned at least the # requested count. remaining_n=$(echo "$buyer_status" | grep -oE 'remaining=[0-9]+' | head -1 | cut -d= -f2) if [ -n "$remaining_n" ] && [ "$remaining_n" -ge 5 ] 2>/dev/null; then diff --git a/flows/flow-14-live-obol-base-sepolia.sh b/flows/flow-14-live-obol-base-sepolia.sh index 12a37620..4b0f5bb8 100755 --- a/flows/flow-14-live-obol-base-sepolia.sh +++ b/flows/flow-14-live-obol-base-sepolia.sh @@ -43,7 +43,7 @@ # FLOW14_ARTIFACT_DIR where receipts + logs land # FLOW14_BOB_GAS_MIN_WEI default: 100000000000000 # OBOL_LLM_ENDPOINT required vLLM/llama.cpp/OpenAI-compatible endpoint -# OBOL_LLM_MODEL endpoint model name (default: qwen36-fast) +# OBOL_LLM_MODEL endpoint model name (default: qwen36-deep, 27B-class) # # Usage: # ./flows/flow-14-live-obol-base-sepolia.sh @@ -74,7 +74,7 @@ BOB_HTTP_ALT_PORT="$(dual_stack_env_or_free_port BOB_HTTP_ALT_PORT)" BOB_HTTPS_PORT="$(dual_stack_env_or_free_port BOB_HTTPS_PORT)" BOB_HTTPS_ALT_PORT="$(dual_stack_env_or_free_port BOB_HTTPS_ALT_PORT)" -OBOL_LLM_MODEL="${OBOL_LLM_MODEL:-qwen36-fast}" +OBOL_LLM_MODEL="${OBOL_LLM_MODEL:-qwen36-deep}" export OBOL_LLM_MODEL # Live Base Sepolia RPC + public Obol facilitator. No host.k3d.internal pin. diff --git a/flows/lib-dual-stack.sh b/flows/lib-dual-stack.sh index a88efb76..5d6bbea1 100644 --- a/flows/lib-dual-stack.sh +++ b/flows/lib-dual-stack.sh @@ -371,12 +371,13 @@ _agent_buy_pr_exists() { -o name 2>/dev/null | grep -q . } -# 1-retry wrapper for the agent buy prompt at flow-13/14 step 46. qwen36-fast -# (4B-class) occasionally narrates a fabricated failure on the long single-shot -# buy prompt instead of actually invoking the bash tool. When that happens, no -# PurchaseRequest is created and step 47 fails with "PurchaseRequest CR not -# ready" — even though buy.py was never invoked. See -# plans/inference-v1337-followup-20260514.md. +# 1-retry wrapper for the agent buy prompt at flow-13/14 step 46. The QA LLM +# (qwen36-deep, 27B-class — see OBOL_LLM_MODEL default) occasionally narrates a +# fabricated failure on the long single-shot buy prompt instead of actually +# invoking the bash tool. When that happens, no PurchaseRequest is created and +# step 47 fails with "PurchaseRequest CR not ready" — even though buy.py was +# never invoked. The smaller qwen36-fast (~4B) flakes much more often; deep is +# the new default for that reason. See plans/inference-v1337-followup-20260514.md. # # Strategy: poll for the PR for up to 60s after the first prompt; if absent, # print a LOUD warning flagging this as agent unreliability and re-send the @@ -406,10 +407,12 @@ agent_buy_with_retry() { echo "" echo " ╔════════════════════════════════════════════════════════════════════════╗" echo " ║ WARN: agent did NOT create a PurchaseRequest after 60s. ║" - echo " ║ Documented qwen36-fast (4B) flake — agent narrates a fabricated ║" - echo " ║ failure instead of invoking buy.py. Re-prompting ONCE. ║" - echo " ║ If this fires regularly, switch to a more reliable LLM (qwen36-deep ║" - echo " ║ / qwen36-35b-heretic) or add a non-agent fallback path. ║" + echo " ║ Documented LLM flake on the long single-shot buy prompt — agent ║" + echo " ║ narrated a fabricated failure instead of invoking buy.py. ║" + echo " ║ Re-prompting ONCE. ║" + echo " ║ If this fires regularly: confirm OBOL_LLM_MODEL=qwen36-deep (default) ║" + echo " ║ not qwen36-fast (4B), or escalate to qwen36-35b-heretic, or add a ║" + echo " ║ non-agent fallback path. ║" echo " ║ Ref: plans/inference-v1337-followup-20260514.md ║" echo " ╚════════════════════════════════════════════════════════════════════════╝" echo "" diff --git a/flows/lib.sh b/flows/lib.sh index 036ed940..a9972f13 100755 --- a/flows/lib.sh +++ b/flows/lib.sh @@ -569,7 +569,9 @@ bootstrap_flow_workspace() { # Activated when OBOL_LLM_ENDPOINT is set (for example, # http://127.0.0.1:8000/v1 on a QA machine). The endpoint must be # OpenAI-compatible, such as vLLM or llama.cpp. -# OBOL_LLM_MODEL is the upstream model id (default qwen36-fast). +# OBOL_LLM_MODEL is the upstream model id (default qwen36-deep, 27B-class). +# qwen36-fast (4B) is faster but flakes on long single-shot agent prompts; see +# the flow-13/14 step 46 retry-wrapper rationale in lib-dual-stack.sh. # OBOL_LLM_NAME is the LiteLLM short name registered for the endpoint (default # external-llm). # @@ -588,7 +590,7 @@ route_llm_via_obol_cli() { local model name if [ -n "${OBOL_LLM_ENDPOINT:-}" ]; then - model="${OBOL_LLM_MODEL:-qwen36-fast}" + model="${OBOL_LLM_MODEL:-qwen36-deep}" name="${OBOL_LLM_NAME:-external-llm}" local args=(model setup custom --no-sync --name "$name" --endpoint "$OBOL_LLM_ENDPOINT" --model "$model") diff --git a/flows/release-smoke.sh b/flows/release-smoke.sh index 7621a778..2c3d34d2 100755 --- a/flows/release-smoke.sh +++ b/flows/release-smoke.sh @@ -152,7 +152,7 @@ release-smoke: OBOL_LLM_ENDPOINT must be set when RELEASE_SMOKE_INCLUDE_OBOL=tru Set, for example: export OBOL_LLM_ENDPOINT=http://127.0.0.1:8000/v1 - export OBOL_LLM_MODEL=qwen36-fast # or whatever the endpoint serves + export OBOL_LLM_MODEL=qwen36-deep # 27B-class default; or whatever the endpoint serves See .claude/skills/obol-stack-dev/references/qa-model-envs.md. EOF From 187d82062a0392c7c2c04fa5c6e1f8cc802cc8e5 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Fri, 15 May 2026 14:44:38 +0800 Subject: [PATCH 3/3] chore(gitleaks): allowlist shell-variable Bearer headers (false positive) The authorization-header-value rule fires on `-H "Authorization: Bearer $BOB_TOKEN"` because the broad `\S+` match treats the shell variable as a high-entropy literal. The actual token comes from $BOB_TOKEN at runtime (set elsewhere in the flow), not from the literal source text. Adds a narrowly scoped allowlist regex matching only the shell variable expansion form (`$VAR` / `${VAR}`). A genuinely hardcoded Bearer string like `Bearer abc123def456...` still trips the rule because the allowlist regex requires a literal `$`. Triggered on PR #496 (the agent_buy_with_retry helper inherited the existing flow-13/14 step 46 idiom; the original sites in flow-03/04 and buy-external are pre-existing on main and so never appeared in a PR diff scan). --- .gitleaks.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitleaks.toml b/.gitleaks.toml index 3ccd89bf..e6ab0021 100644 --- a/.gitleaks.toml +++ b/.gitleaks.toml @@ -44,6 +44,12 @@ regexes = [ '''test test test test test test test test test test test junk''', # USDC storage slot values (uint256 padded, not secrets) '''0x0{50,}[0-9a-fA-F]{1,14}''', + # Shell variable expansion in HTTP Auth headers — the actual secret + # comes from $BOB_TOKEN / $LITELLM_KEY / etc. at runtime, not from + # the literal source text. Matches `Authorization: Bearer $VAR` and + # `Authorization: Basic ${VAR}` forms only; a hardcoded literal still + # trips the rule because the allowlist regex requires a literal `$`. + '''Authorization:\s+(?:Basic|Bearer)\s+\$\{?[A-Za-z_][A-Za-z0-9_]*''', ] paths = [ # Gitleaks own config