diff --git a/apps/seat-guide-speaker-vercel/.gitignore b/apps/seat-guide-speaker-vercel/.gitignore new file mode 100644 index 0000000000..6096ed2886 --- /dev/null +++ b/apps/seat-guide-speaker-vercel/.gitignore @@ -0,0 +1,2 @@ +.vercel +node_modules diff --git a/apps/seat-guide-speaker-vercel/README.md b/apps/seat-guide-speaker-vercel/README.md new file mode 100644 index 0000000000..65821be143 --- /dev/null +++ b/apps/seat-guide-speaker-vercel/README.md @@ -0,0 +1,43 @@ +# SeatGuide Speaker Vercel App + +This app lets an iPhone mounted on the Go2 act as the SeatGuide speaker. + +Flow: + +1. iPhone opens the deployed page with cellular data. +2. The page polls `/api/latest?device=go2-demo`. +3. Mac/DimOS posts arrival text to `/api/speak`. +4. The iPhone speaks the latest message with the local browser speaker. + +This minimal Vercel version stores only the latest message in serverless memory. +It is enough for quick demos, but can lose messages on cold starts or instance +changes. + +## Deploy + +Create a Vercel project from this directory: + +```bash +cd apps/seat-guide-speaker-vercel +npx vercel +``` + +No Redis or database is required for the quick demo version. + +## iPhone + +Open: + +```text +https:///?device=go2-demo +``` + +Tap `Enable speaker`. Keep Safari open and unlocked. + +## Mac Test + +```bash +curl -X POST "https:///api/speak" \ + -H "content-type: application/json" \ + -d '{"device":"go2-demo","text":"我已经到了, 请坐。"}' +``` diff --git a/apps/seat-guide-speaker-vercel/api/[...speaker].js b/apps/seat-guide-speaker-vercel/api/[...speaker].js new file mode 100644 index 0000000000..8539e114ed --- /dev/null +++ b/apps/seat-guide-speaker-vercel/api/[...speaker].js @@ -0,0 +1,74 @@ +const messages = globalThis.__seatGuideSpeakerMessages || new Map(); +globalThis.__seatGuideSpeakerMessages = messages; + +function json(res, status, body) { + res.statusCode = status; + res.setHeader("content-type", "application/json; charset=utf-8"); + res.end(JSON.stringify(body)); +} + +function sanitizeDevice(value) { + const device = String(value || "go2-demo") + .trim() + .replace(/[^a-zA-Z0-9_-]/g, "-") + .slice(0, 80); + return device || "go2-demo"; +} + +async function readBody(req) { + const chunks = []; + for await (const chunk of req) chunks.push(chunk); + const raw = Buffer.concat(chunks).toString("utf8"); + return raw ? JSON.parse(raw) : {}; +} + +async function handleSpeak(req, res) { + if (req.method !== "POST") { + json(res, 405, { ok: false, error: "method_not_allowed" }); + return; + } + + try { + const body = await readBody(req); + const text = String(body.text || "").trim(); + if (!text) { + json(res, 400, { ok: false, error: "missing_text" }); + return; + } + const device = sanitizeDevice(body.device); + const message = { + id: `${Date.now()}-${Math.random().toString(16).slice(2)}`, + device, + text: text.slice(0, 800), + createdAt: new Date().toISOString(), + }; + messages.set(device, message); + json(res, 200, { ok: true, storage: "memory", message }); + } catch (error) { + json(res, 500, { ok: false, error: String(error.message || error) }); + } +} + +function handleLatest(req, res) { + if (req.method !== "GET") { + json(res, 405, { ok: false, error: "method_not_allowed" }); + return; + } + const url = new URL(req.url, `https://${req.headers.host || "localhost"}`); + const device = sanitizeDevice(url.searchParams.get("device")); + json(res, 200, { ok: true, device, message: messages.get(device) || null }); +} + +export default async function handler(req, res) { + const url = new URL(req.url, `https://${req.headers.host || "localhost"}`); + const route = url.pathname.split("/").filter(Boolean).at(-1); + if (route === "speak") { + await handleSpeak(req, res); + return; + } + if (route === "latest") { + handleLatest(req, res); + return; + } + json(res, 404, { ok: false, error: "not_found" }); +} diff --git a/apps/seat-guide-speaker-vercel/package.json b/apps/seat-guide-speaker-vercel/package.json new file mode 100644 index 0000000000..6a88b3acdc --- /dev/null +++ b/apps/seat-guide-speaker-vercel/package.json @@ -0,0 +1,9 @@ +{ + "name": "seat-guide-speaker-vercel", + "version": "0.1.0", + "private": true, + "type": "module", + "scripts": { + "lint": "node --check 'api/[...speaker].js'" + } +} diff --git a/apps/seat-guide-speaker-vercel/public/index.html b/apps/seat-guide-speaker-vercel/public/index.html new file mode 100644 index 0000000000..08d906ab4c --- /dev/null +++ b/apps/seat-guide-speaker-vercel/public/index.html @@ -0,0 +1,238 @@ + + + + + + SeatGuide Speaker + + + +
+
+

SeatGuide Speaker

+
audio=locked
+
+
+ + + +
+
+
Messages
+
+
+
+ + + diff --git a/apps/seat-guide-speaker-vercel/vercel.json b/apps/seat-guide-speaker-vercel/vercel.json new file mode 100644 index 0000000000..e27ea66ef7 --- /dev/null +++ b/apps/seat-guide-speaker-vercel/vercel.json @@ -0,0 +1,13 @@ +{ + "headers": [ + { + "source": "/(.*)", + "headers": [ + { + "key": "Cache-Control", + "value": "no-store" + } + ] + } + ] +} diff --git a/bin/demo_seat_guide_hardware_acceptance b/bin/demo_seat_guide_hardware_acceptance new file mode 100755 index 0000000000..01a361bf5b --- /dev/null +++ b/bin/demo_seat_guide_hardware_acceptance @@ -0,0 +1,720 @@ +#!/usr/bin/env bash +set -euo pipefail + +run_dimos() { + if command -v dimos >/dev/null 2>&1; then + dimos "$@" + else + uv run dimos "$@" + fi +} + +log_dir="${SEAT_GUIDE_ACCEPTANCE_LOG_DIR:-logs/seat_guide_acceptance}" +mkdir -p "${log_dir}" +log_file="${log_dir}/$(date +%Y%m%d-%H%M%S).log" +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +acceptance_log_verifier="${script_dir}/demo_seat_guide_verify_acceptance_log" +web_input_url="http://localhost:5555" +active_stream_pid="" +active_stream_file="" + +cleanup_active_stream() { + if [[ -n "${active_stream_pid}" ]] && kill -0 "${active_stream_pid}" >/dev/null 2>&1; then + kill "${active_stream_pid}" >/dev/null 2>&1 || true + wait "${active_stream_pid}" >/dev/null 2>&1 || true + fi + if [[ -n "${active_stream_file}" ]]; then + rm -f "${active_stream_file}" + fi + active_stream_pid="" + active_stream_file="" +} + +trap cleanup_active_stream EXIT +trap 'cleanup_active_stream; exit 130' INT +trap 'cleanup_active_stream; exit 143' TERM + +log() { + printf '%s\n' "$*" | tee -a "${log_file}" +} + +run_and_log() { + log "+ dimos $*" + run_dimos "$@" 2>&1 | tee -a "${log_file}" +} + +run_output="" +run_capture() { + log "+ dimos $*" + if ! run_output="$(run_dimos "$@" 2>&1)"; then + printf '%s\n' "${run_output}" | tee -a "${log_file}" + return 1 + fi + printf '%s\n' "${run_output}" | tee -a "${log_file}" +} + +capture_dimos_log() { + local label="$1" + log "" + log "${label}" + log "+ dimos log -n 200" + run_dimos log -n 200 2>&1 | tee -a "${log_file}" || true +} + +verify_acceptance_log() { + log "" + log "Verifying hardware acceptance transcript..." + log "+ ${acceptance_log_verifier} ${log_file}" + local verifier_output + if ! verifier_output="$("${acceptance_log_verifier}" "${log_file}" 2>&1)"; then + printf '%s\n' "${verifier_output}" | tee -a "${log_file}" + log "Hardware acceptance no-go: transcript verifier failed." + log "Transcript saved to: ${log_file}" + exit 3 + fi + printf '%s\n' "${verifier_output}" | tee -a "${log_file}" +} + +extract_goal_sequence() { + sed -n 's/.*goal_sequence=\([0-9][0-9]*\).*/\1/p' <<<"$1" | tail -n 1 +} + +extract_web_input_url() { + sed -n 's/.*url=\(http[^; ]*\).*/\1/p' <<<"$1" | sed 's/\.$//' | tail -n 1 +} + +extract_run_id() { + sed -n 's/^ Run ID:[[:space:]]*//p' <<<"$1" | tail -n 1 +} + +seat_guide_goal_completed_after_sequence() { + local previous_goal_sequence="$1" + local nav_output="$2" + local current_goal_sequence + current_goal_sequence="$(extract_goal_sequence "${nav_output}")" + [[ -n "${current_goal_sequence}" ]] \ + && ((current_goal_sequence > previous_goal_sequence)) \ + && grep -Fq "goal_reached=true" <<<"${nav_output}" +} + +seat_guide_preflight_ready_for_hardware() { + local output="$1" + grep -Fq "SeatGuide preflight ready" <<<"${output}" \ + && grep -Fq "navigation=IDLE" <<<"${output}" +} + +web_input_ready_for_seat_guide() { + local output="$1" + grep -Fq "web=started" <<<"${output}" \ + && grep -Fq "thread=running" <<<"${output}" \ + && grep -Fq "seat_route=seat_guide_direct" <<<"${output}" \ + && grep -Fq "responses=connected" <<<"${output}" \ + && grep -Fq "voice_upload=connected" <<<"${output}" \ + && grep -Fq "stt=connected" <<<"${output}" \ + && grep -Fq "human_transport=connected" <<<"${output}" +} + +log_web_input_no_go_details() { + local output="$1" + if ! grep -Fq "web=started" <<<"${output}"; then + log " - WebInput server is not started. Check WebInput module startup before using browser text or microphone input." + fi + if ! grep -Fq "thread=running" <<<"${output}"; then + log " - WebInput server thread is not running. Inspect WebInput startup logs and port binding." + fi + if ! grep -Fq "seat_route=seat_guide_direct" <<<"${output}"; then + log " - WebInput is not directly wired to SeatGuide. Check blueprint injection of SeatGuideSkillContainer into WebInput." + fi + if ! grep -Fq "responses=connected" <<<"${output}"; then + log " - WebInput response stream is missing. Browser feedback cannot prove SeatGuide responses." + fi + if ! grep -Fq "voice_upload=connected" <<<"${output}"; then + log " - WebInput browser audio upload endpoint is not connected. Browser microphone recordings cannot reach the speech-to-text pipeline." + fi + if ! grep -Fq "stt=connected" <<<"${output}"; then + log " - WebInput speech-to-text pipeline is unavailable. Check Whisper dependencies, model loading, and audio pipeline initialization before browser microphone acceptance." + fi + if ! grep -Fq "human_transport=connected" <<<"${output}"; then + log " - WebInput human-input fallback transport is missing. Normal agent text fallback will not work." + fi +} + +camera_provider_ready_for_hardware() { + local output="$1" + grep -Fq "credential=present" <<<"${output}" \ + && grep -Eq "image=[0-9]+x[0-9]+" <<<"${output}" \ + && grep -Fq "image_fresh=true" <<<"${output}" \ + && grep -Eq "camera_info=[0-9]+x[0-9]+" <<<"${output}" \ + && grep -Fq "camera_info_fresh=true" <<<"${output}" \ + && grep -Eq "lidar=[1-9][0-9]* points" <<<"${output}" \ + && grep -Fq "lidar_fresh=true" <<<"${output}" \ + && grep -Fq "odom=(" <<<"${output}" \ + && grep -Fq "odom_fresh=true" <<<"${output}" \ + && grep -Fq "override=inactive" <<<"${output}" \ + && grep -Fq "configured_fallback_seats=0" <<<"${output}" \ + && grep -Fq "configured_fallback_people=0" <<<"${output}" +} + +log_camera_provider_no_go_details() { + local output="$1" + if grep -Fq "credential=missing" <<<"${output}"; then + log " - VLM credential is missing. For qwen, export ALIBABA_API_KEY in the environment used to start the DimOS daemon, then restart the SeatGuide stack. For moondream, verify the local model cache." + fi + if grep -Fq "image=missing" <<<"${output}" || ! grep -Eq "image=[0-9]+x[0-9]+" <<<"${output}"; then + log " - Camera image is missing. Check the Go2 camera stream and confirm the robot is publishing color_image." + fi + if grep -Fq "image_fresh=false" <<<"${output}"; then + log " - Camera image is stale. Restart/fix the camera stream before using visual detections for a live goal." + fi + if grep -Fq "camera_info=missing" <<<"${output}" || ! grep -Eq "camera_info=[0-9]+x[0-9]+" <<<"${output}"; then + log " - Camera calibration is missing. Check GO2Connection.camera_info so 2D detections can be projected into 3D." + fi + if grep -Fq "camera_info_fresh=false" <<<"${output}"; then + log " - Camera calibration timestamps are stale. Restart/fix camera_info publishing before projecting detections into 3D." + fi + if grep -Fq "lidar=missing" <<<"${output}" || ! grep -Eq "lidar=[1-9][0-9]* points" <<<"${output}"; then + log " - LiDAR point cloud is missing. Check the Go2 L1 LiDAR stream; SeatGuide needs 3D points for real navigation targets." + fi + if grep -Fq "lidar_fresh=false" <<<"${output}"; then + log " - LiDAR point cloud is stale. Restart/fix the LiDAR stream before using live SeatGuide navigation." + fi + if grep -Fq "odom=missing" <<<"${output}"; then + log " - Odometry is missing. Check localization/odom before sending a map-frame navigation goal." + fi + if grep -Fq "odom_fresh=false" <<<"${output}"; then + log " - Odometry is stale. Restart/fix localization before sending a map-frame navigation goal." + fi + if grep -Fq "override=active" <<<"${output}"; then + log " - Runtime seat-scene override is active. Clear it before hardware acceptance so camera recognition is the source of truth." + fi + if ! grep -Fq "configured_fallback_seats=0" <<<"${output}" \ + || ! grep -Fq "configured_fallback_people=0" <<<"${output}"; then + log " - Configured fallback seats/people are non-zero. Use the SeatGuide hardware blueprint with camera-backed perception and no fallback layout for acceptance." + fi +} + +log_seat_guide_no_go_details() { + local output="$1" + if grep -Fq "navigation=FOLLOWING_PATH" <<<"${output}" \ + || grep -Fq "navigation=RECOVERY" <<<"${output}"; then + log " - Navigation is busy. Wait for the current goal to finish or cancel it, then rerun preflight before any live voice request." + fi + if grep -Fq "navigation=missing" <<<"${output}" \ + || grep -Fq "navigation=error(" <<<"${output}"; then + log " - Navigation is unavailable. Check the NavigationSkillContainer wiring and navigation logs before sending a SeatGuide goal." + fi + if grep -Fq "source=no_camera_image" <<<"${output}" \ + || grep -Fq "perception=no_camera_image" <<<"${output}"; then + log " - SeatGuide has no camera image. Check the Go2 camera stream and turn the robot toward the conference table." + fi + if grep -Fq "source=camera_no_odom" <<<"${output}" \ + || grep -Fq "perception=camera_no_odom" <<<"${output}"; then + log " - SeatGuide has camera frames but no odometry. Fix localization/odom before sending a map-frame navigation goal." + fi + if grep -Fq "source=stale_camera_image" <<<"${output}" \ + || grep -Fq "perception=stale_camera_image" <<<"${output}"; then + log " - SeatGuide camera frames are stale. Restore the live camera stream before sending a SeatGuide goal." + fi + if grep -Fq "source=stale_camera_odom" <<<"${output}" \ + || grep -Fq "perception=stale_camera_odom" <<<"${output}"; then + log " - SeatGuide odometry is stale. Restore localization before sending a map-frame goal." + fi + if grep -Fq "source=camera_no_seats_detected" <<<"${output}" \ + || grep -Fq "perception=camera_no_seats_detected" <<<"${output}" \ + || grep -Fq "no seats" <<<"${output}"; then + log " - SeatGuide cannot see chairs. Reposition the robot toward the long table or inspect VLM chair detections before live navigation." + fi + if grep -Fq "source=camera_detection_error" <<<"${output}" \ + || grep -Fq "perception=camera_detection_error" <<<"${output}"; then + log " - SeatGuide camera detection failed. Check the configured VLM model, required credential or local model cache, and DimOS logs." + fi + if grep -Fq "is not live camera" <<<"${output}" \ + || grep -Fq "source=configured_fallback" <<<"${output}" \ + || grep -Fq "source=runtime_override" <<<"${output}"; then + log " - SeatGuide is using fallback/calibrated coordinates. That is useful for debugging but rejected for official hardware acceptance." + fi + if grep -Fq "no empty seat" <<<"${output}" \ + || grep -Fq "no empty seat available" <<<"${output}"; then + log " - SeatGuide sees seats but none are empty. Move a person/object away from a chair or adjust the occupancy radius/layout before live navigation." + fi +} + +require_hardware_run_registry() { + local status_text="$1" + local run_id + run_id="$(extract_run_id "${status_text}")" + if [[ -z "${run_id}" ]]; then + log "Hardware acceptance no-go: could not parse DimOS run ID from status." + log "Transcript saved to: ${log_file}" + exit 3 + fi + + local state_home="${XDG_STATE_HOME:-${HOME}/.local/state}" + local registry_path="${state_home}/dimos/runs/${run_id}.json" + if [[ ! -f "${registry_path}" ]]; then + log "Hardware acceptance no-go: DimOS run registry entry not found: ${registry_path}" + log "Transcript saved to: ${log_file}" + exit 3 + fi + + log "Hardware run registry: ${registry_path}" + if grep -Fq '"--replay' "${registry_path}" \ + || grep -Eq '"replay"[[:space:]]*:[[:space:]]*true' "${registry_path}"; then + log "Hardware acceptance no-go: running DimOS stack is replay mode." + log "Transcript saved to: ${log_file}" + exit 3 + fi + if grep -Fq '"--simulation' "${registry_path}" \ + || grep -Eq '"simulation"[[:space:]]*:[[:space:]]*true' "${registry_path}" \ + || grep -Eq '"simulation"[[:space:]]*:[[:space:]]*"[^"]+"' "${registry_path}"; then + log "Hardware acceptance no-go: running DimOS stack is simulation mode." + log "Transcript saved to: ${log_file}" + exit 3 + fi + if ! grep -Eq '"blueprint"[[:space:]]*:[[:space:]]*"unitree-go2-seat-guide(-agentic)?"' "${registry_path}"; then + log "Hardware acceptance no-go: running DimOS stack is not a SeatGuide Go2 blueprint." + log "Transcript saved to: ${log_file}" + exit 3 + fi + local blueprint + blueprint="$(sed -n 's/.*"blueprint"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${registry_path}" | tail -n 1)" + log "Hardware blueprint: ${blueprint}" + log "Hardware run mode: hardware." +} + +wait_for_navigation_goal_reached() { + local previous_goal_sequence="$1" + local attempts="${SEAT_GUIDE_GOAL_REACHED_ATTEMPTS:-60}" + local interval_s="${SEAT_GUIDE_GOAL_REACHED_INTERVAL_S:-2}" + + for ((attempt = 1; attempt <= attempts; attempt++)); do + log "Checking SeatGuide navigation completion (${attempt}/${attempts})..." + run_capture mcp call seat_guide_navigation_status + local nav_output="${run_output}" + if seat_guide_goal_completed_after_sequence "${previous_goal_sequence}" "${nav_output}"; then + log "SeatGuide navigation goal reached." + return 0 + fi + sleep "${interval_s}" + done + + log "Hardware acceptance no-go: SeatGuide navigation did not report goal_reached=true." + capture_dimos_log "Capturing DimOS log snapshot after navigation completion timeout..." + log "Transcript saved to: ${log_file}" + exit 3 +} + +wait_for_stream_text() { + local stream_file="$1" + local expected="$2" + local timeout_s="$3" + local start_offset="${4:-0}" + local elapsed=0 + + while true; do + if dd if="${stream_file}" bs=1 skip="${start_offset}" 2>/dev/null | grep -Fq "${expected}"; then + return 0 + fi + if ((elapsed >= timeout_s)); then + return 1 + fi + sleep 1 + elapsed=$((elapsed + 1)) + done +} + +check_webinput_preview_route() { + if ! command -v curl >/dev/null 2>&1; then + log "Hardware acceptance no-go: curl is required to verify the WebInput HTTP route." + log "Transcript saved to: ${log_file}" + exit 3 + fi + + local stream_file + stream_file="$(mktemp)" + active_stream_file="${stream_file}" + local stream_pid="" + stop_stream() { + if [[ -n "${stream_pid}" ]] && kill -0 "${stream_pid}" >/dev/null 2>&1; then + kill "${stream_pid}" >/dev/null 2>&1 || true + wait "${stream_pid}" >/dev/null 2>&1 || true + fi + active_stream_pid="" + } + + local response_timeout="${SEAT_GUIDE_WEBINPUT_TEXT_WAIT_S:-20}" + log "+ curl --no-buffer --max-time $((response_timeout + 5)) ${web_input_url}/text_stream/agent_responses" + curl --no-buffer --max-time "$((response_timeout + 5))" -s "${web_input_url}/text_stream/agent_responses" \ + >"${stream_file}" 2>&1 & + stream_pid="$!" + active_stream_pid="${stream_pid}" + sleep 0.5 + local stream_start_bytes + stream_start_bytes="$(wc -c <"${stream_file}" | tr -d ' ')" + + log "+ curl -X POST ${web_input_url}/submit_query --data-urlencode query=预检帮我找一个空位" + local post_output + if ! post_output="$(curl -sS -f -X POST \ + --data-urlencode "query=预检帮我找一个空位" \ + "${web_input_url}/submit_query" 2>&1)"; then + printf '%s\n' "${post_output}" | tee -a "${log_file}" + log "Hardware acceptance no-go: WebInput /submit_query request failed." + log "Transcript saved to: ${log_file}" + stop_stream + rm -f "${stream_file}" + active_stream_file="" + exit 3 + fi + printf '%s\n' "${post_output}" | tee -a "${log_file}" + + local stream_matched=0 + if wait_for_stream_text "${stream_file}" "SeatGuide preflight ready" "${response_timeout}" "${stream_start_bytes}"; then + stream_matched=1 + fi + stop_stream + + log "Captured WebInput agent_responses stream:" + cat "${stream_file}" | tee -a "${log_file}" || true + if [[ "${stream_matched}" != "1" ]]; then + log "Hardware acceptance no-go: WebInput text route did not publish a ready SeatGuide preview response." + capture_dimos_log "Capturing DimOS log snapshot after WebInput text route failure..." + log "Transcript saved to: ${log_file}" + rm -f "${stream_file}" + active_stream_file="" + exit 3 + fi + rm -f "${stream_file}" + active_stream_file="" +} + +check_webinput_voice_preview_route() { + if ! command -v curl >/dev/null 2>&1; then + log "Hardware acceptance no-go: curl is required to verify the WebInput voice route." + log "Transcript saved to: ${log_file}" + exit 3 + fi + + local stream_file + stream_file="$(mktemp)" + active_stream_file="${stream_file}" + local stream_pid="" + stop_stream() { + if [[ -n "${stream_pid}" ]] && kill -0 "${stream_pid}" >/dev/null 2>&1; then + kill "${stream_pid}" >/dev/null 2>&1 || true + wait "${stream_pid}" >/dev/null 2>&1 || true + fi + active_stream_pid="" + } + + local response_timeout="${SEAT_GUIDE_WEBINPUT_VOICE_PREVIEW_WAIT_S:-120}" + log "+ curl --no-buffer --max-time $((response_timeout + 5)) ${web_input_url}/text_stream/agent_responses" + curl --no-buffer --max-time "$((response_timeout + 5))" -s "${web_input_url}/text_stream/agent_responses" \ + >"${stream_file}" 2>&1 & + stream_pid="$!" + active_stream_pid="${stream_pid}" + + cat </dev/null 2>&1; then + log "Hardware acceptance no-go: curl is required to verify the live WebInput voice route." + log "Transcript saved to: ${log_file}" + exit 3 + fi + + local stream_file + stream_file="$(mktemp)" + active_stream_file="${stream_file}" + local stream_pid="" + stop_stream() { + if [[ -n "${stream_pid}" ]] && kill -0 "${stream_pid}" >/dev/null 2>&1; then + kill "${stream_pid}" >/dev/null 2>&1 || true + wait "${stream_pid}" >/dev/null 2>&1 || true + fi + active_stream_pid="" + } + + local response_timeout="${SEAT_GUIDE_WEBINPUT_VOICE_LIVE_WAIT_S:-150}" + log "+ curl --no-buffer --max-time $((response_timeout + 5)) ${web_input_url}/text_stream/agent_responses" + curl --no-buffer --max-time "$((response_timeout + 5))" -s "${web_input_url}/text_stream/agent_responses" \ + >"${stream_file}" 2>&1 & + stream_pid="$!" + active_stream_pid="${stream_pid}" + + cat <&2 + +No running DimOS stack found. +Start the real Go2 SeatGuide stack first, for example: + + dimos run unitree-go2-seat-guide-agentic --robot-ip 192.168.123.161 --daemon +EOF + log "Transcript saved to: ${log_file}" + exit 2 +fi +require_hardware_run_registry "${status_output}" + +log "" +log "Checking SeatGuide hardware acceptance tools..." +if ! tools="$(run_dimos mcp list-tools 2>&1)"; then + printf '%s\n' "${tools}" | tee -a "${log_file}" + log "Hardware acceptance no-go: MCP tools are unavailable." + log "Confirm the running blueprint is unitree-go2-seat-guide or unitree-go2-seat-guide-agentic and includes McpServer." + log "Transcript saved to: ${log_file}" + exit 3 +fi +printf '%s\n' "${tools}" >>"${log_file}" +require_tool "${tools}" "web_input_status" +require_tool "${tools}" "camera_seat_provider_status" +require_tool "${tools}" "seat_guide_status" +require_tool "${tools}" "seat_guide_readiness_report" +require_tool "${tools}" "seat_guide_preflight" +require_tool "${tools}" "seat_guide_navigation_status" +require_tool "${tools}" "preview_seat_request" +require_tool "${tools}" "preview_empty_seat_goal" +require_tool "${tools}" "handle_seat_request" + +log "" +log "Checking SeatGuide module wiring..." +run_capture mcp modules +modules_output="${run_output}" +require_output_contains "${modules_output}" "CameraSeatObservationProvider" "mcp modules" +require_output_contains "${modules_output}" "SeatGuideSkillContainer" "mcp modules" +require_output_contains "${modules_output}" "WebInput" "mcp modules" + +log "" +log "Checking WebInput route..." +run_capture mcp call web_input_status +web_input_output="${run_output}" +if ! web_input_ready_for_seat_guide "${web_input_output}"; then + log "Hardware acceptance no-go: web_input_status was not ready for SeatGuide." + log_web_input_no_go_details "${web_input_output}" + log "Transcript saved to: ${log_file}" + exit 3 +fi +detected_web_input_url="$(extract_web_input_url "${web_input_output}")" +if [[ -n "${detected_web_input_url}" ]]; then + web_input_url="${detected_web_input_url}" +fi +log "Using WebInput URL: ${web_input_url}" + +log "" +log "Checking camera/VLM provider without running detection..." +run_capture mcp call camera_seat_provider_status +camera_output="${run_output}" +if ! camera_provider_ready_for_hardware "${camera_output}"; then + log "Hardware acceptance no-go: camera_seat_provider_status was not hardware ready." + log_camera_provider_no_go_details "${camera_output}" + log "Transcript saved to: ${log_file}" + exit 3 +fi + +log "" +log "Checking current SeatGuide scene..." +run_capture mcp call seat_guide_status +scene_output="${run_output}" +if ! grep -Eq "SeatGuide scene source=camera(_3d)?:" <<<"${scene_output}"; then + log "Hardware acceptance no-go: seat_guide_status did not report live camera perception." + log_seat_guide_no_go_details "${scene_output}" + log "Transcript saved to: ${log_file}" + exit 3 +fi + +log "" +log "Running no-motion readiness report..." +run_capture mcp call seat_guide_readiness_report +readiness_output="${run_output}" +if ! seat_guide_preflight_ready_for_hardware "${readiness_output}"; then + log "Hardware acceptance no-go: seat_guide_readiness_report was not hardware ready." + log_seat_guide_no_go_details "${readiness_output}" + log "Transcript saved to: ${log_file}" + exit 3 +fi + +log "" +log "Running no-motion preflight..." +run_capture mcp call seat_guide_preflight +preflight_output="${run_output}" +if ! seat_guide_preflight_ready_for_hardware "${preflight_output}"; then + log "Hardware acceptance no-go: seat_guide_preflight was not hardware ready." + log_seat_guide_no_go_details "${preflight_output}" + log "Transcript saved to: ${log_file}" + exit 3 +fi + +log "" +log "Previewing spoken Chinese request without moving..." +run_capture mcp call preview_seat_request --json-args '{"text": "预检帮我找一个空位"}' +preview_request_output="${run_output}" +if ! seat_guide_preflight_ready_for_hardware "${preview_request_output}"; then + log "Hardware acceptance no-go: preview_seat_request was not hardware ready." + log_seat_guide_no_go_details "${preview_request_output}" + log "Transcript saved to: ${log_file}" + exit 3 +fi + +log "" +log "Previewing selected goal without moving..." +run_capture mcp call preview_empty_seat_goal +preview_goal_output="${run_output}" +if ! grep -Eq "SeatGuide preview source=camera(_3d)?:" <<<"${preview_goal_output}"; then + log "Hardware acceptance no-go: preview_empty_seat_goal did not use live camera perception." + log_seat_guide_no_go_details "${preview_goal_output}" + log "Transcript saved to: ${log_file}" + exit 3 +fi +require_output_contains "${preview_goal_output}" "selected" "preview_empty_seat_goal" +require_output_contains "${preview_goal_output}" "goal=(" "preview_empty_seat_goal" + +log "" +log "Verifying WebInput HTTP text route without moving..." +check_webinput_preview_route + +log "" +log "Verifying browser microphone/Whisper route without moving..." +check_webinput_voice_preview_route + +capture_dimos_log "Capturing DimOS log snapshot after no-motion checks..." + +log "" +log "Capturing SeatGuide goal sequence before live voice request..." +run_capture mcp call seat_guide_navigation_status +before_live_goal_sequence="$(extract_goal_sequence "${run_output}")" +if [[ -z "${before_live_goal_sequence}" ]]; then + log "Hardware acceptance no-go: could not parse SeatGuide goal_sequence before live request." + capture_dimos_log "Capturing DimOS log snapshot after goal_sequence parse failure..." + log "Transcript saved to: ${log_file}" + exit 3 +fi + +cat <<'EOF' | tee -a "${log_file}" + +No-motion checks completed. + +Before live navigation: + - Confirm the Go2 is physically clear to move. + - Automated gates passed for WebInput, browser voice, camera/VLM/odometry, speech, preflight, and goal preview. + +Type LIVE to start the real browser-microphone navigation request. Anything else aborts. +EOF + +read -r confirmation +log "Operator confirmation: ${confirmation}" +if [[ "${confirmation}" != "LIVE" ]]; then + log "Aborted before live navigation." + log "Transcript saved to: ${log_file}" + exit 0 +fi + +log "" +log "Sending live SeatGuide navigation request through browser microphone..." +check_webinput_voice_live_route +wait_for_navigation_goal_reached "${before_live_goal_sequence}" + +log "" +capture_dimos_log "Capturing DimOS log snapshot after live request..." +verify_acceptance_log + +log "" +log "Live request sent. Continue monitoring with: dimos log -f" +log "Transcript saved to: ${log_file}" diff --git a/bin/demo_seat_guide_hardware_bringup b/bin/demo_seat_guide_hardware_bringup new file mode 100755 index 0000000000..e4654416de --- /dev/null +++ b/bin/demo_seat_guide_hardware_bringup @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Usage: bin/demo_seat_guide_hardware_bringup [--robot-ip ] [--detection-model ] [--skip-start] [--skip-smoke] + +Start the real Go2 SeatGuide stack, run no-motion readiness checks, then run +the browser-microphone hardware acceptance flow. + +Options: + --robot-ip Go2 robot IP. Default: 192.168.123.161 + --detection-model + VLM detector. Default: moondream. Use qwen only when + ALIBABA_API_KEY is configured. + --skip-start Use the currently running DimOS stack instead of starting one. + --skip-smoke Skip the no-motion smoke wrapper and go straight to hardware acceptance. + -h, --help Show this help. + +Required environment: + ALIBABA_API_KEY Required only when --detection-model qwen. + +Optional environment: + OPENROUTER_API_KEY or OPENAI_API_KEY + Enables the normal LLM agent path. SeatGuide direct voice + routing and MCP tools still work without it. +EOF +} + +run_dimos() { + if command -v dimos >/dev/null 2>&1; then + dimos "$@" + else + uv run dimos "$@" + fi +} + +robot_ip="${SEAT_GUIDE_ROBOT_IP:-192.168.123.161}" +detection_model="${SEAT_GUIDE_DETECTION_MODEL:-moondream}" +skip_start=0 +skip_smoke=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --robot-ip) + if [[ $# -lt 2 ]]; then + echo "Missing value for --robot-ip." >&2 + usage >&2 + exit 2 + fi + robot_ip="$2" + shift 2 + ;; + --detection-model) + if [[ $# -lt 2 ]]; then + echo "Missing value for --detection-model." >&2 + usage >&2 + exit 2 + fi + detection_model="$2" + shift 2 + ;; + --skip-start) + skip_start=1 + shift + ;; + --skip-smoke) + skip_smoke=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 2 + ;; + esac +done + +missing=0 +if [[ "${detection_model}" == "qwen" && -z "${ALIBABA_API_KEY:-}" ]]; then + echo "SeatGuide bring-up no-go: ALIBABA_API_KEY is not set for detection_model=qwen." >&2 + missing=1 +fi +if [[ "${missing}" != "0" ]]; then + cat >&2 <<'EOF' +Set the required keys in the same terminal that will start DimOS, for example: + +If you choose --detection-model qwen, also set: + + export ALIBABA_API_KEY=... + +EOF + exit 2 +fi + +if [[ -z "${OPENROUTER_API_KEY:-}" && -z "${OPENAI_API_KEY:-}" ]]; then + echo "SeatGuide bring-up note: no LLM API key is set; normal agent chat will be disabled, but direct SeatGuide voice/MCP routing still works." >&2 +fi + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +if [[ "${skip_start}" != "1" ]]; then + echo "Starting real Go2 SeatGuide stack at robot IP ${robot_ip} with detection_model=${detection_model}..." + run_dimos --robot-ip "${robot_ip}" --detection-model "${detection_model}" run unitree-go2-seat-guide-agentic --daemon +else + echo "Using the currently running DimOS stack." +fi + +echo +echo "Current DimOS status:" +run_dimos status + +if [[ "${skip_smoke}" != "1" ]]; then + echo + echo "Running SeatGuide no-motion smoke checks..." + "${script_dir}/demo_seat_guide_smoke" +else + echo + echo "Skipping no-motion smoke checks." +fi + +cat <<'EOF' + +SeatGuide hardware acceptance will now verify the real browser microphone path. +You will need to: + 1. Open the WebInput URL printed by the script. + 2. Allow microphone access. + 3. Type LIVE only when the Go2 is physically clear to move. + 4. Say the prompted Chinese phrases into the browser microphone. +EOF + +"${script_dir}/demo_seat_guide_hardware_acceptance" diff --git a/bin/demo_seat_guide_replay_smoke b/bin/demo_seat_guide_replay_smoke new file mode 100755 index 0000000000..ed67192a98 --- /dev/null +++ b/bin/demo_seat_guide_replay_smoke @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +run_dimos() { + if command -v dimos >/dev/null 2>&1; then + dimos "$@" + else + uv run dimos "$@" + fi +} + +if [[ "$(uname -s)" == "Darwin" ]]; then + if ! netstat -nr | awk '/224\.0\.0(\.0)?\/4/ && /lo0/ { found = 1 } END { exit found ? 0 : 1 }'; then + cat >&2 <<'EOF' +SeatGuide replay smoke requires multicast route 224.0.0.0/4 on lo0. + +Run this once in an interactive terminal, then rerun this script: + + sudo route delete -net 224.0.0.0/4 || true + sudo route add -net 224.0.0.0/4 -interface lo0 +EOF + exit 2 + fi +fi + +cleanup() { + run_dimos stop >/dev/null 2>&1 || true +} +trap cleanup EXIT + +echo "Starting SeatGuide replay stack..." +run_dimos --replay run unitree-go2-seat-guide-agentic --daemon + +echo +echo "Running SeatGuide no-motion smoke against replay stack..." +"$(dirname "$0")/demo_seat_guide_smoke" + +echo +echo "SeatGuide replay smoke completed." diff --git a/bin/demo_seat_guide_smoke b/bin/demo_seat_guide_smoke new file mode 100755 index 0000000000..d2e99ad1f4 --- /dev/null +++ b/bin/demo_seat_guide_smoke @@ -0,0 +1,135 @@ +#!/usr/bin/env bash + +set -euo pipefail + +run_dimos() { + if command -v dimos >/dev/null 2>&1; then + dimos "$@" + else + uv run dimos "$@" + fi +} + +require_tool() { + local tools="$1" + local tool_name="$2" + if ! grep -q "\"${tool_name}\"" <<<"${tools}"; then + echo "SeatGuide smoke no-go: missing MCP tool '${tool_name}'." >&2 + echo "Confirm the running blueprint is unitree-go2-seat-guide or unitree-go2-seat-guide-agentic and includes SeatGuide, WebInput, and camera provider modules." >&2 + exit 3 + fi +} + +require_output_contains() { + local output="$1" + local expected="$2" + local label="$3" + if ! grep -Fq "${expected}" <<<"${output}"; then + echo "SeatGuide smoke no-go: ${label} did not contain '${expected}'." >&2 + return 1 + fi +} + +extract_run_id() { + sed -n 's/^ Run ID:[[:space:]]*//p' <<<"$1" | tail -n 1 +} + +require_seat_guide_run_registry() { + local status_text="$1" + local run_id + run_id="$(extract_run_id "${status_text}")" + if [[ -z "${run_id}" ]]; then + echo "SeatGuide smoke no-go: could not parse DimOS run ID from status." >&2 + exit 3 + fi + + local state_home="${XDG_STATE_HOME:-${HOME}/.local/state}" + local registry_path="${state_home}/dimos/runs/${run_id}.json" + if [[ ! -f "${registry_path}" ]]; then + echo "SeatGuide smoke no-go: DimOS run registry entry not found: ${registry_path}" >&2 + exit 3 + fi + + if ! grep -Eq '"blueprint"[[:space:]]*:[[:space:]]*"unitree-go2-seat-guide(-agentic)?"' "${registry_path}"; then + echo "SeatGuide smoke no-go: running DimOS stack is not a SeatGuide Go2 blueprint." >&2 + echo "Start unitree-go2-seat-guide-agentic for replay or real hardware acceptance." >&2 + exit 3 + fi +} + +echo "Checking DimOS run status..." +status_output="$(run_dimos status)" +printf '%s\n' "${status_output}" +if grep -q "No running DimOS instance" <<<"${status_output}"; then + cat >&2 <<'EOF' + +No running DimOS stack found. +Start a SeatGuide stack first, for example: + + dimos --replay run unitree-go2-seat-guide-agentic --daemon + +or, on real Go2 hardware: + + dimos run unitree-go2-seat-guide-agentic --robot-ip 192.168.123.161 --daemon +EOF + exit 2 +fi +require_seat_guide_run_registry "${status_output}" + +echo +echo "Checking SeatGuide MCP tools..." +if ! tools="$(run_dimos mcp list-tools 2>&1)"; then + printf '%s\n' "${tools}" >&2 + cat >&2 <<'EOF' +SeatGuide smoke no-go: MCP tools are unavailable. +Confirm the running blueprint is unitree-go2-seat-guide or unitree-go2-seat-guide-agentic and includes McpServer. +EOF + exit 3 +fi +require_tool "${tools}" "seat_guide_readiness_report" +require_tool "${tools}" "preview_seat_request" +require_tool "${tools}" "seat_guide_preflight" +require_tool "${tools}" "seat_guide_navigation_status" +require_tool "${tools}" "seat_guide_status" +require_tool "${tools}" "preview_empty_seat_goal" +require_tool "${tools}" "web_input_status" +require_tool "${tools}" "camera_seat_provider_status" + +echo +echo "Checking WebInput voice/text route status..." +web_input_output="$(run_dimos mcp call web_input_status)" +printf '%s\n' "${web_input_output}" +require_output_contains "${web_input_output}" "web=started" "web_input_status" +require_output_contains "${web_input_output}" "thread=running" "web_input_status" +require_output_contains "${web_input_output}" "seat_route=seat_guide_direct" "web_input_status" +require_output_contains "${web_input_output}" "responses=connected" "web_input_status" +require_output_contains "${web_input_output}" "voice_upload=connected" "web_input_status" +require_output_contains "${web_input_output}" "stt=connected" "web_input_status" +require_output_contains "${web_input_output}" "human_transport=connected" "web_input_status" + +echo +echo "Checking camera SeatGuide perception provider status..." +run_dimos mcp call camera_seat_provider_status + +echo +echo "Checking current SeatGuide scene..." +run_dimos mcp call seat_guide_status + +echo +echo "Running no-motion readiness report..." +run_dimos mcp call seat_guide_readiness_report + +echo +echo "Running no-motion voice-intent preview..." +run_dimos mcp call preview_seat_request --json-args '{"text": "预检帮我找一个空位"}' + +echo +echo "Previewing selected goal without moving..." +run_dimos mcp call preview_empty_seat_goal + +echo +echo "Checking navigation completion status reader..." +run_dimos mcp call seat_guide_navigation_status + +echo +echo "SeatGuide no-motion smoke completed." diff --git a/bin/demo_seat_guide_verify_acceptance_log b/bin/demo_seat_guide_verify_acceptance_log new file mode 100755 index 0000000000..9c3b575103 --- /dev/null +++ b/bin/demo_seat_guide_verify_acceptance_log @@ -0,0 +1,183 @@ +#!/usr/bin/env bash + +set -euo pipefail + +log_file="${1:-}" +if [[ -z "${log_file}" ]]; then + echo "Usage: $0 " >&2 + exit 2 +fi + +if [[ ! -f "${log_file}" ]]; then + echo "Acceptance log not found: ${log_file}" >&2 + exit 2 +fi + +require_log_contains() { + local expected="$1" + local label="$2" + if ! grep -Fq "${expected}" "${log_file}"; then + echo "Acceptance log missing ${label}: ${expected}" >&2 + exit 3 + fi +} + +require_log_matches() { + local expected_regex="$1" + local label="$2" + if ! grep -Eq "${expected_regex}" "${log_file}"; then + echo "Acceptance log missing ${label}: ${expected_regex}" >&2 + exit 3 + fi +} + +require_log_not_contains() { + local forbidden="$1" + local label="$2" + if grep -Fq "${forbidden}" "${log_file}"; then + echo "Acceptance log contains forbidden ${label}: ${forbidden}" >&2 + exit 3 + fi +} + +require_log_count_at_least() { + local expected="$1" + local minimum_count="$2" + local label="$3" + local count + count="$(grep -Fc "${expected}" "${log_file}" || true)" + if ((count < minimum_count)); then + echo "Acceptance log has only ${count}/${minimum_count} ${label}: ${expected}" >&2 + exit 3 + fi +} + +require_log_line_matches() { + local expected_regex="$1" + local label="$2" + if ! grep -Eq "${expected_regex}" "${log_file}"; then + echo "Acceptance log missing ${label}: ${expected_regex}" >&2 + exit 3 + fi +} + +require_log_line_count_at_least() { + local expected_regex="$1" + local minimum_count="$2" + local label="$3" + local count + count="$(grep -Ec "${expected_regex}" "${log_file}" || true)" + if ((count < minimum_count)); then + echo "Acceptance log has only ${count}/${minimum_count} ${label}: ${expected_regex}" >&2 + exit 3 + fi +} + +require_log_order() { + local before="$1" + local after="$2" + local label="$3" + local before_line + local after_line + before_line="$(grep -nF "${before}" "${log_file}" | head -n 1 | cut -d: -f1 || true)" + after_line="$(grep -nF "${after}" "${log_file}" | awk -F: -v before_line="${before_line}" '$1 > before_line { print $1; exit }' || true)" + if [[ -z "${before_line}" || -z "${after_line}" ]]; then + echo "Acceptance log has invalid ${label}: expected '${before}' before '${after}'" >&2 + exit 3 + fi +} + +require_log_order_after() { + local anchor="$1" + local before="$2" + local after="$3" + local label="$4" + local anchor_line + local before_line + local after_line + anchor_line="$(grep -nF "${anchor}" "${log_file}" | head -n 1 | cut -d: -f1 || true)" + before_line="$(grep -nF "${before}" "${log_file}" | awk -F: -v anchor_line="${anchor_line}" '$1 > anchor_line { print $1; exit }' || true)" + after_line="$(grep -nF "${after}" "${log_file}" | awk -F: -v before_line="${before_line}" '$1 > before_line { print $1; exit }' || true)" + if [[ -z "${anchor_line}" || -z "${before_line}" || -z "${after_line}" ]]; then + echo "Acceptance log has invalid ${label}: expected '${before}' before '${after}' after '${anchor}'" >&2 + exit 3 + fi +} + +require_log_contains "Hardware run registry:" "hardware run registry" +require_log_contains "Hardware run mode: hardware." "hardware run mode" +require_log_matches "Hardware blueprint: unitree-go2-seat-guide(-agentic)?" "SeatGuide hardware blueprint" +require_log_contains "web=started" "WebInput web server readiness" +require_log_contains "thread=running" "WebInput thread readiness" +require_log_contains "seat_route=seat_guide_direct" "WebInput direct SeatGuide route" +require_log_contains "responses=connected" "WebInput response stream" +require_log_contains "voice_upload=connected" "WebInput browser audio upload route" +require_log_contains "stt=connected" "WebInput speech-to-text pipeline" +require_log_contains "human_transport=connected" "WebInput human transport" +require_log_contains "Using WebInput URL: http" "resolved WebInput URL" +require_log_contains "CameraSeatObservationProvider" "camera perception module" +require_log_contains "SeatGuideSkillContainer" "SeatGuide planner/navigation module" +require_log_contains "WebInput" "voice command intake module" +require_log_matches "image=[0-9]+x[0-9]+" "camera image readiness" +require_log_contains "image_fresh=true" "fresh camera image readiness" +require_log_matches "camera_info=[0-9]+x[0-9]+" "camera calibration readiness" +require_log_contains "camera_info_fresh=true" "fresh camera calibration readiness" +require_log_matches "lidar=[1-9][0-9]* points" "LiDAR point cloud readiness" +require_log_contains "lidar_fresh=true" "fresh LiDAR readiness" +require_log_contains "credential=present" "VLM credential readiness" +require_log_contains "odom=(" "odometry readiness" +require_log_contains "odom_fresh=true" "fresh odometry readiness" +require_log_contains "override=inactive" "camera runtime override disabled" +require_log_contains "configured_fallback_seats=0" "camera fallback seats disabled" +require_log_contains "configured_fallback_people=0" "camera fallback people disabled" +require_log_matches "SeatGuide scene source=camera(_3d)?:" "live camera perception" +require_log_contains "SeatGuide preflight ready" "no-motion preflight" +require_log_matches "SeatGuide preview source=camera(_3d)?:" "camera-backed goal preview" +require_log_matches "empty=[0-9]+ occupied=[0-9]+" "SeatGuide occupancy counts" +require_log_contains "Captured WebInput agent_responses stream" "typed WebInput stream" +require_log_contains "Manual no-motion voice gate:" "browser microphone no-motion gate" +require_log_count_at_least "Press Enter here when ready." 2 "browser microphone readiness prompts" +require_log_contains "Click the microphone button and say: 预检帮我找一个空位" "browser microphone no-motion spoken phrase" +require_log_contains "Captured WebInput voice agent_responses stream" "no-motion voice stream" +require_log_count_at_least "WebInput received text" 3 "WebInput recognized text events" +require_log_line_count_at_least "WebInput received text.*预检帮我找一个空位" 2 "recognized no-motion SeatGuide phrases" +require_log_line_matches 'WebInput received text.*(text=帮我找一个空位|"text"[[:space:]]*:[[:space:]]*"帮我找一个空位")' "recognized live SeatGuide phrase" +require_log_contains "WebInput routing text to SeatGuide preview" "no-motion WebInput SeatGuide route" +require_log_line_matches "WebInput routing text to SeatGuide preview.*预检帮我找一个空位" "no-motion WebInput SeatGuide phrase route" +require_log_contains "Capturing DimOS log snapshot after no-motion checks" "no-motion DimOS log snapshot" +require_log_contains "No-motion checks completed." "no-motion completion marker" +require_log_order "Captured WebInput agent_responses stream" "Manual no-motion voice gate:" "typed preview before no-motion voice gate order" +require_log_order "Manual no-motion voice gate:" "Press Enter here when ready." "no-motion voice gate before readiness prompt order" +require_log_order_after "Manual no-motion voice gate:" "Press Enter here when ready." "Click the microphone button and say: 预检帮我找一个空位" "no-motion readiness before speech order" +require_log_order "Manual no-motion voice gate:" "Captured WebInput voice agent_responses stream" "no-motion voice gate before voice stream order" +require_log_order "WebInput routing text to SeatGuide preview" "Capturing DimOS log snapshot after no-motion checks" "no-motion preview before log snapshot order" +require_log_order "Capturing DimOS log snapshot after no-motion checks" "No-motion checks completed." "no-motion snapshot before completion order" +require_log_contains "Operator confirmation: LIVE" "operator live confirmation" +require_log_order "No-motion checks completed." "Operator confirmation: LIVE" "no-motion before live order" +require_log_contains "Live voice navigation gate:" "browser microphone live gate" +require_log_contains "Say: 帮我找一个空位" "browser microphone live spoken phrase" +require_log_contains "Captured live WebInput voice agent_responses stream" "live voice stream" +require_log_contains "WebInput routing text to SeatGuide live request" "live WebInput SeatGuide route" +require_log_line_matches 'WebInput routing text to SeatGuide live request.*(text=帮我找一个空位|"text"[[:space:]]*:[[:space:]]*"帮我找一个空位")' "live WebInput SeatGuide phrase route" +require_log_contains "Navigating to" "live SeatGuide navigation start" +require_log_contains "goal_sequence=" "SeatGuide goal sequence" +require_log_contains "Checking SeatGuide navigation completion" "navigation completion polling" +require_log_contains "goal_reached=true" "navigation completion" +require_log_contains "SeatGuide navigation goal reached" "acceptance completion marker" +require_log_contains "Capturing DimOS log snapshot after live request" "live DimOS log snapshot" +require_log_order "Operator confirmation: LIVE" "Live voice navigation gate:" "LIVE before browser microphone live gate order" +require_log_order "Live voice navigation gate:" "Press Enter here when ready." "live voice gate before readiness prompt order" +require_log_order_after "Live voice navigation gate:" "Press Enter here when ready." "Say: 帮我找一个空位" "live readiness before speech order" +require_log_order "Live voice navigation gate:" "WebInput routing text to SeatGuide live request" "live voice gate before live route order" +require_log_order "WebInput routing text to SeatGuide live request" "Navigating to" "live route before navigation order" +require_log_order "Navigating to" "goal_reached=true" "navigation before completion order" +require_log_order "Checking SeatGuide navigation completion" "goal_reached=true" "polling before completion order" +require_log_order "goal_reached=true" "SeatGuide navigation goal reached" "goal reached before completion marker order" +require_log_not_contains "+ dimos mcp call handle_seat_request" "direct MCP live SeatGuide call" +require_log_not_contains "+ dimos mcp call set_seat_scene" "fallback seat scene calibration" +require_log_not_contains "+ dimos mcp call clear_seat_scene_override" "fallback seat scene override clearing" +require_log_not_contains "require_live_perception=false" "fallback live-perception bypass" +require_log_not_contains '"require_live_perception": false' "fallback live-perception bypass" +require_log_not_contains '"require_live_perception":false' "fallback live-perception bypass" + +echo "SeatGuide acceptance log contains all required evidence: ${log_file}" diff --git a/bin/demo_seat_guide_web_camera b/bin/demo_seat_guide_web_camera new file mode 100755 index 0000000000..29eab4428e --- /dev/null +++ b/bin/demo_seat_guide_web_camera @@ -0,0 +1,49 @@ +#!/usr/bin/env -S uv run python +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Run the browser-camera SeatGuide validation page without a robot.""" + +from __future__ import annotations + +import argparse +import webbrowser + +from dimos.web.robot_web_interface import RobotWebInterface + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Run SeatGuide browser-camera validation without connecting to Go2." + ) + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", type=int, default=5555) + parser.add_argument("--no-open", action="store_true") + return parser.parse_args() + + +def main() -> int: + args = _parse_args() + url = f"http://{args.host}:{args.port}/seat-guide-camera" + server = RobotWebInterface(host=args.host, port=args.port) + print(f"SeatGuide browser-camera validation: {url}") + print("This mode does not connect to Go2. Press Ctrl-C to stop.") + if not args.no_open: + webbrowser.open(url) + server.run() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/demo_seat_check_webcam.py b/demo_seat_check_webcam.py new file mode 100644 index 0000000000..ad3316768b --- /dev/null +++ b/demo_seat_check_webcam.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Standalone webcam test for the YOLO empty-seat logic (no robot needed). + +Mirrors SeatFinderSkill's detection: YOLO detects chairs/couches and people, +then a seat is "occupied" if a person box overlaps it past a threshold. + +Usage: + .venv/bin/python demo_seat_check_webcam.py [--camera 0] + +Overlay: + green box = empty seat + red box = occupied seat + blue box = person + gray box = other detected object +Keys: q / ESC to quit. +""" + +from __future__ import annotations + +import argparse + +import cv2 + +from dimos.msgs.sensor_msgs.Image import Image +from dimos.perception.detection.detectors.yolo import Yolo2DDetector + +SEAT_CLASSES = ("chair", "couch", "bench") +OCCUPANCY_OVERLAP = 0.2 + + +def is_occupied(seat, persons) -> bool: + sx1, sy1, sx2, sy2 = seat.bbox + seat_area = max(1.0, (sx2 - sx1) * (sy2 - sy1)) + for p in persons: + px1, py1, px2, py2 = p.bbox + iw = max(0.0, min(sx2, px2) - max(sx1, px1)) + ih = max(0.0, min(sy2, py2) - max(sy1, py1)) + if (iw * ih) / seat_area > OCCUPANCY_OVERLAP: + return True + return False + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--camera", type=int, default=0, help="webcam index") + args = parser.parse_args() + + print("Loading YOLO...") + detector = Yolo2DDetector() + + cap = cv2.VideoCapture(args.camera) + if not cap.isOpened(): + raise SystemExit(f"Could not open camera index {args.camera}") + print("Running. green=empty seat, red=occupied, blue=person. q/ESC to quit.") + + while True: + ok, frame = cap.read() + if not ok: + break + + detections = detector.process_image(Image.from_opencv(frame)).detections + persons = [d for d in detections if d.name == "person"] + seats = [d for d in detections if d.name in SEAT_CLASSES] + + empty = 0 + for d in detections: + x1, y1, x2, y2 = (int(v) for v in d.bbox) + if d.name in SEAT_CLASSES: + occupied = is_occupied(d, persons) + color = (0, 0, 255) if occupied else (0, 255, 0) + label = f"{d.name} {'OCCUPIED' if occupied else 'EMPTY'} {d.confidence:.2f}" + empty += 0 if occupied else 1 + elif d.name == "person": + color = (255, 0, 0) + label = f"person {d.confidence:.2f}" + else: + color = (150, 150, 150) + label = f"{d.name} {d.confidence:.2f}" + cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2) + cv2.putText( + frame, label, (x1, max(15, y1 - 6)), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2, + ) + + cv2.putText( + frame, f"seats={len(seats)} empty={empty} persons={len(persons)}", + (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 2, + ) + cv2.imshow("YOLO empty-seat check", frame) + if cv2.waitKey(1) & 0xFF in (ord("q"), 27): + break + + cap.release() + cv2.destroyAllWindows() + + +if __name__ == "__main__": + main() diff --git a/dimos/agents/mcp/mcp_adapter.py b/dimos/agents/mcp/mcp_adapter.py index 213bf71e23..489b1ca7ae 100644 --- a/dimos/agents/mcp/mcp_adapter.py +++ b/dimos/agents/mcp/mcp_adapter.py @@ -41,7 +41,7 @@ logger = setup_logger() -DEFAULT_TIMEOUT = 30 +DEFAULT_TIMEOUT = 120 class McpError(Exception): diff --git a/dimos/agents/mcp/mcp_client.py b/dimos/agents/mcp/mcp_client.py index 75b532e9cc..a22a71bb03 100644 --- a/dimos/agents/mcp/mcp_client.py +++ b/dimos/agents/mcp/mcp_client.py @@ -13,6 +13,7 @@ # limitations under the License. from collections.abc import Callable +import os from queue import Empty, Queue from threading import Event, RLock, Thread import time @@ -24,6 +25,7 @@ from langchain_core.messages import HumanMessage from langchain_core.messages.base import BaseMessage from langchain_core.tools import StructuredTool +from langchain_openai import ChatOpenAI from langgraph.graph.state import CompiledStateGraph from reactivex.disposable import Disposable @@ -40,6 +42,54 @@ logger = setup_logger() +OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1" + + +def _requires_openai_api_key(model: Any) -> bool: + if not isinstance(model, str): + return False + return model.startswith("gpt-") or model.startswith("openai:") + + +def _uses_openrouter(model: Any) -> bool: + if not isinstance(model, str): + return False + return model.startswith("openrouter:") + + +def _openrouter_model_name(model: str) -> str: + if model.startswith("openrouter:"): + return model.removeprefix("openrouter:") + configured_model = os.getenv("OPENROUTER_MODEL") + if configured_model: + return configured_model + if model.startswith("openai:"): + return f"openai/{model.removeprefix('openai:')}" + if model.startswith("gpt-"): + return f"openai/{model}" + return model + + +def _openrouter_headers() -> dict[str, str] | None: + headers: dict[str, str] = {} + if referer := os.getenv("OPENROUTER_HTTP_REFERER"): + headers["HTTP-Referer"] = referer + if title := os.getenv("OPENROUTER_APP_TITLE"): + headers["X-OpenRouter-Title"] = title + return headers or None + + +def _build_openrouter_model(model: str) -> ChatOpenAI | None: + api_key = os.getenv("OPENROUTER_API_KEY") + if not api_key: + return None + return ChatOpenAI( + model=_openrouter_model_name(model), + api_key=api_key, + base_url=os.getenv("OPENROUTER_BASE_URL", OPENROUTER_BASE_URL), + default_headers=_openrouter_headers(), + ) + class McpClientConfig(ModuleConfig): system_prompt: str | None = SYSTEM_PROMPT @@ -217,6 +267,27 @@ def on_system_modules(self, _modules: list[RPCClient]) -> None: from dimos.agents.testing import MockModel model = MockModel(json_path=self.config.model_fixture) + elif isinstance(model, str) and ( + _uses_openrouter(model) + or (_requires_openai_api_key(model) and os.getenv("OPENROUTER_API_KEY")) + ): + openrouter_model = _build_openrouter_model(model) + if openrouter_model is None: + logger.warning( + "McpClient agent disabled because OPENROUTER_API_KEY is not set", + model=model, + n_tools=len(tools), + ) + return + model = openrouter_model + elif _requires_openai_api_key(model) and not os.getenv("OPENAI_API_KEY"): + logger.warning( + "McpClient agent disabled because OPENAI_API_KEY is not set. " + "Set OPENROUTER_API_KEY to use OpenRouter instead.", + model=model, + n_tools=len(tools), + ) + return with self._lock: self._state_graph = create_agent( diff --git a/dimos/agents/mcp/mcp_server.py b/dimos/agents/mcp/mcp_server.py index dbd31f8d87..d0d626197f 100644 --- a/dimos/agents/mcp/mcp_server.py +++ b/dimos/agents/mcp/mcp_server.py @@ -19,7 +19,7 @@ import json import os import time -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, ClassVar from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware @@ -94,6 +94,13 @@ def _handle_tools_list(req_id: Any, skills: list[SkillInfo]) -> dict[str, Any]: return _jsonrpc_result(req_id, {"tools": tools}) +def _module_class_has_skills(module: RPCClient) -> bool: + return any( + callable(attr) and hasattr(attr, "__skill__") + for attr in (getattr(module.actor_class, name, None) for name in dir(module.actor_class)) + ) + + async def _handle_tools_call( req_id: Any, params: dict[str, Any], rpc_calls: dict[str, Any] ) -> dict[str, Any]: @@ -243,6 +250,8 @@ async def event_generator() -> AsyncGenerator[str, None]: class McpServer(Module): + dedicated_worker: ClassVar[bool] = True + _uvicorn_server: uvicorn.Server | None = None _serve_future: concurrent.futures.Future[None] | None = None _tool_stream_cleanup: Callable[[], None] | None = None @@ -279,15 +288,26 @@ def stop(self) -> None: def on_system_modules(self, modules: list[RPCClient]) -> None: # TODO: this is a bit hacky, also not thread-safe assert self.rpc is not None - app.state.skills = [ - skill_info for module in modules for skill_info in (module.get_skills() or []) - ] - app.state.rpc_calls = { - skill_info.func_name: RpcCall( - None, self.rpc, skill_info.func_name, skill_info.class_name, [] - ) - for skill_info in app.state.skills - } + skills: list[SkillInfo] = [] + rpc_calls: dict[str, Any] = {} + + for module in modules: + if module.remote_name == self.__class__.__name__: + module_skills = self.get_skills() + for skill_info in module_skills: + rpc_calls[skill_info.func_name] = getattr(self, skill_info.func_name) + else: + if not _module_class_has_skills(module): + continue + module_skills = module.get_skills() or [] + for skill_info in module_skills: + rpc_calls[skill_info.func_name] = RpcCall( + None, self.rpc, skill_info.func_name, skill_info.class_name, [] + ) + skills.extend(module_skills) + + app.state.skills = skills + app.state.rpc_calls = rpc_calls @skill def server_status(self) -> str: diff --git a/dimos/agents/mcp/test_mcp_client.py b/dimos/agents/mcp/test_mcp_client.py index b31637630e..4a1bdd367b 100644 --- a/dimos/agents/mcp/test_mcp_client.py +++ b/dimos/agents/mcp/test_mcp_client.py @@ -17,6 +17,13 @@ from langchain_core.messages import HumanMessage from dimos.agents.annotation import skill +from dimos.agents.mcp.mcp_client import ( + OPENROUTER_BASE_URL, + _build_openrouter_model, + _openrouter_model_name, + _requires_openai_api_key, + _uses_openrouter, +) from dimos.core.module import Module from dimos.msgs.sensor_msgs.Image import Image from dimos.utils.data import get_data @@ -197,3 +204,49 @@ def test_image(agent_setup): assert "cafe" in response assert "stadium" not in response assert "battleship" not in response + + +def test_requires_openai_api_key_for_gpt_models() -> None: + assert _requires_openai_api_key("gpt-4o") + assert _requires_openai_api_key("openai:gpt-4o") + assert not _requires_openai_api_key("ollama:llama3") + + +def test_openrouter_model_detection_and_name_mapping(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("OPENROUTER_MODEL", raising=False) + + assert _uses_openrouter("openrouter:anthropic/claude-sonnet-4.5") + assert not _uses_openrouter("gpt-4o") + assert _openrouter_model_name("openrouter:anthropic/claude-sonnet-4.5") == ( + "anthropic/claude-sonnet-4.5" + ) + assert _openrouter_model_name("gpt-4o") == "openai/gpt-4o" + assert _openrouter_model_name("openai:gpt-4o-mini") == "openai/gpt-4o-mini" + + monkeypatch.setenv("OPENROUTER_MODEL", "google/gemini-2.5-flash") + assert _openrouter_model_name("gpt-4o") == "google/gemini-2.5-flash" + + +def test_build_openrouter_model_uses_openrouter_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("OPENROUTER_API_KEY", "test-openrouter-key") + monkeypatch.setenv("OPENROUTER_MODEL", "openai/gpt-4o-mini") + monkeypatch.setenv("OPENROUTER_HTTP_REFERER", "https://example.com") + monkeypatch.setenv("OPENROUTER_APP_TITLE", "DimOS Test") + + model = _build_openrouter_model("gpt-4o") + + assert model is not None + assert model.model_name == "openai/gpt-4o-mini" + assert str(model.openai_api_base).rstrip("/") == OPENROUTER_BASE_URL + assert model.default_headers == { + "HTTP-Referer": "https://example.com", + "X-OpenRouter-Title": "DimOS Test", + } + + +def test_build_openrouter_model_returns_none_without_key( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("OPENROUTER_API_KEY", raising=False) + + assert _build_openrouter_model("gpt-4o") is None diff --git a/dimos/agents/mcp/test_mcp_server.py b/dimos/agents/mcp/test_mcp_server.py index fd514b0643..973f5d467a 100644 --- a/dimos/agents/mcp/test_mcp_server.py +++ b/dimos/agents/mcp/test_mcp_server.py @@ -18,7 +18,7 @@ import json from unittest.mock import MagicMock -from dimos.agents.mcp.mcp_server import handle_request +from dimos.agents.mcp.mcp_server import McpServer, app, handle_request from dimos.core.module import SkillInfo @@ -68,6 +68,51 @@ def test_mcp_module_request_flow() -> None: rpc_calls["add"].assert_called_once_with(x=2, y=3) +def test_mcp_server_registers_own_skills_without_self_rpc(monkeypatch) -> None: + class OtherSkills: + def other(self) -> str: + """Other skill.""" + return "ok" + + OtherSkills.other.__skill__ = True # type: ignore[attr-defined] + + schema = json.dumps({"type": "object", "properties": {}}) + server_skill = SkillInfo( + class_name="McpServer", + func_name="server_status", + args_schema=schema, + ) + remote_skill = SkillInfo( + class_name="OtherSkills", + func_name="other", + args_schema=schema, + ) + + server = McpServer.__new__(McpServer) + server.rpc = MagicMock() + monkeypatch.setattr(server, "get_skills", MagicMock(return_value=[server_skill])) + + server_proxy = MagicMock() + server_proxy.remote_name = "McpServer" + server_proxy.get_skills.side_effect = AssertionError("self get_skills must be local") + + remote_proxy = MagicMock() + remote_proxy.remote_name = "OtherSkills" + remote_proxy.actor_class = OtherSkills + remote_proxy.get_skills.return_value = [remote_skill] + + try: + server.on_system_modules([server_proxy, remote_proxy]) + + assert {skill.func_name for skill in app.state.skills} == {"server_status", "other"} + server_proxy.get_skills.assert_not_called() + assert app.state.rpc_calls["server_status"] == server.server_status + assert app.state.rpc_calls["other"]._remote_name == "OtherSkills" + finally: + app.state.skills = [] + app.state.rpc_calls = {} + + def test_mcp_module_injects_progress_token_as_mcp_context() -> None: """When the client sends `_meta.progressToken`, the RPC call receives it as an `_mcp_context` kwarg so the `@skill` wrapper can stash it in the diff --git a/dimos/agents/skills/navigation.py b/dimos/agents/skills/navigation.py index d88bec452e..d2c9cd04ab 100644 --- a/dimos/agents/skills/navigation.py +++ b/dimos/agents/skills/navigation.py @@ -43,7 +43,7 @@ class NavigationSkillContainer(Module): _skill_started: bool = False _similarity_threshold: float = 0.23 - _spatial_memory: SpatialMemorySpec + _spatial_memory: SpatialMemorySpec | None = None _navigation: NavigationInterfaceSpec _object_tracking: ObjectTrackingSpec | None = None @@ -104,6 +104,9 @@ def tag_location(self, location_name: str) -> str: rotation=(rotation.x, rotation.y, rotation.z), ) + if self._spatial_memory is None: + return "Spatial memory is unavailable, cannot tag location." + if not self._spatial_memory.tag_location(location): return f"Error: Failed to store '{location_name}' in the spatial memory" @@ -144,6 +147,9 @@ def navigate_with_text(self, query: str) -> str: return f"No tagged location called '{query}'. No object in view matching '{query}'. No matching location found in semantic map for '{query}'." def _navigate_by_tagged_location(self, query: str) -> str | None: + if self._spatial_memory is None: + return None + robot_location = self._spatial_memory.query_tagged_location(query) if not robot_location: @@ -227,6 +233,9 @@ def _get_bbox_for_current_frame(self, query: str) -> BBox | None: return get_object_bbox_from_image(self._vl_model, self._latest_image, query) def _navigate_using_semantic_map(self, query: str) -> str: + if self._spatial_memory is None: + return "Spatial memory is unavailable." + results = self._spatial_memory.query_by_text(query) if not results: diff --git a/dimos/agents/skills/seat_finder.py b/dimos/agents/skills/seat_finder.py new file mode 100644 index 0000000000..f8cf00627a --- /dev/null +++ b/dimos/agents/skills/seat_finder.py @@ -0,0 +1,245 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from threading import RLock +import time +from typing import TYPE_CHECKING, Any + +import cv2 +from dimos_lcm.std_msgs import Bool # type: ignore[import-untyped] + +from dimos.agents.annotation import skill +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In, Out +from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.geometry_msgs.Twist import Twist +from dimos.msgs.geometry_msgs.Vector3 import Vector3 +from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo +from dimos.msgs.sensor_msgs.Image import Image, sharpness_window +from dimos.msgs.sensor_msgs.PointCloud2 import PointCloud2 +from dimos.perception.detection.detectors.yolo import Yolo2DDetector +from dimos.perception.detection.type.detection3d.pointcloud import Detection3DPC +from dimos.utils.logging_config import setup_logger +from dimos.utils.reactive import backpressure + +if TYPE_CHECKING: + from reactivex.abc import DisposableBase + + from dimos.perception.detection.type.detection2d.bbox import Detection2DBBox + from dimos.perception.detection.type.detection2d.imageDetections2D import ImageDetections2D + +logger = setup_logger() + +# COCO classes treated as seats, and the fraction of a seat's box that must be +# covered by a person box before we call it occupied. +SEAT_CLASSES = ("chair", "couch", "bench") +OCCUPANCY_OVERLAP = 0.2 + +# In-place scan: rotate slowly while the continuous detector watches, until a +# target is found or we've turned roughly all the way around. +SCAN_YAW_RATE = 0.5 # rad/s (slow, to keep motion blur low) [rad/s] +# Publish cmd_vel at ~10 Hz; the Go2 stops between commands if they arrive too +# slowly, so a low rate means it never actually turns. +SCAN_TICK = 0.1 # seconds between cmd_vel publishes [s] +SCAN_DURATION = 14.0 # ~one full revolution at SCAN_YAW_RATE [s] +SCAN_LOG_EVERY = 2.0 # seconds between progress logs [s] + + +class Config(ModuleConfig): + camera_info: CameraInfo + # Sharpest-frame target frequency (Hz). The detector only runs on the + # crispest frame in each window, which suppresses motion blur. + detect_freq: float = 5.0 + + +class SeatFinderSkill(Module): + """Self-contained seat/object finder: scan in place, then navigate to one. + + Detection runs continuously on a sharpness-filtered, backpressured stream + (motion-robust). When a skill is called the module first stops frontier + exploration (so it cannot clobber our goal), rotates in place until a target + is seen, projects it to a 3D pose via the pointcloud, and publishes that pose + on ``goal_request`` for the A* planner. Owning the whole flow keeps a single + navigation-goal source, avoiding the explorer/seat-goal conflict. + """ + + config: Config + + color_image: In[Image] + pointcloud: In[PointCloud2] + goal_request: Out[PoseStamped] + detections_image: Out[Image] + stop_explore_cmd: Out[Bool] + cmd_vel: Out[Twist] + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._detector = Yolo2DDetector() + self._latest: ImageDetections2D | None = None + self._lock = RLock() + self._subscription: DisposableBase | None = None + + @rpc + def start(self) -> None: + super().start() + sharp = backpressure( + sharpness_window(self.config.detect_freq, self.color_image.pure_observable()) + ) + self._subscription = sharp.subscribe( + on_next=self._on_frame, + on_error=lambda e: logger.exception("Error in seat detection loop", exc_info=e), + ) + + @rpc + def stop(self) -> None: + if self._subscription is not None: + self._subscription.dispose() + self._subscription = None + self.cmd_vel.publish(Twist.zero()) + super().stop() + + def _on_frame(self, image: Image) -> None: + detections = self._detector.process_image(image) + with self._lock: + self._latest = detections + self.detections_image.publish(self._annotate(image, detections.detections)) + + @skill + def find_empty_seat(self) -> str: + """Look around for an empty seat, chair, or sofa and navigate to it. + + Use this when asked to guide someone to a free seat. The robot turns in + place to search, then heads to the seat. Do NOT also call exploration or + other movement tools; this skill owns the search and the motion. + """ + return self._scan_and_navigate(self._select_empty_seats, "empty seat") + + @skill + def find_object(self, query: str) -> str: + """Look around for an object named `query` and navigate next to it. + + Use this to locate and approach a specific item (e.g. "bottle", + "backpack", "chair"). Matches the YOLO (COCO) class name. The robot turns + in place to search; do NOT also call exploration or movement tools. + """ + return self._scan_and_navigate(lambda dets: self._select_by_name(dets, query), query) + + def _scan_and_navigate(self, selector: Any, label: str) -> str: + # Single goal source: stop the frontier explorer so it can't overwrite + # the goal we are about to publish. + self.stop_explore_cmd.publish(Bool(data=True)) + + detections, candidates = self._scan_in_place(selector) + logger.info( + f"SeatFinder: label={label!r} matched={len(candidates)} " + f"names={[d.name for d in detections.detections] if detections else []}" + ) + if detections is None or not candidates: + return f"No {label} found after looking around." + + pointcloud = self.pointcloud.get_next() + transform = self.tf.get("camera_optical", pointcloud.frame_id, detections.image.ts, 5.0) + if not transform: + return f"Could not resolve the camera transform, cannot locate the {label}." + + best = max(candidates, key=lambda d: d.bbox_2d_volume()) + target3d = Detection3DPC.from_2d( + best, + world_pointcloud=pointcloud, + camera_info=self.config.camera_info, + world_to_optical_transform=transform, + ) + if target3d is None: + return f"Found a {label} but could not compute its 3D position." + + pose = target3d.pose + self.goal_request.publish(pose) + return ( + f"Found a {label} at ({pose.position.x:.2f}, {pose.position.y:.2f}). " + "Navigating there now." + ) + + def _scan_in_place(self, selector: Any) -> tuple[ImageDetections2D | None, list[Detection2DBBox]]: + """Rotate slowly in place until the continuous detector yields a match + (or we've turned all the way around). Always stops the robot on exit.""" + deadline = time.time() + SCAN_DURATION + next_log = 0.0 + yaw = Twist( + linear=Vector3(0.0, 0.0, 0.0), angular=Vector3(0.0, 0.0, SCAN_YAW_RATE) + ) + try: + while True: + with self._lock: + detections = self._latest + candidates = selector(detections.detections) if detections is not None else [] + if candidates or time.time() >= deadline: + return detections, candidates + now = time.time() + if now >= next_log: + seen = [d.name for d in detections.detections] if detections else [] + logger.info(f"SeatFinder scan: rotating, currently seeing {seen}") + next_log = now + SCAN_LOG_EVERY + self.cmd_vel.publish(yaw) + time.sleep(SCAN_TICK) + finally: + self.cmd_vel.publish(Twist.zero()) + + def _select_empty_seats(self, detections: list[Detection2DBBox]) -> list[Detection2DBBox]: + seats = [d for d in detections if d.name in SEAT_CLASSES] + persons = [d for d in detections if d.name == "person"] + return [s for s in seats if not self._is_occupied(s, persons)] + + def _select_by_name( + self, detections: list[Detection2DBBox], query: str + ) -> list[Detection2DBBox]: + q = query.lower() + return [d for d in detections if d.name.lower() in q or q in d.name.lower()] + + def _is_occupied(self, seat: Detection2DBBox, persons: list[Detection2DBBox]) -> bool: + sx1, sy1, sx2, sy2 = seat.bbox + seat_area = max(1.0, (sx2 - sx1) * (sy2 - sy1)) + for p in persons: + px1, py1, px2, py2 = p.bbox + iw = max(0.0, min(sx2, px2) - max(sx1, px1)) + ih = max(0.0, min(sy2, py2) - max(sy1, py1)) + if (iw * ih) / seat_area > OCCUPANCY_OVERLAP: + return True + return False + + def _annotate(self, image: Image, detections: list[Detection2DBBox]) -> Image: + img = image.to_opencv().copy() + persons = [d for d in detections if d.name == "person"] + for d in detections: + x1, y1, x2, y2 = (int(v) for v in d.bbox) + if d.name in SEAT_CLASSES: + occupied = self._is_occupied(d, persons) + color = (0, 0, 255) if occupied else (0, 255, 0) + text = f"{d.name} {'occupied' if occupied else 'EMPTY'}" + elif d.name == "person": + color = (255, 0, 0) + text = "person" + else: + color = (150, 150, 150) + text = d.name + cv2.rectangle(img, (x1, y1), (x2, y2), color, 2) + cv2.putText( + img, text, (x1, max(15, y1 - 6)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2 + ) + return Image.from_opencv(img, ts=image.ts) + + +__all__ = ["SeatFinderSkill"] diff --git a/dimos/agents/skills/seat_guide.py b/dimos/agents/skills/seat_guide.py new file mode 100644 index 0000000000..ae24b69503 --- /dev/null +++ b/dimos/agents/skills/seat_guide.py @@ -0,0 +1,1603 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass +import math +import os +from threading import RLock +import time +from typing import Protocol + +from pydantic import Field +from reactivex.disposable import Disposable + +from dimos.agents.annotation import skill +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In +from dimos.models.vl.base import VlModel +from dimos.models.vl.types import VlModelName +from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.geometry_msgs.Quaternion import Quaternion +from dimos.msgs.geometry_msgs.Transform import Transform +from dimos.msgs.geometry_msgs.Vector3 import Vector3 +from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo +from dimos.msgs.sensor_msgs.Image import Image +from dimos.msgs.sensor_msgs.PointCloud2 import PointCloud2 +from dimos.navigation.navigation_spec import NavigationInterfaceSpec +from dimos.perception.detection.detectors.base import Detector +from dimos.perception.detection.type.detection2d.bbox import Detection2DBBox +from dimos.perception.detection.type.detection2d.imageDetections2D import ImageDetections2D +from dimos.perception.detection.type.detection3d.pointcloud import Detection3DPC +from dimos.spec.utils import Spec +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +@dataclass(frozen=True) +class SeatObservation: + """A candidate chair pose in the room map frame.""" + + seat_id: str + x: float + y: float + yaw: float = 0.0 + + +@dataclass(frozen=True) +class PersonObservation: + """A detected person position in the room map frame.""" + + x: float + y: float + + +@dataclass(frozen=True) +class SeatGuideResult: + seat: SeatObservation + goal_x: float + goal_y: float + goal_yaw: float + guidance_summary: str + + +@dataclass(frozen=True) +class SeatSceneObservation: + seats: list[SeatObservation] + people: list[PersonObservation] + robot_x: float = 0.0 + robot_y: float = 0.0 + source: str = "unknown" + + +@dataclass(frozen=True) +class SeatGuideIntent: + should_find_seat: bool + normalized_text: str + + +class SeatObservationProviderSpec(Spec, Protocol): + def get_seat_scene(self) -> SeatSceneObservation: ... + + +class ExplorationSpec(Spec, Protocol): + def begin_exploration(self) -> str: ... + def end_exploration(self) -> str: ... + + +class RelativeMoveSpec(Spec, Protocol): + def relative_move( + self, + forward: float = 0.0, + left: float = 0.0, + degrees: float = 0.0, + x: float = 0.0, + y: float = 0.0, + duration: float = 0.0, + ) -> str: ... + + +class DirectMoveSpec(Spec, Protocol): + def direct_move( + self, + x: float, + y: float = 0.0, + yaw: float = 0.0, + duration: float = 1.0, + ) -> str: ... + + +class SeatGuideRequestSpec(Spec, Protocol): + def handle_seat_request(self, text: str) -> str: ... + def preview_seat_request(self, text: str) -> str: ... + + +class SeatGuidePlanner: + """Selects an empty conference room seat and computes the robot guide pose.""" + + def __init__( + self, + *, + occupied_radius_m: float = 0.75, + aisle_offset_m: float = 0.65, + ) -> None: + if occupied_radius_m <= 0: + raise ValueError("occupied_radius_m must be positive.") + if aisle_offset_m < 0: + raise ValueError("aisle_offset_m cannot be negative.") + self.occupied_radius_m = occupied_radius_m + self.aisle_offset_m = aisle_offset_m + + def find_empty_seat( + self, + seats: list[SeatObservation], + people: list[PersonObservation], + robot_x: float = 0.0, + robot_y: float = 0.0, + ) -> SeatGuideResult | None: + empty_seats = [seat for seat in seats if not self._is_occupied(seat, people)] + if not empty_seats: + return None + + selected = min( + empty_seats, + key=lambda seat: math.hypot(seat.x - robot_x, seat.y - robot_y), + ) + goal_x, goal_y = self._guide_pose_for(selected) + return SeatGuideResult( + seat=selected, + goal_x=goal_x, + goal_y=goal_y, + goal_yaw=selected.yaw, + guidance_summary=( + f"I found an empty seat {selected.seat_id}. " + "Please follow me to the chair beside the table." + ), + ) + + def _is_occupied(self, seat: SeatObservation, people: list[PersonObservation]) -> bool: + return any( + math.hypot(person.x - seat.x, person.y - seat.y) <= self.occupied_radius_m + for person in people + ) + + def occupancy_counts( + self, seats: list[SeatObservation], people: list[PersonObservation] + ) -> tuple[int, int]: + occupied = sum(1 for seat in seats if self._is_occupied(seat, people)) + return len(seats) - occupied, occupied + + def _guide_pose_for(self, seat: SeatObservation) -> tuple[float, float]: + offset_x = math.cos(seat.yaw) * self.aisle_offset_m + offset_y = math.sin(seat.yaw) * self.aisle_offset_m + return seat.x + offset_x, seat.y + offset_y + + +class SeatGuideSkillContainer(Module): + """Skill container for finding and guiding to an empty conference room seat.""" + + _navigation: NavigationInterfaceSpec + _seat_observation_provider: SeatObservationProviderSpec | None = None + _explorer: ExplorationSpec | None = None + _direct_mover: DirectMoveSpec | None = None + _relative_mover: RelativeMoveSpec | None = None + _seat_guide_goal_sequence: int = 0 + _seat_guide_goal_reached_reset_required: bool = False + + @rpc + def start(self) -> None: + super().start() + + @rpc + def stop(self) -> None: + super().stop() + + @skill + def find_empty_seat( + self, + seats: list[float], + people: list[float], + robot_x: float = 0.0, + robot_y: float = 0.0, + wait_for_arrival: bool = False, + arrival_timeout_s: float = 60.0, + arrival_poll_s: float = 0.5, + arrival_message: str = "我已经到了, 空椅子在我右边, 请坐。", + ) -> str: + """Find an empty chair in a conference room and navigate next to it. + + This is the demo-critical SeatGuide skill for a controlled conference room. + Provide chair detections as a flat list of [x, y, yaw] triples in the map frame. + Provide person detections as a flat list of [x, y] pairs in the map frame. + A chair is considered occupied when a person is within 0.75 meters. + + Args: + seats: Flat chair pose list [x, y, yaw, x, y, yaw, ...]. + people: Flat person position list [x, y, x, y, ...]. + robot_x: Robot x position used to choose the nearest empty seat. + robot_y: Robot y position used to choose the nearest empty seat. + wait_for_arrival: When true, wait for navigation to finish before returning. + arrival_timeout_s: Maximum seconds to wait for arrival. + arrival_poll_s: Delay between navigation status checks. + arrival_message: Text response to return after arrival. + """ + seat_observations = _parse_seats(seats) + person_observations = _parse_people(people) + if not seat_observations: + message = ( + "I cannot see any seats yet. Please face the conference table or calibrate " + "the room layout." + ) + return message + + planner = SeatGuidePlanner() + result = planner.find_empty_seat( + seat_observations, + person_observations, + robot_x=robot_x, + robot_y=robot_y, + ) + if result is None: + message = "I could not find an empty seat in the conference room." + return message + + goal = PoseStamped( + frame_id="map", + position=Vector3(result.goal_x, result.goal_y, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, result.goal_yaw)), + ) + navigation_text, navigation_ok = self._navigation_readiness_text() + if not navigation_ok: + message = ( + f"Found empty seat {result.seat.seat_id}, but navigation is not ready " + f"for a new goal: {navigation_text}." + ) + return message + + previous_goal_reached = self._navigation_goal_reached_or_false() + try: + goal_started = self._navigation.set_goal(goal) + except Exception as exc: + message = ( + f"Found empty seat {result.seat.seat_id}, but navigation raised an error: " + f"{exc}." + ) + return message + + if not goal_started: + message = f"Found empty seat {result.seat.seat_id}, but failed to start navigation." + return message + + self._seat_guide_goal_sequence = getattr(self, "_seat_guide_goal_sequence", 0) + 1 + self._seat_guide_goal_reached_reset_required = previous_goal_reached + navigating_message = ( + f"{result.guidance_summary} Navigating to ({result.goal_x:.2f}, {result.goal_y:.2f})." + ) + if not wait_for_arrival: + return navigating_message + + arrival_result = self._wait_for_arrival( + timeout_s=arrival_timeout_s, + poll_s=arrival_poll_s, + ) + if arrival_result == "arrived": + return f"{navigating_message} {arrival_message}" + if arrival_result == "failed": + message = ( + f"{navigating_message} I found the empty seat, but navigation stopped " + "before reaching it." + ) + return message + + message = ( + f"{navigating_message} I found the empty seat, but navigation did not " + f"finish within {arrival_timeout_s:.0f} seconds." + ) + return message + + @skill + def find_empty_seat_from_scene( + self, + require_live_perception: bool = True, + wait_for_arrival: bool = False, + arrival_timeout_s: float = 60.0, + ) -> str: + """Find an empty chair using the configured conference room observation provider. + + Use this for the SeatGuide demo when perception or a synthetic room provider + is already running as a module. The provider returns chair poses, person + positions, and robot position in the map frame. + + Args: + require_live_perception: When true, only a camera-backed scene can + trigger navigation. Set false only for explicit fallback calibration. + wait_for_arrival: When true, wait for navigation to finish before returning. + arrival_timeout_s: Maximum seconds to wait for arrival. + """ + if self._seat_observation_provider is None: + message = "No seat observation provider is connected." + return message + + scene = self._seat_observation_provider.get_seat_scene() + if require_live_perception and not _is_live_camera_source(scene.source): + message = _describe_live_perception_required(scene) + return message + seats = _flatten_seats(scene.seats) + people = _flatten_people(scene.people) + return self.find_empty_seat( + seats=seats, + people=people, + robot_x=scene.robot_x, + robot_y=scene.robot_y, + wait_for_arrival=wait_for_arrival, + arrival_timeout_s=arrival_timeout_s, + ) + + @skill + def search_for_empty_seat_from_scene( + self, + search_timeout_s: float = 30.0, + poll_interval_s: float = 0.5, + require_live_perception: bool = True, + wait_for_arrival: bool = False, + arrival_timeout_s: float = 60.0, + ) -> str: + """Move around while scanning for a visible empty chair, then navigate to it. + + Use this when the current camera view has no chair but the Go2 should + actively search the nearby area. The robot starts exploration, polls the + SeatGuide camera scene, stops exploration when a live camera seat is + visible, then sends the normal empty-seat navigation goal. + + Args: + search_timeout_s: Maximum time to search before stopping exploration. + poll_interval_s: Delay between camera scene checks during search. + require_live_perception: When true, only camera-backed detections can + trigger final navigation. + wait_for_arrival: When true, wait for navigation to finish before returning. + arrival_timeout_s: Maximum seconds to wait for arrival. + """ + if self._seat_observation_provider is None: + message = "No seat observation provider is connected." + return message + + if self._direct_mover is not None or self._relative_mover is not None: + return self.scan_for_empty_seat_from_scene( + max_turn_degrees=min(360.0, max(30.0, search_timeout_s * 12.0)), + step_degrees=30.0, + settle_s=max(0.5, poll_interval_s), + require_live_perception=require_live_perception, + wait_for_arrival=wait_for_arrival, + arrival_timeout_s=arrival_timeout_s, + ) + + if self._explorer is None: + message = ( + "I cannot see any seats yet, and no exploration module is connected " + "to search for one." + ) + return message + + search_timeout_s = max(1.0, min(search_timeout_s, 120.0)) + poll_interval_s = max(0.2, min(poll_interval_s, 5.0)) + initial_scene = self._seat_observation_provider.get_seat_scene() + if _scene_has_empty_seat(initial_scene) and ( + not require_live_perception or _is_live_camera_source(initial_scene.source) + ): + return self.find_empty_seat_from_scene( + require_live_perception=require_live_perception + ) + if ( + require_live_perception + and initial_scene.source != "camera_no_seats_detected" + and not _is_live_camera_source(initial_scene.source) + ): + message = _describe_live_perception_required(initial_scene) + return message + + self._explorer.begin_exploration() + exploration_active = True + deadline = time.time() + search_timeout_s + try: + while time.time() < deadline: + scene = self._seat_observation_provider.get_seat_scene() + if _scene_has_empty_seat(scene) and ( + not require_live_perception or _is_live_camera_source(scene.source) + ): + self._explorer.end_exploration() + exploration_active = False + return self.find_empty_seat_from_scene( + require_live_perception=require_live_perception, + wait_for_arrival=wait_for_arrival, + arrival_timeout_s=arrival_timeout_s, + ) + time.sleep(poll_interval_s) + finally: + if exploration_active: + self._explorer.end_exploration() + + message = ( + "I searched but still cannot see an empty seat. Please reposition me " + "or point the camera toward the conference table." + ) + return message + + @skill + def scan_for_empty_seat_from_scene( + self, + max_turn_degrees: float = 360.0, + step_degrees: float = 30.0, + settle_s: float = 0.75, + turn_yaw_rate_rad_s: float = 0.5, + require_live_perception: bool = True, + wait_for_arrival: bool = False, + arrival_timeout_s: float = 60.0, + ) -> str: + """Rotate in place while scanning for a visible empty chair, then navigate. + + Use this for the SeatGuide demo when the current camera view does not + contain an empty chair. The robot turns in place in small increments, + checks camera-backed SeatGuide perception after each turn, and navigates + once an empty seat is visible. + + Args: + max_turn_degrees: Maximum total in-place scan angle. + step_degrees: Degrees to rotate per scan step. + settle_s: Delay after each turn before reading camera perception. + turn_yaw_rate_rad_s: Yaw velocity for direct in-place turns. + require_live_perception: When true, only camera-backed detections can + trigger final navigation. + wait_for_arrival: When true, wait for navigation to finish before returning. + arrival_timeout_s: Maximum seconds to wait for arrival. + """ + if self._seat_observation_provider is None: + message = "No seat observation provider is connected." + return message + if self._direct_mover is None and self._relative_mover is None: + message = ( + "I cannot rotate-scan for a seat because no relative movement " + "module is connected." + ) + return message + + max_turn_degrees = max(30.0, min(float(max_turn_degrees), 720.0)) + step_degrees = max(10.0, min(abs(float(step_degrees)), 90.0)) + settle_s = max(0.1, min(float(settle_s), 5.0)) + turn_yaw_rate_rad_s = max(0.1, min(abs(float(turn_yaw_rate_rad_s)), 1.5)) + steps = max(1, math.ceil(max_turn_degrees / step_degrees)) + + scene = self._seat_observation_provider.get_seat_scene() + if _scene_has_empty_seat(scene) and ( + not require_live_perception or _is_live_camera_source(scene.source) + ): + return self.find_empty_seat_from_scene( + require_live_perception=require_live_perception, + wait_for_arrival=wait_for_arrival, + arrival_timeout_s=arrival_timeout_s, + ) + if ( + require_live_perception + and not _is_live_camera_source(scene.source) + and scene.source != "camera_no_seats_detected" + ): + message = _describe_live_perception_required(scene) + return message + + for _ in range(steps): + move_result = self._turn_in_place( + degrees=step_degrees, + yaw_rate_rad_s=turn_yaw_rate_rad_s, + ) + if "failed" in move_result.lower() or "cancelled" in move_result.lower(): + message = f"SeatGuide scan stopped because rotation failed: {move_result}." + return message + time.sleep(settle_s) + + scene = self._seat_observation_provider.get_seat_scene() + if _scene_has_empty_seat(scene) and ( + not require_live_perception or _is_live_camera_source(scene.source) + ): + return self.find_empty_seat_from_scene( + require_live_perception=require_live_perception, + wait_for_arrival=wait_for_arrival, + arrival_timeout_s=arrival_timeout_s, + ) + + message = ( + "I rotated in place but still cannot see an empty seat. Please reposition me " + "or point the camera toward the conference table." + ) + return message + + def _turn_in_place(self, *, degrees: float, yaw_rate_rad_s: float) -> str: + if self._direct_mover is not None: + yaw_direction = 1.0 if degrees >= 0 else -1.0 + yaw = yaw_direction * yaw_rate_rad_s + duration = max(0.2, abs(math.radians(degrees)) / yaw_rate_rad_s) + return self._direct_mover.direct_move( + x=0.0, + y=0.0, + yaw=yaw, + duration=duration, + ) + if self._relative_mover is not None: + return self._relative_mover.relative_move( + forward=0.0, + left=0.0, + degrees=degrees, + ) + return "Rotation failed: no movement module is connected." + + @skill + def preview_empty_seat_goal(self) -> str: + """Preview the selected empty seat and navigation goal without moving. + + Use this during real Go2 bring-up after `seat_guide_status` and before + `handle_seat_request` to verify the selected chair and map-frame goal. + This never calls navigation. + """ + if self._seat_observation_provider is None: + return "No seat observation provider is connected." + + scene = self._seat_observation_provider.get_seat_scene() + return _describe_goal_preview(scene) + + @skill + def handle_seat_request( + self, + text: str, + require_live_perception: bool = True, + wait_for_arrival: bool = True, + arrival_timeout_s: float = 60.0, + ) -> str: + """Handle a spoken or typed request to find an empty conference room seat. + + This is the Go2-free voice intake boundary for the SeatGuide demo. Pass + speech-to-text output or typed text here. If the text asks for an empty + seat, this delegates to the configured scene provider and navigation. + + Args: + text: Transcribed or typed user request. + require_live_perception: When true, only a camera-backed scene can + trigger navigation. Set false only for explicit fallback calibration. + wait_for_arrival: When true, wait for navigation to finish before returning. + arrival_timeout_s: Maximum seconds to wait for arrival. + """ + intent = parse_seat_guide_intent(text) + if not intent.should_find_seat: + message = "I did not hear a request to find an empty seat." + return message + + scene = ( + self._seat_observation_provider.get_seat_scene() + if self._seat_observation_provider is not None + else None + ) + can_scan_or_explore = ( + self._direct_mover is not None + or self._relative_mover is not None + or self._explorer is not None + ) + if ( + scene is not None + and ( + (scene.source == "camera_no_seats_detected" and not scene.seats) + or (_is_live_camera_source(scene.source) and not _scene_has_empty_seat(scene)) + ) + and can_scan_or_explore + ): + return self.search_for_empty_seat_from_scene( + require_live_perception=require_live_perception, + wait_for_arrival=wait_for_arrival, + arrival_timeout_s=arrival_timeout_s, + ) + + return self.find_empty_seat_from_scene( + require_live_perception=require_live_perception, + wait_for_arrival=wait_for_arrival, + arrival_timeout_s=arrival_timeout_s, + ) + + @skill + def preview_seat_request(self, text: str) -> str: + """Preview a spoken or typed SeatGuide request without moving. + + Use this to validate the real microphone or typed WebInput path during + bring-up. If the text asks for an empty seat, this runs the same + no-motion preflight used before live hardware navigation. + + Args: + text: Transcribed or typed user request. + """ + intent = parse_seat_guide_intent(text) + if not intent.should_find_seat: + message = "I did not hear a request to find an empty seat." + return message + + message = self.seat_guide_preflight() + return message + + @skill + def seat_guide_preflight(self, require_live_perception: bool = True) -> str: + """Run a no-motion SeatGuide hardware preflight before sending a goal. + + Use this on the real Go2 before asking a person to follow the robot. It + checks navigation reachability at the interface level, the current seat + scene source, and whether an empty seat can be selected. This never + calls navigation. + + Args: + require_live_perception: When true, only a camera-backed scene can + pass preflight. Set false only for explicit fallback calibration. + """ + if self._seat_observation_provider is None: + return ( + "SeatGuide preflight no-go: " + f"{self._navigation_readiness_text()[0]}; perception=missing; " + "feedback=phone_or_web." + ) + + scene = self._seat_observation_provider.get_seat_scene() + return self._describe_preflight(scene, require_live_perception=require_live_perception) + + @skill + def seat_guide_status(self) -> str: + """Describe the current SeatGuide scene provider state without navigating. + + Use this during bring-up to confirm whether SeatGuide can see chairs and + people before asking the robot to guide a user to an empty seat. + """ + if self._seat_observation_provider is None: + return "SeatGuide status: no seat observation provider is connected." + + scene = self._seat_observation_provider.get_seat_scene() + return _describe_scene(scene) + + @skill + def seat_guide_readiness_report(self, require_live_perception: bool = True) -> str: + """Run all no-motion SeatGuide readiness checks in one report. + + Use this as the first hardware bring-up command. It combines scene + status, live-perception preflight, and selected-goal preview without + calling navigation. + + Args: + require_live_perception: When true, preflight only passes a + camera-backed scene. Set false only for explicit fallback calibration. + """ + if self._seat_observation_provider is None: + return "SeatGuide readiness report: no seat observation provider is connected." + + scene = self._seat_observation_provider.get_seat_scene() + status = _describe_scene(scene) + preflight = self._describe_preflight( + scene, require_live_perception=require_live_perception + ) + preview = _describe_goal_preview(scene) + return f"SeatGuide readiness report: {status} | {preflight} | {preview}" + + @skill + def seat_guide_navigation_status(self) -> str: + """Report whether the current SeatGuide navigation goal has completed. + + Use this after a live SeatGuide request to verify the robot did more + than accept a goal. It reads the navigation interface state and + `is_goal_reached()` without sending or canceling any goal. + """ + goal_sequence = getattr(self, "_seat_guide_goal_sequence", 0) + if not hasattr(self, "_navigation") or self._navigation is None: + return ( + "SeatGuide navigation status: navigation=missing; " + f"goal_reached=unknown; goal_sequence={goal_sequence}." + ) + try: + state = self._navigation.get_state() + raw_goal_reached = self._navigation.is_goal_reached() + except Exception as exc: + return ( + f"SeatGuide navigation status: navigation=error({exc}); " + f"goal_reached=unknown; goal_sequence={goal_sequence}." + ) + + reset_suffix = "" + goal_reached = raw_goal_reached + if ( + goal_sequence > 0 + and getattr(self, "_seat_guide_goal_reached_reset_required", False) + ): + if raw_goal_reached: + goal_reached = False + reset_suffix = "; completion_reset=waiting_for_false" + else: + self._seat_guide_goal_reached_reset_required = False + + return ( + f"SeatGuide navigation status: navigation={state.name}; " + f"goal_reached={'true' if goal_reached else 'false'}; " + f"goal_sequence={goal_sequence}{reset_suffix}." + ) + + def _navigation_goal_reached_or_false(self) -> bool: + try: + return self._navigation.is_goal_reached() + except Exception: + return False + + def _wait_for_arrival(self, *, timeout_s: float, poll_s: float) -> str: + timeout_s = max(1.0, min(float(timeout_s), 300.0)) + poll_s = max(0.1, min(float(poll_s), 5.0)) + start = time.time() + deadline = start + timeout_s + saw_non_idle = False + + while time.time() < deadline: + try: + state = self._navigation.get_state() + raw_goal_reached = self._navigation.is_goal_reached() + except Exception: + logger.warning("SeatGuide navigation arrival check failed", exc_info=True) + return "failed" + + goal_reached = raw_goal_reached + if getattr(self, "_seat_guide_goal_reached_reset_required", False): + if raw_goal_reached: + goal_reached = False + else: + self._seat_guide_goal_reached_reset_required = False + + if goal_reached: + return "arrived" + if state.name != "IDLE": + saw_non_idle = True + elif saw_non_idle or time.time() - start >= min(1.0, timeout_s): + return "failed" + + time.sleep(poll_s) + + return "timeout" + + def _navigation_readiness_text(self) -> tuple[str, bool]: + if not hasattr(self, "_navigation") or self._navigation is None: + return "navigation=missing", False + try: + state = self._navigation.get_state() + return f"navigation={state.name}", state.name == "IDLE" + except Exception as exc: + return f"navigation=error({exc})", False + + def _describe_preflight( + self, + scene: SeatSceneObservation, + *, + require_live_perception: bool, + ) -> str: + navigation_text, navigation_ok = self._navigation_readiness_text() + + if not scene.seats: + return ( + "SeatGuide preflight no-go: " + f"{navigation_text}; perception={scene.source} no seats; " + "feedback=phone_or_web." + ) + if require_live_perception and not _is_live_camera_source(scene.source): + return ( + "SeatGuide preflight no-go: " + f"{navigation_text}; perception={scene.source} is not live camera; " + f"seats={len(scene.seats)} people={len(scene.people)}; " + "feedback=phone_or_web." + ) + + planner = SeatGuidePlanner() + empty_count, occupied_count = planner.occupancy_counts(scene.seats, scene.people) + result = planner.find_empty_seat( + scene.seats, + scene.people, + robot_x=scene.robot_x, + robot_y=scene.robot_y, + ) + if result is None: + return ( + "SeatGuide preflight no-go: " + f"{navigation_text}; perception={scene.source}; no empty seat; " + f"empty={empty_count} occupied={occupied_count}; " + "feedback=phone_or_web." + ) + + verdict = "ready" if navigation_ok else "no-go" + return ( + f"SeatGuide preflight {verdict}: {navigation_text}; " + f"perception={scene.source} seats={len(scene.seats)} people={len(scene.people)}; " + f"empty={empty_count} occupied={occupied_count}; " + f"selected={result.seat.seat_id}; " + f"goal=({result.goal_x:.2f}, {result.goal_y:.2f}, yaw={result.goal_yaw:.2f}); " + "feedback=phone_or_web." + ) + + +class SyntheticSeatSceneConfig(ModuleConfig): + seats: list[float] = Field( + default_factory=lambda: [0.0, -1.0, 0.0, 1.5, -1.0, 0.0, 3.0, -1.0, 0.0] + ) + people: list[float] = Field(default_factory=lambda: [0.1, -1.0, 1.6, -1.0]) + robot_x: float = -1.0 + robot_y: float = -1.0 + + +class SyntheticSeatObservationProvider(Module): + """Go2-free conference room observation provider for tests and demos.""" + + config: SyntheticSeatSceneConfig + _scene_override: SeatSceneObservation | None = None + _scene_lock: RLock = RLock() + + @rpc + def start(self) -> None: + super().start() + + @rpc + def stop(self) -> None: + super().stop() + + @rpc + def get_seat_scene(self) -> SeatSceneObservation: + with self._scene_lock: + if self._scene_override is not None: + return self._scene_override + + return _scene_from_flat_config(self.config) + + @skill + def set_seat_scene( + self, + seats: list[float], + people: list[float], + robot_x: float = 0.0, + robot_y: float = 0.0, + ) -> str: + """Configure the synthetic conference room scene at runtime. + + Use this during Go2 bring-up to align the fallback scene with the real + chair layout before calling `handle_seat_request`. Chair poses are flat + [x, y, yaw] triples in the map frame. Person positions are flat [x, y] + pairs in the map frame. + + Args: + seats: Flat chair pose list [x, y, yaw, x, y, yaw, ...]. + people: Flat person position list [x, y, x, y, ...]. + robot_x: Robot x position in the map frame. + robot_y: Robot y position in the map frame. + """ + scene = SeatSceneObservation( + seats=_parse_seats(seats), + people=_parse_people(people), + robot_x=robot_x, + robot_y=robot_y, + source="runtime_override", + ) + with self._scene_lock: + self._scene_override = scene + + people_word = "person" if len(scene.people) == 1 else "people" + return f"Configured {len(scene.seats)} seats and {len(scene.people)} {people_word}." + + @skill + def clear_seat_scene_override(self) -> str: + """Clear the runtime synthetic scene and return to configured defaults.""" + with self._scene_lock: + self._scene_override = None + return "Cleared synthetic seat scene override." + + +class CameraSeatSceneConfig(SyntheticSeatSceneConfig): + seats: list[float] = Field(default_factory=list) + people: list[float] = Field(default_factory=list) + detection_model: VlModelName | None = None + fast_detector_enabled: bool = True + fast_detector_model_name: str = "yolo11n.pt" + vlm_fallback_enabled: bool = False + chair_distance_m: float = 2.0 + lateral_span_m: float = 3.0 + max_input_age_s: float = 5.0 + + +class CameraSeatObservationProvider(Module): + """Camera-backed conference room observation provider for SeatGuide.""" + + config: CameraSeatSceneConfig + color_image: In[Image] + camera_info: In[CameraInfo] + lidar: In[PointCloud2] + odom: In[PoseStamped] + + _latest_image: Image | None = None + _latest_camera_info: CameraInfo | None = None + _latest_lidar: PointCloud2 | None = None + _latest_odom: PoseStamped | None = None + _vl_model: VlModel | None = None + _fast_detector: Detector | None = None + _scene_override: SeatSceneObservation | None = None + _scene_lock: RLock = RLock() + + @rpc + def start(self) -> None: + super().start() + self.register_disposable(Disposable(self.color_image.subscribe(self._on_color_image))) + self.register_disposable(Disposable(self.camera_info.subscribe(self._on_camera_info))) + self.register_disposable(Disposable(self.lidar.subscribe(self._on_lidar))) + self.register_disposable(Disposable(self.odom.subscribe(self._on_odom))) + + @rpc + def stop(self) -> None: + super().stop() + + def _on_color_image(self, image: Image) -> None: + with self._scene_lock: + self._latest_image = image + + def _on_camera_info(self, camera_info: CameraInfo) -> None: + with self._scene_lock: + self._latest_camera_info = camera_info + + def _on_lidar(self, lidar: PointCloud2) -> None: + with self._scene_lock: + self._latest_lidar = lidar + + def _on_odom(self, odom: PoseStamped) -> None: + with self._scene_lock: + self._latest_odom = odom + + @rpc + def get_seat_scene(self) -> SeatSceneObservation: + with self._scene_lock: + if self._scene_override is not None: + return self._scene_override + latest_image = self._latest_image + latest_odom = self._latest_odom + latest_camera_info = getattr(self, "_latest_camera_info", None) + latest_lidar = getattr(self, "_latest_lidar", None) + + if latest_image is None: + return _scene_from_flat_config(self.config, source="no_camera_image") + if latest_odom is None: + return _scene_from_flat_config(self.config, source="camera_no_odom") + if _message_age_s(latest_image.ts) > self.config.max_input_age_s: + return _scene_from_flat_config(self.config, source="stale_camera_image") + if _message_age_s(latest_odom.ts) > self.config.max_input_age_s: + return _scene_from_flat_config(self.config, source="stale_camera_odom") + + try: + detected_scene = self._detect_scene_from_image( + latest_image, + latest_odom, + camera_info=latest_camera_info, + lidar=latest_lidar, + ) + except Exception: + logger.warning( + "Failed to detect conference room seats from camera image", exc_info=True + ) + return _scene_from_flat_config(self.config, source="camera_detection_error") + + if not detected_scene.seats: + return _scene_from_flat_config(self.config, source="camera_no_seats_detected") + + return detected_scene + + @skill + def set_seat_scene( + self, + seats: list[float], + people: list[float], + robot_x: float = 0.0, + robot_y: float = 0.0, + ) -> str: + """Configure the fallback conference room scene at runtime. + + Use this during Go2 bring-up when visual detection is unavailable or + unreliable. Chair poses are flat [x, y, yaw] triples in the map frame. + Person positions are flat [x, y] pairs in the map frame. + + Args: + seats: Flat chair pose list [x, y, yaw, x, y, yaw, ...]. + people: Flat person position list [x, y, x, y, ...]. + robot_x: Robot x position in the map frame. + robot_y: Robot y position in the map frame. + """ + scene = SeatSceneObservation( + seats=_parse_seats(seats), + people=_parse_people(people), + robot_x=robot_x, + robot_y=robot_y, + source="runtime_override", + ) + with self._scene_lock: + self._scene_override = scene + + people_word = "person" if len(scene.people) == 1 else "people" + return f"Configured {len(scene.seats)} seats and {len(scene.people)} {people_word}." + + @skill + def clear_seat_scene_override(self) -> str: + """Clear the runtime fallback scene and return to camera detection/defaults.""" + with self._scene_lock: + self._scene_override = None + return "Cleared camera seat scene override." + + @skill + def camera_seat_provider_status(self) -> str: + """Report camera-backed SeatGuide perception readiness without running detection. + + Use this during Go2 bring-up before `seat_guide_status` to check whether + camera frames and odometry are arriving, whether the VLM credential path + is configured, and whether a runtime fallback override is active. + """ + with self._scene_lock: + override_active = self._scene_override is not None + latest_image = self._latest_image + latest_camera_info = getattr(self, "_latest_camera_info", None) + latest_lidar = getattr(self, "_latest_lidar", None) + latest_odom = self._latest_odom + image_text = ( + f"image={latest_image.width}x{latest_image.height}" + if latest_image is not None + else "image=missing" + ) + odom_text = ( + f"odom=({latest_odom.x:.2f}, {latest_odom.y:.2f}, " + f"yaw={latest_odom.yaw:.2f})" + if latest_odom is not None + else "odom=missing" + ) + camera_info_text = ( + f"camera_info={latest_camera_info.width}x{latest_camera_info.height}" + if latest_camera_info is not None + else "camera_info=missing" + ) + lidar_text = ( + f"lidar={len(latest_lidar)} points" if latest_lidar is not None else "lidar=missing" + ) + image_fresh_text = _freshness_text( + latest_image.ts if latest_image is not None else None, + self.config.max_input_age_s, + ) + camera_info_fresh_text = _freshness_text( + latest_camera_info.ts if latest_camera_info is not None else None, + self.config.max_input_age_s, + ) + lidar_fresh_text = _freshness_text( + latest_lidar.ts if latest_lidar is not None else None, + self.config.max_input_age_s, + ) + odom_fresh_text = _freshness_text( + latest_odom.ts if latest_odom is not None else None, + self.config.max_input_age_s, + ) + detection_model = self._detection_model() + credential_text = self._credential_status_for(detection_model) + detector_text = "fast_detector=yolo" if self.config.fast_detector_enabled else "fast_detector=off" + return ( + "CameraSeatObservationProvider status: " + f"{image_text}; image_fresh={image_fresh_text}; " + f"{camera_info_text}; camera_info_fresh={camera_info_fresh_text}; " + f"{lidar_text}; lidar_fresh={lidar_fresh_text}; " + f"{odom_text}; odom_fresh={odom_fresh_text}; " + f"{detector_text}; " + f"detection_model={detection_model}; " + f"{credential_text}; override={'active' if override_active else 'inactive'}; " + f"configured_fallback_seats={len(_parse_seats(self.config.seats))}; " + f"configured_fallback_people={len(_parse_people(self.config.people))}." + ) + + def _detect_scene_from_image( + self, + image: Image, + odom: PoseStamped | None = None, + *, + camera_info: CameraInfo | None = None, + lidar: PointCloud2 | None = None, + ) -> SeatSceneObservation: + chair_detections, person_detections = self._detect_chairs_and_people(image) + robot_x, robot_y, robot_yaw = self._robot_pose_for_detection(odom) + transform = None + if camera_info is not None and lidar is not None: + transform = self.tf.get("camera_optical", lidar.frame_id, image.ts, 1.0) + seats: list[SeatObservation] = [] + people: list[PersonObservation] = [] + used_3d = False + + for detection in chair_detections: + seat_id = f"seat_{len(seats) + 1}" + seat = None + if camera_info is not None and lidar is not None and transform is not None: + seat = _bbox_to_seat_observation_3d( + seat_id=seat_id, + detection=detection, + camera_info=camera_info, + lidar=lidar, + world_to_optical_transform=transform, + robot_x=robot_x, + robot_y=robot_y, + ) + if seat is not None: + used_3d = True + seats.append(seat) + continue + + seats.append( + _bbox_to_seat_observation( + seat_id=seat_id, + bbox=detection.bbox, + image_width=image.width, + robot_x=robot_x, + robot_y=robot_y, + robot_yaw=robot_yaw, + distance_m=self.config.chair_distance_m, + lateral_span_m=self.config.lateral_span_m, + ) + ) + for detection in person_detections: + person = None + if camera_info is not None and lidar is not None and transform is not None: + person = _bbox_to_person_observation_3d( + detection=detection, + camera_info=camera_info, + lidar=lidar, + world_to_optical_transform=transform, + ) + if person is not None: + used_3d = True + people.append(person) + continue + + people.append( + _bbox_to_person_observation( + bbox=detection.bbox, + image_width=image.width, + robot_x=robot_x, + robot_y=robot_y, + robot_yaw=robot_yaw, + distance_m=self.config.chair_distance_m, + lateral_span_m=self.config.lateral_span_m, + ) + ) + + return SeatSceneObservation( + seats=seats, + people=people, + robot_x=robot_x, + robot_y=robot_y, + source="camera_3d" if used_3d else "camera", + ) + + def _robot_pose_for_detection( + self, odom: PoseStamped | None = None + ) -> tuple[float, float, float]: + if odom is None: + return self.config.robot_x, self.config.robot_y, 0.0 + return odom.x, odom.y, odom.yaw + + def _get_vl_model(self) -> VlModel: + if self._vl_model is not None: + return self._vl_model + detection_model = self._detection_model() + if detection_model == "qwen" and not os.getenv("ALIBABA_API_KEY"): + raise ValueError( + "CameraSeatObservationProvider detection_model=qwen requires ALIBABA_API_KEY" + ) + + from dimos.models.vl.create import create + + self._vl_model = create(detection_model) + return self._vl_model + + def _detect_chairs_and_people( + self, image: Image + ) -> tuple[list[Detection2DBBox], list[Detection2DBBox]]: + if getattr(self, "_vl_model", None) is not None: + return self._detect_chairs_and_people_with_vlm(image) + + if self.config.fast_detector_enabled: + try: + detections = self._get_fast_detector().process_image(image) + chairs = _detections_named(detections, {"chair"}) + people = _detections_named(detections, {"person"}) + if chairs or people or not self.config.vlm_fallback_enabled: + return chairs, people + except Exception: + logger.warning("SeatGuide fast detector failed; falling back to VLM", exc_info=True) + if not self.config.vlm_fallback_enabled: + return [], [] + + return self._detect_chairs_and_people_with_vlm(image) + + def _detect_chairs_and_people_with_vlm( + self, image: Image + ) -> tuple[list[Detection2DBBox], list[Detection2DBBox]]: + vl_model = self._get_vl_model() + chair_detections = vl_model.query_detections(image, "chair").detections + person_detections = vl_model.query_detections(image, "person").detections + return list(chair_detections), list(person_detections) + + def _get_fast_detector(self) -> Detector: + if getattr(self, "_fast_detector", None) is not None: + return self._fast_detector + + from dimos.perception.detection.detectors.yolo import Yolo2DDetector + + self._fast_detector = Yolo2DDetector(model_name=self.config.fast_detector_model_name) + return self._fast_detector + + def _detection_model(self) -> VlModelName: + return self.config.detection_model or self.config.g.detection_model + + def _credential_status_for(self, detection_model: VlModelName) -> str: + if detection_model == "qwen" and not os.getenv("ALIBABA_API_KEY"): + return "credential=missing" + return "credential=present" + + +def _parse_seats(values: list[float]) -> list[SeatObservation]: + if len(values) % 3 != 0: + raise ValueError("seats must be a flat list of [x, y, yaw] triples.") + return [ + SeatObservation( + seat_id=f"seat_{index + 1}", + x=float(values[offset]), + y=float(values[offset + 1]), + yaw=float(values[offset + 2]), + ) + for index, offset in enumerate(range(0, len(values), 3)) + ] + + +def _parse_people(values: list[float]) -> list[PersonObservation]: + if len(values) % 2 != 0: + raise ValueError("people must be a flat list of [x, y] pairs.") + return [ + PersonObservation(x=float(values[offset]), y=float(values[offset + 1])) + for offset in range(0, len(values), 2) + ] + + +def _flatten_seats(seats: list[SeatObservation]) -> list[float]: + values: list[float] = [] + for seat in seats: + values.extend([seat.x, seat.y, seat.yaw]) + return values + + +def _flatten_people(people: list[PersonObservation]) -> list[float]: + values: list[float] = [] + for person in people: + values.extend([person.x, person.y]) + return values + + +def _detections_named( + detections: ImageDetections2D[Detection2DBBox], + names: set[str], +) -> list[Detection2DBBox]: + return [ + detection + for detection in detections.detections + if detection.name.strip().lower() in names + ] + + +def _scene_from_flat_config( + config: SyntheticSeatSceneConfig, + *, + source: str = "configured_fallback", +) -> SeatSceneObservation: + return SeatSceneObservation( + seats=_parse_seats(config.seats), + people=_parse_people(config.people), + robot_x=config.robot_x, + robot_y=config.robot_y, + source=source, + ) + + +def _describe_scene(scene: SeatSceneObservation) -> str: + if not scene.seats: + return ( + f"SeatGuide scene source={scene.source}: no seats visible or configured; " + f"{len(scene.people)} people detected." + ) + seats = ", ".join( + f"{seat.seat_id}=({seat.x:.2f}, {seat.y:.2f}, yaw={seat.yaw:.2f})" + for seat in scene.seats + ) + people = ", ".join(f"({person.x:.2f}, {person.y:.2f})" for person in scene.people) + people_text = people if people else "none" + return ( + f"SeatGuide scene source={scene.source}: {len(scene.seats)} seats [{seats}], " + f"{len(scene.people)} people [{people_text}], " + f"robot=({scene.robot_x:.2f}, {scene.robot_y:.2f})." + ) + + +def _describe_goal_preview(scene: SeatSceneObservation) -> str: + if not scene.seats: + return f"SeatGuide preview source={scene.source}: no seats visible or configured." + + planner = SeatGuidePlanner() + empty_count, occupied_count = planner.occupancy_counts(scene.seats, scene.people) + result = planner.find_empty_seat( + scene.seats, + scene.people, + robot_x=scene.robot_x, + robot_y=scene.robot_y, + ) + if result is None: + return ( + f"SeatGuide preview source={scene.source}: no empty seat available; " + f"empty={empty_count} occupied={occupied_count}." + ) + + return ( + f"SeatGuide preview source={scene.source}: selected {result.seat.seat_id} " + f"empty={empty_count} occupied={occupied_count} " + f"seat=({result.seat.x:.2f}, {result.seat.y:.2f}, yaw={result.seat.yaw:.2f}) " + f"goal=({result.goal_x:.2f}, {result.goal_y:.2f}, yaw={result.goal_yaw:.2f})." + ) + + +def _scene_has_empty_seat(scene: SeatSceneObservation) -> bool: + if not scene.seats: + return False + planner = SeatGuidePlanner() + return ( + planner.find_empty_seat( + scene.seats, + scene.people, + robot_x=scene.robot_x, + robot_y=scene.robot_y, + ) + is not None + ) + + +def _describe_live_perception_required(scene: SeatSceneObservation) -> str: + advice_by_source = { + "no_camera_image": "check camera stream wiring and face the conference table", + "camera_no_odom": "check localization/odometry before sending a map-frame goal", + "stale_camera_image": "camera frames are stale; restore the live camera stream", + "stale_camera_odom": "odometry is stale; restore localization before sending a goal", + "camera_no_seats_detected": "turn the robot toward the chairs or adjust the detector", + "camera_detection_error": "check VLM/API key setup and logs", + "configured_fallback": "use require_live_perception=false only for explicit fallback calibration", + "runtime_override": "use require_live_perception=false only for explicit fallback calibration", + } + advice = advice_by_source.get(scene.source, "run seat_guide_status and inspect perception") + return ( + "SeatGuide requires live camera perception before navigation; " + f"source={scene.source}; seats={len(scene.seats)}; people={len(scene.people)}; " + f"robot=({scene.robot_x:.2f}, {scene.robot_y:.2f}); next={advice}." + ) + + +def _is_live_camera_source(source: str) -> bool: + return source in {"camera", "camera_3d"} + + +def _message_age_s(ts: float) -> float: + return max(0.0, time.time() - ts) + + +def _freshness_text(ts: float | None, max_age_s: float) -> str: + if ts is None: + return "missing" + return "true" if _message_age_s(ts) <= max_age_s else "false" + + +def _bbox_center_x(bbox: tuple[float, float, float, float], image_width: int) -> float: + left = max(0.0, min(float(bbox[0]), float(bbox[2]), float(image_width))) + right = max(0.0, min(max(float(bbox[0]), float(bbox[2])), float(image_width))) + return (left + right) / 2.0 + + +def _bbox_to_lateral_offset( + bbox: tuple[float, float, float, float], image_width: int, lateral_span_m: float +) -> float: + if image_width <= 0: + return 0.0 + normalized_x = (_bbox_center_x(bbox, image_width) / image_width) - 0.5 + return normalized_x * lateral_span_m + + +def _bbox_to_seat_observation( + *, + seat_id: str, + bbox: tuple[float, float, float, float], + image_width: int, + robot_x: float, + robot_y: float, + robot_yaw: float, + distance_m: float, + lateral_span_m: float, +) -> SeatObservation: + x, y = _camera_relative_to_map( + forward_m=distance_m, + lateral_m=_bbox_to_lateral_offset(bbox, image_width, lateral_span_m), + robot_x=robot_x, + robot_y=robot_y, + robot_yaw=robot_yaw, + ) + return SeatObservation( + seat_id=seat_id, + x=x, + y=y, + yaw=robot_yaw, + ) + + +def _bbox_to_person_observation( + *, + bbox: tuple[float, float, float, float], + image_width: int, + robot_x: float, + robot_y: float, + robot_yaw: float, + distance_m: float, + lateral_span_m: float, +) -> PersonObservation: + x, y = _camera_relative_to_map( + forward_m=distance_m, + lateral_m=_bbox_to_lateral_offset(bbox, image_width, lateral_span_m), + robot_x=robot_x, + robot_y=robot_y, + robot_yaw=robot_yaw, + ) + return PersonObservation(x=x, y=y) + + +def _bbox_to_seat_observation_3d( + *, + seat_id: str, + detection: Detection2DBBox, + camera_info: CameraInfo, + lidar: PointCloud2, + world_to_optical_transform: Transform, + robot_x: float, + robot_y: float, +) -> SeatObservation | None: + x, y = _bbox_to_map_xy_3d( + detection=detection, + camera_info=camera_info, + lidar=lidar, + world_to_optical_transform=world_to_optical_transform, + ) + if x is None or y is None: + return None + return SeatObservation( + seat_id=seat_id, + x=x, + y=y, + yaw=math.atan2(robot_y - y, robot_x - x), + ) + + +def _bbox_to_person_observation_3d( + *, + detection: Detection2DBBox, + camera_info: CameraInfo, + lidar: PointCloud2, + world_to_optical_transform: Transform, +) -> PersonObservation | None: + x, y = _bbox_to_map_xy_3d( + detection=detection, + camera_info=camera_info, + lidar=lidar, + world_to_optical_transform=world_to_optical_transform, + ) + if x is None or y is None: + return None + return PersonObservation(x=x, y=y) + + +def _bbox_to_map_xy_3d( + *, + detection: Detection2DBBox, + camera_info: CameraInfo, + lidar: PointCloud2, + world_to_optical_transform: Transform, +) -> tuple[float | None, float | None]: + detection_3d = Detection3DPC.from_2d( + det=detection, + world_pointcloud=lidar, + camera_info=camera_info, + world_to_optical_transform=world_to_optical_transform, + filters=[], + ) + if detection_3d is None: + return None, None + points, _ = detection_3d.pointcloud.as_numpy() + if len(points) == 0: + return None, None + xy = _robust_detection_xy(points) + return float(xy[0]), float(xy[1]) + + +def _robust_detection_xy(points: object) -> tuple[float, float]: + import numpy as np + + point_array = np.asarray(points, dtype=float) + finite = point_array[np.isfinite(point_array).all(axis=1)] + if len(finite) == 0: + raise ValueError("detection pointcloud has no finite points") + if finite.shape[1] >= 3: + height_threshold = np.percentile(finite[:, 2], 20) + above_floor = finite[finite[:, 2] >= height_threshold] + if len(above_floor) > 0: + finite = above_floor + xy = np.median(finite[:, :2], axis=0) + return float(xy[0]), float(xy[1]) + + +def _camera_relative_to_map( + *, + forward_m: float, + lateral_m: float, + robot_x: float, + robot_y: float, + robot_yaw: float, +) -> tuple[float, float]: + return ( + robot_x + math.cos(robot_yaw) * forward_m - math.sin(robot_yaw) * lateral_m, + robot_y + math.sin(robot_yaw) * forward_m + math.cos(robot_yaw) * lateral_m, + ) + + +def parse_seat_guide_intent(text: str) -> SeatGuideIntent: + normalized = " ".join(text.strip().lower().split()) + if not normalized: + return SeatGuideIntent(should_find_seat=False, normalized_text="") + + english_seat_words = ("seat", "chair", "place to sit", "empty place") + english_find_words = ("find", "look for", "take me", "guide me", "show me") + chinese_seat_words = ("座位", "椅子", "空位", "位置") + chinese_find_words = ("找", "带我", "帮我", "引导", "去") + + should_find_seat = ( + any(word in normalized for word in english_seat_words) + and any(word in normalized for word in english_find_words) + ) or ( + any(word in normalized for word in chinese_seat_words) + and any(word in normalized for word in chinese_find_words) + ) + return SeatGuideIntent(should_find_seat=should_find_seat, normalized_text=normalized) + + +def is_seat_guide_preview_request(text: str) -> bool: + normalized = text.casefold() + preview_words = ( + "preview", + "preflight", + "dry run", + "test", + "check", + "预检", + "测试", + "先看", + "检查", + "不要动", + "别动", + ) + return parse_seat_guide_intent(text).should_find_seat and any( + word in normalized for word in preview_words + ) diff --git a/dimos/agents/skills/seat_planner.py b/dimos/agents/skills/seat_planner.py new file mode 100644 index 0000000000..a6ce6f164f --- /dev/null +++ b/dimos/agents/skills/seat_planner.py @@ -0,0 +1,239 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""On-demand YOLO empty-seat picker for the manual-map → auto-find demo. + +Flow: + 1. Operator drives the Go2 manually (Rerun click-to-goal or teleop) to build + the voxel map and bring the seats into view. + 2. Operator runs `dimos mcp call find_empty_seat_now` from another terminal. + 3. This skill grabs the latest YOLO detections, picks an unoccupied seat, + projects it to a 3D pose via the world voxel cloud, and publishes the pose + to `goal_request`. A* draws the path; if MovementManager is enabled the + robot walks there. + +YOLO runs continuously so the annotated image is always available in the +viewer. The skill itself never publishes cmd_vel and never rotates. +""" + +from __future__ import annotations + +import math +import threading +from threading import RLock +from typing import TYPE_CHECKING, Any + +import cv2 +from reactivex.disposable import Disposable + +from dimos.agents.annotation import skill +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In, Out +from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.geometry_msgs.Quaternion import Quaternion +from dimos.msgs.geometry_msgs.Vector3 import Vector3, make_vector3 +from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo +from dimos.msgs.sensor_msgs.Image import Image, sharpness_window +from dimos.msgs.sensor_msgs.PointCloud2 import PointCloud2 +from dimos.perception.detection.detectors.yolo import Yolo2DDetector +from dimos.perception.detection.type.detection3d.pointcloud import Detection3DPC +from dimos.utils.logging_config import setup_logger +from dimos.utils.reactive import backpressure + +if TYPE_CHECKING: + from reactivex.abc import DisposableBase + + from dimos.perception.detection.type.detection2d.bbox import Detection2DBBox + from dimos.perception.detection.type.detection2d.imageDetections2D import ImageDetections2D + +logger = setup_logger() + +SEAT_CLASSES = ("chair", "couch", "bench") +OCCUPANCY_OVERLAP = 0.2 + +# Spoken / typed phrases that trigger find_empty_seat_now via /human_input. +# Matched case-insensitively as substrings, so partial Whisper transcriptions +# still fire (e.g. "椅子まで行って" matches "椅子"). +SEAT_TRIGGER_KEYWORDS = ("椅子", "空席", "席まで", "chair", "seat", "vacant") + + +class Config(ModuleConfig): + camera_info: CameraInfo + detect_freq: float = 5.0 # YOLO inference rate on sharpest frame [Hz] + + +class SeatPlanner(Module): + config: Config + + color_image: In[Image] + pointcloud: In[PointCloud2] + human_input: In[str] + goal_request: Out[PoseStamped] + detections_image: Out[Image] + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._detector = Yolo2DDetector() + self._latest: ImageDetections2D | None = None + self._lock = RLock() + self._subscription: DisposableBase | None = None + self._voice_busy = threading.Lock() + + @rpc + def start(self) -> None: + super().start() + sharp = backpressure( + sharpness_window(self.config.detect_freq, self.color_image.pure_observable()) + ) + self._subscription = sharp.subscribe( + on_next=self._on_frame, + on_error=lambda e: logger.exception("SeatPlanner detection error", exc_info=e), + ) + self.register_disposable(Disposable(self.human_input.subscribe(self._on_human_input))) + + @rpc + def stop(self) -> None: + if self._subscription is not None: + self._subscription.dispose() + self._subscription = None + super().stop() + + def _on_human_input(self, text: str) -> None: + lower = text.lower() + if not any(kw.lower() in lower for kw in SEAT_TRIGGER_KEYWORDS): + return + if not self._voice_busy.acquire(blocking=False): + logger.info(f"SeatPlanner: voice trigger ignored, already running ({text!r})") + return + logger.info(f"SeatPlanner: voice trigger fired by {text!r}") + threading.Thread(target=self._voice_worker, daemon=True, name="SeatPlannerVoice").start() + + def _voice_worker(self) -> None: + try: + result = self.find_empty_seat_now() + logger.info(f"SeatPlanner: voice trigger result: {result}") + finally: + self._voice_busy.release() + + def _on_frame(self, image: Image) -> None: + detections = self._detector.process_image(image) + with self._lock: + self._latest = detections + self.detections_image.publish(self._annotate(image, detections.detections)) + + @skill + def navigate_to_point(self, x: float, y: float, yaw_deg: float = 0.0) -> str: + """Publish a world-frame goal pose so the A* planner drives the robot there. + + Args: + x: target X in the `map` frame [m] + y: target Y in the `map` frame [m] + yaw_deg: final heading in degrees (0 = +X), default 0 + """ + pose = PoseStamped( + position=make_vector3(float(x), float(y), 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, math.radians(float(yaw_deg)))), + frame_id="map", + ) + self.goal_request.publish(pose) + msg = f"Published waypoint goal at ({x:.2f}, {y:.2f}, yaw={yaw_deg:.0f}deg)." + logger.info(f"SeatPlanner: {msg}") + return msg + + @skill + def find_empty_seat_now(self) -> str: + """Pick an empty seat in the current camera view and publish a 3D goal. + + Uses the latest YOLO detections (chair/couch/bench minus person-overlap) + and the world voxel cloud. Returns immediately; downstream A* plans the + path. The robot only walks there if MovementManager is enabled. + """ + with self._lock: + detections = self._latest + + if detections is None: + return "No detections yet — wait a second after the camera comes up." + + seats = [d for d in detections.detections if d.name in SEAT_CLASSES] + persons = [d for d in detections.detections if d.name == "person"] + empty = [s for s in seats if not self._is_occupied(s, persons)] + + seen = [d.name for d in detections.detections] + logger.info( + f"SeatPlanner: seen={seen} seats={len(seats)} persons={len(persons)} empty={len(empty)}" + ) + + if not empty: + return f"No empty seat in view. Saw {len(seats)} seat(s), {len(persons)} person(s)." + + try: + pointcloud = self.pointcloud.get_next(timeout=2.0) + except Exception as e: + return f"Pointcloud unavailable: {e}" + + transform = self.tf.get("camera_optical", pointcloud.frame_id, detections.image.ts, 2.0) + if not transform: + return "Camera transform unavailable — drive the robot a bit then retry." + + best = max(empty, key=lambda d: d.bbox_2d_volume()) + target3d = Detection3DPC.from_2d( + best, + world_pointcloud=pointcloud, + camera_info=self.config.camera_info, + world_to_optical_transform=transform, + ) + if target3d is None: + return "Found an empty seat but 3D projection failed (no cloud points in bbox)." + + pose = target3d.pose + self.goal_request.publish(pose) + msg = f"Published goal at ({pose.position.x:.2f}, {pose.position.y:.2f})." + logger.info(f"SeatPlanner: {msg}") + return msg + + def _is_occupied(self, seat: Detection2DBBox, persons: list[Detection2DBBox]) -> bool: + sx1, sy1, sx2, sy2 = seat.bbox + seat_area = max(1.0, (sx2 - sx1) * (sy2 - sy1)) + for p in persons: + px1, py1, px2, py2 = p.bbox + iw = max(0.0, min(sx2, px2) - max(sx1, px1)) + ih = max(0.0, min(sy2, py2) - max(sy1, py1)) + if (iw * ih) / seat_area > OCCUPANCY_OVERLAP: + return True + return False + + def _annotate(self, image: Image, detections: list[Detection2DBBox]) -> Image: + img = image.to_opencv().copy() + persons = [d for d in detections if d.name == "person"] + for d in detections: + x1, y1, x2, y2 = (int(v) for v in d.bbox) + if d.name in SEAT_CLASSES: + occupied = self._is_occupied(d, persons) + color = (0, 0, 255) if occupied else (0, 255, 0) + text = f"{d.name} {'occupied' if occupied else 'EMPTY'}" + elif d.name == "person": + color = (255, 0, 0) + text = "person" + else: + color = (150, 150, 150) + text = d.name + cv2.rectangle(img, (x1, y1), (x2, y2), color, 2) + cv2.putText( + img, text, (x1, max(15, y1 - 6)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2 + ) + return Image.from_opencv(img, ts=image.ts) + + +__all__ = ["SeatPlanner"] diff --git a/dimos/agents/skills/test_seat_guide.py b/dimos/agents/skills/test_seat_guide.py new file mode 100644 index 0000000000..6ac25f9271 --- /dev/null +++ b/dimos/agents/skills/test_seat_guide.py @@ -0,0 +1,4295 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import json +import math +import os +from pathlib import Path +import subprocess +import sys +from threading import RLock +from types import SimpleNamespace +from typing import Any + +from fastapi.testclient import TestClient +import numpy as np +import pytest + +from dimos.agents.mcp.mcp_server import handle_request +import dimos.agents.skills.seat_guide as seat_guide_module +from dimos.agents.skills.seat_guide import ( + CameraSeatObservationProvider, + CameraSeatSceneConfig, + PersonObservation, + SeatGuidePlanner, + SeatGuideRequestSpec, + SeatGuideSkillContainer, + SeatObservation, + SeatSceneObservation, + SyntheticSeatObservationProvider, + SyntheticSeatSceneConfig, + _flatten_people, + _flatten_seats, + _parse_people, + _parse_seats, + is_seat_guide_preview_request, + parse_seat_guide_intent, +) +from dimos.agents.system_prompt import SYSTEM_PROMPT +import dimos.agents.web_human_input as web_human_input_module +from dimos.agents.web_human_input import WebInput, _create_whisper_node +from dimos.core.coordination.blueprints import autoconnect +from dimos.core.coordination.module_coordinator import ModuleCoordinator +from dimos.core.core import rpc +from dimos.core.global_config import GlobalConfig +from dimos.core.module import Module +from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.geometry_msgs.Quaternion import Quaternion +from dimos.msgs.geometry_msgs.Vector3 import Vector3 +from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo +from dimos.msgs.sensor_msgs.Image import Image +from dimos.msgs.sensor_msgs.PointCloud2 import PointCloud2 +from dimos.navigation.base import NavigationState +from dimos.perception.detection.type.detection2d.bbox import Detection2DBBox +from dimos.perception.detection.type.detection2d.imageDetections2D import ImageDetections2D +from dimos.web.robot_web_interface import RobotWebInterface + +REPO_ROOT = Path(__file__).parents[3] +ACCEPTANCE_LOG_VERIFIER = REPO_ROOT / "bin" / "demo_seat_guide_verify_acceptance_log" +HARDWARE_ACCEPTANCE_SCRIPT = REPO_ROOT / "bin" / "demo_seat_guide_hardware_acceptance" +HARDWARE_BRINGUP_SCRIPT = REPO_ROOT / "bin" / "demo_seat_guide_hardware_bringup" +SMOKE_SCRIPT = REPO_ROOT / "bin" / "demo_seat_guide_smoke" +REPLAY_SMOKE_SCRIPT = REPO_ROOT / "bin" / "demo_seat_guide_replay_smoke" +SEAT_GUIDE_DOC = REPO_ROOT / "docs" / "agents" / "seat_guide_modules.md" +SEAT_GUIDE_SCRIPTS = [ + SMOKE_SCRIPT, + REPLAY_SMOKE_SCRIPT, + HARDWARE_BRINGUP_SCRIPT, + HARDWARE_ACCEPTANCE_SCRIPT, + ACCEPTANCE_LOG_VERIFIER, +] + + +class FakeNavigation: + def __init__( + self, + *, + accepts_goal: bool = True, + raises_on_goal: bool = False, + state: NavigationState = NavigationState.IDLE, + goal_reached: bool = False, + ) -> None: + self.accepts_goal = accepts_goal + self.raises_on_goal = raises_on_goal + self.state = state + self.goal_reached = goal_reached + self.goal: PoseStamped | None = None + + def set_goal(self, goal: PoseStamped) -> bool: + if self.raises_on_goal: + raise RuntimeError("planner unavailable") + self.goal = goal + return self.accepts_goal + + def get_state(self) -> NavigationState: + return self.state + + def is_goal_reached(self) -> bool: + return self.goal_reached + + def cancel_goal(self) -> bool: + return True + + +class SequencedNavigation(FakeNavigation): + def __init__( + self, + *, + states: list[NavigationState], + goal_reached_values: list[bool], + ) -> None: + super().__init__(state=states[-1], goal_reached=goal_reached_values[-1]) + self.states = states + self.goal_reached_values = goal_reached_values + self.state_calls = 0 + self.goal_reached_calls = 0 + + def get_state(self) -> NavigationState: + index = min(self.state_calls, len(self.states) - 1) + self.state_calls += 1 + return self.states[index] + + def is_goal_reached(self) -> bool: + index = min(self.goal_reached_calls, len(self.goal_reached_values) - 1) + self.goal_reached_calls += 1 + return self.goal_reached_values[index] + + +class FakeSeatObservationProvider: + def __init__(self, scene: SeatSceneObservation) -> None: + self.scene = scene + + def get_seat_scene(self) -> SeatSceneObservation: + return self.scene + + +class CountingSeatObservationProvider(FakeSeatObservationProvider): + def __init__(self, scene: SeatSceneObservation) -> None: + super().__init__(scene) + self.calls = 0 + + def get_seat_scene(self) -> SeatSceneObservation: + self.calls += 1 + return super().get_seat_scene() + + +class SequenceSeatObservationProvider: + def __init__(self, scenes: list[SeatSceneObservation]) -> None: + self.scenes = scenes + self.calls = 0 + + def get_seat_scene(self) -> SeatSceneObservation: + index = min(self.calls, len(self.scenes) - 1) + self.calls += 1 + return self.scenes[index] + + +class FakeExplorer: + def __init__(self) -> None: + self.begin_calls = 0 + self.end_calls = 0 + + def begin_exploration(self) -> str: + self.begin_calls += 1 + return "Started exploration skill." + + def end_exploration(self) -> str: + self.end_calls += 1 + return "Stopped exploration." + + +class FakeRelativeMover: + def __init__(self, result: str = "Navigation goal reached") -> None: + self.result = result + self.moves: list[tuple[float, float, float]] = [] + + def relative_move( + self, + forward: float = 0.0, + left: float = 0.0, + degrees: float = 0.0, + x: float = 0.0, + y: float = 0.0, + duration: float = 0.0, + ) -> str: + self.moves.append((forward, left, degrees)) + return self.result + + +class FakeDirectMover: + def __init__(self, result: str = "Direct move sent") -> None: + self.result = result + self.moves: list[tuple[float, float, float, float]] = [] + + def direct_move( + self, + x: float, + y: float = 0.0, + yaw: float = 0.0, + duration: float = 1.0, + ) -> str: + self.moves.append((x, y, yaw, duration)) + return self.result + + +class FakeSeatGuideRequest: + def __init__(self, *, raises: bool = False) -> None: + self.requests: list[str] = [] + self.preview_requests: list[str] = [] + self.raises = raises + + def handle_seat_request(self, text: str) -> str: + self.requests.append(text) + if self.raises: + raise RuntimeError("seat guide unavailable") + return "handled" + + def preview_seat_request(self, text: str) -> str: + self.preview_requests.append(text) + if self.raises: + raise RuntimeError("seat guide unavailable") + return "previewed" + + +class FakeHumanTransport: + def __init__(self) -> None: + self.published: list[str] = [] + + def publish(self, text: str) -> None: + self.published.append(text) + + +class FakeWebInterface: + port = 5555 + audio_subject = SimpleNamespace() + + +class FakeThread: + def __init__(self, *, alive: bool) -> None: + self.alive = alive + + def is_alive(self) -> bool: + return self.alive + + +class FakeAgentResponses: + def __init__(self) -> None: + self.published: list[str] = [] + + def on_next(self, text: str) -> None: + self.published.append(text) + + +class FakeLogger: + def __init__(self) -> None: + self.info_calls: list[tuple[str, dict[str, Any]]] = [] + + def info(self, event: str, **kwargs: Any) -> None: + self.info_calls.append((event, kwargs)) + + +class FakeVlModel: + def __init__(self, detections_by_query: dict[str, list[SimpleNamespace]]) -> None: + self._detections_by_query = detections_by_query + self.queries: list[str] = [] + + def query_detections(self, image: Image, query: str) -> SimpleNamespace: + self.queries.append(query) + return SimpleNamespace(detections=self._detections_by_query.get(query, [])) + + +class FakeFastDetector: + def __init__(self, detections: list[Any]) -> None: + self._detections = detections + self.calls = 0 + + def process_image(self, image: Image) -> ImageDetections2D: + self.calls += 1 + return ImageDetections2D(image, list(self._detections)) + + +class OdomMutatingFakeVlModel(FakeVlModel): + def __init__( + self, + detections_by_query: dict[str, list[SimpleNamespace]], + provider: CameraSeatObservationProvider, + ) -> None: + super().__init__(detections_by_query) + self._provider = provider + + def query_detections(self, image: Image, query: str) -> SimpleNamespace: + self._provider._on_odom( + PoseStamped( + frame_id="map", + position=Vector3(100.0, 200.0, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, 1.0)), + ) + ) + return super().query_detections(image, query) + + +class RecordingNavigation(Module): + _last_goal: PoseStamped | None = None + + @rpc + def set_goal(self, goal: PoseStamped) -> bool: + self._last_goal = goal + return True + + @rpc + def get_state(self) -> NavigationState: + return NavigationState.IDLE + + @rpc + def is_goal_reached(self) -> bool: + return False + + @rpc + def cancel_goal(self) -> bool: + return True + + @rpc + def get_last_goal_xy(self) -> tuple[float, float] | None: + if self._last_goal is None: + return None + return self._last_goal.position.x, self._last_goal.position.y + + +def test_planner_selects_nearest_empty_seat() -> None: + planner = SeatGuidePlanner(occupied_radius_m=0.75, aisle_offset_m=0.5) + seats = [ + SeatObservation("occupied_near", x=1.0, y=0.0, yaw=0.0), + SeatObservation("empty_far", x=5.0, y=0.0, yaw=0.0), + SeatObservation("empty_near", x=2.0, y=0.0, yaw=math.pi / 2), + ] + people = [PersonObservation(x=1.2, y=0.1)] + + result = planner.find_empty_seat(seats, people, robot_x=0.0, robot_y=0.0) + + assert result is not None + assert result.seat.seat_id == "empty_near" + assert result.goal_x == pytest.approx(2.0) + assert result.goal_y == pytest.approx(0.5) + assert result.goal_yaw == pytest.approx(math.pi / 2) + assert planner.occupancy_counts(seats, people) == (2, 1) + + +def test_planner_returns_none_when_all_seats_are_occupied() -> None: + planner = SeatGuidePlanner() + seats = [ + SeatObservation("left", x=0.0, y=0.0), + SeatObservation("right", x=1.0, y=0.0), + ] + people = [ + PersonObservation(x=0.1, y=0.0), + PersonObservation(x=1.1, y=0.0), + ] + + assert planner.find_empty_seat(seats, people) is None + + +def test_skill_sets_navigation_goal_for_empty_seat() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + skill._navigation = fake_navigation + + message = skill.find_empty_seat( + seats=[0.0, 0.0, 0.0, 2.0, 0.0, math.pi], + people=[0.2, 0.1], + robot_x=0.0, + robot_y=0.0, + ) + + assert "seat_2" in message + assert fake_navigation.goal is not None + assert fake_navigation.goal.frame_id == "map" + assert fake_navigation.goal.position.x == pytest.approx(1.35) + assert fake_navigation.goal.position.y == pytest.approx(0.0) + assert "goal_sequence=1" in skill.seat_guide_navigation_status() + + +def test_navigation_status_ignores_stale_goal_reached_until_reset_seen() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation(goal_reached=True) + skill._navigation = fake_navigation + + message = skill.find_empty_seat(seats=[1.0, 0.0, 0.0], people=[]) + + assert "Navigating to" in message + assert skill.seat_guide_navigation_status() == ( + "SeatGuide navigation status: navigation=IDLE; goal_reached=false; " + "goal_sequence=1; completion_reset=waiting_for_false." + ) + + fake_navigation.goal_reached = False + assert skill.seat_guide_navigation_status() == ( + "SeatGuide navigation status: navigation=IDLE; goal_reached=false; " + "goal_sequence=1." + ) + + fake_navigation.goal_reached = True + assert skill.seat_guide_navigation_status() == ( + "SeatGuide navigation status: navigation=IDLE; goal_reached=true; " + "goal_sequence=1." + ) + + +def test_skill_reports_navigation_failure() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + skill._navigation = FakeNavigation(accepts_goal=False) + + message = skill.find_empty_seat(seats=[1.0, 0.0, 0.0], people=[]) + + assert message == "Found empty seat seat_1, but failed to start navigation." + assert "goal_sequence=0" in skill.seat_guide_navigation_status() + + +def test_skill_reports_navigation_exception() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation(raises_on_goal=True) + skill._navigation = fake_navigation + + message = skill.find_empty_seat(seats=[1.0, 0.0, 0.0], people=[]) + + assert ( + message + == "Found empty seat seat_1, but navigation raised an error: planner unavailable." + ) + assert fake_navigation.goal is None + + +def test_skill_refuses_to_override_active_navigation_goal() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation(state=NavigationState.FOLLOWING_PATH) + skill._navigation = fake_navigation + + message = skill.find_empty_seat(seats=[1.0, 0.0, 0.0], people=[]) + + assert message == ( + "Found empty seat seat_1, but navigation is not ready for a new goal: " + "navigation=FOLLOWING_PATH." + ) + assert fake_navigation.goal is None + + +def test_skill_uses_connected_observation_provider() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[ + SeatObservation("occupied", x=0.0, y=0.0, yaw=0.0), + SeatObservation("empty", x=1.0, y=0.0, yaw=0.0), + ], + people=[PersonObservation(x=0.1, y=0.0)], + robot_x=0.0, + robot_y=0.0, + ) + ) + + message = skill.find_empty_seat_from_scene(require_live_perception=False) + + assert "seat_2" in message + assert fake_navigation.goal is not None + assert fake_navigation.goal.position.x == pytest.approx(1.65) + + +def test_skill_reports_missing_observation_provider() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + skill._seat_observation_provider = None + + assert skill.find_empty_seat_from_scene() == "No seat observation provider is connected." + + +def test_skill_reports_no_visible_seats_separately() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + skill._navigation = FakeNavigation() + + message = skill.find_empty_seat(seats=[], people=[]) + + assert message == ( + "I cannot see any seats yet. Please face the conference table or calibrate " + "the room layout." + ) + + +def test_handle_seat_request_delegates_to_scene_provider() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[ + SeatObservation("occupied", x=0.0, y=0.0, yaw=0.0), + SeatObservation("empty", x=1.0, y=0.0, yaw=0.0), + ], + people=[PersonObservation(x=0.1, y=0.0)], + source="camera", + ) + ) + + message = skill.handle_seat_request( + "Please help me find an empty seat", + wait_for_arrival=False, + ) + + assert "seat_2" in message + assert fake_navigation.goal is not None + assert fake_navigation.goal.position.x == pytest.approx(1.65) + + +def test_handle_seat_request_waits_and_reports_when_arrived() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = SequencedNavigation( + states=[NavigationState.IDLE, NavigationState.FOLLOWING_PATH, NavigationState.IDLE], + goal_reached_values=[False, False, True], + ) + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[SeatObservation("empty", x=1.0, y=0.0, yaw=0.0)], + people=[], + source="camera_3d", + ) + ) + + message = skill.handle_seat_request( + "Please help me find an empty seat", + wait_for_arrival=True, + arrival_timeout_s=2.0, + ) + + assert "我已经到了, 空椅子在我右边, 请坐。" in message + + +def test_handle_seat_request_reports_when_navigation_stops_before_arrival() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = SequencedNavigation( + states=[NavigationState.IDLE, NavigationState.FOLLOWING_PATH, NavigationState.IDLE], + goal_reached_values=[False, False, False], + ) + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[SeatObservation("empty", x=1.0, y=0.0, yaw=0.0)], + people=[], + source="camera_3d", + ) + ) + + message = skill.handle_seat_request( + "Please help me find an empty seat", + wait_for_arrival=True, + arrival_timeout_s=2.0, + ) + + assert "navigation stopped before reaching it" in message + + +def test_handle_seat_request_requires_live_camera_by_default() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[SeatObservation("fallback", x=1.0, y=0.0, yaw=0.0)], + people=[], + source="configured_fallback", + ) + ) + + message = skill.handle_seat_request( + "Please help me find an empty seat", + wait_for_arrival=False, + ) + + assert message == ( + "SeatGuide requires live camera perception before navigation; " + "source=configured_fallback; seats=1; people=0; robot=(0.00, 0.00); " + "next=use require_live_perception=false only for explicit fallback calibration." + ) + assert fake_navigation.goal is None + + +def test_handle_seat_request_reports_camera_detection_error_next_step() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[SeatObservation("fallback", x=1.0, y=0.0, yaw=0.0)], + people=[PersonObservation(x=0.0, y=0.0)], + robot_x=-1.0, + robot_y=2.0, + source="camera_detection_error", + ) + ) + + message = skill.handle_seat_request( + "Please help me find an empty seat", + wait_for_arrival=False, + ) + + assert message == ( + "SeatGuide requires live camera perception before navigation; " + "source=camera_detection_error; seats=1; people=1; robot=(-1.00, 2.00); " + "next=check VLM/API key setup and logs." + ) + assert fake_navigation.goal is None + + +def test_handle_seat_request_can_explicitly_allow_fallback_calibration() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[SeatObservation("fallback", x=1.0, y=0.0, yaw=0.0)], + people=[], + source="configured_fallback", + ) + ) + + message = skill.handle_seat_request( + "Please help me find an empty seat", + require_live_perception=False, + wait_for_arrival=False, + ) + + assert "seat_1" in message + assert fake_navigation.goal is not None + assert fake_navigation.goal.position.x == pytest.approx(1.65) + + +def test_handle_seat_request_searches_when_no_seat_visible_then_navigates() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + fake_explorer = FakeExplorer() + skill._navigation = fake_navigation + skill._explorer = fake_explorer + skill._seat_observation_provider = SequenceSeatObservationProvider( + [ + SeatSceneObservation(seats=[], people=[], source="camera_no_seats_detected"), + SeatSceneObservation(seats=[], people=[], source="camera_no_seats_detected"), + SeatSceneObservation( + seats=[SeatObservation("visible", x=1.0, y=0.0, yaw=0.0)], + people=[], + source="camera_3d", + ), + ] + ) + + message = skill.handle_seat_request( + "Please help me find an empty seat", + wait_for_arrival=False, + ) + + assert "seat_1" in message + assert fake_explorer.begin_calls == 1 + assert fake_explorer.end_calls == 1 + assert fake_navigation.goal is not None + assert fake_navigation.goal.position.x == pytest.approx(1.65) + + +def test_handle_seat_request_prefers_rotate_scan_when_no_seat_visible() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + fake_explorer = FakeExplorer() + fake_relative_mover = FakeRelativeMover() + skill._navigation = fake_navigation + skill._explorer = fake_explorer + skill._relative_mover = fake_relative_mover + skill._seat_observation_provider = SequenceSeatObservationProvider( + [ + SeatSceneObservation(seats=[], people=[], source="camera_no_seats_detected"), + SeatSceneObservation(seats=[], people=[], source="camera_no_seats_detected"), + SeatSceneObservation( + seats=[SeatObservation("visible", x=1.0, y=0.0, yaw=0.0)], + people=[], + source="camera_3d", + ), + SeatSceneObservation( + seats=[SeatObservation("visible", x=1.0, y=0.0, yaw=0.0)], + people=[], + source="camera_3d", + ), + ] + ) + + message = skill.handle_seat_request( + "Please help me find an empty seat", + wait_for_arrival=False, + ) + + assert "seat_1" in message + assert fake_relative_mover.moves == [(0.0, 0.0, 30.0)] + assert fake_explorer.begin_calls == 0 + assert fake_explorer.end_calls == 0 + assert fake_navigation.goal is not None + assert fake_navigation.goal.position.x == pytest.approx(1.65) + + +def test_handle_seat_request_uses_rotate_scan_without_explorer() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + fake_relative_mover = FakeRelativeMover() + skill._navigation = fake_navigation + skill._relative_mover = fake_relative_mover + skill._seat_observation_provider = SequenceSeatObservationProvider( + [ + SeatSceneObservation(seats=[], people=[], source="camera_no_seats_detected"), + SeatSceneObservation(seats=[], people=[], source="camera_no_seats_detected"), + SeatSceneObservation( + seats=[SeatObservation("visible", x=1.0, y=0.0, yaw=0.0)], + people=[], + source="camera_3d", + ), + SeatSceneObservation( + seats=[SeatObservation("visible", x=1.0, y=0.0, yaw=0.0)], + people=[], + source="camera_3d", + ), + ] + ) + + message = skill.handle_seat_request( + "Please help me find an empty seat", + wait_for_arrival=False, + ) + + assert "seat_1" in message + assert fake_relative_mover.moves == [(0.0, 0.0, 30.0)] + assert fake_navigation.goal is not None + + +def test_scan_for_empty_seat_prefers_direct_turn_over_relative_navigation() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + fake_direct_mover = FakeDirectMover() + fake_relative_mover = FakeRelativeMover() + skill._navigation = fake_navigation + skill._direct_mover = fake_direct_mover + skill._relative_mover = fake_relative_mover + skill._seat_observation_provider = SequenceSeatObservationProvider( + [ + SeatSceneObservation(seats=[], people=[], source="camera_no_seats_detected"), + SeatSceneObservation( + seats=[SeatObservation("visible", x=1.0, y=0.0, yaw=0.0)], + people=[], + source="camera_3d", + ), + SeatSceneObservation( + seats=[SeatObservation("visible", x=1.0, y=0.0, yaw=0.0)], + people=[], + source="camera_3d", + ), + ] + ) + + message = skill.scan_for_empty_seat_from_scene( + max_turn_degrees=30.0, + step_degrees=30.0, + settle_s=0.1, + turn_yaw_rate_rad_s=0.5, + ) + + assert "seat_1" in message + assert len(fake_direct_mover.moves) == 1 + x, y, yaw, duration = fake_direct_mover.moves[0] + assert x == 0.0 + assert y == 0.0 + assert yaw == pytest.approx(0.5) + assert duration == pytest.approx(math.radians(30.0) / 0.5) + assert fake_relative_mover.moves == [] + assert fake_navigation.goal is not None + + +def test_handle_seat_request_searches_when_visible_seats_are_occupied() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + fake_explorer = FakeExplorer() + skill._navigation = fake_navigation + skill._explorer = fake_explorer + skill._seat_observation_provider = SequenceSeatObservationProvider( + [ + SeatSceneObservation( + seats=[SeatObservation("occupied", x=1.0, y=0.0, yaw=0.0)], + people=[PersonObservation(x=1.1, y=0.0)], + source="camera_3d", + ), + SeatSceneObservation( + seats=[SeatObservation("occupied", x=1.0, y=0.0, yaw=0.0)], + people=[PersonObservation(x=1.1, y=0.0)], + source="camera_3d", + ), + SeatSceneObservation( + seats=[ + SeatObservation("occupied", x=1.0, y=0.0, yaw=0.0), + SeatObservation("empty", x=2.0, y=0.0, yaw=0.0), + ], + people=[PersonObservation(x=1.1, y=0.0)], + source="camera_3d", + ), + ] + ) + + message = skill.handle_seat_request( + "Please help me find an empty seat", + wait_for_arrival=False, + ) + + assert "seat_2" in message + assert fake_explorer.begin_calls == 1 + assert fake_explorer.end_calls == 1 + assert fake_navigation.goal is not None + assert fake_navigation.goal.position.x == pytest.approx(2.65) + + +def test_scan_for_empty_seat_rotates_until_empty_seat_visible() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + fake_relative_mover = FakeRelativeMover() + skill._navigation = fake_navigation + skill._relative_mover = fake_relative_mover + skill._seat_observation_provider = SequenceSeatObservationProvider( + [ + SeatSceneObservation(seats=[], people=[], source="camera_no_seats_detected"), + SeatSceneObservation(seats=[], people=[], source="camera_no_seats_detected"), + SeatSceneObservation( + seats=[SeatObservation("visible", x=2.0, y=0.0, yaw=0.0)], + people=[], + source="camera_3d", + ), + SeatSceneObservation( + seats=[SeatObservation("visible", x=2.0, y=0.0, yaw=0.0)], + people=[], + source="camera_3d", + ), + ] + ) + + message = skill.scan_for_empty_seat_from_scene( + max_turn_degrees=90.0, + step_degrees=30.0, + settle_s=0.1, + ) + + assert "seat_1" in message + assert fake_relative_mover.moves == [(0.0, 0.0, 30.0), (0.0, 0.0, 30.0)] + assert fake_navigation.goal is not None + assert fake_navigation.goal.position.x == pytest.approx(2.65) + + +def test_scan_for_empty_seat_reports_rotation_failure() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + fake_relative_mover = FakeRelativeMover("Navigation was cancelled or failed") + skill._navigation = fake_navigation + skill._relative_mover = fake_relative_mover + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation(seats=[], people=[], source="camera_no_seats_detected") + ) + + message = skill.scan_for_empty_seat_from_scene( + max_turn_degrees=30.0, + step_degrees=30.0, + settle_s=0.1, + ) + + assert message == ( + "SeatGuide scan stopped because rotation failed: " + "Navigation was cancelled or failed." + ) + assert fake_relative_mover.moves == [(0.0, 0.0, 30.0)] + assert fake_navigation.goal is None + + +def test_search_for_empty_seat_stops_exploration_on_timeout() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + fake_explorer = FakeExplorer() + skill._navigation = fake_navigation + skill._explorer = fake_explorer + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation(seats=[], people=[], source="camera_no_seats_detected") + ) + + message = skill.search_for_empty_seat_from_scene( + search_timeout_s=1.0, + poll_interval_s=0.2, + ) + + assert message == ( + "I searched but still cannot see an empty seat. Please reposition me " + "or point the camera toward the conference table." + ) + assert fake_explorer.begin_calls == 1 + assert fake_explorer.end_calls == 1 + assert fake_navigation.goal is None + + +def test_handle_seat_request_rejects_unrelated_text() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + + message = skill.handle_seat_request("what time is the meeting") + + assert message == "I did not hear a request to find an empty seat." + + +def test_preview_seat_request_runs_preflight_without_navigating() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[SeatObservation("empty", x=2.0, y=0.0, yaw=0.0)], + people=[], + robot_x=0.0, + robot_y=0.0, + source="camera", + ) + ) + + message = skill.preview_seat_request("预检帮我找一个空位") + + assert "SeatGuide preflight ready" in message + assert fake_navigation.goal is None + + +def test_seat_guide_status_describes_current_scene() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[SeatObservation("seat_1", x=1.0, y=2.0, yaw=0.5)], + people=[PersonObservation(x=1.1, y=2.0)], + robot_x=-1.0, + robot_y=0.0, + source="camera", + ) + ) + + message = skill.seat_guide_status() + + assert message == ( + "SeatGuide scene source=camera: 1 seats [seat_1=(1.00, 2.00, yaw=0.50)], " + "1 people [(1.10, 2.00)], robot=(-1.00, 0.00)." + ) + + +def test_seat_guide_status_reports_missing_provider() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + skill._seat_observation_provider = None + + assert ( + skill.seat_guide_status() + == "SeatGuide status: no seat observation provider is connected." + ) + + +def test_preview_empty_seat_goal_does_not_navigate() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[ + SeatObservation("occupied", x=0.0, y=0.0, yaw=0.0), + SeatObservation("empty", x=2.0, y=0.0, yaw=0.0), + ], + people=[PersonObservation(x=0.1, y=0.0)], + robot_x=0.0, + robot_y=0.0, + source="camera", + ) + ) + + message = skill.preview_empty_seat_goal() + + assert message == ( + "SeatGuide preview source=camera: selected empty " + "empty=1 occupied=1 " + "seat=(2.00, 0.00, yaw=0.00) goal=(2.65, 0.00, yaw=0.00)." + ) + assert fake_navigation.goal is None + + +def test_preview_empty_seat_goal_reports_no_seats() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation(seats=[], people=[], source="no_camera_image") + ) + + assert ( + skill.preview_empty_seat_goal() + == "SeatGuide preview source=no_camera_image: no seats visible or configured." + ) + + +def test_preview_empty_seat_goal_reports_no_empty_seat() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[SeatObservation("occupied", x=0.0, y=0.0, yaw=0.0)], + people=[PersonObservation(x=0.1, y=0.0)], + source="camera", + ) + ) + + assert ( + skill.preview_empty_seat_goal() + == "SeatGuide preview source=camera: no empty seat available; empty=0 occupied=1." + ) + + +def test_seat_guide_preflight_reports_ready_without_navigating() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[ + SeatObservation("occupied", x=0.0, y=0.0, yaw=0.0), + SeatObservation("empty", x=2.0, y=0.0, yaw=0.0), + ], + people=[PersonObservation(x=0.1, y=0.0)], + robot_x=0.0, + robot_y=0.0, + source="camera", + ) + ) + + message = skill.seat_guide_preflight() + + assert message == ( + "SeatGuide preflight ready: navigation=IDLE; perception=camera seats=2 people=1; " + "empty=1 occupied=1; selected=empty; " + "goal=(2.65, 0.00, yaw=0.00); feedback=phone_or_web." + ) + assert fake_navigation.goal is None + + +def test_seat_guide_preflight_reports_no_go_without_provider() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + skill._navigation = FakeNavigation() + skill._seat_observation_provider = None + + assert ( + skill.seat_guide_preflight() + == "SeatGuide preflight no-go: navigation=IDLE; perception=missing; feedback=phone_or_web." + ) + + +def test_seat_guide_preflight_reports_no_go_without_visible_seats() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + skill._navigation = FakeNavigation() + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation(seats=[], people=[], source="camera_detection_error") + ) + + assert ( + skill.seat_guide_preflight() + == "SeatGuide preflight no-go: navigation=IDLE; perception=camera_detection_error " + "no seats; feedback=phone_or_web." + ) + + +def test_seat_guide_preflight_reports_no_empty_seat_counts() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + skill._navigation = FakeNavigation() + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[ + SeatObservation("left", x=0.0, y=0.0, yaw=0.0), + SeatObservation("right", x=1.0, y=0.0, yaw=0.0), + ], + people=[ + PersonObservation(x=0.1, y=0.0), + PersonObservation(x=1.1, y=0.0), + ], + source="camera", + ) + ) + + assert skill.seat_guide_preflight() == ( + "SeatGuide preflight no-go: navigation=IDLE; perception=camera; " + "no empty seat; empty=0 occupied=2; feedback=phone_or_web." + ) + + +def test_seat_guide_preflight_reports_no_go_when_navigation_is_busy() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation(state=NavigationState.FOLLOWING_PATH) + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[SeatObservation("empty", x=2.0, y=0.0, yaw=0.0)], + people=[], + source="camera", + ) + ) + + message = skill.seat_guide_preflight() + + assert message == ( + "SeatGuide preflight no-go: navigation=FOLLOWING_PATH; " + "perception=camera seats=1 people=0; empty=1 occupied=0; selected=empty; " + "goal=(2.65, 0.00, yaw=0.00); feedback=phone_or_web." + ) + assert fake_navigation.goal is None + + +def test_seat_guide_preflight_requires_live_camera_by_default() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[SeatObservation("fallback", x=2.0, y=0.0, yaw=0.0)], + people=[], + source="configured_fallback", + ) + ) + + message = skill.seat_guide_preflight() + + assert message == ( + "SeatGuide preflight no-go: navigation=IDLE; perception=configured_fallback " + "is not live camera; seats=1 people=0; feedback=phone_or_web." + ) + assert fake_navigation.goal is None + + +def test_seat_guide_preflight_can_explicitly_allow_fallback_calibration() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[SeatObservation("fallback", x=2.0, y=0.0, yaw=0.0)], + people=[], + source="configured_fallback", + ) + ) + + message = skill.seat_guide_preflight(require_live_perception=False) + + assert "SeatGuide preflight ready" in message + assert "perception=configured_fallback" in message + assert fake_navigation.goal is None + + +def test_seat_guide_readiness_report_combines_no_motion_checks() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[ + SeatObservation("occupied", x=0.0, y=0.0, yaw=0.0), + SeatObservation("empty", x=2.0, y=0.0, yaw=0.0), + ], + people=[PersonObservation(x=0.1, y=0.0)], + robot_x=0.0, + robot_y=0.0, + source="camera", + ) + ) + + message = skill.seat_guide_readiness_report() + + assert message.startswith("SeatGuide readiness report: SeatGuide scene source=camera") + assert "SeatGuide preflight ready" in message + assert "empty=1 occupied=1" in message + assert "SeatGuide preview source=camera" in message + assert fake_navigation.goal is None + + +def test_seat_guide_readiness_report_keeps_fallback_no_go_by_default() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + fake_navigation = FakeNavigation() + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[SeatObservation("fallback", x=2.0, y=0.0, yaw=0.0)], + people=[], + source="configured_fallback", + ) + ) + + message = skill.seat_guide_readiness_report() + + assert "SeatGuide preflight no-go" in message + assert "perception=configured_fallback is not live camera" in message + assert "SeatGuide preview source=configured_fallback" in message + assert fake_navigation.goal is None + + +def test_seat_guide_readiness_report_reads_scene_once() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + provider = CountingSeatObservationProvider( + SeatSceneObservation( + seats=[SeatObservation("empty", x=2.0, y=0.0, yaw=0.0)], + people=[], + source="camera", + ) + ) + skill._navigation = FakeNavigation() + skill._seat_observation_provider = provider + + skill.seat_guide_readiness_report() + + assert provider.calls == 1 + + +def test_seat_guide_navigation_status_reports_goal_reached() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + skill._navigation = FakeNavigation(goal_reached=True) + + assert skill.seat_guide_navigation_status() == ( + "SeatGuide navigation status: navigation=IDLE; goal_reached=true; goal_sequence=0." + ) + + +def test_seat_guide_navigation_status_reports_busy_goal_not_reached() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + skill._navigation = FakeNavigation( + state=NavigationState.FOLLOWING_PATH, + goal_reached=False, + ) + + assert skill.seat_guide_navigation_status() == ( + "SeatGuide navigation status: navigation=FOLLOWING_PATH; " + "goal_reached=false; goal_sequence=0." + ) + + +def test_seat_guide_navigation_status_reports_missing_navigation() -> None: + skill = SeatGuideSkillContainer.__new__(SeatGuideSkillContainer) + skill._navigation = None + + assert skill.seat_guide_navigation_status() == ( + "SeatGuide navigation status: navigation=missing; goal_reached=unknown; " + "goal_sequence=0." + ) + + +def test_web_input_routes_seat_voice_text_directly_to_seat_guide() -> None: + web_input = WebInput.__new__(WebInput) + fake_seat_guide = FakeSeatGuideRequest() + fake_transport = FakeHumanTransport() + fake_agent_responses = FakeAgentResponses() + cloud_posts: list[str] = [] + web_input._seat_guide = fake_seat_guide + web_input._human_transport = fake_transport + web_input._agent_responses = fake_agent_responses + web_input._post_cloud_speaker = cloud_posts.append + + web_input._route_text("帮我找一个空位") + + assert fake_seat_guide.requests == ["帮我找一个空位"] + assert fake_seat_guide.preview_requests == [] + assert fake_transport.published == [] + assert fake_agent_responses.published == ["handled"] + assert cloud_posts == ["handled"] + + +def test_web_input_logs_live_seat_guide_route_for_voice_bringup( + monkeypatch: pytest.MonkeyPatch, +) -> None: + web_input = WebInput.__new__(WebInput) + fake_seat_guide = FakeSeatGuideRequest() + fake_transport = FakeHumanTransport() + fake_agent_responses = FakeAgentResponses() + fake_logger = FakeLogger() + cloud_posts: list[str] = [] + web_input._seat_guide = fake_seat_guide + web_input._human_transport = fake_transport + web_input._agent_responses = fake_agent_responses + web_input._post_cloud_speaker = cloud_posts.append + monkeypatch.setattr(web_human_input_module, "logger", fake_logger) + + web_input._route_text("帮我找一个空位") + + assert fake_logger.info_calls == [ + ("WebInput received text", {"text": "帮我找一个空位"}), + ("WebInput routing text to SeatGuide live request", {"text": "帮我找一个空位"}), + ] + + +def test_web_input_routes_preview_seat_voice_text_without_navigation_request( + monkeypatch: pytest.MonkeyPatch, +) -> None: + web_input = WebInput.__new__(WebInput) + fake_seat_guide = FakeSeatGuideRequest() + fake_transport = FakeHumanTransport() + fake_agent_responses = FakeAgentResponses() + fake_logger = FakeLogger() + cloud_posts: list[str] = [] + web_input._seat_guide = fake_seat_guide + web_input._human_transport = fake_transport + web_input._agent_responses = fake_agent_responses + web_input._post_cloud_speaker = cloud_posts.append + monkeypatch.setattr(web_human_input_module, "logger", fake_logger) + + web_input._route_text("预检帮我找一个空位") + + assert fake_seat_guide.preview_requests == ["预检帮我找一个空位"] + assert fake_seat_guide.requests == [] + assert fake_transport.published == [] + assert fake_agent_responses.published == ["previewed"] + assert cloud_posts == [] + assert fake_logger.info_calls == [ + ("WebInput received text", {"text": "预检帮我找一个空位"}), + ( + "WebInput routing text to SeatGuide preview", + {"text": "预检帮我找一个空位"}, + ), + ] + + +def test_web_input_submit_query_http_route_reaches_seat_guide_preview() -> None: + web_input = WebInput.__new__(WebInput) + fake_seat_guide = FakeSeatGuideRequest() + fake_transport = FakeHumanTransport() + fake_agent_responses = FakeAgentResponses() + web_input._seat_guide = fake_seat_guide + web_input._human_transport = fake_transport + web_input._agent_responses = fake_agent_responses + + interface = RobotWebInterface(port=5555) + subscription = interface.query_stream.subscribe(web_input._route_text) + + try: + response = TestClient(interface.app).post( + "/submit_query", data={"query": "预检帮我找一个空位"} + ) + finally: + subscription.dispose() + + assert response.status_code == 200 + assert response.json() == {"success": True, "message": "Query received"} + assert fake_seat_guide.preview_requests == ["预检帮我找一个空位"] + assert fake_seat_guide.requests == [] + assert fake_transport.published == [] + assert fake_agent_responses.published == ["previewed"] + + +def test_web_input_phone_speaker_test_publishes_response() -> None: + web_input = WebInput.__new__(WebInput) + fake_agent_responses = FakeAgentResponses() + web_input._agent_responses = fake_agent_responses + + message = web_input.phone_speaker_test("i am here") + + assert message == "Phone speaker test sent: i am here; local=sent; cloud=not_configured" + assert fake_agent_responses.published == ["i am here"] + + +def test_web_input_phone_speaker_test_posts_cloud_response( + monkeypatch: pytest.MonkeyPatch, +) -> None: + class FakeResponse: + def raise_for_status(self) -> None: + return None + + calls: list[dict[str, Any]] = [] + + def fake_post(url: str, **kwargs: Any) -> FakeResponse: + calls.append({"url": url, **kwargs}) + return FakeResponse() + + monkeypatch.setenv("SEAT_GUIDE_SPEAKER_URL", "https://speaker.example") + monkeypatch.setenv("SEAT_GUIDE_SPEAKER_TOKEN", "test-token") + monkeypatch.setenv("SEAT_GUIDE_SPEAKER_DEVICE", "go2-lab") + monkeypatch.setattr(web_human_input_module.requests, "post", fake_post) + + web_input = WebInput.__new__(WebInput) + web_input._agent_responses = None + + message = web_input.phone_speaker_test("i am here") + + assert message == "Phone speaker test sent: i am here; local=missing; cloud=sent" + assert calls == [ + { + "url": "https://speaker.example/api/speak", + "json": {"device": "go2-lab", "text": "i am here"}, + "headers": { + "content-type": "application/json", + "authorization": "Bearer test-token", + }, + "timeout": 5.0, + } + ] + + +def test_web_input_phone_seat_request_routes_and_publishes_response() -> None: + web_input = WebInput.__new__(WebInput) + fake_seat_guide = FakeSeatGuideRequest() + fake_agent_responses = FakeAgentResponses() + web_input._seat_guide = fake_seat_guide + web_input._agent_responses = fake_agent_responses + + message = web_input.phone_seat_request("帮我找一个空位") + + assert message == "handled" + assert fake_seat_guide.requests == ["帮我找一个空位"] + assert fake_agent_responses.published == ["handled"] + + +def test_seat_guide_speaker_page_exposes_phone_speaker_stream() -> None: + interface = RobotWebInterface( + port=5555, + text_streams={"agent_responses": web_human_input_module.rx.subject.Subject()}, + ) + + response = TestClient(interface.app).get("/seat-guide-speaker") + + assert response.status_code == 200 + assert "/text_stream/agent_responses" in response.text + assert "speechSynthesis" in response.text + assert "Enable speaker" in response.text + + +def test_seat_guide_camera_detect_frame_returns_object_description() -> None: + class FakeSeatGuideCameraModel: + def query(self, image: Image, query: str) -> str: + return "The image shows a laptop and a coffee mug." + + def query_detections(self, image: Image, query: str, **kwargs: Any) -> SimpleNamespace: + detections = [SimpleNamespace(bbox=(10.0, 10.0, 40.0, 40.0))] if query == "chair" else [] + return SimpleNamespace(detections=detections) + + interface = RobotWebInterface(port=5555) + interface._seat_guide_model = FakeSeatGuideCameraModel() + + result = interface._detect_seat_guide_frame( + np.zeros((60, 80, 3), dtype=np.uint8), + detector="moondream", + ) + + assert result["detector"] == "moondream2" + assert result["description"] == "The image shows a laptop and a coffee mug." + assert result["chairs"] == 1 + assert result["people"] == 0 + assert result["empty"] == 1 + + +def test_seat_guide_camera_detect_frame_uses_yolo_detector() -> None: + class FakeSeatGuideYoloDetector: + def process_image(self, image: Image) -> ImageDetections2D: + return ImageDetections2D( + image, + [ + Detection2DBBox( + bbox=(10.0, 10.0, 40.0, 40.0), + track_id=1, + class_id=56, + confidence=0.9, + name="chair", + ts=image.ts, + image=image, + ), + Detection2DBBox( + bbox=(12.0, 12.0, 38.0, 38.0), + track_id=2, + class_id=0, + confidence=0.8, + name="person", + ts=image.ts, + image=image, + ), + ], + ) + + interface = RobotWebInterface(port=5555) + interface._seat_guide_yolo_detector = FakeSeatGuideYoloDetector() + + result = interface._detect_seat_guide_frame(np.zeros((60, 80, 3), dtype=np.uint8)) + + assert result["detector"] == "yolo11n" + assert result["chairs"] == 1 + assert result["people"] == 1 + assert result["empty"] == 0 + + +def test_web_input_upload_audio_http_route_emits_audio_event( + monkeypatch: pytest.MonkeyPatch, +) -> None: + audio_events: list[Any] = [] + audio_subject = web_human_input_module.rx.subject.Subject() + audio_subject.subscribe(audio_events.append) + + decoded_audio = np.array([0.1, -0.1, 0.0], dtype=np.float32) + monkeypatch.setattr( + "dimos.web.dimos_interface.api.server.FastAPIServer._decode_audio", + staticmethod(lambda raw: (decoded_audio, 16000)), + ) + + interface = RobotWebInterface(port=5555, audio_subject=audio_subject) + response = TestClient(interface.app).post( + "/upload_audio", + files={"file": ("seat-guide.webm", b"browser audio", "audio/webm")}, + ) + + assert response.status_code == 200 + assert response.json() == {"success": True} + assert len(audio_events) == 1 + event = audio_events[0] + assert event.sample_rate == 16000 + assert event.channels == 1 + assert np.array_equal(event.data, decoded_audio) + + +def test_web_input_upload_audio_requires_configured_voice_subject() -> None: + interface = RobotWebInterface(port=5555, audio_subject=None) + response = TestClient(interface.app).post( + "/upload_audio", + files={"file": ("seat-guide.webm", b"browser audio", "audio/webm")}, + ) + + assert response.status_code == 400 + assert response.json() == { + "success": False, + "message": "Voice input not configured", + } + + +def test_web_input_upload_audio_rejects_decode_failures( + monkeypatch: pytest.MonkeyPatch, +) -> None: + audio_events: list[Any] = [] + audio_subject = web_human_input_module.rx.subject.Subject() + audio_subject.subscribe(audio_events.append) + monkeypatch.setattr( + "dimos.web.dimos_interface.api.server.FastAPIServer._decode_audio", + staticmethod(lambda raw: (None, None)), + ) + + interface = RobotWebInterface(port=5555, audio_subject=audio_subject) + response = TestClient(interface.app).post( + "/upload_audio", + files={"file": ("seat-guide.webm", b"not audio", "audio/webm")}, + ) + + assert response.status_code == 400 + assert response.json() == {"success": False, "message": "Unable to decode audio"} + assert audio_events == [] + + +def test_web_input_falls_back_to_agent_path_when_seat_guide_route_fails() -> None: + web_input = WebInput.__new__(WebInput) + fake_seat_guide = FakeSeatGuideRequest(raises=True) + fake_transport = FakeHumanTransport() + fake_agent_responses = FakeAgentResponses() + web_input._seat_guide = fake_seat_guide + web_input._human_transport = fake_transport + web_input._agent_responses = fake_agent_responses + + web_input._route_text("帮我找一个空位") + + assert fake_seat_guide.requests == ["帮我找一个空位"] + assert fake_transport.published == ["帮我找一个空位"] + assert fake_agent_responses.published == [] + + +def test_web_input_keeps_unrelated_voice_text_on_agent_path( + monkeypatch: pytest.MonkeyPatch, +) -> None: + web_input = WebInput.__new__(WebInput) + fake_seat_guide = FakeSeatGuideRequest() + fake_transport = FakeHumanTransport() + fake_logger = FakeLogger() + web_input._seat_guide = fake_seat_guide + web_input._human_transport = fake_transport + monkeypatch.setattr(web_human_input_module, "logger", fake_logger) + + web_input._route_text("what time is the meeting") + + assert fake_seat_guide.requests == [] + assert fake_transport.published == ["what time is the meeting"] + assert fake_logger.info_calls == [ + ("WebInput received text", {"text": "what time is the meeting"}), + ("WebInput routing text to agent path", {"text": "what time is the meeting"}), + ] + + +def test_web_input_status_reports_not_started_state() -> None: + web_input = WebInput.__new__(WebInput) + web_input._web_interface = None + web_input._thread = None + web_input._seat_guide = None + web_input._agent_responses = None + web_input._stt_node = None + web_input._stt_error = None + web_input._human_transport = None + + assert web_input.web_input_status() == ( + "WebInput status: web=not_started; thread=not_running; " + "seat_route=agent_only; responses=missing; voice_upload=missing; " + "stt=missing; human_transport=missing; url=unavailable." + ) + + +def test_web_input_status_reports_seat_guide_direct_route() -> None: + web_input = WebInput.__new__(WebInput) + web_input._web_interface = FakeWebInterface() + web_input._thread = FakeThread(alive=True) + web_input._seat_guide = FakeSeatGuideRequest() + web_input._agent_responses = FakeAgentResponses() + web_input._stt_node = SimpleNamespace() + web_input._stt_error = None + web_input._human_transport = FakeHumanTransport() + + assert web_input.web_input_status() == ( + "WebInput status: web=started; thread=running; " + "seat_route=seat_guide_direct; responses=connected; " + "voice_upload=connected; stt=connected; human_transport=connected; " + "url=http://localhost:5555." + ) + + +def test_web_input_status_reports_stt_initialization_error() -> None: + web_input = WebInput.__new__(WebInput) + web_input._web_interface = FakeWebInterface() + web_input._thread = FakeThread(alive=True) + web_input._seat_guide = FakeSeatGuideRequest() + web_input._agent_responses = FakeAgentResponses() + web_input._stt_node = None + web_input._stt_error = "RuntimeError: whisper missing" + web_input._human_transport = FakeHumanTransport() + + assert web_input.web_input_status() == ( + "WebInput status: web=started; thread=running; " + "seat_route=seat_guide_direct; responses=connected; " + "voice_upload=connected; stt=error(RuntimeError: whisper missing); " + "human_transport=connected; url=http://localhost:5555." + ) + + +def test_web_input_status_reports_missing_browser_voice_upload() -> None: + web_input = WebInput.__new__(WebInput) + web_input._web_interface = SimpleNamespace(port=5555, audio_subject=None) + web_input._thread = FakeThread(alive=True) + web_input._seat_guide = FakeSeatGuideRequest() + web_input._agent_responses = FakeAgentResponses() + web_input._stt_node = SimpleNamespace() + web_input._stt_error = None + web_input._human_transport = FakeHumanTransport() + + assert web_input.web_input_status() == ( + "WebInput status: web=started; thread=running; " + "seat_route=seat_guide_direct; responses=connected; " + "voice_upload=missing; stt=connected; human_transport=connected; " + "url=http://localhost:5555." + ) + + +def test_web_input_whisper_auto_detects_language(monkeypatch: pytest.MonkeyPatch) -> None: + created_modelopts: list[dict[str, Any] | None] = [] + + class FakeWhisperNode: + def __init__( + self, + model: str = "base", + modelopts: dict[str, Any] | None = None, + ) -> None: + created_modelopts.append(modelopts) + + monkeypatch.setitem( + sys.modules, + "dimos.stream.audio.stt.node_whisper", + SimpleNamespace(WhisperNode=FakeWhisperNode), + ) + + _create_whisper_node() + + assert created_modelopts == [{"fp16": False}] + assert "language" not in created_modelopts[0] + + +def test_autoconnect_injects_scene_provider_and_navigation() -> None: + blueprint = autoconnect( + SeatGuideSkillContainer.blueprint(), + SyntheticSeatObservationProvider.blueprint( + seats=[0.0, 0.0, 0.0, 2.0, 0.0, 0.0], + people=[0.1, 0.0], + robot_x=0.0, + robot_y=0.0, + ), + RecordingNavigation.blueprint(), + ) + coordinator = ModuleCoordinator.build(blueprint, {"g": {"viewer": "none"}}) + + try: + seat_guide = coordinator.get_instance(SeatGuideSkillContainer) + navigation = coordinator.get_instance(RecordingNavigation) + + message = seat_guide.handle_seat_request( + "Please find me an empty seat", + require_live_perception=False, + wait_for_arrival=False, + ) + + assert "seat_2" in message + assert navigation.get_last_goal_xy() == pytest.approx((2.65, 0.0)) + finally: + coordinator.stop() + + +def test_autoconnect_uses_runtime_configured_synthetic_scene() -> None: + blueprint = autoconnect( + SeatGuideSkillContainer.blueprint(), + SyntheticSeatObservationProvider.blueprint( + seats=[0.0, 0.0, 0.0], + people=[], + robot_x=0.0, + robot_y=0.0, + ), + RecordingNavigation.blueprint(), + ) + coordinator = ModuleCoordinator.build(blueprint, {"g": {"viewer": "none"}}) + + try: + seat_guide = coordinator.get_instance(SeatGuideSkillContainer) + provider = coordinator.get_instance(SyntheticSeatObservationProvider) + navigation = coordinator.get_instance(RecordingNavigation) + + provider.set_seat_scene( + seats=[0.0, 0.0, 0.0, 4.0, 0.0, 0.0], + people=[0.1, 0.0], + robot_x=0.0, + robot_y=0.0, + ) + message = seat_guide.handle_seat_request( + "Please find me an empty seat", + require_live_perception=False, + wait_for_arrival=False, + ) + + assert "seat_2" in message + assert navigation.get_last_goal_xy() == pytest.approx((4.65, 0.0)) + finally: + coordinator.stop() + + +def test_autoconnect_wires_seat_guide_without_robot_speaker() -> None: + blueprint = autoconnect( + SeatGuideSkillContainer.blueprint(), + SyntheticSeatObservationProvider.blueprint( + seats=[0.0, 0.0, 0.0, 2.0, 0.0, 0.0], + people=[0.1, 0.0], + robot_x=0.0, + robot_y=0.0, + ), + RecordingNavigation.blueprint(), + ) + coordinator = ModuleCoordinator.build(blueprint, {"g": {"viewer": "none"}}) + + try: + seat_guide = coordinator.get_instance(SeatGuideSkillContainer) + + message = seat_guide.handle_seat_request( + "Please find me an empty seat", + require_live_perception=False, + wait_for_arrival=False, + ) + + assert "seat_2" in message + finally: + coordinator.stop() + + +def test_seat_guide_exposes_agent_friendly_skills() -> None: + skill = SeatGuideSkillContainer() + try: + skill_infos = {info.func_name: json.loads(info.args_schema) for info in skill.get_skills()} + finally: + skill._close_module() + + assert "handle_seat_request" in skill_infos + assert "preview_seat_request" in skill_infos + assert "find_empty_seat_from_scene" in skill_infos + assert "seat_guide_preflight" in skill_infos + assert "seat_guide_readiness_report" in skill_infos + assert "seat_guide_navigation_status" in skill_infos + assert "preview_empty_seat_goal" in skill_infos + assert "seat_guide_status" in skill_infos + assert "camera_seat_provider_status" not in skill_infos + + request_schema = skill_infos["handle_seat_request"] + assert "spoken or typed request" in request_schema["description"] + assert request_schema["properties"] == { + "text": {"title": "Text", "type": "string"}, + "require_live_perception": { + "default": True, + "title": "Require Live Perception", + "type": "boolean", + }, + "wait_for_arrival": { + "default": True, + "title": "Wait For Arrival", + "type": "boolean", + }, + "arrival_timeout_s": { + "default": 60.0, + "title": "Arrival Timeout S", + "type": "number", + }, + } + assert request_schema["required"] == ["text"] + + preview_request_schema = skill_infos["preview_seat_request"] + assert "without moving" in preview_request_schema["description"] + assert preview_request_schema["properties"] == {"text": {"title": "Text", "type": "string"}} + assert preview_request_schema["required"] == ["text"] + + scene_schema = skill_infos["find_empty_seat_from_scene"] + assert "observation provider" in scene_schema["description"] + assert scene_schema["properties"] == { + "require_live_perception": { + "default": True, + "title": "Require Live Perception", + "type": "boolean", + }, + "wait_for_arrival": { + "default": False, + "title": "Wait For Arrival", + "type": "boolean", + }, + "arrival_timeout_s": { + "default": 60.0, + "title": "Arrival Timeout S", + "type": "number", + }, + } + + preflight_schema = skill_infos["seat_guide_preflight"] + assert "no-motion" in preflight_schema["description"] + assert preflight_schema["properties"] == { + "require_live_perception": { + "default": True, + "title": "Require Live Perception", + "type": "boolean", + } + } + + readiness_schema = skill_infos["seat_guide_readiness_report"] + assert "readiness checks" in readiness_schema["description"] + assert readiness_schema["properties"] == { + "require_live_perception": { + "default": True, + "title": "Require Live Perception", + "type": "boolean", + } + } + + navigation_status_schema = skill_infos["seat_guide_navigation_status"] + assert "navigation goal has completed" in navigation_status_schema["description"] + assert navigation_status_schema.get("properties", {}) == {} + + preview_schema = skill_infos["preview_empty_seat_goal"] + assert "without moving" in preview_schema["description"] + assert preview_schema.get("properties", {}) == {} + + status_schema = skill_infos["seat_guide_status"] + assert "without navigating" in status_schema["description"] + assert status_schema.get("properties", {}) == {} + + +def test_web_input_exposes_bringup_status_skill() -> None: + web_input = WebInput() + try: + skill_infos = { + info.func_name: json.loads(info.args_schema) for info in web_input.get_skills() + } + finally: + web_input._close_module() + + assert "web_input_status" in skill_infos + status_schema = skill_infos["web_input_status"] + assert "voice and text routing readiness" in status_schema["description"] + assert status_schema.get("properties", {}) == {} + + +def test_camera_provider_exposes_bringup_status_skill() -> None: + provider = CameraSeatObservationProvider() + try: + skill_infos = { + info.func_name: json.loads(info.args_schema) for info in provider.get_skills() + } + finally: + provider._close_module() + + assert "camera_seat_provider_status" in skill_infos + status_schema = skill_infos["camera_seat_provider_status"] + assert "perception readiness" in status_schema["description"] + assert status_schema.get("properties", {}) == {} + + +def test_seat_guide_mcp_request_flow_without_go2() -> None: + skill = SeatGuideSkillContainer() + fake_navigation = FakeNavigation() + skill._navigation = fake_navigation + skill._seat_observation_provider = FakeSeatObservationProvider( + SeatSceneObservation( + seats=[ + SeatObservation("occupied", x=0.0, y=0.0, yaw=0.0), + SeatObservation("empty", x=2.0, y=0.0, yaw=0.0), + ], + people=[PersonObservation(x=0.1, y=0.0)], + robot_x=0.0, + robot_y=0.0, + source="configured_fallback", + ) + ) + + try: + skills = skill.get_skills() + rpc_calls = {info.func_name: getattr(skill, info.func_name) for info in skills} + + response = asyncio.run( + handle_request({"method": "tools/list", "id": 1}, skills, rpc_calls) + ) + tool_names = {tool["name"] for tool in response["result"]["tools"]} + assert { + "seat_guide_status", + "preview_seat_request", + "seat_guide_preflight", + "seat_guide_readiness_report", + "seat_guide_navigation_status", + "preview_empty_seat_goal", + "handle_seat_request", + } <= tool_names + + response = asyncio.run( + handle_request( + { + "method": "tools/call", + "id": 2, + "params": {"name": "seat_guide_status", "arguments": {}}, + }, + skills, + rpc_calls, + ) + ) + assert "source=configured_fallback" in response["result"]["content"][0]["text"] + + response = asyncio.run( + handle_request( + { + "method": "tools/call", + "id": 3, + "params": { + "name": "seat_guide_preflight", + "arguments": {"require_live_perception": False}, + }, + }, + skills, + rpc_calls, + ) + ) + assert "SeatGuide preflight ready" in response["result"]["content"][0]["text"] + assert fake_navigation.goal is None + + response = asyncio.run( + handle_request( + { + "method": "tools/call", + "id": 4, + "params": { + "name": "seat_guide_readiness_report", + "arguments": {"require_live_perception": False}, + }, + }, + skills, + rpc_calls, + ) + ) + assert "SeatGuide readiness report" in response["result"]["content"][0]["text"] + assert fake_navigation.goal is None + + response = asyncio.run( + handle_request( + { + "method": "tools/call", + "id": 5, + "params": {"name": "preview_empty_seat_goal", "arguments": {}}, + }, + skills, + rpc_calls, + ) + ) + assert "goal=(2.65, 0.00" in response["result"]["content"][0]["text"] + assert fake_navigation.goal is None + + response = asyncio.run( + handle_request( + { + "method": "tools/call", + "id": 6, + "params": { + "name": "preview_seat_request", + "arguments": {"text": "预检帮我找一个空位"}, + }, + }, + skills, + rpc_calls, + ) + ) + assert "is not live camera" in response["result"]["content"][0]["text"] + assert fake_navigation.goal is None + + response = asyncio.run( + handle_request( + { + "method": "tools/call", + "id": 7, + "params": { + "name": "handle_seat_request", + "arguments": { + "text": "帮我找一个空位", + "require_live_perception": False, + }, + }, + }, + skills, + rpc_calls, + ) + ) + assert "seat_2" in response["result"]["content"][0]["text"] + assert fake_navigation.goal is not None + assert fake_navigation.goal.position.x == pytest.approx(2.65) + + fake_navigation.goal_reached = True + response = asyncio.run( + handle_request( + { + "method": "tools/call", + "id": 8, + "params": {"name": "seat_guide_navigation_status", "arguments": {}}, + }, + skills, + rpc_calls, + ) + ) + assert "goal_reached=true" in response["result"]["content"][0]["text"] + finally: + skill._close_module() + + +def test_seat_guide_go2_blueprints_include_real_runtime_modules() -> None: + from dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_seat_guide import ( + unitree_go2_seat_guide, + ) + from dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_seat_guide_agentic import ( + unitree_go2_seat_guide_agentic, + ) + + agentic_modules = {atom.module.__name__ for atom in unitree_go2_seat_guide_agentic.blueprints} + direct_modules = {atom.module.__name__ for atom in unitree_go2_seat_guide.blueprints} + + assert { + "GO2Connection", + "McpServer", + "McpClient", + "CameraSeatObservationProvider", + "SeatGuideSkillContainer", + "WebInput", + } <= agentic_modules + assert { + "GO2Connection", + "McpServer", + "CameraSeatObservationProvider", + "SeatGuideSkillContainer", + "WebInput", + } <= direct_modules + assert "UnitreeSpeakSkill" not in agentic_modules + assert "UnitreeSpeakSkill" not in direct_modules + assert "SpatialMemory" not in agentic_modules + assert "SpatialMemory" not in direct_modules + assert "PersonFollowSkillContainer" not in agentic_modules + assert "PersonFollowSkillContainer" not in direct_modules + assert "McpClient" not in direct_modules + + +def test_go2_seat_guide_blueprints_wire_web_input_to_seat_guide() -> None: + from dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_seat_guide import ( + unitree_go2_seat_guide, + ) + from dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_seat_guide_agentic import ( + unitree_go2_seat_guide_agentic, + ) + + for blueprint in (unitree_go2_seat_guide, unitree_go2_seat_guide_agentic): + web_input_atom = next( + atom for atom in blueprint.blueprints if atom.module is WebInput + ) + seat_guide_refs = [ + ref for ref in web_input_atom.module_refs if ref.name == "_seat_guide" + ] + + assert len(seat_guide_refs) == 1 + assert seat_guide_refs[0].spec is SeatGuideRequestSpec + assert seat_guide_refs[0].optional + + +def test_synthetic_observation_provider_returns_configured_scene() -> None: + provider = SyntheticSeatObservationProvider.__new__(SyntheticSeatObservationProvider) + provider.config = SyntheticSeatSceneConfig( + seats=[0.0, 1.0, 0.0, 2.0, 1.0, 0.5], + people=[0.1, 1.0], + robot_x=-1.0, + robot_y=1.0, + ) + + scene = provider.get_seat_scene() + + assert scene.seats == [ + SeatObservation("seat_1", 0.0, 1.0, 0.0), + SeatObservation("seat_2", 2.0, 1.0, 0.5), + ] + assert scene.people == [PersonObservation(0.1, 1.0)] + assert scene.robot_x == -1.0 + assert scene.robot_y == 1.0 + assert scene.source == "configured_fallback" + + +def test_synthetic_observation_provider_runtime_override() -> None: + provider = SyntheticSeatObservationProvider.__new__(SyntheticSeatObservationProvider) + provider.config = SyntheticSeatSceneConfig( + seats=[0.0, 0.0, 0.0], + people=[], + robot_x=0.0, + robot_y=0.0, + ) + provider._scene_override = None + + message = provider.set_seat_scene( + seats=[1.0, 2.0, 0.0, 3.0, 2.0, 0.0], + people=[1.1, 2.0], + robot_x=-1.0, + robot_y=2.0, + ) + scene = provider.get_seat_scene() + + assert message == "Configured 2 seats and 1 person." + assert scene.seats == [ + SeatObservation("seat_1", 1.0, 2.0, 0.0), + SeatObservation("seat_2", 3.0, 2.0, 0.0), + ] + assert scene.people == [PersonObservation(1.1, 2.0)] + assert scene.robot_x == -1.0 + assert scene.robot_y == 2.0 + assert scene.source == "runtime_override" + + assert provider.clear_seat_scene_override() == "Cleared synthetic seat scene override." + assert provider.get_seat_scene().seats == [SeatObservation("seat_1", 0.0, 0.0, 0.0)] + + +def test_camera_observation_provider_detects_scene_from_image() -> None: + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig( + seats=[9.0, 9.0, 0.0], + people=[], + robot_x=0.0, + robot_y=0.0, + chair_distance_m=2.0, + lateral_span_m=4.0, + ) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = Image.from_numpy(np.zeros((100, 100, 3), dtype=np.uint8)) + provider._latest_odom = PoseStamped( + frame_id="map", + position=Vector3(0.0, 0.0, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, 0.0)), + ) + fake_vl_model = FakeVlModel( + { + "chair": [ + SimpleNamespace(name="chair", bbox=(10.0, 20.0, 30.0, 80.0)), + SimpleNamespace(name="chair", bbox=(70.0, 20.0, 90.0, 80.0)), + ], + "person": [ + SimpleNamespace(name="person", bbox=(12.0, 10.0, 32.0, 90.0)), + ], + } + ) + provider._vl_model = fake_vl_model + + scene = provider.get_seat_scene() + + assert fake_vl_model.queries == ["chair", "person"] + assert [seat.seat_id for seat in scene.seats] == ["seat_1", "seat_2"] + assert [seat.x for seat in scene.seats] == pytest.approx([2.0, 2.0]) + assert [seat.y for seat in scene.seats] == pytest.approx([-1.2, 1.2]) + assert [seat.yaw for seat in scene.seats] == pytest.approx([0.0, 0.0]) + assert [person.x for person in scene.people] == pytest.approx([2.0]) + assert [person.y for person in scene.people] == pytest.approx([-1.12]) + assert scene.source == "camera" + + +def test_camera_observation_provider_prefers_fast_detector() -> None: + image = Image.from_numpy(np.zeros((100, 100, 3), dtype=np.uint8)) + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig( + seats=[], + people=[], + robot_x=0.0, + robot_y=0.0, + chair_distance_m=2.0, + lateral_span_m=4.0, + ) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = image + provider._latest_odom = PoseStamped( + frame_id="map", + position=Vector3(0.0, 0.0, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, 0.0)), + ) + provider._vl_model = None + provider._fast_detector = FakeFastDetector( + [ + Detection2DBBox( + bbox=(40.0, 20.0, 60.0, 80.0), + track_id=1, + class_id=56, + confidence=0.9, + name="chair", + ts=image.ts, + image=image, + ), + Detection2DBBox( + bbox=(80.0, 20.0, 100.0, 80.0), + track_id=2, + class_id=0, + confidence=0.8, + name="person", + ts=image.ts, + image=image, + ), + Detection2DBBox( + bbox=(10.0, 10.0, 20.0, 20.0), + track_id=3, + class_id=0, + confidence=0.7, + name="book", + ts=image.ts, + image=image, + ), + ] + ) + + scene = provider.get_seat_scene() + + assert provider._fast_detector.calls == 1 + assert len(scene.seats) == 1 + assert len(scene.people) == 1 + assert scene.source == "camera" + + +def test_camera_observation_provider_empty_fast_detector_does_not_fallback_to_vlm() -> None: + image = Image.from_numpy(np.zeros((100, 100, 3), dtype=np.uint8)) + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig( + seats=[], + people=[], + robot_x=0.0, + robot_y=0.0, + ) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = image + provider._latest_odom = PoseStamped( + frame_id="map", + position=Vector3(0.0, 0.0, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, 0.0)), + ) + provider._vl_model = None + provider._fast_detector = FakeFastDetector([]) + + scene = provider.get_seat_scene() + + assert provider._fast_detector.calls == 1 + assert scene.source == "camera_no_seats_detected" + assert scene.seats == [] + + +def test_camera_observation_provider_projects_detections_with_lidar( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(seat_guide_module.time, "time", lambda: 11.0) + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig( + seats=[], + people=[], + robot_x=0.0, + robot_y=0.0, + chair_distance_m=2.0, + lateral_span_m=4.0, + ) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = Image.from_numpy(np.zeros((100, 100, 3), dtype=np.uint8), ts=10.0) + provider._latest_camera_info = CameraInfo(width=100, height=100, ts=10.0) + provider._latest_lidar = PointCloud2.from_numpy( + np.array([[4.0, 1.0, 0.4], [4.2, 1.2, 0.5]], dtype=np.float32), + frame_id="world", + timestamp=10.0, + ) + provider._latest_odom = PoseStamped( + ts=10.0, + frame_id="map", + position=Vector3(1.0, 1.0, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, 0.0)), + ) + provider._vl_model = FakeVlModel( + { + "chair": [SimpleNamespace(name="chair", bbox=(40.0, 20.0, 60.0, 80.0))], + "person": [], + } + ) + provider._tf = SimpleNamespace(get=lambda *args: SimpleNamespace()) + + def fake_from_2d(**kwargs: Any) -> SimpleNamespace: + return SimpleNamespace( + pointcloud=PointCloud2.from_numpy( + np.array( + [[3.0, 2.0, 0.1], [5.0, 4.0, 0.8], [5.2, 4.2, 0.9]], + dtype=np.float32, + ), + frame_id="world", + timestamp=10.0, + ) + ) + + monkeypatch.setattr(seat_guide_module.Detection3DPC, "from_2d", fake_from_2d) + + scene = provider.get_seat_scene() + + assert scene.source == "camera_3d" + assert scene.seats[0].x == pytest.approx(5.1) + assert scene.seats[0].y == pytest.approx(4.1) + assert scene.seats[0].yaw == pytest.approx(math.atan2(1.0 - 4.1, 1.0 - 5.1)) + + +def test_camera_observation_provider_projects_detections_from_latest_odom() -> None: + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig( + seats=[], + people=[], + robot_x=0.0, + robot_y=0.0, + chair_distance_m=2.0, + lateral_span_m=4.0, + ) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = Image.from_numpy(np.zeros((100, 100, 3), dtype=np.uint8)) + provider._latest_odom = PoseStamped( + frame_id="map", + position=Vector3(10.0, 20.0, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, math.pi / 2)), + ) + provider._vl_model = FakeVlModel( + { + "chair": [SimpleNamespace(name="chair", bbox=(40.0, 20.0, 60.0, 80.0))], + "person": [SimpleNamespace(name="person", bbox=(90.0, 20.0, 100.0, 80.0))], + } + ) + + scene = provider.get_seat_scene() + + assert scene.robot_x == pytest.approx(10.0) + assert scene.robot_y == pytest.approx(20.0) + assert len(scene.seats) == 1 + assert scene.seats[0].seat_id == "seat_1" + assert scene.seats[0].x == pytest.approx(10.0) + assert scene.seats[0].y == pytest.approx(22.0) + assert scene.seats[0].yaw == pytest.approx(math.pi / 2) + assert len(scene.people) == 1 + assert scene.people[0].x == pytest.approx(8.2) + assert scene.people[0].y == pytest.approx(22.0) + + +def test_camera_observation_provider_uses_one_odom_snapshot_per_detection() -> None: + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig( + seats=[], + people=[], + robot_x=0.0, + robot_y=0.0, + chair_distance_m=2.0, + lateral_span_m=4.0, + ) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = Image.from_numpy(np.zeros((100, 100, 3), dtype=np.uint8)) + provider._latest_odom = PoseStamped( + frame_id="map", + position=Vector3(10.0, 20.0, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, 0.0)), + ) + provider._vl_model = OdomMutatingFakeVlModel( + { + "chair": [SimpleNamespace(name="chair", bbox=(40.0, 20.0, 60.0, 80.0))], + "person": [], + }, + provider, + ) + + scene = provider.get_seat_scene() + + assert scene.robot_x == pytest.approx(10.0) + assert scene.robot_y == pytest.approx(20.0) + assert scene.seats[0].x == pytest.approx(12.0) + assert scene.seats[0].y == pytest.approx(20.0) + assert provider._latest_odom.x == pytest.approx(100.0) + + +def test_camera_observation_provider_clamps_detection_bbox_to_image_width() -> None: + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig( + seats=[], + people=[], + robot_x=0.0, + robot_y=0.0, + chair_distance_m=2.0, + lateral_span_m=4.0, + ) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = Image.from_numpy(np.zeros((100, 100, 3), dtype=np.uint8)) + provider._latest_odom = PoseStamped( + frame_id="map", + position=Vector3(0.0, 0.0, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, 0.0)), + ) + provider._vl_model = FakeVlModel( + { + "chair": [ + SimpleNamespace(name="chair", bbox=(-20.0, 20.0, 20.0, 80.0)), + SimpleNamespace(name="chair", bbox=(80.0, 20.0, 140.0, 80.0)), + SimpleNamespace(name="chair", bbox=(80.0, 20.0, 20.0, 80.0)), + ], + "person": [], + } + ) + + scene = provider.get_seat_scene() + + assert [seat.x for seat in scene.seats] == pytest.approx([2.0, 2.0, 2.0]) + assert [seat.y for seat in scene.seats] == pytest.approx([-1.6, 1.6, 0.0]) + + +def test_camera_observation_provider_falls_back_without_image() -> None: + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig( + seats=[1.0, 2.0, 0.0], + people=[1.1, 2.0], + robot_x=-1.0, + robot_y=2.0, + ) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = None + provider._latest_odom = None + + scene = provider.get_seat_scene() + + assert scene.seats == [SeatObservation("seat_1", 1.0, 2.0, 0.0)] + assert scene.people == [PersonObservation(1.1, 2.0)] + assert scene.robot_x == -1.0 + assert scene.robot_y == 2.0 + assert scene.source == "no_camera_image" + + +def test_camera_observation_provider_requires_odom_for_camera_source() -> None: + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig( + seats=[1.0, 2.0, 0.0], + people=[1.1, 2.0], + robot_x=-1.0, + robot_y=2.0, + ) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = Image.from_numpy(np.zeros((100, 100, 3), dtype=np.uint8)) + provider._latest_odom = None + provider._vl_model = FakeVlModel({"chair": [], "person": []}) + + scene = provider.get_seat_scene() + + assert scene.seats == [SeatObservation("seat_1", 1.0, 2.0, 0.0)] + assert scene.people == [PersonObservation(1.1, 2.0)] + assert scene.robot_x == -1.0 + assert scene.robot_y == 2.0 + assert scene.source == "camera_no_odom" + assert provider._vl_model.queries == [] + + +def test_camera_observation_provider_rejects_stale_image( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(seat_guide_module.time, "time", lambda: 200.0) + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig(max_input_age_s=5.0) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = Image.from_numpy( + np.zeros((100, 100, 3), dtype=np.uint8), ts=190.0 + ) + provider._latest_odom = PoseStamped( + ts=199.0, + frame_id="map", + position=Vector3(0.0, 0.0, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, 0.0)), + ) + provider._vl_model = FakeVlModel( + {"chair": [SimpleNamespace(bbox=(40, 20, 60, 80))], "person": []} + ) + + scene = provider.get_seat_scene() + + assert scene.source == "stale_camera_image" + assert provider._vl_model.queries == [] + + +def test_camera_observation_provider_rejects_stale_odom( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(seat_guide_module.time, "time", lambda: 200.0) + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig(max_input_age_s=5.0) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = Image.from_numpy( + np.zeros((100, 100, 3), dtype=np.uint8), ts=199.0 + ) + provider._latest_odom = PoseStamped( + ts=190.0, + frame_id="map", + position=Vector3(0.0, 0.0, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, 0.0)), + ) + provider._vl_model = FakeVlModel( + {"chair": [SimpleNamespace(bbox=(40, 20, 60, 80))], "person": []} + ) + + scene = provider.get_seat_scene() + + assert scene.source == "stale_camera_odom" + assert provider._vl_model.queries == [] + + +def test_camera_seat_provider_status_reports_missing_runtime_inputs( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("ALIBABA_API_KEY", raising=False) + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig( + seats=[1.0, 2.0, 0.0], + people=[1.1, 2.0], + ) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = None + provider._latest_odom = None + + assert provider.camera_seat_provider_status() == ( + "CameraSeatObservationProvider status: image=missing; image_fresh=missing; " + "camera_info=missing; camera_info_fresh=missing; " + "lidar=missing; lidar_fresh=missing; " + "odom=missing; odom_fresh=missing; fast_detector=yolo; " + "detection_model=moondream; " + "credential=present; override=inactive; " + "configured_fallback_seats=1; configured_fallback_people=1." + ) + + +def test_camera_seat_provider_status_reports_live_inputs_and_credentials( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("ALIBABA_API_KEY", "test-key") + monkeypatch.setattr(seat_guide_module.time, "time", lambda: 101.0) + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig(seats=[], people=[]) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = Image.from_numpy( + np.zeros((120, 160, 3), dtype=np.uint8), ts=100.0 + ) + provider._latest_camera_info = CameraInfo(width=160, height=120, ts=100.0) + provider._latest_lidar = PointCloud2.from_numpy( + np.array([[1.0, 2.0, 0.5]], dtype=np.float32), + frame_id="world", + timestamp=100.0, + ) + provider._latest_odom = PoseStamped( + ts=100.0, + frame_id="map", + position=Vector3(1.0, 2.0, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, 0.5)), + ) + + assert provider.camera_seat_provider_status() == ( + "CameraSeatObservationProvider status: image=160x120; " + "image_fresh=true; camera_info=160x120; camera_info_fresh=true; " + "lidar=1 points; lidar_fresh=true; odom=(1.00, 2.00, yaw=0.50); " + "odom_fresh=true; fast_detector=yolo; detection_model=moondream; " + "credential=present; override=inactive; configured_fallback_seats=0; " + "configured_fallback_people=0." + ) + + +def test_camera_seat_provider_status_reports_missing_qwen_credential( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("ALIBABA_API_KEY", raising=False) + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig( + g=GlobalConfig(detection_model="moondream"), + detection_model="qwen", + ) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = None + provider._latest_odom = None + + assert "detection_model=qwen; credential=missing" in ( + provider.camera_seat_provider_status() + ) + + +def test_camera_seat_provider_status_reports_runtime_override() -> None: + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig(seats=[], people=[]) + provider._scene_override = SeatSceneObservation( + seats=[SeatObservation("manual", 1.0, 0.0, 0.0)], + people=[], + source="runtime_override", + ) + provider._scene_lock = RLock() + provider._latest_image = None + provider._latest_odom = None + + assert "override=active" in provider.camera_seat_provider_status() + + +def test_camera_observation_provider_runtime_override_source() -> None: + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig() + provider._scene_override = None + provider._scene_lock = RLock() + + provider.set_seat_scene( + seats=[1.0, 2.0, 0.0], + people=[1.1, 2.0], + robot_x=-1.0, + robot_y=2.0, + ) + scene = provider.get_seat_scene() + + assert scene.source == "runtime_override" + assert scene.seats == [SeatObservation("seat_1", 1.0, 2.0, 0.0)] + + +def test_camera_observation_provider_reports_empty_camera_detection_source() -> None: + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig( + seats=[1.0, 2.0, 0.0], + people=[], + robot_x=-1.0, + robot_y=2.0, + ) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = Image.from_numpy(np.zeros((100, 100, 3), dtype=np.uint8)) + provider._latest_odom = PoseStamped( + frame_id="map", + position=Vector3(0.0, 0.0, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, 0.0)), + ) + provider._vl_model = FakeVlModel({"chair": [], "person": []}) + + scene = provider.get_seat_scene() + + assert scene.seats == [SeatObservation("seat_1", 1.0, 2.0, 0.0)] + assert scene.source == "camera_no_seats_detected" + + +def test_camera_observation_provider_reports_detection_error_source() -> None: + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig( + seats=[1.0, 2.0, 0.0], + people=[], + robot_x=-1.0, + robot_y=2.0, + ) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = Image.from_numpy(np.zeros((100, 100, 3), dtype=np.uint8)) + provider._latest_odom = PoseStamped( + frame_id="map", + position=Vector3(0.0, 0.0, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, 0.0)), + ) + + def raise_detection_error( + image: Image, odom: PoseStamped | None = None + ) -> SeatSceneObservation: + raise RuntimeError("vlm unavailable") + + provider._detect_scene_from_image = raise_detection_error + + scene = provider.get_seat_scene() + + assert scene.seats == [SeatObservation("seat_1", 1.0, 2.0, 0.0)] + assert scene.source == "camera_detection_error" + + +def test_camera_observation_provider_reports_missing_qwen_key_as_detection_error( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("ALIBABA_API_KEY", raising=False) + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig( + seats=[1.0, 2.0, 0.0], + people=[], + robot_x=-1.0, + robot_y=2.0, + detection_model="qwen", + vlm_fallback_enabled=True, + ) + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = Image.from_numpy(np.zeros((100, 100, 3), dtype=np.uint8)) + provider._latest_odom = PoseStamped( + frame_id="map", + position=Vector3(0.0, 0.0, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, 0.0)), + ) + provider._vl_model = None + + scene = provider.get_seat_scene() + + assert scene.seats == [SeatObservation("seat_1", 1.0, 2.0, 0.0)] + assert scene.source == "camera_detection_error" + + +def test_camera_observation_provider_does_not_fake_default_scene_without_image() -> None: + provider = CameraSeatObservationProvider.__new__(CameraSeatObservationProvider) + provider.config = CameraSeatSceneConfig() + provider._scene_override = None + provider._scene_lock = RLock() + provider._latest_image = None + + scene = provider.get_seat_scene() + + assert scene.seats == [] + assert scene.people == [] + assert scene.source == "no_camera_image" + + +def test_parse_flat_observation_lists() -> None: + assert _parse_seats([1.0, 2.0, 0.1, 3.0, 4.0, 0.2]) == [ + SeatObservation("seat_1", 1.0, 2.0, 0.1), + SeatObservation("seat_2", 3.0, 4.0, 0.2), + ] + assert _parse_people([1.0, 2.0, 3.0, 4.0]) == [ + PersonObservation(1.0, 2.0), + PersonObservation(3.0, 4.0), + ] + assert _flatten_seats([SeatObservation("ignored", 1.0, 2.0, 0.1)]) == [1.0, 2.0, 0.1] + assert _flatten_people([PersonObservation(1.0, 2.0)]) == [1.0, 2.0] + + +def test_parse_flat_observation_lists_reject_bad_lengths() -> None: + with pytest.raises(ValueError, match="triples"): + _parse_seats([1.0, 2.0]) + + with pytest.raises(ValueError, match="pairs"): + _parse_people([1.0]) + + +@pytest.mark.parametrize( + "text", + [ + "Please find me an empty seat", + "guide me to a chair", + "帮我找一个空位", + "带我去座位", + "我要找到一个位置", + "帮我找到附近的位置, 然后找到一个空位置", + ], +) +def test_parse_seat_guide_intent_accepts_seat_requests(text: str) -> None: + intent = parse_seat_guide_intent(text) + + assert intent.should_find_seat + assert intent.normalized_text + + +@pytest.mark.parametrize( + "text", + [ + "preview find me an empty seat", + "preflight guide me to a chair", + "预检帮我找一个空位", + "测试带我去座位", + "不要动, 先帮我找一个位置", + ], +) +def test_parse_seat_guide_preview_request_detects_no_motion_requests(text: str) -> None: + assert is_seat_guide_preview_request(text) + + +@pytest.mark.parametrize( + "text", + [ + "", + "start patrol", + "what is on the table", + "会议什么时候开始", + ], +) +def test_parse_seat_guide_intent_rejects_other_requests(text: str) -> None: + assert not parse_seat_guide_intent(text).should_find_seat + + +def _complete_acceptance_transcript() -> str: + return """Hardware run registry: /tmp/dimos/runs/hardware.json +Hardware run mode: hardware. +Hardware blueprint: unitree-go2-seat-guide-agentic +WebInput status: web=started; thread=running; seat_route=seat_guide_direct; responses=connected; voice_upload=connected; stt=connected; human_transport=connected; url=http://localhost:5555. +Using WebInput URL: http://localhost:5555 +{"modules": {"CameraSeatObservationProvider": ["camera_seat_provider_status"], "SeatGuideSkillContainer": ["seat_guide_status"], "WebInput": ["web_input_status"]}} +image=160x120 +image_fresh=true +camera_info=160x120 +camera_info_fresh=true +lidar=1200 points +lidar_fresh=true +credential=present +odom=(1.00, 2.00, yaw=0.50) +odom_fresh=true +override=inactive +configured_fallback_seats=0 +configured_fallback_people=0 +SeatGuide scene source=camera: 2 seats [seat_1=(1.00, 2.00, yaw=0.00)], 0 people [none], robot=(1.00, 2.00). +SeatGuide preflight ready: navigation=IDLE; perception=camera seats=2 people=0; empty=2 occupied=0; selected=seat_1; goal=(1.65, 2.00, yaw=0.00); feedback=phone_or_web. +SeatGuide preview source=camera: selected seat_1 empty=2 occupied=0 seat=(1.00, 2.00, yaw=0.00) goal=(1.65, 2.00, yaw=0.00). +Captured WebInput agent_responses stream +Manual no-motion voice gate: +Press Enter here when ready. +Click the microphone button and say: 预检帮我找一个空位 +Captured WebInput voice agent_responses stream +WebInput received text text=预检帮我找一个空位 +WebInput received text text=预检帮我找一个空位 +WebInput received text text=帮我找一个空位 +WebInput routing text to SeatGuide preview text=预检帮我找一个空位 +Capturing DimOS log snapshot after no-motion checks +No-motion checks completed. +Operator confirmation: LIVE +Live voice navigation gate: +Press Enter here when ready. +Say: 帮我找一个空位 +Captured live WebInput voice agent_responses stream +WebInput routing text to SeatGuide live request text=帮我找一个空位 +Navigating to +goal_sequence=1 +Checking SeatGuide navigation completion +goal_reached=true +SeatGuide navigation goal reached +Capturing DimOS log snapshot after live request +""" + + +def test_acceptance_log_verifier_accepts_complete_hardware_transcript( + tmp_path: Path, +) -> None: + log_file = tmp_path / "acceptance.log" + log_file.write_text(_complete_acceptance_transcript()) + + result = subprocess.run( + ["bash", str(ACCEPTANCE_LOG_VERIFIER), str(log_file)], + check=False, + text=True, + capture_output=True, + ) + + assert result.returncode == 0 + assert "contains all required evidence" in result.stdout + + +def test_acceptance_log_verifier_accepts_earlier_stale_goal_reached_status( + tmp_path: Path, +) -> None: + log_file = tmp_path / "acceptance.log" + log_file.write_text( + _complete_acceptance_transcript().replace( + "Operator confirmation: LIVE\n", + "goal_reached=true\nOperator confirmation: LIVE\n", + 1, + ) + ) + + result = subprocess.run( + ["bash", str(ACCEPTANCE_LOG_VERIFIER), str(log_file)], + check=False, + text=True, + capture_output=True, + ) + + assert result.returncode == 0 + assert "contains all required evidence" in result.stdout + + +@pytest.mark.parametrize( + ("missing_text", "expected_error"), + [ + ("Hardware run mode: hardware.\n", "hardware run mode"), + ("Hardware run registry: /tmp/dimos/runs/hardware.json\n", "hardware run registry"), + ( + "Hardware blueprint: unitree-go2-seat-guide-agentic\n", + "SeatGuide hardware blueprint", + ), + ("voice_upload=connected; ", "WebInput browser audio upload route"), + ("stt=connected; ", "WebInput speech-to-text pipeline"), + ("image=160x120\n", "camera image readiness"), + ("image_fresh=true\n", "fresh camera image readiness"), + ("camera_info=160x120\n", "camera calibration readiness"), + ("camera_info_fresh=true\n", "fresh camera calibration readiness"), + ("lidar=1200 points\n", "LiDAR point cloud readiness"), + ("lidar_fresh=true\n", "fresh LiDAR readiness"), + ("odom_fresh=true\n", "fresh odometry readiness"), + ("override=inactive\n", "camera runtime override disabled"), + ("configured_fallback_seats=0\n", "camera fallback seats disabled"), + ("configured_fallback_people=0\n", "camera fallback people disabled"), + ( + '"SeatGuideSkillContainer": ["seat_guide_status"], ', + "SeatGuide planner/navigation module", + ), + ( + "Capturing DimOS log snapshot after no-motion checks\n", + "no-motion DimOS log snapshot", + ), + ("No-motion checks completed.\n", "no-motion completion marker"), + ("Manual no-motion voice gate:\n", "browser microphone no-motion gate"), + ("Press Enter here when ready.\n", "browser microphone readiness prompts"), + ( + "Click the microphone button and say: 预检帮我找一个空位\n", + "browser microphone no-motion spoken phrase", + ), + ("Live voice navigation gate:\n", "browser microphone live gate"), + ("Say: 帮我找一个空位\n", "browser microphone live spoken phrase"), + ( + "WebInput routing text to SeatGuide live request text=帮我找一个空位\n", + "live WebInput SeatGuide route", + ), + ( + "WebInput received text text=预检帮我找一个空位\n", + "WebInput recognized text events", + ), + ( + "Capturing DimOS log snapshot after live request\n", + "live DimOS log snapshot", + ), + ("goal_reached=true\n", "navigation completion"), + ], +) +def test_acceptance_log_verifier_rejects_missing_required_evidence( + tmp_path: Path, + missing_text: str, + expected_error: str, +) -> None: + log_file = tmp_path / "acceptance.log" + log_file.write_text(_complete_acceptance_transcript().replace(missing_text, "", 1)) + + result = subprocess.run( + ["bash", str(ACCEPTANCE_LOG_VERIFIER), str(log_file)], + check=False, + text=True, + capture_output=True, + ) + + assert result.returncode == 3 + assert expected_error in result.stderr + + +def test_acceptance_log_verifier_rejects_missing_occupancy_counts( + tmp_path: Path, +) -> None: + log_file = tmp_path / "acceptance.log" + log_file.write_text( + _complete_acceptance_transcript() + .replace("empty=2 occupied=0; ", "") + .replace("empty=2 occupied=0 ", "") + ) + + result = subprocess.run( + ["bash", str(ACCEPTANCE_LOG_VERIFIER), str(log_file)], + check=False, + text=True, + capture_output=True, + ) + + assert result.returncode == 3 + assert "SeatGuide occupancy counts" in result.stderr + + +def test_acceptance_log_verifier_rejects_wrong_recognized_live_phrase( + tmp_path: Path, +) -> None: + log_file = tmp_path / "acceptance.log" + log_file.write_text( + _complete_acceptance_transcript().replace( + "WebInput received text text=帮我找一个空位\n", + "WebInput received text text=帮我找一个垃圾桶\n", + 1, + ) + ) + + result = subprocess.run( + ["bash", str(ACCEPTANCE_LOG_VERIFIER), str(log_file)], + check=False, + text=True, + capture_output=True, + ) + + assert result.returncode == 3 + assert "recognized live SeatGuide phrase" in result.stderr + + +def test_acceptance_log_verifier_rejects_wrong_routed_preview_phrase( + tmp_path: Path, +) -> None: + log_file = tmp_path / "acceptance.log" + log_file.write_text( + _complete_acceptance_transcript().replace( + "WebInput routing text to SeatGuide preview text=预检帮我找一个空位\n", + "WebInput routing text to SeatGuide preview text=预检帮我找一个垃圾桶\n", + 1, + ) + ) + + result = subprocess.run( + ["bash", str(ACCEPTANCE_LOG_VERIFIER), str(log_file)], + check=False, + text=True, + capture_output=True, + ) + + assert result.returncode == 3 + assert "no-motion WebInput SeatGuide phrase route" in result.stderr + + +def test_acceptance_log_verifier_rejects_live_before_no_motion_completion( + tmp_path: Path, +) -> None: + log_file = tmp_path / "acceptance.log" + log_file.write_text( + _complete_acceptance_transcript().replace( + "No-motion checks completed.\nOperator confirmation: LIVE\n", + "Operator confirmation: LIVE\nNo-motion checks completed.\n", + 1, + ) + ) + + result = subprocess.run( + ["bash", str(ACCEPTANCE_LOG_VERIFIER), str(log_file)], + check=False, + text=True, + capture_output=True, + ) + + assert result.returncode == 3 + assert "no-motion before live order" in result.stderr + + +def test_acceptance_log_verifier_rejects_no_motion_completion_before_snapshot( + tmp_path: Path, +) -> None: + log_file = tmp_path / "acceptance.log" + log_file.write_text( + _complete_acceptance_transcript().replace( + "Capturing DimOS log snapshot after no-motion checks\nNo-motion checks completed.\n", + "No-motion checks completed.\nCapturing DimOS log snapshot after no-motion checks\n", + 1, + ) + ) + + result = subprocess.run( + ["bash", str(ACCEPTANCE_LOG_VERIFIER), str(log_file)], + check=False, + text=True, + capture_output=True, + ) + + assert result.returncode == 3 + assert "no-motion snapshot before completion order" in result.stderr + + +def test_acceptance_log_verifier_rejects_no_motion_speech_before_readiness_prompt( + tmp_path: Path, +) -> None: + log_file = tmp_path / "acceptance.log" + log_file.write_text( + _complete_acceptance_transcript().replace( + "Press Enter here when ready.\nClick the microphone button and say: 预检帮我找一个空位\n", + "Click the microphone button and say: 预检帮我找一个空位\nPress Enter here when ready.\n", + 1, + ) + ) + + result = subprocess.run( + ["bash", str(ACCEPTANCE_LOG_VERIFIER), str(log_file)], + check=False, + text=True, + capture_output=True, + ) + + assert result.returncode == 3 + assert "no-motion readiness before speech order" in result.stderr + + +def test_acceptance_log_verifier_rejects_live_speech_before_live_readiness_prompt( + tmp_path: Path, +) -> None: + log_file = tmp_path / "acceptance.log" + log_file.write_text( + _complete_acceptance_transcript().replace( + "Press Enter here when ready.\nSay: 帮我找一个空位\n", + "Say: 帮我找一个空位\nPress Enter here when ready.\n", + 1, + ) + ) + + result = subprocess.run( + ["bash", str(ACCEPTANCE_LOG_VERIFIER), str(log_file)], + check=False, + text=True, + capture_output=True, + ) + + assert result.returncode == 3 + assert "live readiness before speech order" in result.stderr + + +def test_acceptance_log_verifier_rejects_navigation_before_live_route( + tmp_path: Path, +) -> None: + log_file = tmp_path / "acceptance.log" + log_file.write_text( + _complete_acceptance_transcript().replace( + "WebInput routing text to SeatGuide live request text=帮我找一个空位\nNavigating to\n", + "Navigating to\nWebInput routing text to SeatGuide live request text=帮我找一个空位\n", + 1, + ) + ) + + result = subprocess.run( + ["bash", str(ACCEPTANCE_LOG_VERIFIER), str(log_file)], + check=False, + text=True, + capture_output=True, + ) + + assert result.returncode == 3 + assert "live route before navigation order" in result.stderr + + +def test_acceptance_log_verifier_rejects_goal_reached_before_polling( + tmp_path: Path, +) -> None: + log_file = tmp_path / "acceptance.log" + log_file.write_text( + _complete_acceptance_transcript().replace( + "Navigating to\ngoal_sequence=1\nChecking SeatGuide navigation completion\ngoal_reached=true\n", + "Navigating to\ngoal_reached=true\ngoal_sequence=1\nChecking SeatGuide navigation completion\n", + 1, + ) + ) + + result = subprocess.run( + ["bash", str(ACCEPTANCE_LOG_VERIFIER), str(log_file)], + check=False, + text=True, + capture_output=True, + ) + + assert result.returncode == 3 + assert "polling before completion order" in result.stderr + + +def test_acceptance_log_verifier_rejects_direct_mcp_live_request( + tmp_path: Path, +) -> None: + log_file = tmp_path / "acceptance.log" + log_file.write_text( + _complete_acceptance_transcript() + + '\n+ dimos mcp call handle_seat_request --json-args \'{"text": "帮我找一个空位"}\'\n' + ) + + result = subprocess.run( + ["bash", str(ACCEPTANCE_LOG_VERIFIER), str(log_file)], + check=False, + text=True, + capture_output=True, + ) + + assert result.returncode == 3 + assert "direct MCP live SeatGuide call" in result.stderr + + +@pytest.mark.parametrize( + ("forbidden_text", "expected_error"), + [ + ( + '+ dimos mcp call set_seat_scene --json-args \'{"seats": [0, 0, 0], "people": []}\'', + "fallback seat scene calibration", + ), + ( + "+ dimos mcp call clear_seat_scene_override", + "fallback seat scene override clearing", + ), + ( + 'dimos mcp call seat_guide_preflight --json-args \'{"require_live_perception": false}\'', + "fallback live-perception bypass", + ), + ( + 'dimos mcp call seat_guide_preflight --json-args \'{"require_live_perception":false}\'', + "fallback live-perception bypass", + ), + ( + "dimos mcp call seat_guide_preflight --arg require_live_perception=false", + "fallback live-perception bypass", + ), + ], +) +def test_acceptance_log_verifier_rejects_fallback_calibration_evidence( + tmp_path: Path, + forbidden_text: str, + expected_error: str, +) -> None: + log_file = tmp_path / "acceptance.log" + log_file.write_text(_complete_acceptance_transcript() + f"\n{forbidden_text}\n") + + result = subprocess.run( + ["bash", str(ACCEPTANCE_LOG_VERIFIER), str(log_file)], + check=False, + text=True, + capture_output=True, + ) + + assert result.returncode == 3 + assert expected_error in result.stderr + + +def _extract_bash_function(source: str, name: str) -> str: + lines = source.splitlines() + start = lines.index(f"{name}() {{") + body = [lines[start]] + for line in lines[start + 1 :]: + body.append(line) + if line == "}": + return "\n".join(body) + raise AssertionError(f"Could not extract bash function {name}") + + +def _run_hardware_registry_guard( + tmp_path: Path, + *, + run_id: str, + registry: dict[str, Any], +) -> subprocess.CompletedProcess[str]: + state_dir = tmp_path / "state" + registry_dir = state_dir / "dimos" / "runs" + registry_dir.mkdir(parents=True) + (registry_dir / f"{run_id}.json").write_text(json.dumps(registry)) + log_file = tmp_path / "acceptance.log" + + script_source = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + wrapper = tmp_path / "guard.sh" + wrapper.write_text( + "\n".join( + [ + "set -euo pipefail", + _extract_bash_function(script_source, "log"), + _extract_bash_function(script_source, "extract_run_id"), + _extract_bash_function( + script_source, "require_hardware_run_registry" + ), + 'log_file="$1"', + 'XDG_STATE_HOME="$2"', + f"require_hardware_run_registry $' Run ID: {run_id}\\n'", + ] + ) + ) + + return subprocess.run( + ["bash", str(wrapper), str(log_file), str(state_dir)], + check=False, + text=True, + capture_output=True, + ) + + +def _run_smoke_registry_guard( + tmp_path: Path, + *, + run_id: str, + registry: dict[str, Any], +) -> subprocess.CompletedProcess[str]: + state_dir = tmp_path / "state" + registry_dir = state_dir / "dimos" / "runs" + registry_dir.mkdir(parents=True) + (registry_dir / f"{run_id}.json").write_text(json.dumps(registry)) + + script_source = SMOKE_SCRIPT.read_text() + wrapper = tmp_path / "smoke_guard.sh" + wrapper.write_text( + "\n".join( + [ + "set -euo pipefail", + _extract_bash_function(script_source, "extract_run_id"), + _extract_bash_function( + script_source, "require_seat_guide_run_registry" + ), + 'XDG_STATE_HOME="$1"', + f"require_seat_guide_run_registry $' Run ID: {run_id}\\n'", + ] + ) + ) + + return subprocess.run( + ["bash", str(wrapper), str(state_dir)], + check=False, + text=True, + capture_output=True, + ) + + +def _run_web_input_url_extract(tmp_path: Path, status_text: str) -> str: + script_source = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + wrapper = tmp_path / "extract_url.sh" + wrapper.write_text( + "\n".join( + [ + "set -euo pipefail", + _extract_bash_function(script_source, "extract_web_input_url"), + 'extract_web_input_url "$1"', + ] + ) + ) + result = subprocess.run( + ["bash", str(wrapper), status_text], + check=True, + text=True, + capture_output=True, + ) + return result.stdout.strip() + + +def _run_goal_sequence_extract(tmp_path: Path, status_text: str) -> str: + script_source = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + wrapper = tmp_path / "extract_goal_sequence.sh" + wrapper.write_text( + "\n".join( + [ + "set -euo pipefail", + _extract_bash_function(script_source, "extract_goal_sequence"), + 'extract_goal_sequence "$1"', + ] + ) + ) + result = subprocess.run( + ["bash", str(wrapper), status_text], + check=True, + text=True, + capture_output=True, + ) + return result.stdout.strip() + + +def _run_goal_completion_check( + tmp_path: Path, + *, + previous_sequence: int, + status_text: str, +) -> subprocess.CompletedProcess[str]: + script_source = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + wrapper = tmp_path / "goal_completed.sh" + wrapper.write_text( + "\n".join( + [ + "set -euo pipefail", + _extract_bash_function(script_source, "extract_goal_sequence"), + _extract_bash_function( + script_source, "seat_guide_goal_completed_after_sequence" + ), + 'seat_guide_goal_completed_after_sequence "$1" "$2"', + ] + ) + ) + return subprocess.run( + ["bash", str(wrapper), str(previous_sequence), status_text], + check=False, + text=True, + capture_output=True, + ) + + +def _run_preflight_ready_check( + tmp_path: Path, + status_text: str, +) -> subprocess.CompletedProcess[str]: + script_source = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + wrapper = tmp_path / "preflight_ready.sh" + wrapper.write_text( + "\n".join( + [ + "set -euo pipefail", + _extract_bash_function( + script_source, "seat_guide_preflight_ready_for_hardware" + ), + 'seat_guide_preflight_ready_for_hardware "$1"', + ] + ) + ) + return subprocess.run( + ["bash", str(wrapper), status_text], + check=False, + text=True, + capture_output=True, + ) + + +def _run_web_input_ready_check( + tmp_path: Path, + status_text: str, +) -> subprocess.CompletedProcess[str]: + script_source = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + wrapper = tmp_path / "web_input_ready.sh" + wrapper.write_text( + "\n".join( + [ + "set -euo pipefail", + _extract_bash_function(script_source, "web_input_ready_for_seat_guide"), + 'web_input_ready_for_seat_guide "$1"', + ] + ) + ) + return subprocess.run( + ["bash", str(wrapper), status_text], + check=False, + text=True, + capture_output=True, + ) + + +def _run_web_input_no_go_details( + tmp_path: Path, + status_text: str, +) -> subprocess.CompletedProcess[str]: + script_source = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + wrapper = tmp_path / "web_input_details.sh" + wrapper.write_text( + "\n".join( + [ + "set -euo pipefail", + _extract_bash_function(script_source, "log"), + _extract_bash_function(script_source, "log_web_input_no_go_details"), + 'log_file="$1"', + 'log_web_input_no_go_details "$2"', + ] + ) + ) + return subprocess.run( + ["bash", str(wrapper), str(tmp_path / "web_input.log"), status_text], + check=False, + text=True, + capture_output=True, + ) + + +def _run_camera_provider_ready_check( + tmp_path: Path, + status_text: str, +) -> subprocess.CompletedProcess[str]: + script_source = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + wrapper = tmp_path / "camera_ready.sh" + wrapper.write_text( + "\n".join( + [ + "set -euo pipefail", + _extract_bash_function( + script_source, "camera_provider_ready_for_hardware" + ), + 'camera_provider_ready_for_hardware "$1"', + ] + ) + ) + return subprocess.run( + ["bash", str(wrapper), status_text], + check=False, + text=True, + capture_output=True, + ) + + +def _run_camera_provider_no_go_details( + tmp_path: Path, + status_text: str, +) -> subprocess.CompletedProcess[str]: + script_source = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + wrapper = tmp_path / "camera_details.sh" + wrapper.write_text( + "\n".join( + [ + "set -euo pipefail", + _extract_bash_function(script_source, "log"), + _extract_bash_function( + script_source, "log_camera_provider_no_go_details" + ), + 'log_file="$1"', + 'log_camera_provider_no_go_details "$2"', + ] + ) + ) + return subprocess.run( + ["bash", str(wrapper), str(tmp_path / "camera.log"), status_text], + check=False, + text=True, + capture_output=True, + ) + + +def _run_seat_guide_no_go_details( + tmp_path: Path, + status_text: str, +) -> subprocess.CompletedProcess[str]: + script_source = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + wrapper = tmp_path / "seat_guide_details.sh" + wrapper.write_text( + "\n".join( + [ + "set -euo pipefail", + _extract_bash_function(script_source, "log"), + _extract_bash_function(script_source, "log_seat_guide_no_go_details"), + 'log_file="$1"', + 'log_seat_guide_no_go_details "$2"', + ] + ) + ) + return subprocess.run( + ["bash", str(wrapper), str(tmp_path / "seat_guide.log"), status_text], + check=False, + text=True, + capture_output=True, + ) + + +def _run_stream_wait_check( + tmp_path: Path, + stream_text: str, + expected_text: str, + start_offset: int = 0, +) -> subprocess.CompletedProcess[str]: + stream_file = tmp_path / "stream.txt" + stream_file.write_text(stream_text) + script_source = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + wrapper = tmp_path / "stream_wait.sh" + wrapper.write_text( + "\n".join( + [ + "set -euo pipefail", + _extract_bash_function(script_source, "wait_for_stream_text"), + 'wait_for_stream_text "$1" "$2" 0 "$3"', + ] + ) + ) + return subprocess.run( + ["bash", str(wrapper), str(stream_file), expected_text, str(start_offset)], + check=False, + text=True, + capture_output=True, + ) + + +@pytest.mark.parametrize( + ("status_text", "expected_url"), + [ + ( + "WebInput status: web=started; url=http://localhost:5555.", + "http://localhost:5555", + ), + ( + "WebInput status: web=started; url=http://127.0.0.1:6001.", + "http://127.0.0.1:6001", + ), + ( + "WebInput status: web=not_started; url=unavailable.", + "", + ), + ], +) +def test_hardware_acceptance_extracts_web_input_url( + tmp_path: Path, + status_text: str, + expected_url: str, +) -> None: + assert _run_web_input_url_extract(tmp_path, status_text) == expected_url + + +@pytest.mark.parametrize( + ("status_text", "expected_sequence"), + [ + ( + "SeatGuide navigation status: navigation=IDLE; goal_reached=false; goal_sequence=0.", + "0", + ), + ( + "old goal_sequence=1\nnew goal_sequence=2", + "2", + ), + ( + "SeatGuide navigation status: navigation=missing; goal_reached=unknown.", + "", + ), + ], +) +def test_hardware_acceptance_extracts_goal_sequence( + tmp_path: Path, + status_text: str, + expected_sequence: str, +) -> None: + assert _run_goal_sequence_extract(tmp_path, status_text) == expected_sequence + + +@pytest.mark.parametrize( + ("previous_sequence", "status_text", "expected_returncode"), + [ + ( + 1, + "SeatGuide navigation status: navigation=IDLE; goal_reached=true; goal_sequence=2.", + 0, + ), + ( + 1, + "SeatGuide navigation status: navigation=IDLE; goal_reached=true; goal_sequence=1.", + 1, + ), + ( + 1, + "SeatGuide navigation status: navigation=FOLLOWING_PATH; goal_reached=false; goal_sequence=2.", + 1, + ), + ( + 1, + "SeatGuide navigation status: navigation=missing; goal_reached=unknown.", + 1, + ), + ], +) +def test_hardware_acceptance_goal_completion_requires_new_reached_goal( + tmp_path: Path, + previous_sequence: int, + status_text: str, + expected_returncode: int, +) -> None: + result = _run_goal_completion_check( + tmp_path, + previous_sequence=previous_sequence, + status_text=status_text, + ) + + assert result.returncode == expected_returncode + + +@pytest.mark.parametrize( + ("status_text", "expected_returncode"), + [ + ( + "SeatGuide preflight ready: navigation=IDLE; perception=camera seats=2 people=0; selected=seat_1; goal=(1.65, 2.00, yaw=0.00); feedback=phone_or_web.", + 0, + ), + ( + "SeatGuide preflight ready: navigation=FOLLOWING_PATH; perception=camera seats=2 people=0; selected=seat_1; goal=(1.65, 2.00, yaw=0.00); feedback=phone_or_web.", + 1, + ), + ( + "SeatGuide preflight no-go: navigation=IDLE; perception=camera no seats; feedback=phone_or_web.", + 1, + ), + ], +) +def test_hardware_acceptance_preflight_requires_navigation_ready( + tmp_path: Path, + status_text: str, + expected_returncode: int, +) -> None: + result = _run_preflight_ready_check(tmp_path, status_text) + + assert result.returncode == expected_returncode + + +@pytest.mark.parametrize( + ("status_text", "expected_returncode"), + [ + ( + "WebInput status: web=started; thread=running; seat_route=seat_guide_direct; responses=connected; voice_upload=connected; stt=connected; human_transport=connected; url=http://localhost:5555.", + 0, + ), + ( + "WebInput status: web=started; thread=not_running; seat_route=seat_guide_direct; responses=connected; voice_upload=connected; stt=connected; human_transport=connected; url=http://localhost:5555.", + 1, + ), + ( + "WebInput status: web=started; thread=running; seat_route=agent_only; responses=connected; voice_upload=connected; stt=connected; human_transport=connected; url=http://localhost:5555.", + 1, + ), + ( + "WebInput status: web=started; thread=running; seat_route=seat_guide_direct; responses=missing; voice_upload=connected; stt=connected; human_transport=connected; url=http://localhost:5555.", + 1, + ), + ( + "WebInput status: web=started; thread=running; seat_route=seat_guide_direct; responses=connected; voice_upload=missing; stt=connected; human_transport=connected; url=http://localhost:5555.", + 1, + ), + ( + "WebInput status: web=started; thread=running; seat_route=seat_guide_direct; responses=connected; voice_upload=connected; stt=missing; human_transport=connected; url=http://localhost:5555.", + 1, + ), + ( + "WebInput status: web=started; thread=running; seat_route=seat_guide_direct; responses=connected; voice_upload=connected; stt=error(RuntimeError: whisper missing); human_transport=connected; url=http://localhost:5555.", + 1, + ), + ( + "WebInput status: web=started; thread=running; seat_route=seat_guide_direct; responses=connected; voice_upload=connected; stt=connected; human_transport=missing; url=http://localhost:5555.", + 1, + ), + ], +) +def test_hardware_acceptance_web_input_requires_complete_voice_route( + tmp_path: Path, + status_text: str, + expected_returncode: int, +) -> None: + result = _run_web_input_ready_check(tmp_path, status_text) + + assert result.returncode == expected_returncode + + +def test_hardware_acceptance_web_input_no_go_details_are_actionable( + tmp_path: Path, +) -> None: + status_text = ( + "WebInput status: web=not_started; thread=not_running; seat_route=agent_only; " + "responses=missing; voice_upload=missing; stt=missing; human_transport=missing; url=unavailable." + ) + + result = _run_web_input_no_go_details(tmp_path, status_text) + + assert result.returncode == 0 + assert "WebInput server is not started" in result.stdout + assert "server thread is not running" in result.stdout + assert "not directly wired to SeatGuide" in result.stdout + assert "response stream is missing" in result.stdout + assert "browser audio upload endpoint is not connected" in result.stdout + assert "speech-to-text pipeline is unavailable" in result.stdout + assert "fallback transport is missing" in result.stdout + + +@pytest.mark.parametrize( + ("status_text", "expected_returncode"), + [ + ( + "CameraSeatObservationProvider status: image=160x120; image_fresh=true; camera_info=160x120; camera_info_fresh=true; lidar=1200 points; lidar_fresh=true; odom=(1.00, 2.00, yaw=0.50); odom_fresh=true; detection_model=qwen; credential=present; override=inactive; configured_fallback_seats=0; configured_fallback_people=0.", + 0, + ), + ( + "CameraSeatObservationProvider status: image=160x120; image_fresh=true; odom=(1.00, 2.00, yaw=0.50); odom_fresh=true; detection_model=qwen; credential=missing; override=inactive; configured_fallback_seats=0; configured_fallback_people=0.", + 1, + ), + ( + "CameraSeatObservationProvider status: image=missing; image_fresh=missing; odom=(1.00, 2.00, yaw=0.50); odom_fresh=true; detection_model=qwen; credential=present; override=inactive; configured_fallback_seats=0; configured_fallback_people=0.", + 1, + ), + ( + "CameraSeatObservationProvider status: image=160x120; image_fresh=true; odom=missing; odom_fresh=missing; detection_model=qwen; credential=present; override=inactive; configured_fallback_seats=0; configured_fallback_people=0.", + 1, + ), + ( + "CameraSeatObservationProvider status: image=160x120; image_fresh=false; odom=(1.00, 2.00, yaw=0.50); odom_fresh=true; detection_model=qwen; credential=present; override=inactive; configured_fallback_seats=0; configured_fallback_people=0.", + 1, + ), + ( + "CameraSeatObservationProvider status: image=160x120; image_fresh=true; odom=(1.00, 2.00, yaw=0.50); odom_fresh=false; detection_model=qwen; credential=present; override=inactive; configured_fallback_seats=0; configured_fallback_people=0.", + 1, + ), + ( + "CameraSeatObservationProvider status: detection_model=qwen; credential=present; override=inactive; configured_fallback_seats=0; configured_fallback_people=0.", + 1, + ), + ( + "CameraSeatObservationProvider status: image=160x120; image_fresh=true; odom=(1.00, 2.00, yaw=0.50); odom_fresh=true; detection_model=qwen; credential=present; override=active; configured_fallback_seats=0; configured_fallback_people=0.", + 1, + ), + ( + "CameraSeatObservationProvider status: image=160x120; image_fresh=true; odom=(1.00, 2.00, yaw=0.50); odom_fresh=true; detection_model=qwen; credential=present; override=inactive; configured_fallback_seats=1; configured_fallback_people=0.", + 1, + ), + ( + "CameraSeatObservationProvider status: image=160x120; image_fresh=true; odom=(1.00, 2.00, yaw=0.50); odom_fresh=true; detection_model=qwen; credential=present; override=inactive; configured_fallback_seats=0; configured_fallback_people=1.", + 1, + ), + ], +) +def test_hardware_acceptance_camera_provider_requires_live_inputs( + tmp_path: Path, + status_text: str, + expected_returncode: int, +) -> None: + result = _run_camera_provider_ready_check(tmp_path, status_text) + + assert result.returncode == expected_returncode + + +def test_hardware_acceptance_camera_provider_no_go_details_are_actionable( + tmp_path: Path, +) -> None: + status_text = ( + "CameraSeatObservationProvider status: image=missing; image_fresh=false; " + "odom=missing; odom_fresh=false; detection_model=qwen; credential=missing; " + "override=active; " + "configured_fallback_seats=1; configured_fallback_people=1." + ) + + result = _run_camera_provider_no_go_details(tmp_path, status_text) + + assert result.returncode == 0 + assert "ALIBABA_API_KEY" in result.stdout + assert "Camera image is missing" in result.stdout + assert "Camera image is stale" in result.stdout + assert "Odometry is missing" in result.stdout + assert "Odometry is stale" in result.stdout + assert "Runtime seat-scene override is active" in result.stdout + assert "Configured fallback seats/people are non-zero" in result.stdout + + +def test_hardware_acceptance_seat_guide_no_go_details_are_actionable( + tmp_path: Path, +) -> None: + status_text = ( + "SeatGuide readiness report: SeatGuide scene source=stale_camera_image: " + "no seats visible or configured; 0 people detected. | " + "SeatGuide preflight no-go: navigation=FOLLOWING_PATH; " + "perception=stale_camera_odom no seats; feedback=phone_or_web. | " + "SeatGuide preflight no-go: perception=camera_detection_error no seats. | " + "SeatGuide preview source=configured_fallback: no empty seat available." + ) + + result = _run_seat_guide_no_go_details(tmp_path, status_text) + + assert result.returncode == 0 + assert "Navigation is busy" in result.stdout + assert "cannot see chairs" in result.stdout + assert "camera frames are stale" in result.stdout + assert "odometry is stale" in result.stdout + assert "camera detection failed" in result.stdout + assert "fallback/calibrated coordinates" in result.stdout + assert "none are empty" in result.stdout + + +def test_hardware_acceptance_logs_seat_guide_details_for_preflight_failures() -> None: + script = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + + assert "log_seat_guide_no_go_details()" in script + assert "seat_guide_status did not report live camera perception" in script + assert script.count('log_seat_guide_no_go_details "${') >= 4 + + +def test_hardware_acceptance_has_actionable_mcp_tools_failure_message() -> None: + script = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + + assert 'if ! tools="$(run_dimos mcp list-tools 2>&1)"; then' in script + assert "Hardware acceptance no-go: MCP tools are unavailable." in script + assert "Hardware acceptance no-go: missing MCP tool" in script + assert "SeatGuide, WebInput, and camera provider modules" in script + assert "unitree-go2-seat-guide-agentic and includes McpServer" in script + assert 'require_tool "${tools}" "speak"' not in script + assert "SeatGuide audio check. I can guide you to an empty seat." not in script + assert "Operator audio confirmation" not in script + assert "Transcript saved to: ${log_file}" in script + + +def test_hardware_acceptance_missing_stack_reports_transcript_path() -> None: + script = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + missing_stack_block = script.split( + 'if grep -q "No running DimOS instance" <<<"${status_output}"; then', + maxsplit=1, + )[1].split("exit 2", maxsplit=1)[0] + + assert "No running DimOS stack found." in missing_stack_block + assert "dimos run unitree-go2-seat-guide-agentic --robot-ip" in missing_stack_block + assert 'log "Transcript saved to: ${log_file}"' in missing_stack_block + + +@pytest.mark.parametrize( + ("stream_text", "expected_text", "expected_returncode"), + [ + ("event: message\ndata: SeatGuide preflight ready\n", "SeatGuide preflight ready", 0), + ("event: message\ndata: Navigating to (1.00, 2.00)\n", "Navigating to", 0), + ("event: message\ndata: still thinking\n", "Navigating to", 1), + ], +) +def test_hardware_acceptance_waits_for_webinput_stream_text( + tmp_path: Path, + stream_text: str, + expected_text: str, + expected_returncode: int, +) -> None: + result = _run_stream_wait_check(tmp_path, stream_text, expected_text) + + assert result.returncode == expected_returncode + + +def test_hardware_acceptance_stream_wait_ignores_text_before_start_offset( + tmp_path: Path, +) -> None: + stale_text = "event: message\ndata: Navigating to stale goal\n" + result = _run_stream_wait_check( + tmp_path, + stale_text + "event: message\ndata: still waiting\n", + "Navigating to", + start_offset=len(stale_text), + ) + + assert result.returncode == 1 + + +def test_hardware_acceptance_stream_gates_use_offset_wait_result() -> None: + script = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + + assert 'grep -q "SeatGuide preflight ready" "${stream_file}"' not in script + assert 'grep -q "Navigating to" "${stream_file}"' not in script + assert script.count('if [[ "${stream_matched}" != "1" ]]; then') == 3 + + +def test_hardware_acceptance_text_stream_failures_cleanup() -> None: + script = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + + post_failure_block = script.split( + 'log "Hardware acceptance no-go: WebInput /submit_query request failed."', + maxsplit=1, + )[1].split("exit 3", maxsplit=1)[0] + text_wait_failure_block = script.split( + 'log "Hardware acceptance no-go: WebInput text route did not publish a ready SeatGuide preview response."', + maxsplit=1, + )[1].split("exit 3", maxsplit=1)[0] + + assert "stop_stream" in post_failure_block + assert 'rm -f "${stream_file}"' in post_failure_block + assert 'rm -f "${stream_file}"' in text_wait_failure_block + + +def test_hardware_acceptance_has_interrupt_stream_cleanup_trap() -> None: + script = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + + assert "cleanup_active_stream()" in script + assert "trap cleanup_active_stream EXIT" in script + assert "trap 'cleanup_active_stream; exit 130' INT" in script + assert "trap 'cleanup_active_stream; exit 143' TERM" in script + assert script.count('active_stream_file="${stream_file}"') == 3 + assert script.count('active_stream_pid="${stream_pid}"') == 3 + + +def test_hardware_acceptance_auto_verifies_transcript_after_live_request() -> None: + script = HARDWARE_ACCEPTANCE_SCRIPT.read_text() + + assert 'acceptance_log_verifier="${script_dir}/demo_seat_guide_verify_acceptance_log"' in script + assert "verify_acceptance_log()" in script + assert 'log "+ ${acceptance_log_verifier} ${log_file}"' in script + live_tail = script.split( + 'capture_dimos_log "Capturing DimOS log snapshot after live request..."', + maxsplit=1, + )[1] + assert live_tail.index("verify_acceptance_log") < live_tail.index( + 'log "Live request sent. Continue monitoring with: dimos log -f"' + ) + + +@pytest.mark.parametrize( + "blueprint", + ["unitree-go2-seat-guide", "unitree-go2-seat-guide-agentic"], +) +def test_hardware_acceptance_registry_guard_accepts_hardware_run( + tmp_path: Path, + blueprint: str, +) -> None: + result = _run_hardware_registry_guard( + tmp_path, + run_id="hardware", + registry={ + "run_id": "hardware", + "blueprint": blueprint, + "cli_args": [blueprint], + "config_overrides": {"replay": False, "simulation": ""}, + "original_argv": ["dimos", "run", blueprint], + }, + ) + + assert result.returncode == 0 + assert "Hardware run mode: hardware." in result.stdout + assert f"Hardware blueprint: {blueprint}" in result.stdout + + +@pytest.mark.parametrize( + ("registry", "expected_output"), + [ + ( + {"cli_args": ["--replay=true"], "config_overrides": {}}, + "replay mode", + ), + ( + {"cli_args": [], "config_overrides": {"replay": True}}, + "replay mode", + ), + ( + {"cli_args": [], "config_overrides": {}, "replay": True}, + "replay mode", + ), + ( + {"cli_args": ["--simulation=dimsim"], "config_overrides": {}}, + "simulation mode", + ), + ( + {"cli_args": [], "config_overrides": {"simulation": "customsim"}}, + "simulation mode", + ), + ( + {"cli_args": [], "config_overrides": {"simulation": True}}, + "simulation mode", + ), + ( + {"cli_args": [], "config_overrides": {}, "simulation": True}, + "simulation mode", + ), + ( + { + "blueprint": "unitree-go2-agentic", + "cli_args": ["unitree-go2-agentic"], + "config_overrides": {"replay": False, "simulation": ""}, + }, + "not a SeatGuide Go2 blueprint", + ), + ], +) +def test_hardware_acceptance_registry_guard_rejects_non_hardware_runs( + tmp_path: Path, + registry: dict[str, Any], + expected_output: str, +) -> None: + registry = { + "run_id": "not-hardware", + "original_argv": ["dimos", "run", "unitree-go2-seat-guide-agentic"], + **registry, + } + + result = _run_hardware_registry_guard( + tmp_path, + run_id="not-hardware", + registry=registry, + ) + + assert result.returncode == 3 + assert expected_output in result.stdout + + +@pytest.mark.parametrize( + "tool_name", + ["seat_guide_status", "preview_empty_seat_goal"], +) +def test_no_motion_smoke_calls_scene_and_goal_preview(tool_name: str) -> None: + script = SMOKE_SCRIPT.read_text() + + assert f'require_tool "${{tools}}" "{tool_name}"' in script + assert f"run_dimos mcp call {tool_name}" in script + + +def test_no_motion_smoke_requires_web_input_voice_path_ready() -> None: + script = SMOKE_SCRIPT.read_text() + + assert "require_output_contains()" in script + assert "web_input_output=" in script + for expected in [ + "web=started", + "thread=running", + "seat_route=seat_guide_direct", + "responses=connected", + "voice_upload=connected", + "stt=connected", + "human_transport=connected", + ]: + assert f'require_output_contains "${{web_input_output}}" "{expected}"' in script + + +def test_no_motion_smoke_has_actionable_missing_stack_and_mcp_messages() -> None: + script = SMOKE_SCRIPT.read_text() + + assert 'status_output="$(run_dimos status)"' in script + assert "No running DimOS stack found." in script + assert "dimos --replay run unitree-go2-seat-guide-agentic --daemon" in script + assert "dimos run unitree-go2-seat-guide-agentic --robot-ip" in script + assert "SeatGuide smoke no-go: MCP tools are unavailable." in script + assert "SeatGuide smoke no-go: missing MCP tool" in script + assert "SeatGuide, WebInput, and camera provider modules" in script + assert "includes McpServer" in script + + +@pytest.mark.parametrize( + "blueprint", + ["unitree-go2-seat-guide", "unitree-go2-seat-guide-agentic"], +) +def test_no_motion_smoke_registry_guard_accepts_seat_guide_stack( + tmp_path: Path, + blueprint: str, +) -> None: + result = _run_smoke_registry_guard( + tmp_path, + run_id="seat-guide-smoke", + registry={ + "run_id": "seat-guide-smoke", + "blueprint": blueprint, + "original_argv": ["dimos", "run", blueprint], + }, + ) + + assert result.returncode == 0 + + +def test_no_motion_smoke_registry_guard_rejects_general_go2_stack( + tmp_path: Path, +) -> None: + result = _run_smoke_registry_guard( + tmp_path, + run_id="general-go2", + registry={ + "run_id": "general-go2", + "blueprint": "unitree-go2-agentic", + "original_argv": ["dimos", "run", "unitree-go2-agentic"], + }, + ) + + assert result.returncode == 3 + assert "not a SeatGuide Go2 blueprint" in result.stderr + + +def test_replay_smoke_starts_seat_guide_stack_and_runs_no_motion_smoke() -> None: + script = REPLAY_SMOKE_SCRIPT.read_text() + + assert "run_dimos --replay run unitree-go2-seat-guide-agentic --daemon" in script + assert "demo_seat_guide_smoke" in script + assert "unitree-go2-agentic" not in script.replace( + "unitree-go2-seat-guide-agentic", "" + ) + + +def test_hardware_bringup_starts_real_stack_then_runs_smoke_and_acceptance() -> None: + script = HARDWARE_BRINGUP_SCRIPT.read_text() + + assert 'robot_ip="${SEAT_GUIDE_ROBOT_IP:-192.168.123.161}"' in script + assert 'detection_model="${SEAT_GUIDE_DETECTION_MODEL:-moondream}"' in script + assert ( + 'run_dimos --robot-ip "${robot_ip}" --detection-model "${detection_model}" ' + "run unitree-go2-seat-guide-agentic --daemon" + ) in script + assert "demo_seat_guide_smoke" in script + assert "demo_seat_guide_hardware_acceptance" in script + assert "unitree-go2-agentic" not in script.replace( + "unitree-go2-seat-guide-agentic", "" + ) + + +def test_hardware_bringup_requires_real_perception_and_allows_no_agent_key() -> None: + script = HARDWARE_BRINGUP_SCRIPT.read_text() + + assert 'ALIBABA_API_KEY' in script + assert 'OPENROUTER_API_KEY' in script + assert 'OPENAI_API_KEY' in script + assert ( + "SeatGuide bring-up no-go: ALIBABA_API_KEY is not set for detection_model=qwen." + in script + ) + assert "neither OPENROUTER_API_KEY nor OPENAI_API_KEY is set" not in script + assert "direct SeatGuide voice/MCP routing still works" in script + assert "TTS speech feedback will be unavailable" not in script + + +def test_hardware_bringup_allows_existing_stack_and_smoke_skip() -> None: + script = HARDWARE_BRINGUP_SCRIPT.read_text() + + assert "--skip-start" in script + assert "--skip-smoke" in script + assert "Using the currently running DimOS stack." in script + assert "Skipping no-motion smoke checks." in script + + +@pytest.mark.parametrize("script_path", SEAT_GUIDE_SCRIPTS) +def test_seat_guide_demo_scripts_are_directly_executable(script_path: Path) -> None: + assert script_path.read_text().startswith("#!/usr/bin/env bash") + assert os.access(script_path, os.X_OK) + + +def test_seat_guide_doc_does_not_recommend_rejected_general_go2_stack() -> None: + doc = SEAT_GUIDE_DOC.read_text() + + assert "bin/demo_seat_guide_hardware_bringup --robot-ip" in doc + assert ( + "dimos --robot-ip 192.168.123.161 --detection-model moondream " + "run unitree-go2-seat-guide-agentic --daemon" + ) in doc + assert "dimos --replay run unitree-go2-seat-guide-agentic --daemon" in doc + assert "dimos run unitree-go2-agentic --robot-ip" not in doc + + +def test_seat_guide_doc_has_parallel_hardware_day_checklist() -> None: + doc = SEAT_GUIDE_DOC.read_text() + + assert "### Parallel hardware-day checklist" in doc + for track in [ + "Voice intake", + "Perception", + "Planner", + "Navigation", + "Phone feedback", + "Acceptance evidence", + ]: + assert f"| {track} |" in doc + assert "bin/demo_seat_guide_verify_acceptance_log " in doc + + +def test_seat_guide_doc_describes_smoke_webinput_gate() -> None: + doc = SEAT_GUIDE_DOC.read_text() + + smoke_section = doc.split("bin/demo_seat_guide_smoke", maxsplit=1)[0].rsplit( + "Run the no-motion smoke script", maxsplit=1 + )[1] + for expected in [ + "web=started", + "thread=running", + "seat_route=seat_guide_direct", + "responses=connected", + "voice_upload=connected", + "stt=connected", + "human_transport=connected", + ]: + assert expected in smoke_section + + +def test_seat_guide_doc_keeps_direct_mcp_out_of_live_bringup_commands() -> None: + doc = SEAT_GUIDE_DOC.read_text() + no_motion_section = doc.split("Run the real voice path:", maxsplit=1)[0] + + assert "Run the no-motion readiness path" in no_motion_section + assert "dimos mcp call handle_seat_request --json-args" not in no_motion_section + assert "verifier rejects that path" in doc + + +def test_go2_system_prompt_mentions_seat_guide_flow() -> None: + assert "handle_seat_request" in SYSTEM_PROMPT + assert "seat_guide_status" in SYSTEM_PROMPT + assert "camera_seat_provider_status" in SYSTEM_PROMPT + assert "web_input_status" in SYSTEM_PROMPT + assert "phone_speaker_test" in SYSTEM_PROMPT + assert "preview_empty_seat_goal" in SYSTEM_PROMPT + assert "seat_guide_navigation_status" in SYSTEM_PROMPT + assert "goal_reached=true" in SYSTEM_PROMPT + assert "STT pipeline is connected" in SYSTEM_PROMPT + assert "set_seat_scene" in SYSTEM_PROMPT + assert "Do not claim live chair/person perception is active" in SYSTEM_PROMPT diff --git a/dimos/agents/skills/test_unitree_skill_container.py b/dimos/agents/skills/test_unitree_skill_container.py index 30bf6139e8..b90292e310 100644 --- a/dimos/agents/skills/test_unitree_skill_container.py +++ b/dimos/agents/skills/test_unitree_skill_container.py @@ -20,6 +20,9 @@ from dimos.core.core import rpc from dimos.core.module import Module from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.geometry_msgs.Quaternion import Quaternion +from dimos.msgs.geometry_msgs.Twist import Twist +from dimos.msgs.geometry_msgs.Vector3 import Vector3 from dimos.navigation.base import NavigationState from dimos.robot.unitree.unitree_skill_container import _UNITREE_COMMANDS, UnitreeSkillContainer @@ -43,6 +46,10 @@ def cancel_goal(self) -> bool: class StubGO2Connection(Module): + @rpc + def move(self, twist: Twist, duration: float = 0.0) -> bool: + return True + @rpc def publish_request(self, topic: str, data: dict[str, Any]) -> dict[Any, Any]: return {} @@ -70,3 +77,16 @@ def test_did_you_mean() -> None: suggestions = difflib.get_close_matches("Pounce", _UNITREE_COMMANDS.keys(), n=3, cutoff=0.6) assert "FrontPounce" in suggestions assert "Pose" in suggestions + + +def test_relative_move_accepts_velocity_style_aliases() -> None: + skill = UnitreeSkillContainer.__new__(UnitreeSkillContainer) + current_pose = PoseStamped( + position=Vector3(1.0, 2.0, 0.0), + orientation=Quaternion.from_euler(Vector3(0.0, 0.0, 0.0)), + ) + + goal = skill._generate_new_goal(current_pose, forward=0.4, left=-0.2, degrees=0.0) + + assert goal.position.x == 1.4 + assert goal.position.y == 1.8 diff --git a/dimos/agents/system_prompt.py b/dimos/agents/system_prompt.py index 54f713f538..15bbd34986 100644 --- a/dimos/agents/system_prompt.py +++ b/dimos/agents/system_prompt.py @@ -32,6 +32,21 @@ - During `start_exploration`, avoid calling other skills except `stop_movement`. - Always run `execute_sport_command("RecoveryStand")` after dynamic movements (flips, jumps, sit) before navigating. +## SeatGuide Flow +- If the user asks for an empty seat, chair, or place to sit in a conference room, call `handle_seat_request` with the user's exact request text. +- If the user asks to preview, preflight, test, or check a SeatGuide request without moving, call `preview_seat_request` instead. +- `handle_seat_request` uses the configured conference room scene provider, requires live camera perception by default, selects an empty seat, starts navigation, and returns feedback text for the web or phone relay. +- Use `seat_guide_readiness_report` as the first no-motion hardware check; it combines scene status, preflight, and goal preview. +- Use `seat_guide_preflight` before the first real hardware run; by default it requires live camera perception and checks navigation and the selected goal without moving. +- Use `seat_guide_status` during bring-up or uncertainty to inspect visible/configured seats and people without moving. +- Use `camera_seat_provider_status` during bring-up to confirm camera frames, odometry, input freshness, VLM credentials, and fallback/override state before running detection. +- Use `web_input_status` during bring-up to confirm browser microphone/text input is running, browser audio upload is connected, the STT pipeline is connected, and requests are routed directly to SeatGuide. +- Use `phone_speaker_test` during bring-up when a phone speaker relay is mounted on the robot; Go2 body audio is not a required SeatGuide path. +- Use `preview_empty_seat_goal` before live navigation during bring-up to inspect the selected chair and map-frame goal without moving. +- After a live SeatGuide request starts navigation, use `seat_guide_navigation_status` to verify `goal_reached=true` before claiming the task is complete. +- If the room layout has not been calibrated yet, use `set_seat_scene` with map-frame chair poses and person positions; only pass `require_live_perception=false` for explicit fallback calibration. +- Do not claim live chair/person perception is active unless a real perception-backed scene provider has been configured. + ## GPS Navigation Flow For outdoor/GPS-based navigation: 1. Use `get_gps_position_for_queries` to look up coordinates for landmarks diff --git a/dimos/agents/web_human_input.py b/dimos/agents/web_human_input.py index 0a4fe7c3f3..fe76bdd05d 100644 --- a/dimos/agents/web_human_input.py +++ b/dimos/agents/web_human_input.py @@ -12,12 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os from threading import Thread -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import reactivex as rx import reactivex.operators as ops - +import requests + +from dimos.agents.annotation import skill +from dimos.agents.skills.seat_guide import ( + SeatGuideRequestSpec, + is_seat_guide_preview_request, + parse_seat_guide_intent, +) from dimos.constants import DEFAULT_THREAD_JOIN_TIMEOUT from dimos.core.core import rpc from dimos.core.module import Module @@ -32,10 +40,25 @@ logger = setup_logger() +def _create_whisper_node(): + # Do not force English here. SeatGuide's primary demo phrase is Chinese, and + # Whisper can auto-detect language when `language` is omitted. + from dimos.stream.audio.stt.node_whisper import WhisperNode + + return WhisperNode(modelopts={"fp16": False}) + + class WebInput(Module): _web_interface: RobotWebInterface | None = None _thread: Thread | None = None _human_transport: pLCMTransport[str] | None = None + _seat_guide: SeatGuideRequestSpec | None = None + _agent_responses: rx.subject.Subject[str] | None = None + _stt_node: Any | None = None + _stt_error: str | None = None + _speaker_cloud_url_env = "SEAT_GUIDE_SPEAKER_URL" + _speaker_cloud_token_env = "SEAT_GUIDE_SPEAKER_TOKEN" + _speaker_cloud_device_env = "SEAT_GUIDE_SPEAKER_DEVICE" @rpc def start(self) -> None: @@ -45,31 +68,31 @@ def start(self) -> None: audio_subject: rx.subject.Subject[AudioEvent] = rx.subject.Subject() + self._agent_responses = rx.subject.Subject() self._web_interface = RobotWebInterface( port=5555, - text_streams={"agent_responses": rx.subject.Subject()}, + text_streams={"agent_responses": self._agent_responses}, audio_subject=audio_subject, ) - normalizer = AudioNormalizer() - - # Here to prevent unwanted imports in the file. - from dimos.stream.audio.stt.node_whisper import WhisperNode - - stt_node = WhisperNode() + unsub = self._web_interface.query_stream.subscribe(self._route_text) + self.register_disposable(unsub) - # Connect audio pipeline: browser audio → normalizer → whisper - normalizer.consume_audio(audio_subject.pipe(ops.share())) - stt_node.consume_audio(normalizer.emit_audio()) + try: + normalizer = AudioNormalizer() + stt_node = _create_whisper_node() + self._stt_node = stt_node + self._stt_error = None - # Subscribe to both text input sources - # 1. Direct text from web interface - unsub = self._web_interface.query_stream.subscribe(self._human_transport.publish) - self.register_disposable(unsub) + normalizer.consume_audio(audio_subject.pipe(ops.share())) + stt_node.consume_audio(normalizer.emit_audio()) - # 2. Transcribed text from STT - unsub = stt_node.emit_text().subscribe(self._human_transport.publish) - self.register_disposable(unsub) + unsub = stt_node.emit_text().subscribe(self._route_text) + self.register_disposable(unsub) + except Exception as exc: + self._stt_node = None + self._stt_error = f"{type(exc).__name__}: {exc}" + logger.exception("WebInput speech-to-text pipeline unavailable") self._thread = Thread(target=self._web_interface.run, daemon=True) self._thread.start() @@ -85,3 +108,131 @@ def stop(self) -> None: if self._human_transport: self._human_transport.lcm.stop() super().stop() + + @skill + def web_input_status(self) -> str: + """Report WebInput voice and text routing readiness. + + Use this during Go2 bring-up to confirm the browser microphone/text + entry point is running, SeatGuide direct routing is connected, and + SeatGuide responses can be streamed back to the web UI. + """ + web_state = "started" if self._web_interface is not None else "not_started" + thread_state = ( + "running" + if self._thread is not None and self._thread.is_alive() + else "not_running" + ) + seat_route = ( + "seat_guide_direct" if self._seat_guide is not None else "agent_only" + ) + response_stream = ( + "connected" if self._agent_responses is not None else "missing" + ) + voice_upload = ( + "connected" + if self._web_interface is not None + and getattr(self._web_interface, "audio_subject", None) is not None + else "missing" + ) + if self._stt_node is not None: + stt_state = "connected" + elif getattr(self, "_stt_error", None): + stt_state = f"error({self._stt_error})" + else: + stt_state = "missing" + human_transport = ( + "connected" if self._human_transport is not None else "missing" + ) + url = ( + f"http://localhost:{self._web_interface.port}" + if self._web_interface is not None + else "unavailable" + ) + return ( + f"WebInput status: web={web_state}; thread={thread_state}; " + f"seat_route={seat_route}; responses={response_stream}; " + f"voice_upload={voice_upload}; stt={stt_state}; " + f"human_transport={human_transport}; url={url}." + ) + + @skill + def phone_speaker_test(self, text: str = "SeatGuide speaker test.") -> str: + """Send a test message to the browser or phone speaker page. + + Args: + text: Text to speak on the connected browser or phone speaker page. + """ + if self._agent_responses is None: + local_result = "local=missing" + else: + self._publish_agent_response(text) + local_result = "local=sent" + cloud_result = self._post_cloud_speaker(text) + return f"Phone speaker test sent: {text}; {local_result}; {cloud_result}" + + @skill + def phone_seat_request(self, text: str = "Find an empty seat.") -> str: + """Route a SeatGuide request and speak the result on the phone page. + + Args: + text: SeatGuide request text, for example asking for an empty seat. + """ + if self._seat_guide is None: + return "SeatGuide direct route is not connected." + response = self._seat_guide.handle_seat_request(text) + self._publish_agent_response(response) + self._post_cloud_speaker(response) + return response + + def _route_text(self, text: str) -> None: + logger.info("WebInput received text", text=text) + if parse_seat_guide_intent(text).should_find_seat and self._seat_guide is not None: + try: + if is_seat_guide_preview_request(text): + logger.info("WebInput routing text to SeatGuide preview", text=text) + response = self._seat_guide.preview_seat_request(text) + else: + logger.info("WebInput routing text to SeatGuide live request", text=text) + response = self._seat_guide.handle_seat_request(text) + self._publish_agent_response(response) + if not is_seat_guide_preview_request(text): + self._post_cloud_speaker(response) + return + except Exception: + logger.exception( + "SeatGuide direct route failed; publishing text to normal agent path" + ) + + if self._human_transport is None: + logger.warning("Dropping human input because human transport is not initialized") + return + logger.info("WebInput routing text to agent path", text=text) + self._human_transport.publish(text) + + def _publish_agent_response(self, text: str) -> None: + if self._agent_responses is None: + return + self._agent_responses.on_next(text) + + def _post_cloud_speaker(self, text: str) -> str: + base_url = os.environ.get(self._speaker_cloud_url_env) + if not base_url: + return "cloud=not_configured" + token = os.environ.get(self._speaker_cloud_token_env) + device = os.environ.get(self._speaker_cloud_device_env, "go2-demo") + headers = {"content-type": "application/json"} + if token: + headers["authorization"] = f"Bearer {token}" + try: + response = requests.post( + f"{base_url.rstrip('/')}/api/speak", + json={"device": device, "text": text}, + headers=headers, + timeout=5.0, + ) + response.raise_for_status() + except requests.RequestException as exc: + logger.warning("Cloud phone speaker post failed", error=str(exc)) + return f"cloud=error({type(exc).__name__})" + return "cloud=sent" diff --git a/dimos/constants.py b/dimos/constants.py index d849f4aaf3..5b83bce636 100644 --- a/dimos/constants.py +++ b/dimos/constants.py @@ -45,6 +45,11 @@ DEFAULT_CAPACITY_COLOR_IMAGE = 1920 * 1080 * 3 # Default depth image size: 1280x720 frame * 4 (float32 size) DEFAULT_CAPACITY_DEPTH_IMAGE = 1280 * 720 * 4 +# Fixed-capacity SHM channels must be sized before the first message arrives. +# These defaults cover current Go2 replay and navigation payloads while keeping +# large local streams off UDP multicast. +DEFAULT_CAPACITY_POINTCLOUD = 64 * 1024 * 1024 +DEFAULT_CAPACITY_OCCUPANCY_GRID = 16 * 1024 * 1024 # From https://github.com/lcm-proj/lcm.git LCM_MAX_CHANNEL_NAME_LENGTH = 63 diff --git a/dimos/core/coordination/module_coordinator.py b/dimos/core/coordination/module_coordinator.py index 60c41a6ba1..d0593b8e02 100644 --- a/dimos/core/coordination/module_coordinator.py +++ b/dimos/core/coordination/module_coordinator.py @@ -30,7 +30,14 @@ from dimos.core.global_config import GlobalConfig, global_config from dimos.core.module import ModuleBase, ModuleSpec from dimos.core.resource import Resource -from dimos.core.transport import LCMTransport, PubSubTransport, pLCMTransport +from dimos.core.transport import ( + JpegShmTransport, + LCMTransport, + PubSubTransport, + SHMTransport, + pLCMTransport, + pSHMTransport, +) from dimos.spec.utils import is_spec, spec_annotation_compliance, spec_structural_compliance from dimos.utils.generic import short_id from dimos.utils.logging_config import setup_logger @@ -279,6 +286,9 @@ def _connect_streams(self, blueprint: Blueprint) -> None: module=module.__name__, transport=transport.__class__.__name__, ) + # SHM streams are concrete transport objects, not LCM topics. Forward + # them to Rerun after stream wiring has resolved the transport registry. + _configure_rerun_bridge_visual_transports(self) @classmethod def build( @@ -584,6 +594,31 @@ def _get_transport_for(blueprint: Blueprint, name: str, stream_type: type) -> Pu return transport +def _configure_rerun_bridge_visual_transports(coordinator: ModuleCoordinator) -> None: + """Send resolved SHM transports to an active Rerun bridge. + + RerunBridgeModule subscribes to configured pubsubs directly. For SHM + streams, the coordinator forwards the concrete transport objects after + stream wiring has selected them. + """ + from dimos.visualization.rerun.bridge import RerunBridgeModule + + if RerunBridgeModule not in coordinator._deployed_modules: + return + + # LCM transports are already visible through RerunBridgeModule.config.pubsubs. + transports = [ + transport + for transport in coordinator._transport_registry.values() + if isinstance(transport, SHMTransport | pSHMTransport | JpegShmTransport) + ] + if not transports: + return + + bridge = coordinator.get_instance(RerunBridgeModule) + bridge.set_visual_transports(transports) + + def _verify_no_name_conflicts(blueprint: Blueprint) -> None: name_to_types: dict[Any, set[type]] = defaultdict(set) name_to_modules: dict[Any, list[tuple[type, type]]] = defaultdict(list) diff --git a/dimos/core/transport.py b/dimos/core/transport.py index 6435003758..de2ced9cbb 100644 --- a/dimos/core/transport.py +++ b/dimos/core/transport.py @@ -163,14 +163,23 @@ def stop(self) -> None: class pSHMTransport(PubSubTransport[T]): + """Pickled shared-memory transport for local Python object streams.""" + _started: bool = False def __init__(self, topic: str, **kwargs) -> None: # type: ignore[no-untyped-def] super().__init__(topic) + self._kwargs = kwargs self.shm = PickleSharedMemory(**kwargs) def __reduce__(self): # type: ignore[no-untyped-def] - return (pSHMTransport, (self.topic,)) + # Preserve sizing options such as default_capacity when the coordinator + # sends this transport to workers or to Rerun. + return (pSHMTransport, (self.topic,), self._kwargs) + + def __setstate__(self, state: dict[str, Any]) -> None: + self._kwargs = state + self.shm = PickleSharedMemory(**state) def broadcast(self, _, msg) -> None: # type: ignore[no-untyped-def] if not self._started: @@ -193,14 +202,23 @@ def stop(self) -> None: class SHMTransport(PubSubTransport[T]): + """Raw bytes shared-memory transport for local fixed-size payloads.""" + _started: bool = False def __init__(self, topic: str, **kwargs) -> None: # type: ignore[no-untyped-def] super().__init__(topic) + self._kwargs = kwargs self.shm = BytesSharedMemory(**kwargs) def __reduce__(self): # type: ignore[no-untyped-def] - return (SHMTransport, (self.topic,)) + # Preserve sizing options such as default_capacity when the coordinator + # sends this transport to workers or to Rerun. + return (SHMTransport, (self.topic,), self._kwargs) + + def __setstate__(self, state: dict[str, Any]) -> None: + self._kwargs = state + self.shm = BytesSharedMemory(**state) def broadcast(self, _, msg) -> None: # type: ignore[no-untyped-def] if not self._started: @@ -223,6 +241,8 @@ def stop(self) -> None: class JpegShmTransport(PubSubTransport[T]): + """JPEG-compressed shared-memory transport for local image streams.""" + _started: bool = False def __init__(self, topic: str, quality: int = 75, **kwargs) -> None: # type: ignore[no-untyped-def] @@ -233,9 +253,19 @@ def __init__(self, topic: str, quality: int = 75, **kwargs) -> None: # type: ig self.shm = JpegSharedMemory(quality=quality, **kwargs) self.quality = quality + self._kwargs = kwargs def __reduce__(self): # type: ignore[no-untyped-def] - return (JpegShmTransport, (self.topic, self.quality)) + # Preserve quality and sizing options when crossing worker boundaries. + return (JpegShmTransport, (self.topic, self.quality), self._kwargs) + + def __setstate__(self, state: dict[str, Any]) -> None: + from dimos.protocol.pubsub.impl.jpeg_shm import ( + JpegSharedMemory, + ) # deferred to avoid pulling in Image/cv2/rerun + + self._kwargs = state + self.shm = JpegSharedMemory(quality=self.quality, **state) def broadcast(self, _, msg) -> None: # type: ignore[no-untyped-def] if not self._started: diff --git a/dimos/models/vl/moondream.py b/dimos/models/vl/moondream.py index e3cfe744ce..b181a92879 100644 --- a/dimos/models/vl/moondream.py +++ b/dimos/models/vl/moondream.py @@ -36,8 +36,9 @@ class MoondreamConfig(HuggingFaceModelConfig, VlModelConfig): """Configuration for MoondreamVlModel.""" model_name: str = "vikhyatk/moondream2" - dtype: torch.dtype = torch.bfloat16 + dtype: torch.dtype = torch.float32 auto_resize: tuple[int, int] | None = MOONDREAM_DEFAULT_AUTO_RESIZE + compile_model: bool = False class MoondreamVlModel(HuggingFaceModel, VlModel): @@ -52,7 +53,8 @@ def _model(self) -> AutoModelForCausalLM: trust_remote_code=self.config.trust_remote_code, torch_dtype=self.config.dtype, ).to(self.config.device) - model.compile() + if self.config.compile_model: + model.compile() return model def _to_pil(self, image: Image | np.ndarray[Any, Any]) -> PILImage.Image: diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index 6fbf0138bb..ae3dc8c057 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -101,11 +101,17 @@ "unitree-go2-coordinator": "dimos.robot.unitree.go2.blueprints.basic.unitree_go2_coordinator:unitree_go2_coordinator", "unitree-go2-detection": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2_detection:unitree_go2_detection", "unitree-go2-fleet": "dimos.robot.unitree.go2.blueprints.basic.unitree_go2_fleet:unitree_go2_fleet", + "unitree-go2-guide": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_guide:unitree_go2_guide", "unitree-go2-keyboard-teleop": "dimos.robot.unitree.go2.blueprints.basic.unitree_go2_keyboard_teleop:unitree_go2_keyboard_teleop", "unitree-go2-markers": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2:unitree_go2_markers", "unitree-go2-memory": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2:unitree_go2_memory", "unitree-go2-relocalization": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2:unitree_go2_relocalization", "unitree-go2-ros": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2_ros:unitree_go2_ros", + "unitree-go2-seat-demo": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_seat_demo:unitree_go2_seat_demo", + "unitree-go2-seat-demo-record": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_seat_demo_record:unitree_go2_seat_demo_record", + "unitree-go2-seat-demo-reuse": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_seat_demo_reuse:unitree_go2_seat_demo_reuse", + "unitree-go2-seat-guide": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_seat_guide:unitree_go2_seat_guide", + "unitree-go2-seat-guide-agentic": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_seat_guide_agentic:unitree_go2_seat_guide_agentic", "unitree-go2-security": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_security:unitree_go2_security", "unitree-go2-spatial": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2_spatial:unitree_go2_spatial", "unitree-go2-temporal-memory": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_temporal_memory:unitree_go2_temporal_memory", @@ -129,6 +135,7 @@ "b-box-navigation-module": "dimos.navigation.bbox_navigation.BBoxNavigationModule", "b1-connection-module": "dimos.robot.unitree.b1.connection.B1ConnectionModule", "camera-module": "dimos.hardware.sensors.camera.module.CameraModule", + "camera-seat-observation-provider": "dimos.agents.skills.seat_guide.CameraSeatObservationProvider", "cartesian-motion-controller": "dimos.manipulation.control.servo_control.cartesian_motion_controller.CartesianMotionController", "control-coordinator": "dimos.control.coordinator.ControlCoordinator", "cost-mapper": "dimos.mapping.costmapper.CostMapper", @@ -200,12 +207,16 @@ "replanning-a-star-planner": "dimos.navigation.replanning_a_star.module.ReplanningAStarPlanner", "rerun-bridge-module": "dimos.visualization.rerun.bridge.RerunBridgeModule", "rerun-web-socket-server": "dimos.visualization.rerun.websocket_server.RerunWebSocketServer", + "seat-finder-skill": "dimos.agents.skills.seat_finder.SeatFinderSkill", + "seat-guide-skill-container": "dimos.agents.skills.seat_guide.SeatGuideSkillContainer", + "seat-planner": "dimos.agents.skills.seat_planner.SeatPlanner", "security-module": "dimos.experimental.security_demo.security_module.SecurityModule", "semantic-search": "dimos.memory2.module.SemanticSearch", "simple-phone-teleop": "dimos.teleop.phone.phone_extensions.SimplePhoneTeleop", "simple-planner": "dimos.navigation.nav_stack.modules.simple_planner.simple_planner.SimplePlanner", "spatial-memory": "dimos.perception.spatial_perception.SpatialMemory", "speak-skill": "dimos.agents.skills.speak_skill.SpeakSkill", + "synthetic-seat-observation-provider": "dimos.agents.skills.seat_guide.SyntheticSeatObservationProvider", "tare-planner": "dimos.navigation.nav_stack.modules.tare_planner.tare_planner.TarePlanner", "temporal-memory": "dimos.perception.experimental.temporal_memory.temporal_memory.TemporalMemory", "terrain-analysis": "dimos.navigation.nav_stack.modules.terrain_analysis.terrain_analysis.TerrainAnalysis", diff --git a/dimos/robot/unitree/go2/blueprints/agentic/_seat_guide_agentic.py b/dimos/robot/unitree/go2/blueprints/agentic/_seat_guide_agentic.py new file mode 100644 index 0000000000..66e98713c7 --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/agentic/_seat_guide_agentic.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dimos.agents.skills.navigation import NavigationSkillContainer +from dimos.agents.skills.seat_guide import CameraSeatObservationProvider, SeatGuideSkillContainer +from dimos.agents.web_human_input import WebInput +from dimos.core.coordination.blueprints import autoconnect +from dimos.robot.unitree.unitree_skill_container import UnitreeSkillContainer + +_seat_guide_agentic = autoconnect( + NavigationSkillContainer.blueprint(), + CameraSeatObservationProvider.blueprint(), + SeatGuideSkillContainer.blueprint(), + UnitreeSkillContainer.blueprint(), + WebInput.blueprint(), +) + +__all__ = ["_seat_guide_agentic"] diff --git a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_guide.py b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_guide.py new file mode 100644 index 0000000000..7a5450ec1d --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_guide.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Slim "guide dog" blueprint: navigate a leashed person to an empty seat. + +Trimmed from unitree-go2-agentic to keep GPU/compute light for guiding. Drops +the heavy modules that are unused here: SecurityModule (EdgeTAM, eager CUDA +load), SpatialMemory (CLIP), PerceiveLoopSkill and PersonFollowSkill (we lead, +not follow). Keeps the base nav stack, the agent, voice I/O and SeatFinder. + +SeatFinder runs its own continuous, sharpness-filtered YOLO stream and publishes +an annotated frame on ``/seatfinder/detections`` so the viewer can show it. +""" + +from dimos.agents.mcp.mcp_client import McpClient +from dimos.agents.mcp.mcp_server import McpServer +from dimos.agents.skills.seat_finder import SeatFinderSkill +from dimos.agents.skills.speak_skill import SpeakSkill +from dimos.agents.web_human_input import WebInput +from dimos.core.coordination.blueprints import autoconnect +from dimos.core.transport import LCMTransport +from dimos.msgs.sensor_msgs.Image import Image +from dimos.robot.unitree.go2.blueprints.smart.unitree_go2 import unitree_go2 +from dimos.robot.unitree.go2.connection import GO2Connection +from dimos.robot.unitree.unitree_skill_container import UnitreeSkillContainer + +unitree_go2_guide = ( + autoconnect( + unitree_go2, + McpServer.blueprint(), + McpClient.blueprint(), + SeatFinderSkill.blueprint(camera_info=GO2Connection.camera_info_static), + UnitreeSkillContainer.blueprint(), + WebInput.blueprint(), + SpeakSkill.blueprint(), + ) + .remappings( + [ + # 3D projection needs a world-frame cloud; use the VoxelGrid map + # (the raw GO2 /pointcloud is not populated here), like Detection3D. + (SeatFinderSkill, "pointcloud", "global_map"), + ] + ) + .transports( + { + ("detections_image", SeatFinderSkill): LCMTransport( + "/seatfinder/detections", Image + ), + } + ) + .global_config(n_workers=8, robot_model="unitree_go2") +) + +__all__ = ["unitree_go2_guide"] diff --git a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_seat_demo.py b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_seat_demo.py new file mode 100644 index 0000000000..d50c6881ee --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_seat_demo.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Manual-map → on-demand YOLO seat-find demo (no LLM). + +Operator flow: + 1. Launch this blueprint. Rerun opens with the live camera + global map. + 2. Drive the Go2 manually — click-to-goal on the map in Rerun, or use + keyboard teleop — to explore and build the voxel map. + 3. From another terminal, trigger detection on demand: + dimos mcp call find_empty_seat_now + SeatPlanner picks an empty seat in the current view, projects it to 3D, + and publishes goal_request. A* draws the path and (if MovementManager is + enabled) the robot walks there. + +McpServer exposes the @skill over HTTP; McpClient (LLM agent) is not included. +""" + +from dimos.agents.mcp.mcp_server import McpServer +from dimos.agents.skills.seat_planner import SeatPlanner +from dimos.agents.web_human_input import WebInput +from dimos.core.coordination.blueprints import autoconnect +from dimos.core.transport import LCMTransport +from dimos.msgs.sensor_msgs.Image import Image +from dimos.robot.unitree.go2.blueprints.smart.unitree_go2 import unitree_go2 +from dimos.robot.unitree.go2.connection import GO2Connection + +unitree_go2_seat_demo = ( + autoconnect( + unitree_go2, + McpServer.blueprint(), + WebInput.blueprint(), + SeatPlanner.blueprint(camera_info=GO2Connection.camera_info_static), + ) + .remappings( + [ + (SeatPlanner, "pointcloud", "global_map"), + ] + ) + .transports( + { + ("detections_image", SeatPlanner): LCMTransport( + "/seatplanner/detections", Image + ), + } + ) + .global_config(n_workers=6, robot_model="unitree_go2") +) + +__all__ = ["unitree_go2_seat_demo"] diff --git a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_seat_demo_record.py b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_seat_demo_record.py new file mode 100644 index 0000000000..9f80b5a557 --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_seat_demo_record.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Seat-demo + recording: makes a reusable map while you map manually. + +Same skills as `unitree-go2-seat-demo` but adds Go2Memory recording on top +of the plain smart stack — *without* MarkerTfModule so we don't get TF spam +from missing AprilTags. LiDAR/odom/color get written to `recording_go2.db` +for later premap export. + +Operator flow (paired with `unitree-go2-seat-demo-reuse`): + 1. dimos run unitree-go2-seat-demo-record + → manually click-to-goal in Rerun and walk the robot over the area. + 2. Ctrl+C to stop (recording is flushed to recording_go2.db in the cwd). + 3. dimos export-premap recording_go2 + → produces data/recording_go2_twopass_map.pc2.lcm + 4. Next session: use `unitree-go2-seat-demo-reuse` with that premap. +""" + +from dimos.agents.mcp.mcp_server import McpServer +from dimos.agents.skills.seat_planner import SeatPlanner +from dimos.agents.web_human_input import WebInput +from dimos.core.coordination.blueprints import autoconnect +from dimos.core.transport import LCMTransport +from dimos.msgs.sensor_msgs.Image import Image +from dimos.robot.unitree.go2.blueprints.smart.unitree_go2 import Go2Memory, unitree_go2 +from dimos.robot.unitree.go2.connection import GO2Connection + +unitree_go2_seat_demo_record = ( + autoconnect( + unitree_go2, + Go2Memory.blueprint(), + McpServer.blueprint(), + WebInput.blueprint(), + SeatPlanner.blueprint(camera_info=GO2Connection.camera_info_static), + ) + .remappings( + [ + (SeatPlanner, "pointcloud", "global_map"), + ] + ) + .transports( + { + ("detections_image", SeatPlanner): LCMTransport( + "/seatplanner/detections", Image + ), + } + ) + .global_config(n_workers=8, robot_model="unitree_go2") +) + +__all__ = ["unitree_go2_seat_demo_record"] diff --git a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_seat_demo_reuse.py b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_seat_demo_reuse.py new file mode 100644 index 0000000000..312fe6545d --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_seat_demo_reuse.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Seat-demo on top of a previously-recorded premap. + +Same skills as `unitree-go2-seat-demo` but layered on `unitree_go2_relocalization` +so a saved premap is loaded and the live scan gets ICP-aligned to it. Once +relocalize succeeds you can navigate by map-frame coordinates without +re-walking the room. + +Operator flow: + 1. dimos run unitree-go2-seat-demo-reuse \\ + -o relocalizationmodule.map_file= \\ + -o relocalizationmodule.publish_loaded_map=true + 2. Wait for `relocalize: fitness=... TF 'world' -> 'map' published` in the log. + 3. Click-to-goal in Rerun / dimos mcp call navigate_to_point / voice + "椅子まで行って" — same as the no-map demo. +""" + +from dimos.agents.mcp.mcp_server import McpServer +from dimos.agents.skills.seat_planner import SeatPlanner +from dimos.agents.web_human_input import WebInput +from dimos.core.coordination.blueprints import autoconnect +from dimos.core.transport import LCMTransport +from dimos.msgs.sensor_msgs.Image import Image +from dimos.robot.unitree.go2.blueprints.smart.unitree_go2 import unitree_go2_relocalization +from dimos.robot.unitree.go2.connection import GO2Connection + +unitree_go2_seat_demo_reuse = ( + autoconnect( + unitree_go2_relocalization, + McpServer.blueprint(), + WebInput.blueprint(), + SeatPlanner.blueprint(camera_info=GO2Connection.camera_info_static), + ) + .remappings( + [ + (SeatPlanner, "pointcloud", "global_map"), + ] + ) + .transports( + { + ("detections_image", SeatPlanner): LCMTransport( + "/seatplanner/detections", Image + ), + } + ) + .global_config(n_workers=8, robot_model="unitree_go2") +) + +__all__ = ["unitree_go2_seat_demo_reuse"] diff --git a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_seat_guide.py b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_seat_guide.py new file mode 100644 index 0000000000..ba93341858 --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_seat_guide.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dimos.agents.mcp.mcp_server import McpServer +from dimos.core.coordination.blueprints import autoconnect +from dimos.robot.unitree.go2.blueprints.agentic._seat_guide_agentic import _seat_guide_agentic +from dimos.robot.unitree.go2.blueprints.smart.unitree_go2 import unitree_go2 + +unitree_go2_seat_guide = autoconnect( + unitree_go2, + McpServer.blueprint(), + _seat_guide_agentic, +).global_config(n_workers=10) + +__all__ = ["unitree_go2_seat_guide"] diff --git a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_seat_guide_agentic.py b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_seat_guide_agentic.py new file mode 100644 index 0000000000..02dfd6c32b --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_seat_guide_agentic.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dimos.agents.mcp.mcp_client import McpClient +from dimos.agents.mcp.mcp_server import McpServer +from dimos.core.coordination.blueprints import autoconnect +from dimos.robot.unitree.go2.blueprints.agentic._seat_guide_agentic import _seat_guide_agentic +from dimos.robot.unitree.go2.blueprints.smart.unitree_go2 import unitree_go2 + +unitree_go2_seat_guide_agentic = autoconnect( + unitree_go2, + McpServer.blueprint(), + McpClient.blueprint(), + _seat_guide_agentic, +).global_config(n_workers=10) + +__all__ = ["unitree_go2_seat_guide_agentic"] diff --git a/dimos/robot/unitree/go2/blueprints/basic/unitree_go2_basic.py b/dimos/robot/unitree/go2/blueprints/basic/unitree_go2_basic.py index 96a291163d..ea315119c5 100644 --- a/dimos/robot/unitree/go2/blueprints/basic/unitree_go2_basic.py +++ b/dimos/robot/unitree/go2/blueprints/basic/unitree_go2_basic.py @@ -14,29 +14,47 @@ # See the License for the specific language governing permissions and # limitations under the License. -import platform from typing import Any -from dimos.constants import DEFAULT_CAPACITY_COLOR_IMAGE +from dimos.constants import ( + DEFAULT_CAPACITY_COLOR_IMAGE, + DEFAULT_CAPACITY_OCCUPANCY_GRID, + DEFAULT_CAPACITY_POINTCLOUD, +) from dimos.core.coordination.blueprints import autoconnect from dimos.core.global_config import global_config from dimos.core.transport import pSHMTransport +from dimos.msgs.nav_msgs.OccupancyGrid import OccupancyGrid from dimos.msgs.sensor_msgs.Image import Image +from dimos.msgs.sensor_msgs.PointCloud2 import PointCloud2 from dimos.robot.unitree.go2.connection import GO2Connection from dimos.visualization.vis_module import vis_module -# Mac has some issue with high bandwidth UDP, so we use pSHMTransport for color_image -# actually we can use pSHMTransport for all platforms, and for all streams -# TODO need a global transport toggle on blueprints/global config -_mac_transports: dict[tuple[str, type], pSHMTransport[Image]] = { +# Route large local replay and mapping streams through SHM on every platform. +# Small control/status streams continue to use the default LCM transport. +_local_high_bandwidth_transports: dict[tuple[str, type], pSHMTransport[Any]] = { ("color_image", Image): pSHMTransport( - "color_image", default_capacity=DEFAULT_CAPACITY_COLOR_IMAGE + "/color_image", default_capacity=DEFAULT_CAPACITY_COLOR_IMAGE + ), + ("lidar", PointCloud2): pSHMTransport("/lidar", default_capacity=DEFAULT_CAPACITY_POINTCLOUD), + ("pointcloud", PointCloud2): pSHMTransport( + "/pointcloud", default_capacity=DEFAULT_CAPACITY_POINTCLOUD + ), + ("global_map", PointCloud2): pSHMTransport( + "/global_map", default_capacity=DEFAULT_CAPACITY_POINTCLOUD + ), + ("merged_map", PointCloud2): pSHMTransport( + "/merged_map", default_capacity=DEFAULT_CAPACITY_POINTCLOUD + ), + ("global_costmap", OccupancyGrid): pSHMTransport( + "/global_costmap", default_capacity=DEFAULT_CAPACITY_OCCUPANCY_GRID + ), + ("navigation_costmap", OccupancyGrid): pSHMTransport( + "/navigation_costmap", default_capacity=DEFAULT_CAPACITY_OCCUPANCY_GRID ), } -_transports_base = ( - autoconnect() if platform.system() == "Linux" else autoconnect().transports(_mac_transports) -) +_transports_base = autoconnect().transports(_local_high_bandwidth_transports) def _convert_camera_info(camera_info: Any) -> Any: diff --git a/dimos/robot/unitree/go2/connection.py b/dimos/robot/unitree/go2/connection.py index 5568a473ef..2e5ac52996 100644 --- a/dimos/robot/unitree/go2/connection.py +++ b/dimos/robot/unitree/go2/connection.py @@ -307,7 +307,7 @@ def _publish_tf(self, msg: PoseStamped) -> None: def publish_camera_info(self) -> None: while True: - self.camera_info.publish(self.camera_info_static) + self.camera_info.publish(self.camera_info_static.with_ts(time.time())) time.sleep(1.0) @rpc diff --git a/dimos/robot/unitree/go2/connection_spec.py b/dimos/robot/unitree/go2/connection_spec.py index dd6aab9c40..0a2614a95f 100644 --- a/dimos/robot/unitree/go2/connection_spec.py +++ b/dimos/robot/unitree/go2/connection_spec.py @@ -14,8 +14,10 @@ from typing import Any, Protocol +from dimos.msgs.geometry_msgs.Twist import Twist from dimos.spec.utils import Spec class GO2ConnectionSpec(Spec, Protocol): + def move(self, twist: Twist, duration: float = 0.0) -> bool: ... def publish_request(self, topic: str, data: dict[str, Any]) -> dict[Any, Any]: ... diff --git a/dimos/robot/unitree/unitree_skill_container.py b/dimos/robot/unitree/unitree_skill_container.py index 88194473e6..bacb5584e4 100644 --- a/dimos/robot/unitree/unitree_skill_container.py +++ b/dimos/robot/unitree/unitree_skill_container.py @@ -26,6 +26,7 @@ from dimos.core.module import Module from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped from dimos.msgs.geometry_msgs.Quaternion import Quaternion +from dimos.msgs.geometry_msgs.Twist import Twist from dimos.msgs.geometry_msgs.Vector3 import Vector3 from dimos.navigation.base import NavigationState from dimos.navigation.navigation_spec import NavigationInterfaceSpec @@ -208,15 +209,49 @@ def stop(self) -> None: super().stop() @skill - def relative_move(self, forward: float = 0.0, left: float = 0.0, degrees: float = 0.0) -> str: + def direct_move( + self, x: float, y: float = 0.0, yaw: float = 0.0, duration: float = 1.0 + ) -> str: + """Move the Go2 with direct velocity commands for hardware bring-up. + + Use this before navigation-based SeatGuide tests to verify that the robot + can receive and execute low-level movement commands. Keep values small + during bring-up. + + Args: + x: Forward velocity in meters per second. Negative moves backward. + y: Left velocity in meters per second. Negative moves right. + yaw: Counter-clockwise yaw velocity in radians per second. Negative turns right. + duration: How long to keep sending the command in seconds. + """ + x, y, yaw, duration = float(x), float(y), float(yaw), float(duration) + twist = Twist(linear=Vector3(x, y, 0.0), angular=Vector3(0.0, 0.0, yaw)) + if self._connection.move(twist, duration=duration): + return f"Direct move sent: x={x}, y={y}, yaw={yaw}, duration={duration}." + return "Direct move failed to send." + + @skill + def relative_move( + self, + forward: float = 0.0, + left: float = 0.0, + degrees: float = 0.0, + x: float = 0.0, + y: float = 0.0, + duration: float = 0.0, + ) -> str: """Move the robot relative to its current position. The `degrees` arguments refers to the rotation the robot should be at the end, relative to its current rotation. + The `x`, `y`, and `duration` arguments are accepted for compatibility with + velocity-style movement requests; `x` maps to `forward`, `y` maps to `left`, + and `duration` is ignored because this skill sends a relative navigation goal. Example calls: # Move to a point that's 2 meters forward and 1 to the right. relative_move(forward=2, left=-1, degrees=0) + relative_move(x=2, y=-1, degrees=0) # Move back 1 meter, while still facing the same direction. relative_move(forward=-1, left=0, degrees=0) @@ -228,6 +263,11 @@ def relative_move(self, forward: float = 0.0, left: float = 0.0, degrees: float relative_move(forward=0, left=3, degrees=90) """ forward, left, degrees = float(forward), float(left), float(degrees) + x, y = float(x), float(y) + if forward == 0.0 and x != 0.0: + forward = x + if left == 0.0 and y != 0.0: + left = y tf = self.tf.get("world", "base_link") if tf is None: diff --git a/dimos/visualization/rerun/bridge.py b/dimos/visualization/rerun/bridge.py index 2f5fb1efa9..eb00f23730 100644 --- a/dimos/visualization/rerun/bridge.py +++ b/dimos/visualization/rerun/bridge.py @@ -43,6 +43,7 @@ from dimos.core.core import rpc from dimos.core.module import Module, ModuleConfig +from dimos.core.transport import PubSubTransport from dimos.protocol.pubsub.impl.lcmpubsub import LCM from dimos.protocol.pubsub.patterns import Glob, pattern_matches from dimos.protocol.pubsub.spec import SubscribeAllCapable @@ -164,7 +165,10 @@ def _default_blueprint() -> Blueprint: class Config(ModuleConfig): + # Pubsubs cover discoverable sources such as LCM. visual_transports is + # populated by the coordinator for concrete local streams such as SHM. pubsubs: list[SubscribeAllCapable[Any, Any]] = field(default_factory=lambda: [LCM()]) + visual_transports: list[PubSubTransport[Any]] = field(default_factory=list) visual_override: dict[Glob | str, Callable[[Any], Archetype] | None] = field( default_factory=dict @@ -186,10 +190,12 @@ class Config(ModuleConfig): class RerunBridgeModule(Module): - """Bridge that logs messages from pubsubs to Rerun. + """Bridge that logs transport messages to Rerun. - Spawns its own Rerun viewer and subscribes to all topics on each provided - pubsub. Any message that has a to_rerun() method is automatically logged. + Spawns its own Rerun viewer and subscribes to configured pubsubs and + explicit visual transports. Pubsubs cover discoverable transports such as + LCM; visual_transports covers concrete local transports such as SHM. + Any message that has a to_rerun() method is automatically logged. Example: from dimos.protocol.pubsub.impl.lcmpubsub import LCM @@ -215,6 +221,8 @@ def __init__(self, **kwargs: Any) -> None: self._last_log = {} self._override_cache: dict[str, Callable[[Any], RerunData | None]] = {} self._frame_attached: dict[str, str] = {} + self._subscribed_visual_transport_topics: set[str] = set() + self._started = False @property def host(self) -> str: @@ -265,13 +273,56 @@ def composed(msg: Any) -> RerunData | None: return composed def _get_entity_path(self, topic: Any) -> str: + """Map a transport topic to a Rerun entity path. + + LCM topics usually already include a leading slash and a type suffix. + SHM topics are plain strings. Normalize both forms so visual overrides + such as "world/color_image" match consistently. + """ if self.config.topic_to_entity: return self.config.topic_to_entity(topic) topic_str = getattr(topic, "name", None) or str(topic) topic_str = topic_str.split("#")[0] # strip LCM topic suffix + if not topic_str.startswith("/"): + topic_str = f"/{topic_str}" return f"{self.config.entity_prefix}{topic_str}" + @rpc + def set_visual_transports(self, transports: list[PubSubTransport[Any]]) -> None: + """Replace explicit visual transports and subscribe when running. + + The coordinator calls this after stream wiring and after loading + additional blueprints into an existing coordinator. + """ + self.config.visual_transports = transports + if self._started: + self._subscribe_visual_transports() + + def _subscribe_visual_transports(self) -> None: + """Attach to configured SHM streams once per topic.""" + for transport in self.config.visual_transports: + topic = str(getattr(transport, "topic", "")) + if not topic or topic in self._subscribed_visual_transport_topics: + continue + self._subscribed_visual_transport_topics.add(topic) + if hasattr(transport, "start"): + transport.start() + # If subscribe raises, the bridge still owns cleanup for the transport it started. + self.register_disposable(Disposable(transport.stop)) + transport_topic = getattr(transport, "topic", topic) + + def on_visual_message(msg: Any, transport_topic: Any = transport_topic) -> None: + self._on_message(msg, transport_topic) + + unsub = transport.subscribe( + # Capture the current topic so callbacks keep the correct + # entity path even as this loop advances to the next transport. + on_visual_message + ) + if unsub is not None: + self.register_disposable(Disposable(unsub)) + def _on_message(self, msg: Any, topic: Any) -> None: """Handle incoming message - log to rerun.""" @@ -306,6 +357,7 @@ def _on_message(self, msg: Any, topic: Any) -> None: @rpc def start(self) -> None: super().start() + self._started = True logger.info("Rerun bridge starting") @@ -397,6 +449,8 @@ def start(self) -> None: unsub = pubsub.subscribe_all(self._on_message) self.register_disposable(Disposable(unsub)) + self._subscribe_visual_transports() + for pubsub in self.config.pubsubs: if hasattr(pubsub, "stop"): self.register_disposable(Disposable(pubsub.stop)) # type: ignore[union-attr] @@ -506,8 +560,10 @@ def log_blueprint_graph(self, dot_code: str, module_names: list[str]) -> None: @rpc def stop(self) -> None: + self._started = False self._override_cache.clear() self._frame_attached.clear() + self._subscribed_visual_transport_topics.clear() super().stop() diff --git a/dimos/web/dimos_interface/api/server.py b/dimos/web/dimos_interface/api/server.py index b73a1e5fdb..64f7857362 100644 --- a/dimos/web/dimos_interface/api/server.py +++ b/dimos/web/dimos_interface/api/server.py @@ -26,6 +26,7 @@ # Fast Api & Uvicorn import asyncio +import base64 # For audio processing import io @@ -47,6 +48,7 @@ from reactivex.disposable import SingleAssignmentDisposable import soundfile as sf # type: ignore[import-untyped] from sse_starlette.sse import EventSourceResponse +from starlette.concurrency import run_in_threadpool import uvicorn from dimos.core.global_config import global_config @@ -101,6 +103,9 @@ def __init__( # type: ignore[no-untyped-def] self.query_subject = rx.subject.Subject() # type: ignore[var-annotated] self.query_stream = self.query_subject.pipe(ops.share()) self.audio_subject = audio_subject + self._seat_guide_model = None + self._seat_guide_yolo_detector = None + self._seat_guide_model_lock = Lock() for key in self.streams: if self.streams[key] is not None: @@ -314,6 +319,47 @@ async def upload_audio(file: UploadFile = File(...)): # type: ignore[no-untyped print(f"Failed to process uploaded audio: {e}") return JSONResponse(status_code=500, content={"success": False, "message": str(e)}) + @self.app.get("/seat-guide-camera", response_class=HTMLResponse) + async def seat_guide_camera(): # type: ignore[no-untyped-def] + """Browser-camera SeatGuide validation page.""" + return HTMLResponse(self._seat_guide_camera_page()) + + @self.app.get("/seat-guide-speaker", response_class=HTMLResponse) + async def seat_guide_speaker(): # type: ignore[no-untyped-def] + """Phone speaker page for SeatGuide arrival notifications.""" + return HTMLResponse(self._seat_guide_speaker_page()) + + @self.app.post("/seat_guide/detect_frame") + async def seat_guide_detect_frame(request: Request): # type: ignore[no-untyped-def] + """Detect chairs, people, and empty seats from a browser camera frame.""" + try: + payload = await request.json() + image_data = str(payload.get("image", "")) + if "," in image_data: + image_data = image_data.split(",", 1)[1] + if not image_data: + return JSONResponse( + status_code=400, + content={"success": False, "message": "Missing image data"}, + ) + encoded = base64.b64decode(image_data) + frame = cv2.imdecode(np.frombuffer(encoded, dtype=np.uint8), cv2.IMREAD_COLOR) + if frame is None: + return JSONResponse( + status_code=400, + content={"success": False, "message": "Unable to decode image"}, + ) + detector = str(payload.get("detector", "yolo")).strip().lower() + result = await run_in_threadpool( + self._detect_seat_guide_frame, + frame, + detector, + ) + return JSONResponse({"success": True, **result}) + except Exception as e: + print(f"SeatGuide camera detection failed: {e}") + return JSONResponse(status_code=500, content={"success": False, "message": str(e)}) + # Unitree API endpoints @self.app.get("/unitree/status") async def unitree_status(): # type: ignore[no-untyped-def] @@ -353,6 +399,709 @@ async def text_stream(key: str): # type: ignore[no-untyped-def] for key in self.streams: self.app.get(f"/video_feed/{key}")(self.create_video_feed_route(key)) # type: ignore[no-untyped-call] + def _detect_seat_guide_frame( + self, frame: np.ndarray, detector: str = "yolo" + ) -> dict[str, object]: + """Run local SeatGuide chair/person detection on one browser camera frame.""" + if detector == "moondream": + return self._detect_seat_guide_frame_moondream(frame) + if detector != "yolo": + raise ValueError(f"Unsupported SeatGuide detector: {detector}") + return self._detect_seat_guide_frame_yolo(frame) + + def _detect_seat_guide_frame_yolo(self, frame: np.ndarray) -> dict[str, object]: + """Run fast local YOLO chair/person detection on one browser camera frame.""" + import torch + + from dimos.msgs.sensor_msgs.Image import Image, ImageFormat + from dimos.perception.detection.detectors.yolo import Yolo2DDetector + + height, width = frame.shape[:2] + rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + image = Image.from_numpy( + rgb, + format=ImageFormat.RGB, + frame_id="browser_camera", + ts=time.time(), + ) + + with self._seat_guide_model_lock: + if self._seat_guide_yolo_detector is None: + device = "cpu" + if torch.backends.mps.is_available(): + device = "mps" + elif torch.cuda.is_available(): + device = "cuda" + self._seat_guide_yolo_detector = Yolo2DDetector(device=device) + detections = self._seat_guide_yolo_detector.process_image(image) + + chair_detections = [ + detection + for detection in detections.detections + if detection.name.strip().lower() == "chair" + ] + person_detections = [ + detection + for detection in detections.detections + if detection.name.strip().lower() == "person" + ] + return self._seat_guide_detection_response( + frame=frame, + width=width, + height=height, + chair_boxes=[tuple(detection.bbox) for detection in chair_detections], + person_boxes=[tuple(detection.bbox) for detection in person_detections], + detector="yolo11n", + description="YOLO realtime mode detects chairs and people without semantic captioning.", + ) + + def _detect_seat_guide_frame_moondream(self, frame: np.ndarray) -> dict[str, object]: + """Run local Moondream chair/person detection on one browser camera frame.""" + import torch + + from dimos.models.vl.moondream import MoondreamVlModel + from dimos.msgs.sensor_msgs.Image import Image, ImageFormat + + height, width = frame.shape[:2] + rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + image = Image.from_numpy( + rgb, + format=ImageFormat.RGB, + frame_id="browser_camera", + ts=time.time(), + ) + + with self._seat_guide_model_lock: + if self._seat_guide_model is None: + device = "cpu" + if torch.backends.mps.is_available(): + device = "mps" + elif torch.cuda.is_available(): + device = "cuda" + self._seat_guide_model = MoondreamVlModel( + compile_model=False, + device=device, + ) + model = self._seat_guide_model + object_description = model.query( + image, + "In one short sentence, name the main visible objects in this image.", + ).strip() + chair_detections = model.query_detections(image, "chair", max_objects=10).detections + person_detections = model.query_detections(image, "person", max_objects=10).detections + return self._seat_guide_detection_response( + frame=frame, + width=width, + height=height, + chair_boxes=[tuple(detection.bbox) for detection in chair_detections], + person_boxes=[tuple(detection.bbox) for detection in person_detections], + detector="moondream2", + description=object_description, + ) + + def _seat_guide_detection_response( + self, + *, + frame: np.ndarray, + width: int, + height: int, + chair_boxes: list[tuple[float, float, float, float]], + person_boxes: list[tuple[float, float, float, float]], + detector: str, + description: str, + ) -> dict[str, object]: + people = person_boxes + person_centers = [self._bbox_center(bbox) for bbox in people] + + seats: list[dict[str, object]] = [] + annotated = frame.copy() + for index, bbox in enumerate(chair_boxes, start=1): + occupied = any( + self._bbox_contains_point( + self._expanded_bbox(bbox, width, height, fraction=0.15), + center, + ) + for center in person_centers + ) + color = (0, 0, 220) if occupied else (0, 180, 0) + label = f"occupied chair {index}" if occupied else f"EMPTY SEAT {index}" + self._draw_detection_box(annotated, bbox, label, color) + seats.append( + { + "id": f"seat_{index}", + "status": "occupied" if occupied else "empty", + "bbox": [round(value, 1) for value in bbox], + } + ) + + for index, bbox in enumerate(people, start=1): + self._draw_detection_box(annotated, bbox, f"person {index}", (220, 120, 0)) + + empty_count = sum(1 for seat in seats if seat["status"] == "empty") + ok, png = cv2.imencode(".png", annotated) + if not ok: + raise RuntimeError("Unable to encode annotated image") + + return { + "detector": detector, + "description": description, + "chairs": len(seats), + "people": len(people), + "empty": empty_count, + "seats": seats, + "annotated_image": "data:image/png;base64," + + base64.b64encode(png.tobytes()).decode("ascii"), + } + + @staticmethod + def _bbox_center(bbox: tuple[float, float, float, float]) -> tuple[float, float]: + x1, y1, x2, y2 = bbox + return (x1 + x2) / 2.0, (y1 + y2) / 2.0 + + @staticmethod + def _expanded_bbox( + bbox: tuple[float, float, float, float], + width: int, + height: int, + *, + fraction: float, + ) -> tuple[float, float, float, float]: + x1, y1, x2, y2 = bbox + pad_x = max(0.0, (x2 - x1) * fraction) + pad_y = max(0.0, (y2 - y1) * fraction) + return ( + max(0.0, x1 - pad_x), + max(0.0, y1 - pad_y), + min(float(width), x2 + pad_x), + min(float(height), y2 + pad_y), + ) + + @staticmethod + def _bbox_contains_point( + bbox: tuple[float, float, float, float], point: tuple[float, float] + ) -> bool: + x1, y1, x2, y2 = bbox + x, y = point + return x1 <= x <= x2 and y1 <= y <= y2 + + @staticmethod + def _draw_detection_box( + image: np.ndarray, + bbox: tuple[float, float, float, float], + label: str, + color: tuple[int, int, int], + ) -> None: + x1, y1, x2, y2 = [round(value) for value in bbox] + cv2.rectangle(image, (x1, y1), (x2, y2), color, 3) + (text_w, text_h), baseline = cv2.getTextSize( + label, + cv2.FONT_HERSHEY_SIMPLEX, + 0.65, + 2, + ) + top = max(0, y1 - text_h - baseline - 8) + cv2.rectangle(image, (x1, top), (x1 + text_w + 8, top + text_h + baseline + 8), color, -1) + cv2.putText( + image, + label, + (x1 + 4, top + text_h + 3), + cv2.FONT_HERSHEY_SIMPLEX, + 0.65, + (255, 255, 255), + 2, + cv2.LINE_AA, + ) + + @staticmethod + def _seat_guide_camera_page() -> str: + return """ + + + + + SeatGuide Camera Validation + + + +
+

SeatGuide Camera Validation

+
camera=starting
+
+
+
+
Browser Camera-
+ +
+ + + + +
+
Requesting camera access...
+
+
+
Detection Resultobjects=waiting chairs=0 people=0 empty=0
+ Annotated detection result +
No detection run yet.
+
+
+ + + +""" + + @staticmethod + def _seat_guide_speaker_page() -> str: + return """ + + + + + SeatGuide Phone Speaker + + + +
+
+
+

SeatGuide Phone Speaker

+
+
audio=locked
+
+
+ + +
+
+
Agent responses
+
+
+
+ + +""" + @staticmethod def _ensure_certs(certs_dir: Path) -> tuple[str, str]: """Return (cert_path, key_path), generating self-signed certs if needed. diff --git a/docs/agents/seat_guide_modules.md b/docs/agents/seat_guide_modules.md new file mode 100644 index 0000000000..7f5804d218 --- /dev/null +++ b/docs/agents/seat_guide_modules.md @@ -0,0 +1,421 @@ +# SeatGuide Dog module split + +This is the first demo-oriented module plan for the conference room seat-finding +hackathon idea. The goal is to keep each boundary testable without a Go2 while +the default Go2 path uses real browser/Whisper voice input, camera-backed VLM +seat/person recognition (YOLO fast path, VLM fallback), robot navigation, and phone/web feedback. + +## Demo-critical flow + +User asks: "Find me an empty seat." + +The system scans a conference room with one long table, detects chairs and +people, selects the nearest reachable empty chair, navigates beside it, and +publishes a short instruction to the web or phone relay. + +End-to-end demo data path: + +1. User speaks or types a SeatGuide request in the browser, or calls + `phone_seat_request`. +2. `WebInput` routes matching SeatGuide text directly to `handle_seat_request`; + unrelated text stays on the normal agent path. +3. `CameraSeatObservationProvider` reads the latest RGB frame, camera + calibration, LiDAR/pointcloud, and odometry. It detects chairs/people and + produces map-frame `SeatSceneObservation` data. If no chair is visible but a + direct/relative mover is wired, SeatGuide rotates in place and checks again. +4. `SeatGuidePlanner` classifies occupied chairs, selects the nearest empty + chair, and computes the guide pose beside that chair in the `map` frame. +5. `NavigationInterfaceSpec.set_goal()` receives the guide pose; SeatGuide waits + for `goal_reached=true` when live requests use the default arrival wait. +6. After navigation completes or fails, `WebInput` publishes the result to the + web response stream and posts the same result to the configured phone speaker + relay. The phone plays the message; Go2 body audio is not used. + +## Modules + +| Module | Owner boundary | Input | Output | Can build in parallel | Current status | +| --- | --- | --- | --- | --- | --- | +| Voice Command Intake | Converts speech or typed text into the `find_empty_seat` intent. | Browser microphone, web input, or agent text | SeatGuide request intent | Yes | `WebInput` routes SeatGuide voice/text directly to `handle_seat_request()` | +| Seat Perception | Detects chair poses and person positions in the conference room. | Go2 RGB camera frames/replay frames plus odometry | `SeatSceneObservation` in map frame | Yes | `CameraSeatObservationProvider` implemented with VLM detection, odom-backed map projection, and explicit calibrated fallback | +| Seat Occupancy Planner | Decides which chairs are empty and picks the nearest empty chair. | Chair poses, person positions, robot pose | Selected seat and guide pose | Yes | Implemented in `SeatGuidePlanner` | +| Guide Navigation | Sends the robot to a pose beside the selected chair and reports completion. | Guide pose in `map` frame | Navigation goal plus `goal_reached` status | Yes | Uses `NavigationInterfaceSpec` | +| Phone/Web Feedback | Tells the user what was found and where to follow. | Planner result and navigation status | Web response text or phone speaker relay | Yes | `WebInput` exposes the response stream and phone speaker relay | +| Acceptance Harness | Runs the same flow without Go2 hardware. | Fixed synthetic or recorded image layout | Test result and expected goal pose | Yes | Covered by unit tests | + +## Current software boundary + +`dimos.agents.skills.seat_guide.SeatGuideSkillContainer` is the integration +module exposed to the agent. It supports direct debug input and provider-backed +real perception input. + +Direct skill arguments: + +- `seats`: flat `[x, y, yaw, x, y, yaw, ...]` chair poses in the `map` frame +- `people`: flat `[x, y, x, y, ...]` person positions in the `map` frame +- `robot_x`, `robot_y`: robot position for nearest-seat selection + +Provider-backed scene: + +- `SeatObservationProviderSpec.get_seat_scene()` +- `CameraSeatObservationProvider` subscribes to `color_image` and `odom`, runs YOLO fast detection for `chair` and `person` by default, and converts image-space detections to an approximate map-frame scene using the latest robot pose +- `camera_seat_provider_status()` reports camera frame, odometry, input freshness, VLM credential, runtime override, and fallback configuration readiness without running VLM detection +- `CameraSeatObservationProvider.set_seat_scene()` remains available as explicit runtime calibration/fallback when camera/VLM detection is unavailable +- `SyntheticSeatObservationProvider` remains for repeatable Go2-free tests and demos +- `unitree-go2-seat-guide` and `unitree-go2-seat-guide-agentic` include `CameraSeatObservationProvider` so the default SeatGuide bring-up path uses real camera recognition +- the default runtime path uses YOLO `yolo11n.pt` as the fast chair/person detector. `moondream` and `qwen` remain VLM fallback options when `vlm_fallback_enabled` is turned on; if `qwen` is selected, missing `ALIBABA_API_KEY` makes SeatGuide report `camera_detection_error` instead of silently treating missing credentials as a real no-seat observation + +Voice/text intake: + +- `parse_seat_guide_intent(text)` recognizes simple English and Chinese seat-finding requests +- `handle_seat_request(text)` rejects unrelated text or delegates to `find_empty_seat_from_scene()`; by default it only navigates from live `camera` perception +- `preview_seat_request(text)` validates a spoken/typed SeatGuide request and runs no-motion preflight instead of navigation +- `seat_guide_readiness_report()` runs scene status, live-perception preflight, and goal preview in one no-motion report +- `seat_guide_preflight()` checks navigation state, scene source, empty/occupied seat counts, and selected seat/goal without moving; hardware acceptance requires `navigation=IDLE`, it only passes live `camera` perception by default, and fallback calibration must be allowed explicitly with `require_live_perception=false` +- `seat_guide_status()` reports the current scene source, visible/configured seats, people, and robot pose without navigating +- `preview_empty_seat_goal()` runs the same planner and reports empty/occupied seat counts, the selected chair, and map-frame navigation goal without moving +- `seat_guide_navigation_status()` reports navigation state and `goal_reached` after a live SeatGuide request, so acceptance can prove the robot completed the task rather than only accepted a goal; if navigation already reported reached before the SeatGuide goal was sent, status waits to see `goal_reached=false` once before accepting a later `true` +- `WebInput` uses browser audio -> `WhisperNode` speech-to-text with language auto-detection; matching English or Chinese SeatGuide requests are routed directly to `handle_seat_request()` instead of waiting for the LLM to choose a tool, and the returned SeatGuide status is published to the web `agent_responses` text stream +- `web_input_status()` reports whether the browser voice/text entry point, browser audio upload endpoint, SeatGuide direct route, web response stream, STT pipeline, and `/human_input` fallback transport are connected +- unrelated text still goes through the normal `/human_input` agent path + +Scene source values used by `seat_guide_status()`: + +- `camera`: VLM detected at least one chair from the latest camera frame. +- `runtime_override`: operator-provided calibration from `set_seat_scene()`. +- `configured_fallback`: blueprint/static fallback scene. +- `no_camera_image`: no camera frame has arrived yet; check camera stream wiring. +- `camera_no_odom`: camera frames arrived but localization/odometry is missing; live navigation is no-go because SeatGuide cannot produce a trustworthy map-frame goal. +- `stale_camera_image`: camera frames are too old to prove live perception; restore the camera stream before live navigation. +- `stale_camera_odom`: odometry is too old to produce a trustworthy map-frame goal; restore localization before live navigation. +- `camera_no_seats_detected`: camera frames arrived, but VLM found no chairs; turn the robot toward the table or calibrate fallback. +- `camera_detection_error`: VLM detection raised an error; inspect logs/model setup or calibrate fallback. + +When live navigation is requested from a non-`camera` source, SeatGuide refuses +to move and reports the source, seat/person counts, robot pose, and a specific +next step. This is intentional: fallback coordinates can be useful for +calibration, but they should not be mistaken for real chair/person recognition. + +Phone/web feedback: + +- SeatGuide returns result/failure text and publishes it to the web `agent_responses` stream. +- If a phone is mounted on the Go2, the phone can open the SeatGuide speaker page or cloud relay and play the latest message through the phone speaker. +- The Go2 body-audio path is intentionally not part of SeatGuide acceptance because hardware testing showed it is not a reliable output device. + +This keeps perception independent from navigation. A later detector can return +the same scene contract without changing the planner tests. + +## Unit-test acceptance + +The Go2-free acceptance path is: + +1. Build a synthetic long-table room layout. +2. Mark chairs occupied when a person is within 0.75 meters. +3. Select the nearest empty chair to the robot. +4. Compute a navigation pose beside the chair. +5. Verify a fake navigator receives the expected `PoseStamped`. +6. Verify `SyntheticSeatObservationProvider` can feed the same flow without + direct skill arguments. +7. Verify text requests such as "Please find me an empty seat" and "帮我找一个空位" + route into the same provider-backed flow. +8. Verify no-motion text requests such as "预检帮我找一个空位" route to `preview_seat_request()` and do not call navigation. +9. Verify `CameraSeatObservationProvider` converts camera/VLM chair/person detections into a map-frame `SeatSceneObservation` using latest odometry when available. +10. Verify `WebInput` creates Whisper without forcing English, so Chinese phrases such as "帮我找一个空位" can be transcribed by language auto-detection. +11. Verify `seat_guide_status()` can diagnose whether the scene came from camera detection, runtime calibration, or configured fallback before navigation is attempted. +12. Verify `seat_guide_preflight()` reports navigation/perception readiness without calling navigation, and that fallback scenes are no-go unless explicitly allowed. +13. Verify `seat_guide_readiness_report()` combines status, preflight, and preview without calling navigation. +14. Verify `handle_seat_request()` refuses fallback scenes by default and only navigates with fallback when `require_live_perception=false` is explicitly passed. +15. Verify `preview_empty_seat_goal()` reports the selected chair and map-frame goal without calling navigation. +16. Verify `seat_guide_navigation_status()` reports `goal_reached=true/false` and missing navigation without sending or canceling a goal. +17. Verify the SeatGuide MCP JSON-RPC path can list `seat_guide_status`, run preflight/readiness, preview the goal, call `handle_seat_request` with Chinese text, and report `goal_reached=true` without Go2 hardware. + +Run: + +```bash +uv run pytest dimos/agents/skills/test_seat_guide.py +``` + +## Simulation acceptance target + +If a simulator is available, use the same synthetic or camera-derived layout as the unit test and +run the Go2 agentic stack in replay or sim mode. The minimum sim/replay acceptance is +that browser/Whisper text reaches SeatGuide, `CameraSeatObservationProvider` +returns a scene from camera frames or an explicitly calibrated fallback, and navigation receives +a goal. + +Current replay evidence: + +- `dimos --replay run unitree-go2-seat-guide-agentic` starts with MCP tools exposed. +- `dimos mcp list-tools` includes `set_seat_scene`, `find_empty_seat_from_scene`, `preview_seat_request`, `seat_guide_readiness_report`, `seat_guide_preflight`, `seat_guide_navigation_status`, `preview_empty_seat_goal`, `handle_seat_request`, `seat_guide_status`, `server_status`, and `list_modules`. +- `web_input_status` reports `seat_route=seat_guide_direct`, `voice_upload=connected`, and `stt=connected` when browser/Whisper input is wired directly to SeatGuide. +- `camera_seat_provider_status` reports whether Go2 camera frames, odometry, input freshness, and VLM credentials are ready before detection is attempted. +- Calling `set_seat_scene` followed by `handle_seat_request` returned: `I found an empty seat seat_3. Please follow me to the chair beside the table. Navigating to (3.65, -1.00).` +- Posting typed text to `http://localhost:5555/submit_query` with `query=帮我找一个空位` returned success and triggered the same SeatGuide navigation path. +- In replay, the configured fallback goal produced a planner warning `No path found to the goal`; this confirms a navigation goal was sent, but not that the fallback coordinates are reachable in the replay map. + +## Tomorrow G2 bring-up path + +The intended real-perception bring-up path is: + +1. Place chairs in the Go2 camera view. +2. Start `unitree-go2-seat-guide-agentic` with the normal MCP server. +3. Run `seat_guide_status`, `seat_guide_preflight`, and `preview_empty_seat_goal` before any live motion. +4. If preflight reports `navigation=FOLLOWING_PATH` or `navigation=RECOVERY`, wait for the current action to finish or cancel it before asking SeatGuide to send a new goal. +5. Optionally call `set_seat_scene()` only if camera/VLM detection is unavailable or needs explicit fallback calibration. +6. Send or speak: "Please find me an empty seat" / "帮我找一个空位". +7. `WebInput` transcribes browser microphone audio with Whisper language auto-detection and routes matching text to `handle_seat_request(text)`. +8. `handle_seat_request()` parses the intent, reads the camera-backed scene, picks + the nearest empty chair, and calls `NavigationInterfaceSpec.set_goal()`. + +`set_seat_scene()` is now an explicit fallback/calibration tool, not the primary path. + +### Parallel hardware-day checklist + +Use this split when the Go2 is connected so the team can debug independently +before any live motion: + +| Track | Owner checks | Passing evidence | No-go action | +| --- | --- | --- | --- | +| Voice intake | Browser page opens; microphone permission granted; Chinese preview phrase reaches WebInput. | `web_input_status` shows `web=started`, `thread=running`, `seat_route=seat_guide_direct`, `responses=connected`, `voice_upload=connected`, `stt=connected`, `human_transport=connected`; acceptance log shows `WebInput received text` for `预检帮我找一个空位`. | Fix browser/microphone/WebInput before touching navigation. | +| Perception | Go2 camera frame, odometry, and the YOLO fast detector are live; no fallback scene is active. | `camera_seat_provider_status` shows `image=x`, `image_fresh=true`, `odom=(...)`, `odom_fresh=true`, `fast_detector=yolo`, `override=inactive`, `configured_fallback_seats=0`, `configured_fallback_people=0`; `seat_guide_status` starts with `SeatGuide scene source=camera:`. | Turn robot toward the table, restore stale camera/odom streams, or explicitly mark fallback calibration as non-acceptance. | +| Planner | Empty/occupied counts and selected goal make sense before motion. | `seat_guide_preflight`, `seat_guide_readiness_report`, and `preview_empty_seat_goal` report `empty=N occupied=N`, `selected=...`, and `goal=(...)` without sending a goal. | Adjust camera view or chair/person layout before live voice. | +| Navigation | Robot is idle before SeatGuide sends the live goal and reports completion after it. | Preflight has `navigation=IDLE`; after live voice, `seat_guide_navigation_status` reports a new `goal_sequence` and `goal_reached=true`. | Wait/cancel existing navigation or inspect navigation logs; do not rerun live voice until idle. | +| Phone feedback | The web response stream is visible, and a mounted phone can play messages if audible feedback is required. | `web_input_status` shows `responses=connected`; optional phone relay checks can use `phone_speaker_test`. | Keep the phone speaker page open on the mounted phone; do not depend on Go2 body audio. | +| Acceptance evidence | The run is hardware, not replay/sim, and uses the SeatGuide blueprint. | `bin/demo_seat_guide_hardware_acceptance` records the run registry, no-motion gates, browser microphone gates, camera source, speech output check plus operator heard confirmation, ordered WebInput route logs, and `goal_reached=true`; `bin/demo_seat_guide_verify_acceptance_log ` passes. | Treat failures as real no-go evidence; do not replace them with direct MCP live calls. | + +### Bring-up commands + +One-command real Go2 bring-up: + +```bash +# Optional: enables the normal LLM agent path. SeatGuide direct voice/MCP +# routing still works without it. +export OPENROUTER_API_KEY=... +export OPENROUTER_MODEL=openai/gpt-4o-mini +bin/demo_seat_guide_hardware_bringup --robot-ip 192.168.123.161 +``` + +This starts `unitree-go2-seat-guide-agentic`, runs the no-motion smoke checks, +then launches the hardware acceptance flow. Use `--skip-start` if the stack is +already running, or `--skip-smoke` only when repeating acceptance after a known +passing smoke run. + +Start the SeatGuide-focused Go2 stack. This keeps the real Go2 base, +navigation, camera, browser/Whisper voice input, MCP server, and SeatGuide +modules, while avoiding unrelated CUDA-only security demo modules: + +```bash +dimos --robot-ip 192.168.123.161 --detection-model moondream run unitree-go2-seat-guide-agentic --daemon +``` + +For local replay without a real robot: + +```bash +dimos --replay run unitree-go2-seat-guide-agentic --daemon +``` + +On macOS, DimOS replay and MCP require multicast on loopback for cross-process +LCM/RPC. If startup asks for this route and cannot run sudo non-interactively, +run it once in a terminal before replay or hardware bring-up: + +```bash +sudo route delete -net 224.0.0.0/4 || true +sudo route add -net 224.0.0.0/4 -interface lo0 +``` + +Do not use `PYTEST_VERSION=1` as a runtime workaround. It skips the system +configurator, but `McpServer/on_system_modules` can time out because the +cross-process LCM route is still missing. + +The general Go2 agentic stack is still available, but it includes unrelated +demo modules and is not the SeatGuide acceptance target. The hardware +acceptance script intentionally rejects that general stack; start +`unitree-go2-seat-guide` or `unitree-go2-seat-guide-agentic` for SeatGuide +acceptance. + +If `OPENROUTER_API_KEY` is set, `McpClient` routes the normal LLM agent through +OpenRouter's OpenAI-compatible chat API. Set `OPENROUTER_MODEL` to a +tool-calling model such as `openai/gpt-4o-mini`; otherwise DimOS maps the +default `gpt-4o` model to `openai/gpt-4o` on OpenRouter. If neither +`OPENROUTER_API_KEY` nor `OPENAI_API_KEY` is set, `McpClient` disables the LLM +agent but the direct SeatGuide voice route and MCP tools still start. + +The default camera detector uses YOLO `yolo11n.pt` for low-latency chair/person detection. Moondream and Qwen are VLM fallback options only when `vlm_fallback_enabled` is enabled. If `qwen` is selected, +set `ALIBABA_API_KEY`; otherwise `camera_seat_provider_status` reports +`credential=missing` and `seat_guide_status` reports +`source=camera_detection_error`. Use logs or `set_seat_scene` only as an +explicit fallback/calibration path. + +Confirm the SeatGuide tools are exposed: + +```bash +dimos mcp list-tools +dimos mcp modules +``` + +Run the no-motion smoke script against the already-running stack. It checks the +SeatGuide MCP tools and fails early unless `web_input_status` reports +`web=started`, `thread=running`, `seat_route=seat_guide_direct`, +`responses=connected`, `voice_upload=connected`, `stt=connected`, and `human_transport=connected`, then +runs the no-motion scene/status/preflight/preview checks: + +```bash +bin/demo_seat_guide_smoke +``` + +Run the hardware acceptance script against an already-running real Go2 stack. +It performs the no-motion WebInput, camera/VLM, scene, preflight, request +preview, and goal preview checks first. It only offers the `LIVE` prompt after +automated gates pass: the DimOS run registry must show a hardware run, not +`--replay` or `--simulation`, and the blueprint must be +`unitree-go2-seat-guide` or `unitree-go2-seat-guide-agentic`; WebInput must report `web=started`, +`thread=running`, `seat_route=seat_guide_direct`, `responses=connected`, +`voice_upload=connected`, `stt=connected`, and +`human_transport=connected`; camera frames, odometry, and VLM credentials must +be present, the camera provider runtime override must be inactive, and +configured fallback seats/people must both be zero; preflight must be ready with +`navigation=IDLE`; the goal preview must select a seat; +and the script must resolve the active WebInput URL from `web_input_status`. Posting +`预检帮我找一个空位` to that WebInput HTTP text +endpoint must publish a `SeatGuide preflight ready` response on the web +`agent_responses` stream before `SEAT_GUIDE_WEBINPUT_TEXT_WAIT_S` seconds +(default `20`). It then opens a manual no-motion voice gate: the +operator presses Enter when ready, then uses the browser microphone to say +`预检帮我找一个空位`; the script only accepts a `SeatGuide preflight ready` response +that appears in the web response stream after that readiness point, before +`SEAT_GUIDE_WEBINPUT_VOICE_PREVIEW_WAIT_S` seconds (default `120`). After the +operator types `LIVE`, the script still does not call +`handle_seat_request` through MCP; it opens a live voice gate and requires the +operator to press Enter when ready, then say `帮我找一个空位` through the browser +microphone. The live gate passes only if the web response stream reports +`Navigating to` after that readiness point and before +`SEAT_GUIDE_WEBINPUT_VOICE_LIVE_WAIT_S` seconds (default `150`); after that, the +script polls `seat_guide_navigation_status()` until it reports both a new +`goal_sequence` and `goal_reached=true`; stale completion from a previous +navigation goal is suppressed until SeatGuide observes a reset. The transcript is saved under +`logs/seat_guide_acceptance/` by default; override with +`SEAT_GUIDE_ACCEPTANCE_LOG_DIR=/path/to/logs` when needed. The transcript +includes the MCP command outputs plus `dimos log -n 200` snapshots after the +no-motion checks, after the live navigation request, and on WebInput/navigation +failure paths. The saved logs must show `WebInput routing text to SeatGuide +preview` for the no-motion voice gate and `WebInput routing text to SeatGuide +live request` for the live gate, proving the request went through WebInput +rather than a direct MCP call: + +```bash +bin/demo_seat_guide_hardware_acceptance +``` + +The hardware script automatically audits the saved transcript after the live +request completes. It requires the DimOS run registry path, hardware run mode, +SeatGuide Go2 blueprint name, running WebInput server/thread/transport, direct +SeatGuide routing, resolved WebInput URL, camera/odometry/VLM readiness, +`image_fresh=true` and `odom_fresh=true`, typed and spoken no-motion +responses, explicit browser microphone no-motion/live gates with the required +spoken phrases, WebInput preview/live route logs, empty/occupied seat counts, +DimOS log snapshots after no-motion checks and after the live request, the +no-motion completion marker, `LIVE` confirmation, live voice navigation start, +goal-sequence polling, and +`goal_reached=true` completion. It also requires at least three +`WebInput received text` log events, +covering typed no-motion input, no-motion browser microphone input, and live +browser microphone input; those log events must include the recognized +SeatGuide phrases `预检帮我找一个空位` and `帮我找一个空位`, not just arbitrary +WebInput text. The verifier rejects transcripts that contain direct MCP live +calls to `handle_seat_request`, fallback scene calibration with +`set_seat_scene`, clearing fallback overrides, or +`require_live_perception=false`. The no-motion flow must appear in order: typed +WebInput preview, browser microphone no-motion gate, readiness prompt before +the spoken phrase, WebInput preview route, no-motion log snapshot, no-motion +completion marker, and `LIVE` confirmation. The live flow must then appear in +order: `LIVE` confirmation, browser microphone live gate, readiness prompt +before the spoken phrase, WebInput live SeatGuide route, navigation start, +`goal_reached=true`, and completion marker. + +To re-audit an existing transcript manually, run: + +```bash +bin/demo_seat_guide_verify_acceptance_log logs/seat_guide_acceptance/.log +``` + +Run the replay smoke wrapper when no Go2 is connected. It checks the macOS +multicast route before starting replay, starts `unitree-go2-seat-guide-agentic` +with `--replay`, runs the no-motion smoke, and stops the stack: + +```bash +bin/demo_seat_guide_replay_smoke +``` + +Run the no-motion readiness path without relying on microphone or LLM behavior: + +```bash +dimos mcp call seat_guide_status +dimos mcp call web_input_status +dimos mcp call camera_seat_provider_status +dimos mcp call seat_guide_readiness_report +dimos mcp call seat_guide_preflight +dimos mcp call seat_guide_navigation_status +dimos mcp call preview_seat_request --json-args '{"text": "预检帮我找一个空位"}' +dimos mcp call preview_empty_seat_goal +``` + +Do not use a direct MCP `handle_seat_request` call as the live hardware demo +evidence. The hardware acceptance verifier rejects that path because it bypasses +the required browser microphone -> Whisper -> WebInput route. + +Run the real voice path: + +1. Open the web interface printed by `WebInput` (`http://localhost:5555` by default). +2. Allow browser microphone access. +3. First speak "预检帮我找一个空位" or "preview find me an empty seat" to validate the real microphone path without motion. +4. Then speak "帮我找一个空位" or "Please find me an empty seat" when live navigation is intended. +5. The browser audio is transcribed by `WhisperNode` with language auto-detection; no-motion preview text is routed to `preview_seat_request()`, and live SeatGuide text is routed directly to `handle_seat_request()`. +6. Watch the web `agent_responses` stream for the exact SeatGuide result; use the phone speaker page only when audible feedback is required. + +If MCP is healthy, this should route through: + +`seat_guide_readiness_report` for combined no-motion checks -> +`WebInput` -> `WhisperNode` -> `handle_seat_request` -> +`CameraSeatObservationProvider.get_seat_scene` using camera frames and odom -> +`SeatGuidePlanner.find_empty_seat` -> `NavigationInterfaceSpec.set_goal`. + +Calibrate the fallback scene at runtime if camera/VLM detection is not reliable: + +```bash +dimos mcp call set_seat_scene --json-args '{"seats": [0.0, -1.0, 0.0, 1.5, -1.0, 0.0, 3.0, -1.0, 0.0], "people": [0.1, -1.0, 1.6, -1.0], "robot_x": -1.0, "robot_y": -1.0}' +``` + +After fallback calibration, use explicit fallback preflight: + +```bash +dimos mcp call seat_guide_preflight --json-args '{"require_live_perception": false}' +``` + +To intentionally test fallback navigation without live camera recognition, also +pass the override on the request itself: + +```bash +dimos mcp call handle_seat_request --json-args '{"text": "帮我找一个空位", "require_live_perception": false}' +``` + +For the real G2 demo, use map-frame chair and aisle coordinates that are +reachable by the active map. A successful `handle_seat_request` response only +proves the goal was submitted; confirm `dimos log -f` does not show `No path +found to the goal` before relying on that calibration. + +Clear runtime calibration and return to blueprint defaults: + +```bash +dimos mcp call clear_seat_scene_override +``` + +Use the direct skill only for debugging the configured coordinates: + +```bash +dimos mcp call find_empty_seat --json-args '{"seats": [0.0, -1.0, 0.0, 1.5, -1.0, 0.0, 3.0, -1.0, 0.0], "people": [0.1, -1.0, 1.6, -1.0], "robot_x": -1.0, "robot_y": -1.0}' +``` + +Stop after testing: + +```bash +dimos stop +``` diff --git a/docs/agents/seat_guide_step_by_step_plan.md b/docs/agents/seat_guide_step_by_step_plan.md new file mode 100644 index 0000000000..631fcc0d6b --- /dev/null +++ b/docs/agents/seat_guide_step_by_step_plan.md @@ -0,0 +1,314 @@ +# SeatGuide 机器狗空位引导 Step-by-Step 计划 + +目标:让用户通过浏览器麦克风或文字对 Go2 说“帮我找一个空位”,系统用真实相机识别椅子和人,判断空位,给导航下发目标,并通过网页/手机反馈结果。没有连接 G2 时,所有能本地验证的模块都必须有单测或 smoke 验证;连接 G2 后只跑硬件验收,不再临场拼功能。 + +## 总体模块拆分 + +| 模块 | 负责什么 | 输入 | 输出 | 是否可并行 | 当前验证方式 | +| --- | --- | --- | --- | --- | --- | +| 1. 基础语音/文字控制入口 | 接收浏览器麦克风、浏览器文字、普通 agent text,先识别普通运动/姿态命令,再识别找座位请求 | WebInput `/submit_query`、`/upload_audio`、Whisper 文本、agent text | 普通 agent tool call,或 SeatGuide preview/live 请求 | 是 | MCP tool 验收、WebInput 单测、HTTP TestClient、硬件验收脚本 | +| 2. 场景感知 | 用 Go2 RGB 图像 + odom + YOLO 快速识别椅子和人,必要时用 VLM fallback,并投影成 map 坐标 | `color_image`、`odom`、YOLO `yolo11n.pt` | `SeatSceneObservation` | 是 | Camera provider 单测、`camera_seat_provider_status` | +| 3. 空位规划 | 判断哪些椅子被占用,选择最近空位,生成机器人应到达的引导点 | 椅子位姿、人员位置、机器人位置 | 选中椅子、导航目标 pose | 是 | Planner 单测、`preview_empty_seat_goal` | +| 4. 导航执行 | 把目标 pose 发给已有导航模块,并读取完成状态 | SeatGuide goal pose | `set_goal()`、`goal_reached` | 部分并行 | fake navigator 单测、`seat_guide_navigation_status` | +| 5. 手机/网页反馈 | 告诉用户找到哪个位置、是否需要跟随、失败原因 | SeatGuide 结果文本 | web response text、手机扬声器 relay | 是 | `web_input_status`、可选 `phone_speaker_test` | +| 6. 验收脚本 | 把 no-motion、真实语音、真实导航串起来,保存 transcript | 当前 DimOS stack | 通过/失败原因、验收日志 | 是 | `bin/demo_seat_guide_*` | + +## 阶段 1:基础语音控制验收 + +目的:先证明“人说一句话或输入一句话 -> 系统识别意图 -> 下发到 Go2 -> Go2 执行动作”这条最小闭环成立。这个阶段不做找空位,不依赖 VLM,不依赖座椅识别。 + +要做的工作: + +1. 用 MCP 直接调用姿态/移动工具,证明 Go2 控制工具本身可用。 +2. 用浏览器文字输入普通运动命令,证明文字能进入 agent 并触发 Go2 tool。 +3. 用浏览器麦克风说普通运动命令,证明麦克风 -> Whisper -> agent -> Go2 tool 链路可用。 +4. 验证停止/安全命令,确保每次小距离动作后可以停下。 +5. 只验收低风险动作:站立、恢复站立、小距离前进/后退、小角度转向;不要在第一阶段测试跳跃、翻滚等动态动作。 + +### 阶段 1 的可验收路径拆分 + +| 路径 | 入口 | 是否会让 Go2 移动 | 验收命令/动作 | 通过标准 | +| --- | --- | --- | --- | --- | +| 1A. MCP 姿态命令 | MCP tool | 可能改变姿态,不走位 | `dimos mcp call execute_sport_command --json-args '{"command_name":"BalanceStand"}'` | tool 返回成功,Go2 进入稳定站立/平衡状态 | +| 1B. MCP 小距离移动 | MCP tool | 是,小距离 | `relative_move` 前进 0.3m、后退 0.3m、左转 30 度 | Go2 按命令小幅移动或导航状态显示目标完成 | +| 1C. 浏览器文字 -> agent -> Go2 tool | Web 页面文字框或 `/submit_query` | 是,小距离 | 输入 `walk forward 30 centimeters`、`walk backward 30 centimeters` | 日志显示 WebInput 收到文本,非找座位请求进入普通 agent path,agent 调用 `relative_move` | +| 1D. 浏览器麦克风 -> Whisper -> agent -> Go2 tool | 电脑浏览器麦克风 | 是,小距离 | 对浏览器说 `walk forward 30 centimeters` 或中文等价命令 | 日志显示 Whisper 识别文本,agent 调用对应 Go2 tool,Go2 执行动作 | +| 1E. 停止/安全 | MCP tool 或 agent tool | 停止当前导航/动作 | `dimos mcp call stop_navigation` | 导航状态回到停止/空闲,不再继续移动 | + +推荐验收命令: + +```bash +dimos mcp call execute_sport_command --json-args '{"command_name":"BalanceStand"}' +dimos mcp call relative_move --json-args '{"forward":0.3,"left":0,"degrees":0}' +dimos mcp call relative_move --json-args '{"forward":-0.3,"left":0,"degrees":0}' +dimos mcp call relative_move --json-args '{"forward":0,"left":0,"degrees":30}' +dimos mcp call stop_navigation +``` + +通过标准: + +- MCP 直接调用能让 Go2 执行姿态和小距离移动。 +- 浏览器文字命令能触发普通 agent tool,而不是误进 SeatGuide。 +- 浏览器麦克风命令能完成语音识别,并触发同一个 Go2 tool。 +- 任何一次动作失败时,可以定位失败点是控制工具、agent tool selection、Whisper、WebInput,还是 Go2 连接。 +- 当前默认输入设备是 **电脑浏览器麦克风**,不是 Go2 机身麦克风;如果要使用 Go2 自带麦克风,需要后续单独增加输入模块。 + +## 阶段 2:SeatGuide 基础模块开发和本地单测 + +目的:不接机器狗也能证明核心逻辑正确。 + +要做的工作: + +1. 实现 SeatGuide 数据模型:椅子、人员、场景、规划结果、语音意图。 +2. 实现空位判断:人在椅子附近 0.75m 内则认为占用。 +3. 实现最近空位选择:从机器人当前位置选最近的空椅子。 +4. 实现引导点生成:目标点在椅子旁边的过道方向,而不是椅子中心。 +5. 实现 preview 和 live 两种路径:preview 不移动,live 才下发导航。 + +验收方式: + +```bash +uv run pytest dimos/agents/skills/test_seat_guide.py -q -k 'planner or find_empty_seat or preview_empty_seat_goal' +``` + +通过标准: + +- 能选出正确空位。 +- 被占用椅子不会被选中。 +- preview 不调用导航。 +- live 只在可导航且场景来源可信时调用导航。 + +## 阶段 3:SeatGuide 语音和 WebInput 链路 + +目的:用户可以通过浏览器输入或麦克风触发 SeatGuide,而不是必须手动 MCP call。 + +要做的工作: + +1. WebInput 文字输入 `/submit_query` 直接路由 SeatGuide 请求。 +2. 浏览器音频上传 `/upload_audio` 推入 `audio_subject`。 +3. Whisper 自动识别语言,不强制英文。 +4. 中文 preview 语句:`预检帮我找一个空位` 只做 no-motion 检查。 +5. 中文 live 语句:`帮我找一个空位` 才触发导航。 +6. WebInput 把 SeatGuide 返回结果推到 `agent_responses`,浏览器可见。 + +### 阶段 3 的可验收路径拆分 + +| 路径 | 入口 | 是否会让 Go2 移动 | 验收命令/动作 | 通过标准 | +| --- | --- | --- | --- | --- | +| 3A. 模块状态检查 | MCP tool | 否 | `dimos mcp call web_input_status` | 输出包含 `web=started`、`voice_upload=connected`、`stt=connected`、`seat_route=seat_guide_direct` | +| 3B. 浏览器文字 preview | Web 页面文字框或 `/submit_query` | 否 | 在 WebInput 页面输入 `预检帮我找一个空位`,或用硬件脚本自动 POST | `agent_responses` 出现 `SeatGuide preflight ready` 或明确 no-go 原因;导航目标不会下发 | +| 3C. 浏览器麦克风 preview | 电脑浏览器麦克风 | 否 | 打开 WebInput URL,允许麦克风,点击麦克风后说 `预检帮我找一个空位` | Whisper 识别文本后,WebInput 日志包含 `WebInput received text` 和 `WebInput routing text to SeatGuide preview` | +| 3D. 普通 agent text fallback | `/human_input` agent path | 否,除非 agent 后续显式调用工具 | 输入非找座位文本,例如 `what time is the meeting` | WebInput 不调用 SeatGuide,文本进入普通 agent path | +| 3E. 浏览器麦克风 live | 电脑浏览器麦克风 | 是 | 只在 no-motion 通过且现场安全后,说 `帮我找一个空位` | WebInput 日志包含 `WebInput routing text to SeatGuide live request`,SeatGuide 返回 `Navigating to ...` | + +推荐验收顺序: + +1. 先跑 3A,确认 WebInput/STT/浏览器音频上传都在线。 +2. 再跑 3B,确认文字入口和 SeatGuide preview 直连。 +3. 再跑 3C,确认电脑浏览器麦克风 -> Whisper -> SeatGuide preview 直连。 +4. 最后才跑 3E,因为它会下发真实导航目标。 + +注意:当前方案默认使用 **电脑浏览器麦克风**,不是直接使用 Go2 机身麦克风。用户对电脑浏览器说话,电脑把音频上传到 DimOS,Whisper 识别文本,然后 SeatGuide 给 Go2 下发导航。 + +验收方式: + +```bash +uv run pytest dimos/agents/skills/test_seat_guide.py -q -k 'web_input or upload_audio or submit_query' +``` + +通过标准: + +- `/submit_query` 能触发 SeatGuide preview。 +- `/upload_audio` 能产生 `AudioEvent`。 +- 未配置音频入口或音频解码失败时不能误报成功。 +- `web_input_status` 必须包含: + - `web=started` + - `thread=running` + - `seat_route=seat_guide_direct` + - `responses=connected` + - `voice_upload=connected` + - `stt=connected` + - `human_transport=connected` + +## 阶段 4:真实相机/VLM/odom 感知 + +目的:不能用假的 mock 当最终结果,硬件验收必须证明来自真实 camera source。 + +要做的工作: + +1. `CameraSeatObservationProvider` 订阅 `color_image` 和 `odom`。 +2. 使用 Qwen/VLM 分别检测 `chair` 和 `person`。 +3. 根据图像框中心和 odom 估算 map-frame 椅子/人员位置。 +4. 检查 stale image、stale odom、missing key、missing camera 等 no-go 状态。 +5. 保留 `set_seat_scene` 作为 fallback/calibration,但硬件验收不接受 fallback。 + +验收方式: + +```bash +uv run pytest dimos/agents/skills/test_seat_guide.py -q -k 'camera_observation_provider or camera_seat_provider_status' +``` + +硬件前检查: + +```bash +dimos mcp call camera_seat_provider_status +dimos mcp call seat_guide_status +``` + +通过标准: + +- `camera_seat_provider_status` 显示: + - `image=x` + - `image_fresh=true` + - `odom=(...)` + - `odom_fresh=true` + - `credential=present` + - `override=inactive` + - `configured_fallback_seats=0` + - `configured_fallback_people=0` +- `seat_guide_status` 必须以 `SeatGuide scene source=camera:` 开头。 + +## 阶段 5:导航和手机/网页反馈 + +目的:找到空位以后,Go2 能真正下发导航目标,并通过网页或绑在机器狗上的手机给用户可见/可听反馈。 + +要做的工作: + +1. SeatGuide 注入 `NavigationInterfaceSpec`。 +2. live request 时调用 `set_goal(PoseStamped)`。 +3. 如果导航忙,拒绝覆盖当前任务。 +4. 读取 `navigation_state` 和 `goal_reached`。 +5. SeatGuide 返回明确的结果文本,并让 WebInput 发布到 `agent_responses`。 +6. 如果需要可听反馈,让手机打开 speaker relay 页面并用 `phone_speaker_test` 验证。 + +验收方式: + +```bash +uv run pytest dimos/agents/skills/test_seat_guide.py -q +``` + +硬件前检查: + +```bash +dimos mcp call seat_guide_preflight +dimos mcp call preview_empty_seat_goal +dimos mcp call seat_guide_navigation_status +``` + +通过标准: + +- preflight 显示 `navigation=IDLE`。 +- preview 有 `selected=...` 和 `goal=(...)`。 +- WebInput response stream 可见;需要声音时手机 relay 可播放测试消息。 +- live 后 `seat_guide_navigation_status` 最终显示新的 `goal_sequence` 且 `goal_reached=true`。 + +## 阶段 6:Mac replay / SHM 视频流修复 + +目的:Mac 上 replay 的高带宽视频/点云/地图流不再因为 UDP/LCM 大包路径缺失导致看不到视频。 + +要做的工作: + +1. 集成 `dimensionalOS/dimos#2245` / `danvi/experimental/route-replay-through-SHM`。 +2. 将 Go2 replay 的高带宽流 route 到 `pSHMTransport`: + - `color_image` + - `lidar` + - `pointcloud` + - `global_map` + - `merged_map` + - `global_costmap` + - `navigation_costmap` +3. Rerun bridge 接收 SHM visual transports。 + +验收方式: + +```bash +uv run pytest dimos/protocol/pubsub/test_registry.py dimos/visualization/rerun/test_viewer_integration.py -q +bin/demo_seat_guide_replay_smoke +``` + +通过标准: + +- replay stack 能启动。 +- 日志里高带宽流显示 `transport=pSHMTransport`。 +- `bin/demo_seat_guide_replay_smoke` 能完整跑完并停止 stack。 + +## 阶段 7:真实 Go2 bring-up + +目的:你接上机器狗后只跑一个入口,不需要手动拼命令。 + +你需要手动准备: + +1. 机器狗上电,和 Mac 在同一网络。 +2. 确认 Go2 IP,默认示例是 `192.168.123.161`。 +3. 可选准备普通 agent 的 API key。SeatGuide 直连语音/MCP 路径不需要 LLM key;如果选择 Qwen 作为找座位 VLM,仍然需要 Alibaba/Qwen: + +```bash +export OPENROUTER_API_KEY="你的 OpenRouter key" +export OPENROUTER_MODEL="openai/gpt-4o-mini" +``` + +启动一键 bring-up: + +```bash +bin/demo_seat_guide_hardware_bringup --robot-ip 192.168.123.161 +``` + +这个脚本会自动执行: + +1. 检查 YOLO 快速检测路径;如果没有 agent key,普通 agent chat 会禁用,但 SeatGuide 直连路径仍然可用。 +2. 启动 `unitree-go2-seat-guide-agentic`。 +3. 跑 `bin/demo_seat_guide_smoke` 做 no-motion 检查。 +4. 跑 `bin/demo_seat_guide_hardware_acceptance` 做真实浏览器语音和导航验收。 + +你在脚本过程中需要手动做: + +1. 打开脚本打印的 WebInput URL。 +2. 允许浏览器麦克风权限。 +3. no-motion 阶段对浏览器说:`预检帮我找一个空位`。 +4. 确认 Go2 周围安全后,在终端输入 `LIVE`。 +5. live 阶段对浏览器说:`帮我找一个空位`。 + +通过标准: + +- no-motion 阶段所有 gate 通过。 +- live 阶段 WebInput 日志包含中文识别文本。 +- SeatGuide 返回 `Navigating to ...`。 +- 最终 `seat_guide_navigation_status` 显示新的 `goal_sequence` 和 `goal_reached=true`。 +- `bin/demo_seat_guide_verify_acceptance_log ` 通过。 + +## 阶段 8:失败时怎么分模块排查 + +| 失败位置 | 看什么命令 | 常见原因 | 处理方式 | +| --- | --- | --- | --- | +| WebInput 未启动 | `dimos mcp call web_input_status` | 端口占用、WebInput 模块未启动 | 检查 `dimos status`、重启 stack | +| 麦克风没进来 | `web_input_status`、浏览器权限 | `voice_upload=missing`、浏览器拒绝麦克风 | 允许麦克风权限,刷新 WebInput 页面 | +| STT 不工作 | `web_input_status` | Whisper/faster-whisper 初始化失败 | 看 DimOS log,确认依赖安装 | +| 没有图像 | `camera_seat_provider_status` | Go2 camera/replay stream 没到 | 转向桌子,确认 replay/SHM 流 | +| odom 缺失或过期 | `camera_seat_provider_status` | localization 没启动或 stale | 等待 odom,检查 Go2/replay stack | +| YOLO/VLM 失败 | `seat_guide_status` | YOLO 模型加载失败,或启用 VLM fallback 后远程 VLM key 缺失 | 确认 `yolo11n.pt` 可加载;如果使用 Qwen fallback,重新 export 对应 key,并重启 stack | +| 找不到椅子 | `seat_guide_status` | 摄像头没朝向桌子、光照/识别失败 | 调整机器人视角;只调试时可 fallback | +| 导航忙 | `seat_guide_preflight` | `navigation=FOLLOWING_PATH` 或 `RECOVERY` | 等任务结束或停止导航后重跑 | +| 手机反馈不可用 | `web_input_status` / `phone_speaker_test` | 手机没有打开 relay 页面或网络不可达 | 先确认 web response stream;需要声音时让手机访问可用的 relay 页面 | + +## 当前已完成状态 + +已完成: + +- SeatGuide planner / scene / intent / navigation integration。 +- WebInput 中文语音和文字直连 SeatGuide。 +- Camera/VLM/odom provider。 +- WebInput response stream 和可选手机 speaker relay。 +- Go2 SeatGuide blueprints。 +- macOS replay SHM route 集成。 +- 一键硬件 bring-up 脚本。 +- no-motion smoke、hardware acceptance、acceptance log verifier。 + +已验证: + +- SeatGuide/MCP 相关测试通过。 +- pubsub/Rerun SHM 相关测试通过。 +- `bin/demo_seat_guide_replay_smoke` 在 Mac 上完整跑完。 + +未完成: + +- 真实 Go2 硬件 transcript。最终完成标准必须包含真实浏览器麦克风输入、真实 camera/VLM/odom、真实导航和 `goal_reached=true`。 diff --git a/docs/agents/seat_guide_step_by_step_plan_en.md b/docs/agents/seat_guide_step_by_step_plan_en.md new file mode 100644 index 0000000000..fc535094a6 --- /dev/null +++ b/docs/agents/seat_guide_step_by_step_plan_en.md @@ -0,0 +1,314 @@ +# SeatGuide Robot Dog Empty-Seat Guidance Step-by-Step Plan + +Goal: let a user tell the Go2, through browser microphone or typed text, "find me an empty seat." The system should use the real camera to recognize chairs and people, decide which seats are empty, send a navigation goal, and respond through the web or phone relay. When the Go2 is not connected, every locally verifiable module must have unit or smoke coverage. After the Go2 is connected, we should only need to run the hardware acceptance flow instead of assembling functionality on the spot. + +## Overall Module Split + +| Module | Responsibility | Input | Output | Can run in parallel | Current verification | +| --- | --- | --- | --- | --- | --- | +| 1. Basic voice/text control entry | Accept browser microphone, browser text, or normal agent text; first recognize basic movement/posture commands, then recognize seat-finding intent | WebInput `/submit_query`, `/upload_audio`, Whisper text, agent text | Normal agent tool call, or SeatGuide preview/live request | Yes | MCP tool acceptance, WebInput unit tests, HTTP TestClient, hardware acceptance script | +| 2. Scene perception | Use Go2 RGB image + odom + YOLO fast detection for chairs and people, with optional VLM fallback, then project them into map coordinates | `color_image`, `odom`, YOLO `yolo11n.pt` | `SeatSceneObservation` | Yes | Camera provider unit tests, `camera_seat_provider_status` | +| 3. Empty-seat planning | Decide which chairs are occupied, select the nearest empty seat, and generate the guide pose for the robot | Chair poses, person positions, robot position | Selected chair and navigation goal pose | Yes | Planner unit tests, `preview_empty_seat_goal` | +| 4. Navigation execution | Send the target pose to the existing navigation module and read completion status | SeatGuide goal pose | `set_goal()`, `goal_reached` | Partially | Fake navigator unit tests, `seat_guide_navigation_status` | +| 5. Phone/web feedback | Tell the user which seat was found, whether to follow, or why the request failed | SeatGuide result text | Web response text, phone speaker relay | Yes | `web_input_status`, optional `phone_speaker_test` | +| 6. Acceptance scripts | Chain no-motion checks, real voice input, and real navigation, then save a transcript | Current DimOS stack | Pass/fail reason and acceptance log | Yes | `bin/demo_seat_guide_*` | + +## Stage 1: Basic Voice-Control Acceptance + +Purpose: first prove the smallest working loop: "a person says or types a command -> the system recognizes the intent -> the command reaches the Go2 -> the Go2 executes it." This stage does not find seats, does not depend on VLM, and does not depend on chair detection. + +Work items: + +1. Call posture and movement tools directly through MCP to prove the Go2 control tools work. +2. Type normal movement commands in the browser to prove text enters the agent and triggers a Go2 tool. +3. Speak normal movement commands through the browser microphone to prove microphone -> Whisper -> agent -> Go2 tool works. +4. Verify stop/safety commands so every small movement can be stopped. +5. Accept only low-risk actions: stand, recovery stand, short forward/backward movement, and small turns. Do not test jumps, flips, or other dynamic motions in this first stage. + +### Stage 1 Acceptance Path Breakdown + +| Path | Entry point | Will it move the Go2? | Verification command/action | Pass criteria | +| --- | --- | --- | --- | --- | +| 1A. MCP posture command | MCP tool | May change posture, no walking | `dimos mcp call execute_sport_command --json-args '{"command_name":"BalanceStand"}'` | The tool returns success and the Go2 enters a stable standing/balancing state | +| 1B. MCP short movement | MCP tool | Yes, short distance | Run `relative_move` forward 0.3m, backward 0.3m, and turn left 30 degrees | The Go2 makes the small movement, or navigation status reports the goal completed | +| 1C. Browser text -> agent -> Go2 tool | Web page text box or `/submit_query` | Yes, short distance | Type `walk forward 30 centimeters` and `walk backward 30 centimeters` | Logs show WebInput received the text, non-seat text went to the normal agent path, and the agent called `relative_move` | +| 1D. Browser microphone -> Whisper -> agent -> Go2 tool | Computer browser microphone | Yes, short distance | Say `walk forward 30 centimeters`, or the equivalent Chinese command, into the browser | Logs show Whisper recognized the text, the agent called the matching Go2 tool, and the Go2 executed it | +| 1E. Stop/safety | MCP tool or agent tool | Stops current navigation/action | `dimos mcp call stop_navigation` | Navigation returns to stopped/idle and the robot does not continue moving | + +Recommended acceptance commands: + +```bash +dimos mcp call execute_sport_command --json-args '{"command_name":"BalanceStand"}' +dimos mcp call relative_move --json-args '{"forward":0.3,"left":0,"degrees":0}' +dimos mcp call relative_move --json-args '{"forward":-0.3,"left":0,"degrees":0}' +dimos mcp call relative_move --json-args '{"forward":0,"left":0,"degrees":30}' +dimos mcp call stop_navigation +``` + +Pass criteria: + +- Direct MCP calls can make the Go2 execute posture and short movement commands. +- Browser text commands trigger normal agent tools instead of incorrectly entering SeatGuide. +- Browser microphone commands are transcribed and trigger the same Go2 tool. +- If any action fails, the failure can be attributed to one layer: control tool, agent tool selection, Whisper, WebInput, or Go2 connection. +- The current default input device is the **computer browser microphone**, not the Go2 body microphone. Using the Go2 onboard microphone would require a separate input module later. + +## Stage 2: SeatGuide Core Module Development And Local Unit Tests + +Purpose: prove the core logic without connecting the robot dog. + +Work items: + +1. Implement SeatGuide data models: chairs, people, scene, planner result, and voice intent. +2. Implement occupancy detection: a chair is occupied if a person is within 0.75 meters. +3. Implement nearest-empty-seat selection from the robot's current position. +4. Implement guide-pose generation beside the chair in the aisle direction, not at the chair center. +5. Implement preview and live paths: preview never moves; live sends the navigation goal. + +Verification: + +```bash +uv run pytest dimos/agents/skills/test_seat_guide.py -q -k 'planner or find_empty_seat or preview_empty_seat_goal' +``` + +Pass criteria: + +- The correct empty seat is selected. +- Occupied chairs are not selected. +- Preview does not call navigation. +- Live only calls navigation when navigation is available and the scene source is trusted. + +## Stage 3: SeatGuide Voice And WebInput Path + +Purpose: allow the user to trigger SeatGuide through the browser text box or microphone instead of requiring a manual MCP call. + +Work items: + +1. Route WebInput text input from `/submit_query` directly to SeatGuide requests. +2. Push browser audio uploads from `/upload_audio` into `audio_subject`. +3. Let Whisper auto-detect language instead of forcing English. +4. Chinese preview phrase: `预检帮我找一个空位` only runs no-motion checks. +5. Chinese live phrase: `帮我找一个空位` triggers navigation. +6. Publish the SeatGuide response to `agent_responses` so it is visible in the browser. + +### Stage 3 Acceptance Path Breakdown + +| Path | Entry point | Will it move the Go2? | Verification command/action | Pass criteria | +| --- | --- | --- | --- | --- | +| 3A. Module status check | MCP tool | No | `dimos mcp call web_input_status` | Output includes `web=started`, `voice_upload=connected`, `stt=connected`, and `seat_route=seat_guide_direct` | +| 3B. Browser text preview | Web page text box or `/submit_query` | No | Type `预检帮我找一个空位` in the WebInput page, or let the hardware script POST it automatically | `agent_responses` shows `SeatGuide preflight ready` or a clear no-go reason; no navigation goal is sent | +| 3C. Browser microphone preview | Computer browser microphone | No | Open the WebInput URL, allow microphone access, click the microphone button, and say `预检帮我找一个空位` | After Whisper recognition, DimOS logs include `WebInput received text` and `WebInput routing text to SeatGuide preview` | +| 3D. Normal agent text fallback | `/human_input` agent path | No, unless the agent later explicitly calls a tool | Enter non-seat text such as `what time is the meeting` | WebInput does not call SeatGuide; the text goes to the normal agent path | +| 3E. Browser microphone live | Computer browser microphone | Yes | Only after no-motion checks pass and the physical area is safe, say `帮我找一个空位` | Logs include `WebInput routing text to SeatGuide live request`, and SeatGuide returns `Navigating to ...` | + +Recommended acceptance order: + +1. Run 3A first to confirm WebInput, STT, and browser audio upload are online. +2. Run 3B next to confirm browser text input routes directly to SeatGuide preview. +3. Run 3C next to confirm computer browser microphone -> Whisper -> SeatGuide preview. +4. Run 3E last because it sends a real navigation goal. + +Note: the current default path uses the **computer browser microphone**, not the Go2 body microphone. The user speaks to the computer browser, the computer uploads audio to DimOS, Whisper transcribes it, and then SeatGuide sends navigation to the Go2. + +Verification: + +```bash +uv run pytest dimos/agents/skills/test_seat_guide.py -q -k 'web_input or upload_audio or submit_query' +``` + +Pass criteria: + +- `/submit_query` can trigger SeatGuide preview. +- `/upload_audio` can produce an `AudioEvent`. +- Missing voice input configuration or audio decode failure cannot be reported as success. +- `web_input_status` must include: + - `web=started` + - `thread=running` + - `seat_route=seat_guide_direct` + - `responses=connected` + - `voice_upload=connected` + - `stt=connected` + - `human_transport=connected` + +## Stage 4: Real Camera/VLM/Odom Perception + +Purpose: the final result must not rely on fake mocks. Hardware acceptance must prove the scene came from the real camera source. + +Work items: + +1. `CameraSeatObservationProvider` subscribes to `color_image` and `odom`. +2. Use Qwen/VLM to detect `chair` and `person` separately. +3. Estimate map-frame chair/person positions from the bounding-box center and odometry. +4. Diagnose no-go states such as stale image, stale odom, missing key, and missing camera. +5. Keep `set_seat_scene` as fallback/calibration, but reject fallback for official hardware acceptance. + +Verification: + +```bash +uv run pytest dimos/agents/skills/test_seat_guide.py -q -k 'camera_observation_provider or camera_seat_provider_status' +``` + +Pre-hardware checks: + +```bash +dimos mcp call camera_seat_provider_status +dimos mcp call seat_guide_status +``` + +Pass criteria: + +- `camera_seat_provider_status` shows: + - `image=x` + - `image_fresh=true` + - `odom=(...)` + - `odom_fresh=true` + - `credential=present` + - `override=inactive` + - `configured_fallback_seats=0` + - `configured_fallback_people=0` +- `seat_guide_status` must start with `SeatGuide scene source=camera:`. + +## Stage 5: Navigation And Phone/Web Feedback + +Purpose: after finding an empty seat, the Go2 must actually receive a navigation goal and provide visible feedback through the web, or audible feedback through a phone mounted on the robot. + +Work items: + +1. Inject `NavigationInterfaceSpec` into SeatGuide. +2. On a live request, call `set_goal(PoseStamped)`. +3. If navigation is busy, refuse to overwrite the current task. +4. Read `navigation_state` and `goal_reached`. +5. Return clear SeatGuide result text and publish it to the WebInput `agent_responses` stream. +6. If audible feedback is required, open the speaker relay on the mounted phone and verify it with `phone_speaker_test`. + +Verification: + +```bash +uv run pytest dimos/agents/skills/test_seat_guide.py -q +``` + +Pre-hardware checks: + +```bash +dimos mcp call seat_guide_preflight +dimos mcp call preview_empty_seat_goal +dimos mcp call seat_guide_navigation_status +``` + +Pass criteria: + +- Preflight shows `navigation=IDLE`. +- Preview includes `selected=...` and `goal=(...)`. +- The WebInput response stream is visible; if sound is needed, the phone relay can play a test message. +- After live navigation, `seat_guide_navigation_status` eventually shows a new `goal_sequence` and `goal_reached=true`. + +## Stage 6: Mac Replay / SHM Video Stream Fix + +Purpose: on macOS replay, high-bandwidth video/pointcloud/map streams should not disappear because of UDP/LCM large-packet issues. + +Work items: + +1. Integrate `dimensionalOS/dimos#2245` / `danvi/experimental/route-replay-through-SHM`. +2. Route high-bandwidth Go2 replay streams through `pSHMTransport`: + - `color_image` + - `lidar` + - `pointcloud` + - `global_map` + - `merged_map` + - `global_costmap` + - `navigation_costmap` +3. Let the Rerun bridge receive SHM visual transports. + +Verification: + +```bash +uv run pytest dimos/protocol/pubsub/test_registry.py dimos/visualization/rerun/test_viewer_integration.py -q +bin/demo_seat_guide_replay_smoke +``` + +Pass criteria: + +- The replay stack starts. +- Logs show high-bandwidth streams with `transport=pSHMTransport`. +- `bin/demo_seat_guide_replay_smoke` completes and stops the stack. + +## Stage 7: Real Go2 Bring-Up + +Purpose: after connecting the robot dog, you should run one entry point instead of manually composing commands. + +Manual preparation: + +1. Power on the robot dog and connect it to the same network as the Mac. +2. Confirm the Go2 IP. The default example is `192.168.123.161`. +3. Optionally prepare an API key for the normal agent. The SeatGuide direct voice/MCP path does not need an LLM key; seat/person VLM still requires Alibaba/Qwen when Qwen is selected: + +```bash +export OPENROUTER_API_KEY="your OpenRouter key" +export OPENROUTER_MODEL="openai/gpt-4o-mini" +``` + +Start one-command bring-up: + +```bash +bin/demo_seat_guide_hardware_bringup --robot-ip 192.168.123.161 +``` + +The script automatically: + +1. Checks the YOLO fast detection path; if no agent key is set, normal agent chat is disabled but the direct SeatGuide route still works. +2. Starts `unitree-go2-seat-guide-agentic`. +3. Runs `bin/demo_seat_guide_smoke` for no-motion checks. +4. Runs `bin/demo_seat_guide_hardware_acceptance` for real browser voice input and navigation acceptance. + +Manual actions during the script: + +1. Open the WebInput URL printed by the script. +2. Allow browser microphone access. +3. During the no-motion stage, say into the browser: `预检帮我找一个空位`. +4. After confirming the area around the Go2 is physically safe, type `LIVE` in the terminal. +5. During the live stage, say into the browser: `帮我找一个空位`. + +Pass criteria: + +- All no-motion gates pass. +- The live stage WebInput logs include the recognized Chinese text. +- SeatGuide returns `Navigating to ...`. +- `seat_guide_navigation_status` eventually shows a new `goal_sequence` and `goal_reached=true`. +- `bin/demo_seat_guide_verify_acceptance_log ` passes. + +## Stage 8: Module-Level Debugging When Something Fails + +| Failure point | Command to inspect | Common cause | Fix | +| --- | --- | --- | --- | +| WebInput is not started | `dimos mcp call web_input_status` | Port conflict or WebInput module did not start | Check `dimos status`, restart the stack | +| Microphone input does not arrive | `web_input_status`, browser permissions | `voice_upload=missing`, browser denied microphone access | Allow microphone permission, refresh the WebInput page | +| STT is not working | `web_input_status` | Whisper/faster-whisper initialization failed | Inspect DimOS logs and confirm dependencies | +| No image | `camera_seat_provider_status` | Go2 camera/replay stream did not arrive | Turn toward the table, confirm replay/SHM stream | +| Odom missing or stale | `camera_seat_provider_status` | Localization did not start or stale odom | Wait for odom, inspect Go2/replay stack | +| YOLO/VLM failed | `seat_guide_status` | YOLO model load failure, or missing remote VLM key after enabling VLM fallback | Confirm `yolo11n.pt` can load; if using Qwen fallback, re-export the matching key, then restart the stack | +| No chairs found | `seat_guide_status` | Camera not facing the table, lighting or recognition issue | Adjust robot view; use fallback only for debugging | +| Navigation busy | `seat_guide_preflight` | `navigation=FOLLOWING_PATH` or `RECOVERY` | Wait for the task to finish or stop navigation before retrying | +| Phone feedback unavailable | `web_input_status` / `phone_speaker_test` | Phone has not opened the relay page or the relay is unreachable | First confirm the web response stream; if sound is needed, open a reachable relay page on the phone | + +## Current Completion Status + +Completed: + +- SeatGuide planner / scene / intent / navigation integration. +- WebInput Chinese voice and text directly routed to SeatGuide. +- Camera/VLM/odom provider. +- WebInput response stream and optional phone speaker relay. +- Go2 SeatGuide blueprints. +- macOS replay SHM route integration. +- One-command hardware bring-up script. +- No-motion smoke, hardware acceptance, and acceptance log verifier. + +Verified: + +- SeatGuide/MCP tests pass. +- pubsub/Rerun SHM tests pass. +- `bin/demo_seat_guide_replay_smoke` completes on Mac. + +Not complete yet: + +- Real Go2 hardware transcript. Final completion requires proof of real browser microphone input, real camera/VLM/odom, real navigation, and `goal_reached=true`. diff --git a/docs/capabilities/agents/readme.md b/docs/capabilities/agents/readme.md index 7cb26b7463..4c240a9925 100644 --- a/docs/capabilities/agents/readme.md +++ b/docs/capabilities/agents/readme.md @@ -17,7 +17,7 @@ Human Input ──→ Agent ──→ Skill Calls ──→ Robot - `agent: Out[BaseMessage]`: publishes agent responses (text, tool calls, images) - `agent_idle: Out[bool]`: signals when the agent is waiting for input -The agent uses LangGraph with a configurable LLM. The default is `gpt-4o` and you need to provide an `OPENAI_API_KEY` environment variable. On startup, it discovers all `@skill`-annotated methods across deployed modules via RPC and exposes them as LangChain tools. +The agent uses LangGraph with a configurable LLM. The default is `gpt-4o`; provide `OPENAI_API_KEY` for direct OpenAI, or provide `OPENROUTER_API_KEY` to route the agent through OpenRouter's OpenAI-compatible chat API. On startup, it discovers all `@skill`-annotated methods across deployed modules via RPC and exposes them as LangChain tools. ## Skills @@ -86,6 +86,8 @@ dimos mcp status # Server status | Config | Model | Notes | |--------|-------|-------| -| Default | `gpt-4o` | Best quality, requires `OPENAI_API_KEY` | +| Default | `gpt-4o` | Uses `OPENAI_API_KEY` unless `OPENROUTER_API_KEY` is set | +| OpenRouter default | `gpt-4o` mapped to `openai/gpt-4o` | Set `OPENROUTER_API_KEY`; optionally override with `OPENROUTER_MODEL` | +| `openrouter:` | OpenRouter model id | Example: `McpClient.blueprint(model="openrouter:openai/gpt-4o-mini")`; choose a model that supports tool calling | | `ollama:llama3.1` | Local Ollama | Requires `ollama serve` running | | Custom | Any LangChain-compatible | Set via `McpClient.blueprint(model="...")` |